diff --git a/.gitignore b/.gitignore index 52d58e68cb6..1e9765dca9e 100644 --- a/.gitignore +++ b/.gitignore @@ -124,3 +124,16 @@ website/package-lock.json # Toolchains /cmake/toolchain/* + +# ANTLR extension cache +.antlr + +# ANTLR generated files +/src/Parsers/New/*.interp +/src/Parsers/New/*.tokens +/src/Parsers/New/ClickHouseParserBaseVisitor.* + +# pytest-profiling +/prof + +*.iml diff --git a/.gitmodules b/.gitmodules index 10a1419125d..b9a22d13c79 100644 --- a/.gitmodules +++ b/.gitmodules @@ -157,7 +157,7 @@ url = https://github.com/ClickHouse-Extras/libcpuid.git [submodule "contrib/openldap"] path = contrib/openldap - url = https://github.com/openldap/openldap.git + url = https://github.com/ClickHouse-Extras/openldap.git [submodule "contrib/AMQP-CPP"] path = contrib/AMQP-CPP url = https://github.com/ClickHouse-Extras/AMQP-CPP.git @@ -172,6 +172,9 @@ [submodule "contrib/fmtlib"] path = contrib/fmtlib url = https://github.com/fmtlib/fmt.git +[submodule "contrib/antlr4-runtime"] + path = contrib/antlr4-runtime + url = https://github.com/ClickHouse-Extras/antlr4-runtime.git [submodule "contrib/sentry-native"] path = contrib/sentry-native url = https://github.com/ClickHouse-Extras/sentry-native.git @@ -200,8 +203,8 @@ url = https://github.com/facebook/rocksdb branch = v6.14.5 [submodule "contrib/xz"] - path = contrib/xz - url = https://github.com/xz-mirror/xz + path = contrib/xz + url = https://github.com/xz-mirror/xz [submodule "contrib/abseil-cpp"] path = contrib/abseil-cpp url = https://github.com/ClickHouse-Extras/abseil-cpp.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 355c664664d..c722e4a1ca0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ * Remove `ANALYZE` and `AST` queries, and make the setting `enable_debug_queries` obsolete since now it is the part of full featured `EXPLAIN` query. [#16536](https://github.com/ClickHouse/ClickHouse/pull/16536) ([Ivan](https://github.com/abyss7)). * Aggregate functions `boundingRatio`, `rankCorr`, `retention`, `timeSeriesGroupSum`, `timeSeriesGroupRateSum`, `windowFunnel` were erroneously made case-insensitive. Now their names are made case sensitive as designed. Only functions that are specified in SQL standard or made for compatibility with other DBMS or functions similar to those should be case-insensitive. [#16407](https://github.com/ClickHouse/ClickHouse/pull/16407) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Make `rankCorr` function return nan on insufficient data https://github.com/ClickHouse/ClickHouse/issues/16124. [#16135](https://github.com/ClickHouse/ClickHouse/pull/16135) ([hexiaoting](https://github.com/hexiaoting)). +* When upgrading from versions older than 20.5, if rolling update is performed and cluster contains both versions 20.5 or greater and less than 20.5, if ClickHouse nodes with old versions are restarted and old version has been started up in presence of newer versions, it may lead to `Part ... intersects previous part` errors. To prevent this error, first install newer clickhouse-server packages on all cluster nodes and then do restarts (so, when clickhouse-server is restarted, it will start up with the new version). #### New Feature @@ -154,6 +155,7 @@ * Change default value of `format_regexp_escaping_rule` setting (it's related to `Regexp` format) to `Raw` (it means - read whole subpattern as a value) to make the behaviour more like to what users expect. [#15426](https://github.com/ClickHouse/ClickHouse/pull/15426) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Add support for nested multiline comments `/* comment /* comment */ */` in SQL. This conforms to the SQL standard. [#14655](https://github.com/ClickHouse/ClickHouse/pull/14655) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Added MergeTree settings (`max_replicated_merges_with_ttl_in_queue` and `max_number_of_merges_with_ttl_in_pool`) to control the number of merges with TTL in the background pool and replicated queue. This change breaks compatibility with older versions only if you use delete TTL. Otherwise, replication will stay compatible. You can avoid incompatibility issues if you update all shard replicas at once or execute `SYSTEM STOP TTL MERGES` until you finish the update of all replicas. If you'll get an incompatible entry in the replication queue, first of all, execute `SYSTEM STOP TTL MERGES` and after `ALTER TABLE ... DETACH PARTITION ...` the partition where incompatible TTL merge was assigned. Attach it back on a single replica. [#14490](https://github.com/ClickHouse/ClickHouse/pull/14490) ([alesapin](https://github.com/alesapin)). +* When upgrading from versions older than 20.5, if rolling update is performed and cluster contains both versions 20.5 or greater and less than 20.5, if ClickHouse nodes with old versions are restarted and old version has been started up in presence of newer versions, it may lead to `Part ... intersects previous part` errors. To prevent this error, first install newer clickhouse-server packages on all cluster nodes and then do restarts (so, when clickhouse-server is restarted, it will start up with the new version). #### New Feature @@ -438,6 +440,10 @@ ### ClickHouse release v20.9.2.20, 2020-09-22 +#### Backward Incompatible Change + +* When upgrading from versions older than 20.5, if rolling update is performed and cluster contains both versions 20.5 or greater and less than 20.5, if ClickHouse nodes with old versions are restarted and old version has been started up in presence of newer versions, it may lead to `Part ... intersects previous part` errors. To prevent this error, first install newer clickhouse-server packages on all cluster nodes and then do restarts (so, when clickhouse-server is restarted, it will start up with the new version). + #### New Feature * Added column transformers `EXCEPT`, `REPLACE`, `APPLY`, which can be applied to the list of selected columns (after `*` or `COLUMNS(...)`). For example, you can write `SELECT * EXCEPT(URL) REPLACE(number + 1 AS number)`. Another example: `select * apply(length) apply(max) from wide_string_table` to find out the maxium length of all string columns. [#14233](https://github.com/ClickHouse/ClickHouse/pull/14233) ([Amos Bird](https://github.com/amosbird)). @@ -621,6 +627,7 @@ * Now `OPTIMIZE FINAL` query doesn't recalculate TTL for parts that were added before TTL was created. Use `ALTER TABLE ... MATERIALIZE TTL` once to calculate them, after that `OPTIMIZE FINAL` will evaluate TTL's properly. This behavior never worked for replicated tables. [#14220](https://github.com/ClickHouse/ClickHouse/pull/14220) ([alesapin](https://github.com/alesapin)). * Extend `parallel_distributed_insert_select` setting, adding an option to run `INSERT` into local table. The setting changes type from `Bool` to `UInt64`, so the values `false` and `true` are no longer supported. If you have these values in server configuration, the server will not start. Please replace them with `0` and `1`, respectively. [#14060](https://github.com/ClickHouse/ClickHouse/pull/14060) ([Azat Khuzhin](https://github.com/azat)). * Remove support for the `ODBCDriver` input/output format. This was a deprecated format once used for communication with the ClickHouse ODBC driver, now long superseded by the `ODBCDriver2` format. Resolves [#13629](https://github.com/ClickHouse/ClickHouse/issues/13629). [#13847](https://github.com/ClickHouse/ClickHouse/pull/13847) ([hexiaoting](https://github.com/hexiaoting)). +* When upgrading from versions older than 20.5, if rolling update is performed and cluster contains both versions 20.5 or greater and less than 20.5, if ClickHouse nodes with old versions are restarted and old version has been started up in presence of newer versions, it may lead to `Part ... intersects previous part` errors. To prevent this error, first install newer clickhouse-server packages on all cluster nodes and then do restarts (so, when clickhouse-server is restarted, it will start up with the new version). #### New Feature @@ -765,6 +772,7 @@ * The function `groupArrayMoving*` was not working for distributed queries. It's result was calculated within incorrect data type (without promotion to the largest type). The function `groupArrayMovingAvg` was returning integer number that was inconsistent with the `avg` function. This fixes [#12568](https://github.com/ClickHouse/ClickHouse/issues/12568). [#12622](https://github.com/ClickHouse/ClickHouse/pull/12622) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Add sanity check for MergeTree settings. If the settings are incorrect, the server will refuse to start or to create a table, printing detailed explanation to the user. [#13153](https://github.com/ClickHouse/ClickHouse/pull/13153) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Protect from the cases when user may set `background_pool_size` to value lower than `number_of_free_entries_in_pool_to_execute_mutation` or `number_of_free_entries_in_pool_to_lower_max_size_of_merge`. In these cases ALTERs won't work or the maximum size of merge will be too limited. It will throw exception explaining what to do. This closes [#10897](https://github.com/ClickHouse/ClickHouse/issues/10897). [#12728](https://github.com/ClickHouse/ClickHouse/pull/12728) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* When upgrading from versions older than 20.5, if rolling update is performed and cluster contains both versions 20.5 or greater and less than 20.5, if ClickHouse nodes with old versions are restarted and old version has been started up in presence of newer versions, it may lead to `Part ... intersects previous part` errors. To prevent this error, first install newer clickhouse-server packages on all cluster nodes and then do restarts (so, when clickhouse-server is restarted, it will start up with the new version). #### New Feature @@ -951,6 +959,10 @@ ### ClickHouse release v20.6.3.28-stable +#### Backward Incompatible Change + +* When upgrading from versions older than 20.5, if rolling update is performed and cluster contains both versions 20.5 or greater and less than 20.5, if ClickHouse nodes with old versions are restarted and old version has been started up in presence of newer versions, it may lead to `Part ... intersects previous part` errors. To prevent this error, first install newer clickhouse-server packages on all cluster nodes and then do restarts (so, when clickhouse-server is restarted, it will start up with the new version). + #### New Feature * Added an initial implementation of `EXPLAIN` query. Syntax: `EXPLAIN SELECT ...`. This fixes [#1118](https://github.com/ClickHouse/ClickHouse/issues/1118). [#11873](https://github.com/ClickHouse/ClickHouse/pull/11873) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). @@ -1139,6 +1151,7 @@ * Update `zstd` to 1.4.4. It has some minor improvements in performance and compression ratio. If you run replicas with different versions of ClickHouse you may see reasonable error messages `Data after merge is not byte-identical to data on another replicas.` with explanation. These messages are Ok and you should not worry. This change is backward compatible but we list it here in changelog in case you will wonder about these messages. [#10663](https://github.com/ClickHouse/ClickHouse/pull/10663) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Added a check for meaningless codecs and a setting `allow_suspicious_codecs` to control this check. This closes [#4966](https://github.com/ClickHouse/ClickHouse/issues/4966). [#10645](https://github.com/ClickHouse/ClickHouse/pull/10645) ([alexey-milovidov](https://github.com/alexey-milovidov)). * Several Kafka setting changes their defaults. See [#11388](https://github.com/ClickHouse/ClickHouse/pull/11388). +* When upgrading from versions older than 20.5, if rolling update is performed and cluster contains both versions 20.5 or greater and less than 20.5, if ClickHouse nodes with old versions are restarted and old version has been started up in presence of newer versions, it may lead to `Part ... intersects previous part` errors. To prevent this error, first install newer clickhouse-server packages on all cluster nodes and then do restarts (so, when clickhouse-server is restarted, it will start up with the new version). #### New Feature diff --git a/CMakeLists.txt b/CMakeLists.txt index cababc083fa..e3ddfb0c7ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,17 +154,19 @@ endif () # Make sure the final executable has symbols exported set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic") -find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-11" "llvm-objcopy-10" "llvm-objcopy-9" "llvm-objcopy-8" "objcopy") -if (OBJCOPY_PATH) - message(STATUS "Using objcopy: ${OBJCOPY_PATH}.") +if (OS_LINUX) + find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-11" "llvm-objcopy-10" "llvm-objcopy-9" "llvm-objcopy-8" "objcopy") + if (OBJCOPY_PATH) + message(STATUS "Using objcopy: ${OBJCOPY_PATH}.") - if (ARCH_AMD64) - set(OBJCOPY_ARCH_OPTIONS -O elf64-x86-64 -B i386) - elseif (ARCH_AARCH64) - set(OBJCOPY_ARCH_OPTIONS -O elf64-aarch64 -B aarch64) + if (ARCH_AMD64) + set(OBJCOPY_ARCH_OPTIONS -O elf64-x86-64 -B i386) + elseif (ARCH_AARCH64) + set(OBJCOPY_ARCH_OPTIONS -O elf64-aarch64 -B aarch64) + endif () + else () + message(FATAL_ERROR "Cannot find objcopy.") endif () -else () - message(FATAL_ERROR "Cannot find objcopy.") endif () if (OS_DARWIN) @@ -255,6 +257,8 @@ if (WITH_COVERAGE AND COMPILER_GCC) set(WITHOUT_COVERAGE "-fno-profile-arcs -fno-test-coverage") endif() +set(COMPILER_FLAGS "${COMPILER_FLAGS}") + set (CMAKE_BUILD_COLOR_MAKEFILE ON) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPILER_FLAGS} ${PLATFORM_EXTRA_CXX_FLAG} ${COMMON_WARNING_FLAGS} ${CXX_WARNING_FLAGS}") set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 ${CMAKE_CXX_FLAGS_ADD}") @@ -475,9 +479,6 @@ find_contrib_lib(cityhash) find_contrib_lib(farmhash) -set (USE_INTERNAL_BTRIE_LIBRARY ON CACHE INTERNAL "") -find_contrib_lib(btrie) - if (ENABLE_TESTS) include (cmake/find/gtest.cmake) endif () diff --git a/README.md b/README.md index 03b5c988586..97d6eb605e6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![ClickHouse — open source distributed column-oriented DBMS](https://github.com/ClickHouse/ClickHouse/raw/master/website/images/logo-400x240.png)](https://clickhouse.tech) -ClickHouse is an open-source column-oriented database management system that allows generating analytical data reports in real time. +ClickHouse® is an open-source column-oriented database management system that allows generating analytical data reports in real time. ## Useful Links @@ -16,7 +16,4 @@ ClickHouse is an open-source column-oriented database management system that all * You can also [fill this form](https://clickhouse.tech/#meet) to meet Yandex ClickHouse team in person. ## Upcoming Events - -* [The Second ClickHouse Meetup East (online)](https://www.eventbrite.com/e/the-second-clickhouse-meetup-east-tickets-126787955187) on October 31, 2020. -* [ClickHouse for Enterprise Meetup (online in Russian)](https://arenadata-events.timepad.ru/event/1465249/) on November 10, 2020. - +* [SF Bay Area ClickHouse Virtual Office Hours (online)](https://www.meetup.com/San-Francisco-Bay-Area-ClickHouse-Meetup/events/274273549/) on 20 January 2020. diff --git a/base/common/LineReader.cpp b/base/common/LineReader.cpp index b2bc929a1df..a32906dd5a5 100644 --- a/base/common/LineReader.cpp +++ b/base/common/LineReader.cpp @@ -127,7 +127,7 @@ String LineReader::readLine(const String & first_prompt, const String & second_p } #endif - line += (line.empty() ? "" : " ") + input; + line += (line.empty() ? "" : "\n") + input; if (!need_next_line) break; diff --git a/base/common/defines.h b/base/common/defines.h index af5981023ff..e4c456796d3 100644 --- a/base/common/defines.h +++ b/base/common/defines.h @@ -76,12 +76,6 @@ # define NO_SANITIZE_THREAD #endif -#if defined __GNUC__ && !defined __clang__ -# define OPTIMIZE(x) __attribute__((__optimize__(x))) -#else -# define OPTIMIZE(x) -#endif - /// A macro for suppressing warnings about unused variables or function results. /// Useful for structured bindings which have no standard way to declare this. #define UNUSED(...) (void)(__VA_ARGS__) diff --git a/base/common/getMemoryAmount.cpp b/base/common/getMemoryAmount.cpp index 5e600a37351..e7d284354f9 100644 --- a/base/common/getMemoryAmount.cpp +++ b/base/common/getMemoryAmount.cpp @@ -1,100 +1,28 @@ #include #include "common/getMemoryAmount.h" -// http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system - -/* - * Author: David Robert Nadeau - * Site: http://NadeauSoftware.com/ - * License: Creative Commons Attribution 3.0 Unported License - * http://creativecommons.org/licenses/by/3.0/deed.en_US - */ - -#if defined(WIN32) || defined(_WIN32) -#include -#else #include #include #include #if defined(BSD) #include #endif -#endif -/** - * Returns the size of physical memory (RAM) in bytes. - * Returns 0 on unsupported platform - */ +/** Returns the size of physical memory (RAM) in bytes. + * Returns 0 on unsupported platform + */ uint64_t getMemoryAmountOrZero() { -#if defined(_WIN32) && (defined(__CYGWIN__) || defined(__CYGWIN32__)) - /* Cygwin under Windows. ------------------------------------ */ - /* New 64-bit MEMORYSTATUSEX isn't available. Use old 32.bit */ - MEMORYSTATUS status; - status.dwLength = sizeof(status); - GlobalMemoryStatus(&status); - return status.dwTotalPhys; + int64_t num_pages = sysconf(_SC_PHYS_PAGES); + if (num_pages <= 0) + return 0; -#elif defined(WIN32) || defined(_WIN32) - /* Windows. ------------------------------------------------- */ - /* Use new 64-bit MEMORYSTATUSEX, not old 32-bit MEMORYSTATUS */ - MEMORYSTATUSEX status; - status.dwLength = sizeof(status); - GlobalMemoryStatusEx(&status); - return status.ullTotalPhys; + int64_t page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) + return 0; -#else - /* UNIX variants. ------------------------------------------- */ - /* Prefer sysctl() over sysconf() except sysctl() HW_REALMEM and HW_PHYSMEM */ - -#if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64)) - int mib[2]; - mib[0] = CTL_HW; -#if defined(HW_MEMSIZE) - mib[1] = HW_MEMSIZE; /* OSX. --------------------- */ -#elif defined(HW_PHYSMEM64) - mib[1] = HW_PHYSMEM64; /* NetBSD, OpenBSD. --------- */ -#endif - uint64_t size = 0; /* 64-bit */ - size_t len = sizeof(size); - if (sysctl(mib, 2, &size, &len, nullptr, 0) == 0) - return size; - - return 0; /* Failed? */ - -#elif defined(_SC_AIX_REALMEM) - /* AIX. ----------------------------------------------------- */ - return sysconf(_SC_AIX_REALMEM) * 1024; - -#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) - /* FreeBSD, Linux, OpenBSD, and Solaris. -------------------- */ - return uint64_t(sysconf(_SC_PHYS_PAGES)) - *uint64_t(sysconf(_SC_PAGESIZE)); - -#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE) - /* Legacy. -------------------------------------------------- */ - return uint64_t(sysconf(_SC_PHYS_PAGES)) - * uint64_t(sysconf(_SC_PAGE_SIZE)); - -#elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) - /* DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX. -------- */ - int mib[2]; - mib[0] = CTL_HW; -#if defined(HW_REALMEM) - mib[1] = HW_REALMEM; /* FreeBSD. ----------------- */ -#elif defined(HW_PYSMEM) - mib[1] = HW_PHYSMEM; /* Others. ------------------ */ -#endif - unsigned int size = 0; /* 32-bit */ - size_t len = sizeof(size); - if (sysctl(mib, 2, &size, &len, nullptr, 0) == 0) - return size; - - return 0; /* Failed? */ -#endif /* sysctl and sysconf variants */ - -#endif + return num_pages * page_size; } diff --git a/base/common/types.h b/base/common/types.h index f3572da2972..bd5c28fe73b 100644 --- a/base/common/types.h +++ b/base/common/types.h @@ -8,7 +8,7 @@ using Int16 = int16_t; using Int32 = int32_t; using Int64 = int64_t; -#if __cplusplus <= 201703L +#ifndef __cpp_char8_t using char8_t = unsigned char; #endif diff --git a/base/common/ya.make b/base/common/ya.make index adbbe17b486..9b38e3919be 100644 --- a/base/common/ya.make +++ b/base/common/ya.make @@ -5,7 +5,6 @@ LIBRARY() ADDINCL( GLOBAL clickhouse/base - GLOBAL contrib/libs/cctz/include ) CFLAGS (GLOBAL -DARCADIA_BUILD) @@ -24,7 +23,7 @@ ELSEIF (OS_LINUX) ENDIF () PEERDIR( - contrib/libs/cctz/src + contrib/libs/cctz contrib/libs/cxxsupp/libcxx-filesystem contrib/libs/poco/Net contrib/libs/poco/Util diff --git a/base/common/ya.make.in b/base/common/ya.make.in index bcac67c7923..b5c2bbc1717 100644 --- a/base/common/ya.make.in +++ b/base/common/ya.make.in @@ -4,7 +4,6 @@ LIBRARY() ADDINCL( GLOBAL clickhouse/base - GLOBAL contrib/libs/cctz/include ) CFLAGS (GLOBAL -DARCADIA_BUILD) @@ -23,7 +22,7 @@ ELSEIF (OS_LINUX) ENDIF () PEERDIR( - contrib/libs/cctz/src + contrib/libs/cctz contrib/libs/cxxsupp/libcxx-filesystem contrib/libs/poco/Net contrib/libs/poco/Util diff --git a/base/daemon/SentryWriter.cpp b/base/daemon/SentryWriter.cpp index 33f2b237dd5..b8f2e5073ab 100644 --- a/base/daemon/SentryWriter.cpp +++ b/base/daemon/SentryWriter.cpp @@ -6,10 +6,12 @@ #include #include +#include #include #include #include +#include #if !defined(ARCADIA_BUILD) # include "Common/config_version.h" @@ -28,14 +30,13 @@ namespace bool initialized = false; bool anonymize = false; +std::string server_data_path; void setExtras() { - if (!anonymize) - { sentry_set_extra("server_name", sentry_value_new_string(getFQDNOrHostName().c_str())); - } + sentry_set_tag("version", VERSION_STRING); sentry_set_extra("version_githash", sentry_value_new_string(VERSION_GITHASH)); sentry_set_extra("version_describe", sentry_value_new_string(VERSION_DESCRIBE)); @@ -44,6 +45,15 @@ void setExtras() sentry_set_extra("version_major", sentry_value_new_int32(VERSION_MAJOR)); sentry_set_extra("version_minor", sentry_value_new_int32(VERSION_MINOR)); sentry_set_extra("version_patch", sentry_value_new_int32(VERSION_PATCH)); + sentry_set_extra("version_official", sentry_value_new_string(VERSION_OFFICIAL)); + + /// Sentry does not support 64-bit integers. + sentry_set_extra("total_ram", sentry_value_new_string(formatReadableSizeWithBinarySuffix(getMemoryAmountOrZero()).c_str())); + sentry_set_extra("physical_cpu_cores", sentry_value_new_int32(getNumberOfPhysicalCPUCores())); + + if (!server_data_path.empty()) + sentry_set_extra("disk_free_space", sentry_value_new_string(formatReadableSizeWithBinarySuffix( + Poco::File(server_data_path).freeSpace()).c_str())); } void sentry_logger(sentry_level_e level, const char * message, va_list args, void *) @@ -98,6 +108,7 @@ void SentryWriter::initialize(Poco::Util::LayeredConfiguration & config) } if (enabled) { + server_data_path = config.getString("path", ""); const std::filesystem::path & default_tmp_path = std::filesystem::path(config.getString("tmp_path", Poco::Path::temp())) / "sentry"; const std::string & endpoint = config.getString("send_crash_reports.endpoint"); diff --git a/base/mysqlxx/Connection.cpp b/base/mysqlxx/Connection.cpp index 8c7e11eb4a1..55757008562 100644 --- a/base/mysqlxx/Connection.cpp +++ b/base/mysqlxx/Connection.cpp @@ -104,6 +104,11 @@ void Connection::connect(const char* db, if (mysql_options(driver.get(), MYSQL_OPT_LOCAL_INFILE, &enable_local_infile_arg)) throw ConnectionFailed(errorMessage(driver.get()), mysql_errno(driver.get())); + /// Enables auto-reconnect. + bool reconnect = true; + if (mysql_options(driver.get(), MYSQL_OPT_RECONNECT, reinterpret_cast(&reconnect))) + throw ConnectionFailed(errorMessage(driver.get()), mysql_errno(driver.get())); + /// Specifies particular ssl key and certificate if it needs if (mysql_ssl_set(driver.get(), ifNotEmpty(ssl_key), ifNotEmpty(ssl_cert), ifNotEmpty(ssl_ca), nullptr, nullptr)) throw ConnectionFailed(errorMessage(driver.get()), mysql_errno(driver.get())); @@ -115,11 +120,6 @@ void Connection::connect(const char* db, if (mysql_set_character_set(driver.get(), "UTF8")) throw ConnectionFailed(errorMessage(driver.get()), mysql_errno(driver.get())); - /// Enables auto-reconnect. - bool reconnect = true; - if (mysql_options(driver.get(), MYSQL_OPT_RECONNECT, reinterpret_cast(&reconnect))) - throw ConnectionFailed(errorMessage(driver.get()), mysql_errno(driver.get())); - is_connected = true; } diff --git a/base/mysqlxx/Pool.cpp b/base/mysqlxx/Pool.cpp index d845570f1f2..2058429d3da 100644 --- a/base/mysqlxx/Pool.cpp +++ b/base/mysqlxx/Pool.cpp @@ -26,6 +26,7 @@ void Pool::Entry::incrementRefCount() mysql_thread_init(); } + void Pool::Entry::decrementRefCount() { if (!data) @@ -150,28 +151,39 @@ Pool::Entry Pool::tryGet() initialize(); - /// Searching for connection which was established but wasn't used. - for (auto & connection : connections) + /// Try to pick an idle connection from already allocated + for (auto connection_it = connections.cbegin(); connection_it != connections.cend();) { - if (connection->ref_count == 0) + Connection * connection_ptr = *connection_it; + /// Fixme: There is a race condition here b/c we do not synchronize with Pool::Entry's copy-assignment operator + if (connection_ptr->ref_count == 0) { - Entry res(connection, this); - return res.tryForceConnected() ? res : Entry(); + Entry res(connection_ptr, this); + if (res.tryForceConnected()) /// Tries to reestablish connection as well + return res; + + auto & logger = Poco::Util::Application::instance().logger(); + logger.information("Idle connection to mysql server cannot be recovered, dropping it."); + + /// This one is disconnected, cannot be reestablished and so needs to be disposed of. + connection_it = connections.erase(connection_it); + ::delete connection_ptr; /// TODO: Manual memory management is awkward (matches allocConnection() method) } + else + ++connection_it; } - /// Throws if pool is overflowed. if (connections.size() >= max_connections) throw Poco::Exception("mysqlxx::Pool is full"); - /// Allocates new connection. - Connection * conn = allocConnection(true); - if (conn) - return Entry(conn, this); + Connection * connection_ptr = allocConnection(true); + if (connection_ptr) + return {connection_ptr, this}; - return Entry(); + return {}; } + void Pool::removeConnection(Connection* connection) { std::lock_guard lock(mutex); @@ -199,11 +211,9 @@ void Pool::Entry::forceConnected() const throw Poco::RuntimeException("Tried to access NULL database connection."); Poco::Util::Application & app = Poco::Util::Application::instance(); - if (data->conn.ping()) - return; bool first = true; - do + while (!tryForceConnected()) { if (first) first = false; @@ -225,7 +235,26 @@ void Pool::Entry::forceConnected() const pool->rw_timeout, pool->enable_local_infile); } - while (!data->conn.ping()); +} + + +bool Pool::Entry::tryForceConnected() const +{ + auto * const mysql_driver = data->conn.getDriver(); + const auto prev_connection_id = mysql_thread_id(mysql_driver); + if (data->conn.ping()) /// Attempts to reestablish lost connection + { + const auto current_connection_id = mysql_thread_id(mysql_driver); + if (prev_connection_id != current_connection_id) + { + auto & logger = Poco::Util::Application::instance().logger(); + logger.information("Connection to mysql server has been reestablished. Connection id changed: %d -> %d", + prev_connection_id, current_connection_id); + } + return true; + } + + return false; } diff --git a/base/mysqlxx/Pool.h b/base/mysqlxx/Pool.h index 59d15e8c9a0..83b00e0081a 100644 --- a/base/mysqlxx/Pool.h +++ b/base/mysqlxx/Pool.h @@ -127,10 +127,7 @@ public: void forceConnected() const; /// Connects to database. If connection is failed then returns false. - bool tryForceConnected() const - { - return data->conn.ping(); - } + bool tryForceConnected() const; void incrementRefCount(); void decrementRefCount(); diff --git a/base/mysqlxx/tests/CMakeLists.txt b/base/mysqlxx/tests/CMakeLists.txt index ec3fdfaa913..2cf19d78418 100644 --- a/base/mysqlxx/tests/CMakeLists.txt +++ b/base/mysqlxx/tests/CMakeLists.txt @@ -1,2 +1,5 @@ add_executable (mysqlxx_test mysqlxx_test.cpp) target_link_libraries (mysqlxx_test PRIVATE mysqlxx) + +add_executable (mysqlxx_pool_test mysqlxx_pool_test.cpp) +target_link_libraries (mysqlxx_pool_test PRIVATE mysqlxx) diff --git a/base/mysqlxx/tests/mysqlxx_pool_test.cpp b/base/mysqlxx/tests/mysqlxx_pool_test.cpp new file mode 100644 index 00000000000..3dc23e4da85 --- /dev/null +++ b/base/mysqlxx/tests/mysqlxx_pool_test.cpp @@ -0,0 +1,98 @@ +#include + +#include +#include +#include +#include + + +namespace +{ +mysqlxx::Pool::Entry getWithFailover(mysqlxx::Pool & connections_pool) +{ + using namespace std::chrono; + + constexpr size_t max_tries = 3; + + mysqlxx::Pool::Entry worker_connection; + + for (size_t try_no = 1; try_no <= max_tries; ++try_no) + { + try + { + worker_connection = connections_pool.tryGet(); + + if (!worker_connection.isNull()) + { + return worker_connection; + } + } + catch (const Poco::Exception & e) + { + if (e.displayText().find("mysqlxx::Pool is full") != std::string::npos) + { + std::cerr << e.displayText() << std::endl; + } + + std::cerr << "Connection to " << connections_pool.getDescription() << " failed: " << e.displayText() << std::endl; + } + + std::clog << "Connection to all replicas failed " << try_no << " times" << std::endl; + std::this_thread::sleep_for(1s); + } + + std::stringstream message; + message << "Connections to all replicas failed: " << connections_pool.getDescription(); + + throw Poco::Exception(message.str()); +} +} + +int main(int, char **) +{ + using namespace std::chrono; + + const char * remote_mysql = "localhost"; + const std::string test_query = "SHOW DATABASES"; + + mysqlxx::Pool mysql_conn_pool("", remote_mysql, "default", "10203040", 3306); + + size_t iteration = 0; + while (++iteration) + { + std::clog << "Iteration: " << iteration << std::endl; + try + { + std::clog << "Acquiring DB connection ..."; + mysqlxx::Pool::Entry worker = getWithFailover(mysql_conn_pool); + std::clog << "ok" << std::endl; + + std::clog << "Preparing query (5s sleep) ..."; + std::this_thread::sleep_for(5s); + mysqlxx::Query query = worker->query(); + query << test_query; + std::clog << "ok" << std::endl; + + std::clog << "Querying result (5s sleep) ..."; + std::this_thread::sleep_for(5s); + mysqlxx::UseQueryResult result = query.use(); + std::clog << "ok" << std::endl; + + std::clog << "Fetching result data (5s sleep) ..."; + std::this_thread::sleep_for(5s); + size_t rows_count = 0; + while (result.fetch()) + ++rows_count; + std::clog << "ok" << std::endl; + + std::clog << "Read " << rows_count << " rows." << std::endl; + } + catch (const Poco::Exception & e) + { + std::cerr << "Iteration FAILED:\n" << e.displayText() << std::endl; + } + + std::clog << "====================" << std::endl; + std::this_thread::sleep_for(3s); + } +} diff --git a/cmake/Modules/Findbtrie.cmake b/cmake/Modules/Findbtrie.cmake deleted file mode 100644 index 4f3c27f5225..00000000000 --- a/cmake/Modules/Findbtrie.cmake +++ /dev/null @@ -1,44 +0,0 @@ -# - Try to find btrie headers and libraries. -# -# Usage of this module as follows: -# -# find_package(btrie) -# -# Variables used by this module, they can change the default behaviour and need -# to be set before calling find_package: -# -# BTRIE_ROOT_DIR Set this variable to the root installation of -# btrie if the module has problems finding -# the proper installation path. -# -# Variables defined by this module: -# -# BTRIE_FOUND System has btrie libs/headers -# BTRIE_LIBRARIES The btrie library/libraries -# BTRIE_INCLUDE_DIR The location of btrie headers - -find_path(BTRIE_ROOT_DIR - NAMES include/btrie.h -) - -find_library(BTRIE_LIBRARIES - NAMES btrie - PATHS ${BTRIE_ROOT_DIR}/lib ${BTRIE_LIBRARIES_PATHS} -) - -find_path(BTRIE_INCLUDE_DIR - NAMES btrie.h - PATHS ${BTRIE_ROOT_DIR}/include ${BTRIE_INCLUDE_PATHS} -) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(btrie DEFAULT_MSG - BTRIE_LIBRARIES - BTRIE_INCLUDE_DIR -) - -mark_as_advanced( - BTRIE_ROOT_DIR - BTRIE_LIBRARIES - BTRIE_INCLUDE_DIR -) diff --git a/cmake/darwin/default_libs.cmake b/cmake/darwin/default_libs.cmake index 7b57e63f4ee..4ee1bcdcfbf 100644 --- a/cmake/darwin/default_libs.cmake +++ b/cmake/darwin/default_libs.cmake @@ -12,13 +12,7 @@ set(CMAKE_CXX_STANDARD_LIBRARIES ${DEFAULT_LIBS}) set(CMAKE_C_STANDARD_LIBRARIES ${DEFAULT_LIBS}) # Minimal supported SDK version - -set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mmacosx-version-min=10.15") -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmacosx-version-min=10.15") -set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mmacosx-version-min=10.15") - -set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -mmacosx-version-min=10.15") -set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -mmacosx-version-min=10.15") +set(CMAKE_OSX_DEPLOYMENT_TARGET 10.15) # Global libraries diff --git a/cmake/find/avro.cmake b/cmake/find/avro.cmake index e0f73d99111..74ccda3489f 100644 --- a/cmake/find/avro.cmake +++ b/cmake/find/avro.cmake @@ -1,3 +1,4 @@ +# Needed when using Apache Avro serialization format option (ENABLE_AVRO "Enable Avro" ${ENABLE_LIBRARIES}) if (NOT ENABLE_AVRO) diff --git a/cmake/find/ssl.cmake b/cmake/find/ssl.cmake index 9058857c173..f7ac9174202 100644 --- a/cmake/find/ssl.cmake +++ b/cmake/find/ssl.cmake @@ -1,3 +1,5 @@ +# Needed when securely connecting to an external server, e.g. +# clickhouse-client --host ... --secure option(ENABLE_SSL "Enable ssl" ${ENABLE_LIBRARIES}) if(NOT ENABLE_SSL) diff --git a/cmake/warnings.cmake b/cmake/warnings.cmake index c5f3ce47775..8122e9ef31e 100644 --- a/cmake/warnings.cmake +++ b/cmake/warnings.cmake @@ -23,8 +23,8 @@ option (WEVERYTHING "Enable -Weverything option with some exceptions." ON) # Control maximum size of stack frames. It can be important if the code is run in fibers with small stack size. # Only in release build because debug has too large stack frames. -if ((NOT CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") AND (NOT SANITIZE)) - add_warning(frame-larger-than=32768) +if ((NOT CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") AND (NOT SANITIZE) AND (NOT CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")) + add_warning(frame-larger-than=65536) endif () if (COMPILER_CLANG) diff --git a/contrib/AMQP-CPP b/contrib/AMQP-CPP index d63e1f01658..03781aaff0f 160000 --- a/contrib/AMQP-CPP +++ b/contrib/AMQP-CPP @@ -1 +1 @@ -Subproject commit d63e1f016582e9faaaf279aa24513087a07bc6e7 +Subproject commit 03781aaff0f10ef41f902b8cf865fe0067180c10 diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 92e19efe7c3..57f08cc399c 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -21,6 +21,7 @@ endif() set_property(DIRECTORY PROPERTY EXCLUDE_FROM_ALL 1) +add_subdirectory (antlr4-runtime-cmake) add_subdirectory (boost-cmake) add_subdirectory (cctz-cmake) add_subdirectory (consistent-hashing-sumbur) @@ -66,10 +67,6 @@ if (USE_INTERNAL_FARMHASH_LIBRARY) add_subdirectory (libfarmhash) endif () -if (USE_INTERNAL_BTRIE_LIBRARY) - add_subdirectory (libbtrie) -endif () - if (USE_INTERNAL_ZLIB_LIBRARY) set (ZLIB_ENABLE_TESTS 0 CACHE INTERNAL "") set (SKIP_INSTALL_ALL 1 CACHE INTERNAL "") diff --git a/contrib/antlr4-runtime b/contrib/antlr4-runtime new file mode 160000 index 00000000000..a2fa7b76e2e --- /dev/null +++ b/contrib/antlr4-runtime @@ -0,0 +1 @@ +Subproject commit a2fa7b76e2ee16d2ad955e9214a90bbf79da66fc diff --git a/contrib/antlr4-runtime-cmake/CMakeLists.txt b/contrib/antlr4-runtime-cmake/CMakeLists.txt new file mode 100644 index 00000000000..5baefdb1e29 --- /dev/null +++ b/contrib/antlr4-runtime-cmake/CMakeLists.txt @@ -0,0 +1,156 @@ +set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/antlr4-runtime) + +set (SRCS + ${LIBRARY_DIR}/ANTLRErrorListener.cpp + ${LIBRARY_DIR}/ANTLRErrorStrategy.cpp + ${LIBRARY_DIR}/ANTLRFileStream.cpp + ${LIBRARY_DIR}/ANTLRInputStream.cpp + ${LIBRARY_DIR}/atn/AbstractPredicateTransition.cpp + ${LIBRARY_DIR}/atn/ActionTransition.cpp + ${LIBRARY_DIR}/atn/AmbiguityInfo.cpp + ${LIBRARY_DIR}/atn/ArrayPredictionContext.cpp + ${LIBRARY_DIR}/atn/ATN.cpp + ${LIBRARY_DIR}/atn/ATNConfig.cpp + ${LIBRARY_DIR}/atn/ATNConfigSet.cpp + ${LIBRARY_DIR}/atn/ATNDeserializationOptions.cpp + ${LIBRARY_DIR}/atn/ATNDeserializer.cpp + ${LIBRARY_DIR}/atn/ATNSerializer.cpp + ${LIBRARY_DIR}/atn/ATNSimulator.cpp + ${LIBRARY_DIR}/atn/ATNState.cpp + ${LIBRARY_DIR}/atn/AtomTransition.cpp + ${LIBRARY_DIR}/atn/BasicBlockStartState.cpp + ${LIBRARY_DIR}/atn/BasicState.cpp + ${LIBRARY_DIR}/atn/BlockEndState.cpp + ${LIBRARY_DIR}/atn/BlockStartState.cpp + ${LIBRARY_DIR}/atn/ContextSensitivityInfo.cpp + ${LIBRARY_DIR}/atn/DecisionEventInfo.cpp + ${LIBRARY_DIR}/atn/DecisionInfo.cpp + ${LIBRARY_DIR}/atn/DecisionState.cpp + ${LIBRARY_DIR}/atn/EmptyPredictionContext.cpp + ${LIBRARY_DIR}/atn/EpsilonTransition.cpp + ${LIBRARY_DIR}/atn/ErrorInfo.cpp + ${LIBRARY_DIR}/atn/LexerAction.cpp + ${LIBRARY_DIR}/atn/LexerActionExecutor.cpp + ${LIBRARY_DIR}/atn/LexerATNConfig.cpp + ${LIBRARY_DIR}/atn/LexerATNSimulator.cpp + ${LIBRARY_DIR}/atn/LexerChannelAction.cpp + ${LIBRARY_DIR}/atn/LexerCustomAction.cpp + ${LIBRARY_DIR}/atn/LexerIndexedCustomAction.cpp + ${LIBRARY_DIR}/atn/LexerModeAction.cpp + ${LIBRARY_DIR}/atn/LexerMoreAction.cpp + ${LIBRARY_DIR}/atn/LexerPopModeAction.cpp + ${LIBRARY_DIR}/atn/LexerPushModeAction.cpp + ${LIBRARY_DIR}/atn/LexerSkipAction.cpp + ${LIBRARY_DIR}/atn/LexerTypeAction.cpp + ${LIBRARY_DIR}/atn/LL1Analyzer.cpp + ${LIBRARY_DIR}/atn/LookaheadEventInfo.cpp + ${LIBRARY_DIR}/atn/LoopEndState.cpp + ${LIBRARY_DIR}/atn/NotSetTransition.cpp + ${LIBRARY_DIR}/atn/OrderedATNConfigSet.cpp + ${LIBRARY_DIR}/atn/ParseInfo.cpp + ${LIBRARY_DIR}/atn/ParserATNSimulator.cpp + ${LIBRARY_DIR}/atn/PlusBlockStartState.cpp + ${LIBRARY_DIR}/atn/PlusLoopbackState.cpp + ${LIBRARY_DIR}/atn/PrecedencePredicateTransition.cpp + ${LIBRARY_DIR}/atn/PredicateEvalInfo.cpp + ${LIBRARY_DIR}/atn/PredicateTransition.cpp + ${LIBRARY_DIR}/atn/PredictionContext.cpp + ${LIBRARY_DIR}/atn/PredictionMode.cpp + ${LIBRARY_DIR}/atn/ProfilingATNSimulator.cpp + ${LIBRARY_DIR}/atn/RangeTransition.cpp + ${LIBRARY_DIR}/atn/RuleStartState.cpp + ${LIBRARY_DIR}/atn/RuleStopState.cpp + ${LIBRARY_DIR}/atn/RuleTransition.cpp + ${LIBRARY_DIR}/atn/SemanticContext.cpp + ${LIBRARY_DIR}/atn/SetTransition.cpp + ${LIBRARY_DIR}/atn/SingletonPredictionContext.cpp + ${LIBRARY_DIR}/atn/StarBlockStartState.cpp + ${LIBRARY_DIR}/atn/StarLoopbackState.cpp + ${LIBRARY_DIR}/atn/StarLoopEntryState.cpp + ${LIBRARY_DIR}/atn/TokensStartState.cpp + ${LIBRARY_DIR}/atn/Transition.cpp + ${LIBRARY_DIR}/atn/WildcardTransition.cpp + ${LIBRARY_DIR}/BailErrorStrategy.cpp + ${LIBRARY_DIR}/BaseErrorListener.cpp + ${LIBRARY_DIR}/BufferedTokenStream.cpp + ${LIBRARY_DIR}/CharStream.cpp + ${LIBRARY_DIR}/CommonToken.cpp + ${LIBRARY_DIR}/CommonTokenFactory.cpp + ${LIBRARY_DIR}/CommonTokenStream.cpp + ${LIBRARY_DIR}/ConsoleErrorListener.cpp + ${LIBRARY_DIR}/DefaultErrorStrategy.cpp + ${LIBRARY_DIR}/dfa/DFA.cpp + ${LIBRARY_DIR}/dfa/DFASerializer.cpp + ${LIBRARY_DIR}/dfa/DFAState.cpp + ${LIBRARY_DIR}/dfa/LexerDFASerializer.cpp + ${LIBRARY_DIR}/DiagnosticErrorListener.cpp + ${LIBRARY_DIR}/Exceptions.cpp + ${LIBRARY_DIR}/FailedPredicateException.cpp + ${LIBRARY_DIR}/InputMismatchException.cpp + ${LIBRARY_DIR}/InterpreterRuleContext.cpp + ${LIBRARY_DIR}/IntStream.cpp + ${LIBRARY_DIR}/Lexer.cpp + ${LIBRARY_DIR}/LexerInterpreter.cpp + ${LIBRARY_DIR}/LexerNoViableAltException.cpp + ${LIBRARY_DIR}/ListTokenSource.cpp + ${LIBRARY_DIR}/misc/InterpreterDataReader.cpp + ${LIBRARY_DIR}/misc/Interval.cpp + ${LIBRARY_DIR}/misc/IntervalSet.cpp + ${LIBRARY_DIR}/misc/MurmurHash.cpp + ${LIBRARY_DIR}/misc/Predicate.cpp + ${LIBRARY_DIR}/NoViableAltException.cpp + ${LIBRARY_DIR}/Parser.cpp + ${LIBRARY_DIR}/ParserInterpreter.cpp + ${LIBRARY_DIR}/ParserRuleContext.cpp + ${LIBRARY_DIR}/ProxyErrorListener.cpp + ${LIBRARY_DIR}/RecognitionException.cpp + ${LIBRARY_DIR}/Recognizer.cpp + ${LIBRARY_DIR}/RuleContext.cpp + ${LIBRARY_DIR}/RuleContextWithAltNum.cpp + ${LIBRARY_DIR}/RuntimeMetaData.cpp + ${LIBRARY_DIR}/support/Any.cpp + ${LIBRARY_DIR}/support/Arrays.cpp + ${LIBRARY_DIR}/support/CPPUtils.cpp + ${LIBRARY_DIR}/support/guid.cpp + ${LIBRARY_DIR}/support/StringUtils.cpp + ${LIBRARY_DIR}/Token.cpp + ${LIBRARY_DIR}/TokenSource.cpp + ${LIBRARY_DIR}/TokenStream.cpp + ${LIBRARY_DIR}/TokenStreamRewriter.cpp + ${LIBRARY_DIR}/tree/ErrorNode.cpp + ${LIBRARY_DIR}/tree/ErrorNodeImpl.cpp + ${LIBRARY_DIR}/tree/IterativeParseTreeWalker.cpp + ${LIBRARY_DIR}/tree/ParseTree.cpp + ${LIBRARY_DIR}/tree/ParseTreeListener.cpp + ${LIBRARY_DIR}/tree/ParseTreeVisitor.cpp + ${LIBRARY_DIR}/tree/ParseTreeWalker.cpp + ${LIBRARY_DIR}/tree/pattern/Chunk.cpp + ${LIBRARY_DIR}/tree/pattern/ParseTreeMatch.cpp + ${LIBRARY_DIR}/tree/pattern/ParseTreePattern.cpp + ${LIBRARY_DIR}/tree/pattern/ParseTreePatternMatcher.cpp + ${LIBRARY_DIR}/tree/pattern/RuleTagToken.cpp + ${LIBRARY_DIR}/tree/pattern/TagChunk.cpp + ${LIBRARY_DIR}/tree/pattern/TextChunk.cpp + ${LIBRARY_DIR}/tree/pattern/TokenTagToken.cpp + ${LIBRARY_DIR}/tree/TerminalNode.cpp + ${LIBRARY_DIR}/tree/TerminalNodeImpl.cpp + ${LIBRARY_DIR}/tree/Trees.cpp + ${LIBRARY_DIR}/tree/xpath/XPath.cpp + ${LIBRARY_DIR}/tree/xpath/XPathElement.cpp + ${LIBRARY_DIR}/tree/xpath/XPathLexer.cpp + ${LIBRARY_DIR}/tree/xpath/XPathLexerErrorListener.cpp + ${LIBRARY_DIR}/tree/xpath/XPathRuleAnywhereElement.cpp + ${LIBRARY_DIR}/tree/xpath/XPathRuleElement.cpp + ${LIBRARY_DIR}/tree/xpath/XPathTokenAnywhereElement.cpp + ${LIBRARY_DIR}/tree/xpath/XPathTokenElement.cpp + ${LIBRARY_DIR}/tree/xpath/XPathWildcardAnywhereElement.cpp + ${LIBRARY_DIR}/tree/xpath/XPathWildcardElement.cpp + ${LIBRARY_DIR}/UnbufferedCharStream.cpp + ${LIBRARY_DIR}/UnbufferedTokenStream.cpp + ${LIBRARY_DIR}/Vocabulary.cpp + ${LIBRARY_DIR}/WritableToken.cpp +) + +add_library (antlr4-runtime ${SRCS}) + +target_include_directories (antlr4-runtime SYSTEM PUBLIC ${LIBRARY_DIR}) diff --git a/contrib/cassandra b/contrib/cassandra index a49b4e0e269..d10187efb25 160000 --- a/contrib/cassandra +++ b/contrib/cassandra @@ -1 +1 @@ -Subproject commit a49b4e0e2696a4b8ef286a5b9538d1cbe8490509 +Subproject commit d10187efb25b26da391def077edf3c6f2f3a23dd diff --git a/contrib/libbtrie/CMakeLists.txt b/contrib/libbtrie/CMakeLists.txt deleted file mode 100644 index 2b0c8e3fd75..00000000000 --- a/contrib/libbtrie/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -add_library(btrie - src/btrie.c - include/btrie.h -) - -target_include_directories (btrie SYSTEM PUBLIC include) diff --git a/contrib/libbtrie/LICENSE b/contrib/libbtrie/LICENSE deleted file mode 100644 index d386c6f7b79..00000000000 --- a/contrib/libbtrie/LICENSE +++ /dev/null @@ -1,23 +0,0 @@ -Copyright (c) 2013, CobbLiu -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright notice, this - list of conditions and the following disclaimer in the documentation and/or - other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/libbtrie/include/btrie.h b/contrib/libbtrie/include/btrie.h deleted file mode 100644 index 6d805108e7a..00000000000 --- a/contrib/libbtrie/include/btrie.h +++ /dev/null @@ -1,160 +0,0 @@ -#pragma once - -#if defined (__cplusplus) -extern "C" { -#endif - -#include -#include - -/** - * In btrie, each leaf means one bit in ip tree. - * Left means 0, and right means 1. - */ - -#define BTRIE_NULL (uintptr_t) -1 - -#if !defined(BTRIE_MAX_PAGES) -/// 54 ip per page. 8 bytes memory per page when empty -#define BTRIE_MAX_PAGES 1024 * 2048 /// 128m ips , ~16mb ram when empty -// #define BTRIE_MAX_PAGES 1024 * 65535 /// 4g ips (whole ipv4), ~512mb ram when empty -#endif - -typedef struct btrie_node_s btrie_node_t; - -struct btrie_node_s { - btrie_node_t *right; - btrie_node_t *left; - btrie_node_t *parent; - uintptr_t value; -}; - - -typedef struct btrie_s { - btrie_node_t *root; - - btrie_node_t *free; /* free list of btrie */ - char *start; - size_t size; - - /* - * memory pool. - * memory management(esp free) will be so easy by using this facility. - */ - char *pools[BTRIE_MAX_PAGES]; - size_t len; -} btrie_t; - - -/** - * Create an empty btrie - * - * @Return: - * An ip radix_tree created. - * NULL if creation failed. - */ - -btrie_t *btrie_create(); - -/** - * Destroy the ip radix_tree - * - * @Return: - * OK if deletion succeed. - * ERROR if error occurs while deleting. - */ -int btrie_destroy(btrie_t *tree); - -/** - * Count the nodes in the radix tree. - */ -size_t btrie_count(btrie_t *tree); - -/** - * Return the allocated number of bytes. - */ -size_t btrie_allocated(btrie_t *tree); - - -/** - * Add an ipv4 into btrie - * - * @Args: - * key: ip address - * mask: key's mask - * value: value of this IP, may be NULL. - * - * @Return: - * OK for success. - * ERROR for failure. - */ -int btrie_insert(btrie_t *tree, uint32_t key, uint32_t mask, - uintptr_t value); - - -/** - * Delete an ipv4 from btrie - * - * @Args: - * - * @Return: - * OK for success. - * ERROR for failure. - */ -int btrie_delete(btrie_t *tree, uint32_t key, uint32_t mask); - - -/** - * Find an ipv4 from btrie - * - - * @Args: - * - * @Return: - * Value if succeed. - * NULL if failed. - */ -uintptr_t btrie_find(btrie_t *tree, uint32_t key); - - -/** - * Add an ipv6 into btrie - * - * @Args: - * key: ip address - * mask: key's mask - * value: value of this IP, may be NULL. - * - * @Return: - * OK for success. - * ERROR for failure. - */ -int btrie_insert_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask, - uintptr_t value); - -/** - * Delete an ipv6 from btrie - * - * @Args: - * - * @Return: - * OK for success. - * ERROR for failure. - */ -int btrie_delete_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask); - -/** - * Find an ipv6 from btrie - * - - * @Args: - * - * @Return: - * Value if succeed. - * NULL if failed. - */ -uintptr_t btrie_find_a6(btrie_t *tree, const uint8_t *key); - -#if defined (__cplusplus) -} -#endif \ No newline at end of file diff --git a/contrib/libbtrie/src/btrie.c b/contrib/libbtrie/src/btrie.c deleted file mode 100644 index f9353019ac1..00000000000 --- a/contrib/libbtrie/src/btrie.c +++ /dev/null @@ -1,460 +0,0 @@ -#include -#include -#include - -#define PAGE_SIZE 4096 - - -static btrie_node_t * -btrie_alloc(btrie_t *tree) -{ - btrie_node_t *p; - - if (tree->free) { - p = tree->free; - tree->free = tree->free->right; - return p; - } - - if (tree->size < sizeof(btrie_node_t)) { - tree->start = (char *) calloc(sizeof(char), PAGE_SIZE); - if (tree->start == NULL) { - return NULL; - } - - tree->pools[tree->len++] = tree->start; - tree->size = PAGE_SIZE; - } - - p = (btrie_node_t *) tree->start; - - tree->start += sizeof(btrie_node_t); - tree->size -= sizeof(btrie_node_t); - - return p; -} - - -btrie_t * -btrie_create() -{ - btrie_t *tree = (btrie_t *) malloc(sizeof(btrie_t)); - if (tree == NULL) { - return NULL; - } - - tree->free = NULL; - tree->start = NULL; - tree->size = 0; - memset(tree->pools, 0, sizeof(btrie_t *) * BTRIE_MAX_PAGES); - tree->len = 0; - - tree->root = btrie_alloc(tree); - if (tree->root == NULL) { - return NULL; - } - - tree->root->right = NULL; - tree->root->left = NULL; - tree->root->parent = NULL; - tree->root->value = BTRIE_NULL; - - return tree; -} - -static size_t -subtree_weight(btrie_node_t *node) -{ - size_t weight = 1; - if (node->left) { - weight += subtree_weight(node->left); - } - if (node->right) { - weight += subtree_weight(node->right); - } - return weight; -} - -size_t -btrie_count(btrie_t *tree) -{ - if (tree->root == NULL) { - return 0; - } - - return subtree_weight(tree->root); -} - -size_t -btrie_allocated(btrie_t *tree) -{ - return tree->len * PAGE_SIZE; -} - - -int -btrie_insert(btrie_t *tree, uint32_t key, uint32_t mask, - uintptr_t value) -{ - uint32_t bit; - btrie_node_t *node, *next; - - bit = 0x80000000; - - node = tree->root; - next = tree->root; - - while (bit & mask) { - if (key & bit) { - next = node->right; - - } else { - next = node->left; - } - - if (next == NULL) { - break; - } - - bit >>= 1; - node = next; - } - - if (next) { - if (node->value != BTRIE_NULL) { - return -1; - } - - node->value = value; - return 0; - } - - while (bit & mask) { - next = btrie_alloc(tree); - if (next == NULL) { - return -1; - } - - next->right = NULL; - next->left = NULL; - next->parent = node; - next->value = BTRIE_NULL; - - if (key & bit) { - node->right = next; - - } else { - node->left = next; - } - - bit >>= 1; - node = next; - } - - node->value = value; - - return 0; -} - - -int -btrie_delete(btrie_t *tree, uint32_t key, uint32_t mask) -{ - uint32_t bit; - btrie_node_t *node; - - bit = 0x80000000; - node = tree->root; - - while (node && (bit & mask)) { - if (key & bit) { - node = node->right; - - } else { - node = node->left; - } - - bit >>= 1; - } - - if (node == NULL) { - return -1; - } - - if (node->right || node->left) { - if (node->value != BTRIE_NULL) { - node->value = BTRIE_NULL; - return 0; - } - - return -1; - } - - for ( ;; ) { - if (node->parent->right == node) { - node->parent->right = NULL; - - } else { - node->parent->left = NULL; - } - - node->right = tree->free; - tree->free = node; - - node = node->parent; - - if (node->right || node->left) { - break; - } - - if (node->value != BTRIE_NULL) { - break; - } - - if (node->parent == NULL) { - break; - } - } - - return 0; -} - - -uintptr_t -btrie_find(btrie_t *tree, uint32_t key) -{ - uint32_t bit; - uintptr_t value; - btrie_node_t *node; - - bit = 0x80000000; - value = BTRIE_NULL; - node = tree->root; - - while (node) { - if (node->value != BTRIE_NULL) { - value = node->value; - } - - if (key & bit) { - node = node->right; - - } else { - node = node->left; - } - - bit >>= 1; - } - - return value; -} - - -int -btrie_insert_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask, - uintptr_t value) -{ - uint8_t bit; - unsigned int i; - btrie_node_t *node, *next; - - i = 0; - bit = 0x80; - - node = tree->root; - next = tree->root; - - while (bit & mask[i]) { - if (key[i] & bit) { - next = node->right; - - } else { - next = node->left; - } - - if (next == NULL) { - break; - } - - bit >>= 1; - node = next; - - if (bit == 0) { - if (++i == 16) { - break; - } - - bit = 0x80; - } - } - - if (next) { - if (node->value != BTRIE_NULL) { - return -1; - } - - node->value = value; - return 0; - } - - while (bit & mask[i]) { - next = btrie_alloc(tree); - if (next == NULL) { - return -1; - } - - next->right = NULL; - next->left = NULL; - next->parent = node; - next->value = BTRIE_NULL; - - if (key[i] & bit) { - node->right = next; - - } else { - node->left = next; - } - - bit >>= 1; - node = next; - - if (bit == 0) { - if (++i == 16) { - break; - } - - bit = 0x80; - } - } - - node->value = value; - - return 0; -} - - -int -btrie_delete_a6(btrie_t *tree, const uint8_t *key, const uint8_t *mask) -{ - uint8_t bit; - unsigned int i; - btrie_node_t *node; - - i = 0; - bit = 0x80; - node = tree->root; - - while (node && (bit & mask[i])) { - if (key[i] & bit) { - node = node->right; - - } else { - node = node->left; - } - - bit >>= 1; - - if (bit == 0) { - if (++i == 16) { - break; - } - - bit = 0x80; - } - } - - if (node == NULL) { - return -1; - } - - if (node->right || node->left) { - if (node->value != BTRIE_NULL) { - node->value = BTRIE_NULL; - return 0; - } - - return -1; - } - - for ( ;; ) { - if (node->parent->right == node) { - node->parent->right = NULL; - - } else { - node->parent->left = NULL; - } - - node->right = tree->free; - tree->free = node; - - node = node->parent; - - if (node->right || node->left) { - break; - } - - if (node->value != BTRIE_NULL) { - break; - } - - if (node->parent == NULL) { - break; - } - } - - return 0; -} - - -uintptr_t -btrie_find_a6(btrie_t *tree, const uint8_t *key) -{ - uint8_t bit; - uintptr_t value; - unsigned int i; - btrie_node_t *node; - - i = 0; - bit = 0x80; - value = BTRIE_NULL; - node = tree->root; - - while (node) { - if (node->value != BTRIE_NULL) { - value = node->value; - } - - if (key[i] & bit) { - node = node->right; - - } else { - node = node->left; - } - - bit >>= 1; - - if (bit == 0) { - i++; - bit = 0x80; - } - } - - return value; -} - - -int -btrie_destroy(btrie_t *tree) -{ - size_t i; - - - /* free memory pools */ - for (i = 0; i < tree->len; i++) { - free(tree->pools[i]); - } - - free(tree); - - return 0; -} diff --git a/contrib/libbtrie/test/test_btrie.c b/contrib/libbtrie/test/test_btrie.c deleted file mode 100644 index 2bbf2b2db7e..00000000000 --- a/contrib/libbtrie/test/test_btrie.c +++ /dev/null @@ -1,103 +0,0 @@ -#include -#include - -int main() -{ - btrie_t *it; - int ret; - - uint8_t prefix_v6[16] = {0xde, 0xad, 0xbe, 0xef}; - uint8_t mask_v6[16] = {0xff, 0xff, 0xff}; - uint8_t ip_v6[16] = {0xde, 0xad, 0xbe, 0xef, 0xde}; - - it = btrie_create(); - if (it == NULL) { - printf("create error!\n"); - return 0; - } - - //add 101.45.69.50/16 - ret = btrie_insert(it, 1697465650, 0xffff0000, 1); - if (ret != 0) { - printf("insert 1 error.\n"); - goto error; - } - - //add 10.45.69.50/16 - ret = btrie_insert(it, 170738994, 0xffff0000, 1); - if (ret != 0) { - printf("insert 2 error.\n"); - goto error; - } - - //add 10.45.79.50/16 - ret = btrie_insert(it, 170741554, 0xffff0000, 1); - if (ret == 0) { - printf("insert 3 error.\n"); - goto error; - } - - //add 102.45.79.50/24 - ret = btrie_insert(it, 1714245426, 0xffffff00, 1); - if (ret != 0) { - printf("insert 4 error.\n"); - goto error; - } - - ret = btrie_find(it, 170741554); - if (ret == 1) { - printf("test case 1 passed\n"); - } else { - printf("test case 1 error\n"); - } - - ret = btrie_find(it, 170786817); - if (ret != 1) { - printf("test case 2 passed\n"); - } else { - printf("test case 2 error\n"); - } - - ret = btrie_delete(it, 1714245426, 0xffffff00); - if (ret != 0) { - printf("delete 1 error\n"); - goto error; - } - - ret = btrie_find(it, 1714245426); - if (ret != 1) { - printf("test case 3 passed\n"); - } else { - printf("test case 3 error\n"); - } - - //add dead:beef::/32 - ret = btrie_insert_a6(it, prefix_v6, mask_v6, 1); - if (ret != 0) { - printf("insert 5 error\n"); - goto error; - } - - ret = btrie_find_a6(it, ip_v6); - if (ret == 1) { - printf("test case 4 passed\n"); - } else { - printf("test case 4 error\n"); - } - - // insert 4m ips - for (size_t ip = 1; ip < 1024 * 1024 * 4; ++ip) { - ret = btrie_insert(it, ip, 0xffffffff, 1); - if (ret != 0) { - printf("insert 5 error (%d) (%zu) .\n", ret, ip); - goto error; - } - } - - return 0; - - error: - btrie_destroy(it); - printf("test failed\n"); - return 1; -} diff --git a/contrib/librdkafka b/contrib/librdkafka index 2090cbf56b7..9902bc4fb18 160000 --- a/contrib/librdkafka +++ b/contrib/librdkafka @@ -1 +1 @@ -Subproject commit 2090cbf56b715247ec2be7f768707a7ab1bf7ede +Subproject commit 9902bc4fb18bb441fa55ca154b341cdda191e5d3 diff --git a/contrib/libunwind-cmake/CMakeLists.txt b/contrib/libunwind-cmake/CMakeLists.txt index 82b3b9c0de5..3afff30eee7 100644 --- a/contrib/libunwind-cmake/CMakeLists.txt +++ b/contrib/libunwind-cmake/CMakeLists.txt @@ -22,7 +22,16 @@ set_source_files_properties(${LIBUNWIND_C_SOURCES} PROPERTIES COMPILE_FLAGS "-st set(LIBUNWIND_ASM_SOURCES ${LIBUNWIND_SOURCE_DIR}/src/UnwindRegistersRestore.S ${LIBUNWIND_SOURCE_DIR}/src/UnwindRegistersSave.S) -set_source_files_properties(${LIBUNWIND_ASM_SOURCES} PROPERTIES LANGUAGE C) + +# CMake doesn't pass the correct architecture for Apple prior to CMake 3.19 [1] +# Workaround these two issues by compiling as C. +# +# [1]: https://gitlab.kitware.com/cmake/cmake/-/issues/20771 +if (APPLE AND CMAKE_VERSION VERSION_LESS 3.19) + set_source_files_properties(${LIBUNWIND_ASM_SOURCES} PROPERTIES LANGUAGE C) +else() + enable_language(ASM) +endif() set(LIBUNWIND_SOURCES ${LIBUNWIND_CXX_SOURCES} diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index 1485b0de3ea..e05523ca7c1 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit 1485b0de3eaa1508dfe49a5ba1e4aa2a71fd8335 +Subproject commit e05523ca7c1fb8d095b612a1b1cfe96e199ffb17 diff --git a/contrib/openldap b/contrib/openldap index 34b9ba94b30..0208811b604 160000 --- a/contrib/openldap +++ b/contrib/openldap @@ -1 +1 @@ -Subproject commit 34b9ba94b30319ed6389a4e001d057f7983fe363 +Subproject commit 0208811b6043ca06fda8631a5e473df1ec515ccb diff --git a/contrib/poco b/contrib/poco index f49c6ab8d3a..f3d791f6568 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit f49c6ab8d3aa71828bd1b411485c21722e8c9d82 +Subproject commit f3d791f6568b99366d089b4479f76a515beb66d5 diff --git a/debian/clickhouse-server.init b/debian/clickhouse-server.init index 8f10153a682..3e4e888eacd 100755 --- a/debian/clickhouse-server.init +++ b/debian/clickhouse-server.init @@ -67,26 +67,6 @@ if uname -mpi | grep -q 'x86_64'; then fi -is_running() -{ - pgrep --pidfile "$CLICKHOUSE_PIDFILE" $(echo "${PROGRAM}" | cut -c1-15) 1> /dev/null 2> /dev/null -} - - -wait_for_done() -{ - timeout=$1 - attempts=0 - while is_running; do - attempts=$(($attempts + 1)) - if [ -n "$timeout" ] && [ $attempts -gt $timeout ]; then - return 1 - fi - sleep 1 - done -} - - die() { echo $1 >&2 @@ -105,49 +85,7 @@ check_config() initdb() { - if [ -x "$CLICKHOUSE_BINDIR/$EXTRACT_FROM_CONFIG" ]; then - CLICKHOUSE_DATADIR_FROM_CONFIG=$(su -s $SHELL ${CLICKHOUSE_USER} -c "$CLICKHOUSE_BINDIR/$EXTRACT_FROM_CONFIG --config-file=\"$CLICKHOUSE_CONFIG\" --key=path") - if [ "(" "$?" -ne "0" ")" -o "(" -z "${CLICKHOUSE_DATADIR_FROM_CONFIG}" ")" ]; then - die "Cannot obtain value of path from config file: ${CLICKHOUSE_CONFIG}"; - fi - echo "Path to data directory in ${CLICKHOUSE_CONFIG}: ${CLICKHOUSE_DATADIR_FROM_CONFIG}" - else - CLICKHOUSE_DATADIR_FROM_CONFIG=$CLICKHOUSE_DATADIR - fi - - if ! getent passwd ${CLICKHOUSE_USER} >/dev/null; then - echo "Can't chown to non-existing user ${CLICKHOUSE_USER}" - return - fi - if ! getent group ${CLICKHOUSE_GROUP} >/dev/null; then - echo "Can't chown to non-existing group ${CLICKHOUSE_GROUP}" - return - fi - - if ! $(su -s $SHELL ${CLICKHOUSE_USER} -c "test -r ${CLICKHOUSE_CONFIG}"); then - echo "Warning! clickhouse config [${CLICKHOUSE_CONFIG}] not readable by user [${CLICKHOUSE_USER}]" - fi - - if ! $(su -s $SHELL ${CLICKHOUSE_USER} -c "test -O \"${CLICKHOUSE_DATADIR_FROM_CONFIG}\" && test -G \"${CLICKHOUSE_DATADIR_FROM_CONFIG}\""); then - if [ $(dirname "${CLICKHOUSE_DATADIR_FROM_CONFIG}") = "/" ]; then - echo "Directory ${CLICKHOUSE_DATADIR_FROM_CONFIG} seems too dangerous to chown." - else - if [ ! -e "${CLICKHOUSE_DATADIR_FROM_CONFIG}" ]; then - echo "Creating directory ${CLICKHOUSE_DATADIR_FROM_CONFIG}" - mkdir -p "${CLICKHOUSE_DATADIR_FROM_CONFIG}" - fi - - echo "Changing owner of [${CLICKHOUSE_DATADIR_FROM_CONFIG}] to [${CLICKHOUSE_USER}:${CLICKHOUSE_GROUP}]" - chown -R ${CLICKHOUSE_USER}:${CLICKHOUSE_GROUP} "${CLICKHOUSE_DATADIR_FROM_CONFIG}" - fi - fi - - if ! $(su -s $SHELL ${CLICKHOUSE_USER} -c "test -w ${CLICKHOUSE_LOGDIR}"); then - echo "Changing owner of [${CLICKHOUSE_LOGDIR}/*] to [${CLICKHOUSE_USER}:${CLICKHOUSE_GROUP}]" - chown -R ${CLICKHOUSE_USER}:${CLICKHOUSE_GROUP} ${CLICKHOUSE_LOGDIR}/* - echo "Changing owner of [${CLICKHOUSE_LOGDIR}] to [${CLICKHOUSE_LOGDIR_USER}:${CLICKHOUSE_GROUP}]" - chown ${CLICKHOUSE_LOGDIR_USER}:${CLICKHOUSE_GROUP} ${CLICKHOUSE_LOGDIR} - fi + ${CLICKHOUSE_GENERIC_PROGRAM} install --user "${CLICKHOUSE_USER}" --pid-path "${CLICKHOUSE_PIDDIR}" --config-path "${CLICKHOUSE_CONFDIR}" --binary-path "${CLICKHOUSE_BINDIR}" } @@ -171,17 +109,7 @@ restart() forcestop() { - local EXIT_STATUS - EXIT_STATUS=0 - - echo -n "Stop forcefully $PROGRAM service: " - - kill -KILL $(cat "$CLICKHOUSE_PIDFILE") - - wait_for_done - - echo "DONE" - return $EXIT_STATUS + ${CLICKHOUSE_GENERIC_PROGRAM} stop --force --pid-path "${CLICKHOUSE_PIDDIR}" } @@ -261,16 +189,16 @@ main() service_or_func restart ;; condstart) - is_running || service_or_func start + service_or_func start ;; condstop) - is_running && service_or_func stop + service_or_func stop ;; condrestart) - is_running && service_or_func restart + service_or_func restart ;; condreload) - is_running && service_or_func restart + service_or_func restart ;; initdb) initdb @@ -293,17 +221,7 @@ main() status() { - if is_running; then - echo "$PROGRAM service is running" - exit 0 - else - if is_cron_disabled; then - echo "$PROGRAM service is stopped"; - else - echo "$PROGRAM: process unexpectedly terminated" - fi - exit 3 - fi + ${CLICKHOUSE_GENERIC_PROGRAM} status --pid-path "${CLICKHOUSE_PIDDIR}" } diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index c95344eeca2..63c3c679668 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -104,223 +104,249 @@ function start_server function clone_root { -git clone https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt" + git clone https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt" -( -cd "$FASTTEST_SOURCE" -if [ "$PULL_REQUEST_NUMBER" != "0" ]; then - if git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then - git checkout FETCH_HEAD - echo 'Clonned merge head' - else - git fetch - git checkout "$COMMIT_SHA" - echo 'Checked out to commit' - fi -else - if [ -v COMMIT_SHA ]; then - git checkout "$COMMIT_SHA" - fi -fi -) + ( + cd "$FASTTEST_SOURCE" + if [ "$PULL_REQUEST_NUMBER" != "0" ]; then + if git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/merge"; then + git checkout FETCH_HEAD + echo 'Clonned merge head' + else + git fetch + git checkout "$COMMIT_SHA" + echo 'Checked out to commit' + fi + else + if [ -v COMMIT_SHA ]; then + git checkout "$COMMIT_SHA" + fi + fi + ) } function clone_submodules { -( -cd "$FASTTEST_SOURCE" + ( + cd "$FASTTEST_SOURCE" -SUBMODULES_TO_UPDATE=(contrib/boost contrib/zlib-ng contrib/libxml2 contrib/poco contrib/libunwind contrib/ryu contrib/fmtlib contrib/base64 contrib/cctz contrib/libcpuid contrib/double-conversion contrib/libcxx contrib/libcxxabi contrib/libc-headers contrib/lz4 contrib/zstd contrib/fastops contrib/rapidjson contrib/re2 contrib/sparsehash-c11 contrib/croaring contrib/miniselect contrib/xz) + SUBMODULES_TO_UPDATE=( + contrib/antlr4-runtime + contrib/boost + contrib/zlib-ng + contrib/libxml2 + contrib/poco + contrib/libunwind + contrib/ryu + contrib/fmtlib + contrib/base64 + contrib/cctz + contrib/libcpuid + contrib/double-conversion + contrib/libcxx + contrib/libcxxabi + contrib/libc-headers + contrib/lz4 + contrib/zstd + contrib/fastops + contrib/rapidjson + contrib/re2 + contrib/sparsehash-c11 + contrib/croaring + contrib/miniselect + contrib/xz + ) -git submodule sync -git submodule update --init --recursive "${SUBMODULES_TO_UPDATE[@]}" -git submodule foreach git reset --hard -git submodule foreach git checkout @ -f -git submodule foreach git clean -xfd -) + git submodule sync + git submodule update --init --recursive "${SUBMODULES_TO_UPDATE[@]}" + git submodule foreach git reset --hard + git submodule foreach git checkout @ -f + git submodule foreach git clean -xfd + ) } function run_cmake { -CMAKE_LIBS_CONFIG=( - "-DENABLE_LIBRARIES=0" - "-DENABLE_TESTS=0" - "-DENABLE_UTILS=0" - "-DENABLE_EMBEDDED_COMPILER=0" - "-DENABLE_THINLTO=0" - "-DUSE_UNWIND=1" -) + CMAKE_LIBS_CONFIG=( + "-DENABLE_LIBRARIES=0" + "-DENABLE_TESTS=0" + "-DENABLE_UTILS=0" + "-DENABLE_EMBEDDED_COMPILER=0" + "-DENABLE_THINLTO=0" + "-DUSE_UNWIND=1" + ) -# TODO remove this? we don't use ccache anyway. An option would be to download it -# from S3 simultaneously with cloning. -export CCACHE_DIR="$FASTTEST_WORKSPACE/ccache" -export CCACHE_BASEDIR="$FASTTEST_SOURCE" -export CCACHE_NOHASHDIR=true -export CCACHE_COMPILERCHECK=content -export CCACHE_MAXSIZE=15G + # TODO remove this? we don't use ccache anyway. An option would be to download it + # from S3 simultaneously with cloning. + export CCACHE_DIR="$FASTTEST_WORKSPACE/ccache" + export CCACHE_BASEDIR="$FASTTEST_SOURCE" + export CCACHE_NOHASHDIR=true + export CCACHE_COMPILERCHECK=content + export CCACHE_MAXSIZE=15G -ccache --show-stats ||: -ccache --zero-stats ||: + ccache --show-stats ||: + ccache --zero-stats ||: -mkdir "$FASTTEST_BUILD" ||: + mkdir "$FASTTEST_BUILD" ||: -( -cd "$FASTTEST_BUILD" -cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER=clang++-10 -DCMAKE_C_COMPILER=clang-10 "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt" -) + ( + cd "$FASTTEST_BUILD" + cmake "$FASTTEST_SOURCE" -DCMAKE_CXX_COMPILER=clang++-10 -DCMAKE_C_COMPILER=clang-10 "${CMAKE_LIBS_CONFIG[@]}" "${FASTTEST_CMAKE_FLAGS[@]}" | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/cmake_log.txt" + ) } function build { -( -cd "$FASTTEST_BUILD" -time ninja clickhouse-bundle | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt" -if [ "$COPY_CLICKHOUSE_BINARY_TO_OUTPUT" -eq "1" ]; then - cp programs/clickhouse "$FASTTEST_OUTPUT/clickhouse" -fi -ccache --show-stats ||: -) + ( + cd "$FASTTEST_BUILD" + time ninja clickhouse-bundle | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/build_log.txt" + if [ "$COPY_CLICKHOUSE_BINARY_TO_OUTPUT" -eq "1" ]; then + cp programs/clickhouse "$FASTTEST_OUTPUT/clickhouse" + fi + ccache --show-stats ||: + ) } function configure { -clickhouse-client --version -clickhouse-test --help + clickhouse-client --version + clickhouse-test --help -mkdir -p "$FASTTEST_DATA"{,/client-config} -cp -a "$FASTTEST_SOURCE/programs/server/"{config,users}.xml "$FASTTEST_DATA" -"$FASTTEST_SOURCE/tests/config/install.sh" "$FASTTEST_DATA" "$FASTTEST_DATA/client-config" -cp -a "$FASTTEST_SOURCE/programs/server/config.d/log_to_console.xml" "$FASTTEST_DATA/config.d" -# doesn't support SSL -rm -f "$FASTTEST_DATA/config.d/secure_ports.xml" + mkdir -p "$FASTTEST_DATA"{,/client-config} + cp -a "$FASTTEST_SOURCE/programs/server/"{config,users}.xml "$FASTTEST_DATA" + "$FASTTEST_SOURCE/tests/config/install.sh" "$FASTTEST_DATA" "$FASTTEST_DATA/client-config" + cp -a "$FASTTEST_SOURCE/programs/server/config.d/log_to_console.xml" "$FASTTEST_DATA/config.d" + # doesn't support SSL + rm -f "$FASTTEST_DATA/config.d/secure_ports.xml" } function run_tests { -clickhouse-server --version -clickhouse-test --help + clickhouse-server --version + clickhouse-test --help -# Kill the server in case we are running locally and not in docker -stop_server ||: - -start_server - -TESTS_TO_SKIP=( - 00105_shard_collations - 00109_shard_totals_after_having - 00110_external_sort - 00302_http_compression - 00417_kill_query - 00436_convert_charset - 00490_special_line_separators_and_characters_outside_of_bmp - 00652_replicated_mutations_zookeeper - 00682_empty_parts_merge - 00701_rollup - 00834_cancel_http_readonly_queries_on_client_close - 00911_tautological_compare - 00926_multimatch - 00929_multi_match_edit_distance - 01031_mutations_interpreter_and_context - 01053_ssd_dictionary # this test mistakenly requires acces to /var/lib/clickhouse -- can't run this locally, disabled - 01083_expressions_in_engine_arguments - 01092_memory_profiler - 01098_msgpack_format - 01098_temporary_and_external_tables - 01103_check_cpu_instructions_at_startup # avoid dependency on qemu -- invonvenient when running locally - 01193_metadata_loading - 01238_http_memory_tracking # max_memory_usage_for_user can interfere another queries running concurrently - 01251_dict_is_in_infinite_loop - 01259_dictionary_custom_settings_ddl - 01268_dictionary_direct_layout - 01280_ssd_complex_key_dictionary - 01281_group_by_limit_memory_tracking # max_memory_usage_for_user can interfere another queries running concurrently - 01318_encrypt # Depends on OpenSSL - 01318_decrypt # Depends on OpenSSL - 01281_unsucceeded_insert_select_queries_counter - 01292_create_user - 01294_lazy_database_concurrent - 01305_replica_create_drop_zookeeper - 01354_order_by_tuple_collate_const - 01355_ilike - 01411_bayesian_ab_testing - 01532_collate_in_low_cardinality - 01533_collate_in_nullable - 01542_collate_in_array - 01543_collate_in_tuple - _orc_ - arrow - avro - base64 - brotli - capnproto - client - ddl_dictionaries - h3 - hashing - hdfs - java_hash - json - limit_memory - live_view - memory_leak - memory_limit - mysql - odbc - parallel_alter - parquet - protobuf - secure - sha256 - xz - - # Not sure why these two fail even in sequential mode. Disabled for now - # to make some progress. - 00646_url_engine - 00974_query_profiler - - # In fasttest, ENABLE_LIBRARIES=0, so rocksdb engine is not enabled by default - 01504_rocksdb - - # Look at DistributedFilesToInsert, so cannot run in parallel. - 01460_DistributedFilesToInsert - - 01541_max_memory_usage_for_user - - # Require python libraries like scipy, pandas and numpy - 01322_ttest_scipy - - 01545_system_errors - # Checks system.errors - 01563_distributed_query_finish -) - -time clickhouse-test -j 8 --order=random --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt" - -# substr is to remove semicolon after test name -readarray -t FAILED_TESTS < <(awk '/FAIL|TIMEOUT|ERROR/ { print substr($3, 1, length($3)-1) }' "$FASTTEST_OUTPUT/test_log.txt" | tee "$FASTTEST_OUTPUT/failed-parallel-tests.txt") - -# We will rerun sequentially any tests that have failed during parallel run. -# They might have failed because there was some interference from other tests -# running concurrently. If they fail even in seqential mode, we will report them. -# FIXME All tests that require exclusive access to the server must be -# explicitly marked as `sequential`, and `clickhouse-test` must detect them and -# run them in a separate group after all other tests. This is faster and also -# explicit instead of guessing. -if [[ -n "${FAILED_TESTS[*]}" ]] -then + # Kill the server in case we are running locally and not in docker stop_server ||: - # Clean the data so that there is no interference from the previous test run. - rm -rf "$FASTTEST_DATA"/{{meta,}data,user_files} ||: - start_server - echo "Going to run again: ${FAILED_TESTS[*]}" + TESTS_TO_SKIP=( + 00105_shard_collations + 00109_shard_totals_after_having + 00110_external_sort + 00302_http_compression + 00417_kill_query + 00436_convert_charset + 00490_special_line_separators_and_characters_outside_of_bmp + 00652_replicated_mutations_zookeeper + 00682_empty_parts_merge + 00701_rollup + 00834_cancel_http_readonly_queries_on_client_close + 00911_tautological_compare + 00926_multimatch + 00929_multi_match_edit_distance + 01031_mutations_interpreter_and_context + 01053_ssd_dictionary # this test mistakenly requires acces to /var/lib/clickhouse -- can't run this locally, disabled + 01083_expressions_in_engine_arguments + 01092_memory_profiler + 01098_msgpack_format + 01098_temporary_and_external_tables + 01103_check_cpu_instructions_at_startup # avoid dependency on qemu -- invonvenient when running locally + 01193_metadata_loading + 01238_http_memory_tracking # max_memory_usage_for_user can interfere another queries running concurrently + 01251_dict_is_in_infinite_loop + 01259_dictionary_custom_settings_ddl + 01268_dictionary_direct_layout + 01280_ssd_complex_key_dictionary + 01281_group_by_limit_memory_tracking # max_memory_usage_for_user can interfere another queries running concurrently + 01318_encrypt # Depends on OpenSSL + 01318_decrypt # Depends on OpenSSL + 01281_unsucceeded_insert_select_queries_counter + 01292_create_user + 01294_lazy_database_concurrent + 01305_replica_create_drop_zookeeper + 01354_order_by_tuple_collate_const + 01355_ilike + 01411_bayesian_ab_testing + 01532_collate_in_low_cardinality + 01533_collate_in_nullable + 01542_collate_in_array + 01543_collate_in_tuple + _orc_ + arrow + avro + base64 + brotli + capnproto + client + ddl_dictionaries + h3 + hashing + hdfs + java_hash + json + limit_memory + live_view + memory_leak + memory_limit + mysql + odbc + parallel_alter + parquet + protobuf + secure + sha256 + xz - clickhouse-test --order=random --no-long --testname --shard --zookeeper "${FAILED_TESTS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee -a "$FASTTEST_OUTPUT/test_log.txt" -else - echo "No failed tests" -fi + # Not sure why these two fail even in sequential mode. Disabled for now + # to make some progress. + 00646_url_engine + 00974_query_profiler + + # In fasttest, ENABLE_LIBRARIES=0, so rocksdb engine is not enabled by default + 01504_rocksdb + + # Look at DistributedFilesToInsert, so cannot run in parallel. + 01460_DistributedFilesToInsert + + 01541_max_memory_usage_for_user + + # Require python libraries like scipy, pandas and numpy + 01322_ttest_scipy + 01561_mann_whitney_scipy + + 01545_system_errors + # Checks system.errors + 01563_distributed_query_finish + ) + + time clickhouse-test -j 8 --order=random --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt" + + # substr is to remove semicolon after test name + readarray -t FAILED_TESTS < <(awk '/FAIL|TIMEOUT|ERROR/ { print substr($3, 1, length($3)-1) }' "$FASTTEST_OUTPUT/test_log.txt" | tee "$FASTTEST_OUTPUT/failed-parallel-tests.txt") + + # We will rerun sequentially any tests that have failed during parallel run. + # They might have failed because there was some interference from other tests + # running concurrently. If they fail even in seqential mode, we will report them. + # FIXME All tests that require exclusive access to the server must be + # explicitly marked as `sequential`, and `clickhouse-test` must detect them and + # run them in a separate group after all other tests. This is faster and also + # explicit instead of guessing. + if [[ -n "${FAILED_TESTS[*]}" ]] + then + stop_server ||: + + # Clean the data so that there is no interference from the previous test run. + rm -rf "$FASTTEST_DATA"/{{meta,}data,user_files} ||: + + start_server + + echo "Going to run again: ${FAILED_TESTS[*]}" + + clickhouse-test --order=random --no-long --testname --shard --zookeeper "${FAILED_TESTS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee -a "$FASTTEST_OUTPUT/test_log.txt" + else + echo "No failed tests" + fi } case "$stage" in diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index 70b57b245d3..36188fc4a63 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -28,6 +28,7 @@ RUN apt-get update \ libssl-dev \ libcurl4-openssl-dev \ gdb \ + software-properties-common \ && rm -rf \ /var/lib/apt/lists/* \ /var/cache/debconf \ @@ -37,6 +38,22 @@ RUN apt-get update \ ENV TZ=Europe/Moscow RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone +ENV DOCKER_CHANNEL stable +ENV DOCKER_VERSION 5:19.03.13~3-0~ubuntu-bionic +RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - +RUN add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -c -s) ${DOCKER_CHANNEL}" + +RUN apt-get update \ + && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ + docker-ce \ + && rm -rf \ + /var/lib/apt/lists/* \ + /var/cache/debconf \ + /tmp/* \ + && apt-get clean + +RUN dockerd --version; docker --version + RUN python3 -m pip install \ PyMySQL \ aerospike \ @@ -60,28 +77,6 @@ RUN python3 -m pip install \ tzlocal \ urllib3 -ENV DOCKER_CHANNEL stable -ENV DOCKER_VERSION 17.09.1-ce - -RUN set -eux; \ - \ -# this "case" statement is generated via "update.sh" - \ - if ! wget -nv -O docker.tgz "https://download.docker.com/linux/static/${DOCKER_CHANNEL}/x86_64/docker-${DOCKER_VERSION}.tgz"; then \ - echo >&2 "error: failed to download 'docker-${DOCKER_VERSION}' from '${DOCKER_CHANNEL}' for '${x86_64}'"; \ - exit 1; \ - fi; \ - \ - tar --extract \ - --file docker.tgz \ - --strip-components 1 \ - --directory /usr/local/bin/ \ - ; \ - rm docker.tgz; \ - \ - dockerd --version; \ - docker --version - COPY modprobe.sh /usr/local/bin/modprobe COPY dockerd-entrypoint.sh /usr/local/bin/ COPY compose/ /compose/ diff --git a/docker/test/integration/runner/compose/docker_compose_kerberized_kafka.yml b/docker/test/integration/runner/compose/docker_compose_kerberized_kafka.yml index 3ce0000b148..6e1e11344bb 100644 --- a/docker/test/integration/runner/compose/docker_compose_kerberized_kafka.yml +++ b/docker/test/integration/runner/compose/docker_compose_kerberized_kafka.yml @@ -50,7 +50,7 @@ services: - label:disable kafka_kerberos: - image: yandex/clickhouse-kerberos-kdc:${DOCKER_KERBEROS_KDC_TAG} + image: yandex/clickhouse-kerberos-kdc:${DOCKER_KERBEROS_KDC_TAG:-latest} hostname: kafka_kerberos volumes: - ${KERBERIZED_KAFKA_DIR}/secrets:/tmp/keytab diff --git a/docker/test/integration/runner/compose/docker_compose_mysql.yml b/docker/test/integration/runner/compose/docker_compose_mysql.yml index 2f09c2c01e3..90daf8a4238 100644 --- a/docker/test/integration/runner/compose/docker_compose_mysql.yml +++ b/docker/test/integration/runner/compose/docker_compose_mysql.yml @@ -7,4 +7,4 @@ services: MYSQL_ROOT_PASSWORD: clickhouse ports: - 3308:3306 - command: --server_id=100 --log-bin='mysql-bin-1.log' --default-time-zone='+3:00' --gtid-mode="ON" --enforce-gtid-consistency \ No newline at end of file + command: --server_id=100 --log-bin='mysql-bin-1.log' --default-time-zone='+3:00' --gtid-mode="ON" --enforce-gtid-consistency diff --git a/docker/test/integration/runner/compose/docker_compose_mysql_5_7_for_materialize_mysql.yml b/docker/test/integration/runner/compose/docker_compose_mysql_5_7_for_materialize_mysql.yml new file mode 100644 index 00000000000..e7d762203ee --- /dev/null +++ b/docker/test/integration/runner/compose/docker_compose_mysql_5_7_for_materialize_mysql.yml @@ -0,0 +1,10 @@ +version: '2.3' +services: + mysql1: + image: mysql:5.7 + restart: 'no' + environment: + MYSQL_ROOT_PASSWORD: clickhouse + ports: + - 3308:3306 + command: --server_id=100 --log-bin='mysql-bin-1.log' --default-time-zone='+3:00' --gtid-mode="ON" --enforce-gtid-consistency diff --git a/docker/test/integration/runner/compose/docker_compose_mysql_8_0.yml b/docker/test/integration/runner/compose/docker_compose_mysql_8_0_for_materialize_mysql.yml similarity index 93% rename from docker/test/integration/runner/compose/docker_compose_mysql_8_0.yml rename to docker/test/integration/runner/compose/docker_compose_mysql_8_0_for_materialize_mysql.yml index 1aa97f59a83..918a2b5f80f 100644 --- a/docker/test/integration/runner/compose/docker_compose_mysql_8_0.yml +++ b/docker/test/integration/runner/compose/docker_compose_mysql_8_0_for_materialize_mysql.yml @@ -2,7 +2,7 @@ version: '2.3' services: mysql8_0: image: mysql:8.0 - restart: always + restart: 'no' environment: MYSQL_ROOT_PASSWORD: clickhouse ports: diff --git a/docker/test/integration/runner/compose/docker_compose_mysql_golang_client.yml b/docker/test/integration/runner/compose/docker_compose_mysql_golang_client.yml index b172cbcb2c6..a6a338eb6a8 100644 --- a/docker/test/integration/runner/compose/docker_compose_mysql_golang_client.yml +++ b/docker/test/integration/runner/compose/docker_compose_mysql_golang_client.yml @@ -1,6 +1,6 @@ version: '2.3' services: golang1: - image: yandex/clickhouse-mysql-golang-client:${DOCKER_MYSQL_GOLANG_CLIENT_TAG} + image: yandex/clickhouse-mysql-golang-client:${DOCKER_MYSQL_GOLANG_CLIENT_TAG:-latest} # to keep container running command: sleep infinity diff --git a/docker/test/integration/runner/compose/docker_compose_mysql_java_client.yml b/docker/test/integration/runner/compose/docker_compose_mysql_java_client.yml index be1b3ad3f72..21d927df82c 100644 --- a/docker/test/integration/runner/compose/docker_compose_mysql_java_client.yml +++ b/docker/test/integration/runner/compose/docker_compose_mysql_java_client.yml @@ -1,6 +1,6 @@ version: '2.3' services: java1: - image: yandex/clickhouse-mysql-java-client:${DOCKER_MYSQL_JAVA_CLIENT_TAG} + image: yandex/clickhouse-mysql-java-client:${DOCKER_MYSQL_JAVA_CLIENT_TAG:-latest} # to keep container running command: sleep infinity diff --git a/docker/test/integration/runner/compose/docker_compose_mysql_js_client.yml b/docker/test/integration/runner/compose/docker_compose_mysql_js_client.yml index 83954229111..dbd85cf2382 100644 --- a/docker/test/integration/runner/compose/docker_compose_mysql_js_client.yml +++ b/docker/test/integration/runner/compose/docker_compose_mysql_js_client.yml @@ -1,6 +1,6 @@ version: '2.3' services: mysqljs1: - image: yandex/clickhouse-mysql-js-client:${DOCKER_MYSQL_JS_CLIENT_TAG} + image: yandex/clickhouse-mysql-js-client:${DOCKER_MYSQL_JS_CLIENT_TAG:-latest} # to keep container running command: sleep infinity diff --git a/docker/test/integration/runner/compose/docker_compose_mysql_php_client.yml b/docker/test/integration/runner/compose/docker_compose_mysql_php_client.yml index e61cb193b0e..f24f5337a7e 100644 --- a/docker/test/integration/runner/compose/docker_compose_mysql_php_client.yml +++ b/docker/test/integration/runner/compose/docker_compose_mysql_php_client.yml @@ -1,6 +1,6 @@ version: '2.3' services: php1: - image: yandex/clickhouse-mysql-php-client:${DOCKER_MYSQL_PHP_CLIENT_TAG} + image: yandex/clickhouse-mysql-php-client:${DOCKER_MYSQL_PHP_CLIENT_TAG:-latest} # to keep container running command: sleep infinity diff --git a/docker/test/integration/runner/compose/docker_compose_postgesql_java_client.yml b/docker/test/integration/runner/compose/docker_compose_postgesql_java_client.yml index ef18d1edd7b..38191f1bdd6 100644 --- a/docker/test/integration/runner/compose/docker_compose_postgesql_java_client.yml +++ b/docker/test/integration/runner/compose/docker_compose_postgesql_java_client.yml @@ -1,6 +1,6 @@ version: '2.2' services: java: - image: yandex/clickhouse-postgresql-java-client:${DOCKER_POSTGRESQL_JAVA_CLIENT_TAG} + image: yandex/clickhouse-postgresql-java-client:${DOCKER_POSTGRESQL_JAVA_CLIENT_TAG:-latest} # to keep container running command: sleep infinity diff --git a/docker/test/performance-comparison/Dockerfile b/docker/test/performance-comparison/Dockerfile index 004bac02918..8734e47e80f 100644 --- a/docker/test/performance-comparison/Dockerfile +++ b/docker/test/performance-comparison/Dockerfile @@ -25,12 +25,13 @@ RUN apt-get update \ python3 \ python3-dev \ python3-pip \ + python3-setuptools \ rsync \ tree \ tzdata \ vim \ wget \ - && pip3 --no-cache-dir install 'clickhouse-driver>=0.1.5' scipy \ + && pip3 --no-cache-dir install 'git+https://github.com/mymarilyn/clickhouse-driver.git' scipy \ && apt-get purge --yes python3-dev g++ \ && apt-get autoremove --yes \ && apt-get clean \ diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py index 1c54479aab3..7175d0e4143 100755 --- a/docker/test/performance-comparison/perf.py +++ b/docker/test/performance-comparison/perf.py @@ -143,7 +143,8 @@ reportStageEnd('before-connect') # Open connections servers = [{'host': host or args.host[0], 'port': port or args.port[0]} for (host, port) in itertools.zip_longest(args.host, args.port)] -all_connections = [clickhouse_driver.Client(**server) for server in servers] +# Force settings_is_important to fail queries on unknown settings. +all_connections = [clickhouse_driver.Client(**server, settings_is_important=True) for server in servers] for i, s in enumerate(servers): print(f'server\t{i}\t{s["host"]}\t{s["port"]}') @@ -167,12 +168,6 @@ if not args.use_existing_tables: reportStageEnd('drop-1') # Apply settings. -# If there are errors, report them and continue -- maybe a new test uses a setting -# that is not in master, but the queries can still run. If we have multiple -# settings and one of them throws an exception, all previous settings for this -# connection will be reset, because the driver reconnects on error (not -# configurable). So the end result is uncertain, but hopefully we'll be able to -# run at least some queries. settings = root.findall('settings/*') for conn_index, c in enumerate(all_connections): for s in settings: @@ -415,4 +410,4 @@ if not args.keep_created_tables and not args.use_existing_tables: c.execute(q) print(f'drop\t{conn_index}\t{c.last_query.elapsed}\t{tsv_escape(q)}') -reportStageEnd('drop-2') + reportStageEnd('drop-2') diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index 47c45e57508..b063f8d81f6 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -8,6 +8,7 @@ RUN apt-get update -y \ apt-get install --yes --no-install-recommends \ brotli \ expect \ + zstd \ lsof \ ncdu \ netcat-openbsd \ diff --git a/docker/test/stateless_unbundled/Dockerfile b/docker/test/stateless_unbundled/Dockerfile index 1c9f9510d7e..d212290d553 100644 --- a/docker/test/stateless_unbundled/Dockerfile +++ b/docker/test/stateless_unbundled/Dockerfile @@ -8,6 +8,7 @@ RUN apt-get --allow-unauthenticated update -y \ apt-get --allow-unauthenticated install --yes --no-install-recommends \ alien \ brotli \ + zstd \ cmake \ devscripts \ expect \ diff --git a/docker/test/stateless_with_coverage/Dockerfile b/docker/test/stateless_with_coverage/Dockerfile index f7379ba5568..e31d2d1e009 100644 --- a/docker/test/stateless_with_coverage/Dockerfile +++ b/docker/test/stateless_with_coverage/Dockerfile @@ -24,6 +24,7 @@ RUN apt-get update -y \ tree \ moreutils \ brotli \ + zstd \ gdb \ lsof \ unixodbc \ diff --git a/docs/_includes/cmake_in_clickhouse_header.md b/docs/_includes/cmake_in_clickhouse_header.md index 10776e04c01..7dfda35e34a 100644 --- a/docs/_includes/cmake_in_clickhouse_header.md +++ b/docs/_includes/cmake_in_clickhouse_header.md @@ -13,9 +13,9 @@ cmake .. \ -DENABLE_CLICKHOUSE_SERVER=ON \ -DENABLE_CLICKHOUSE_CLIENT=ON \ -DUSE_STATIC_LIBRARIES=OFF \ - -DCLICKHOUSE_SPLIT_BINARY=ON \ -DSPLIT_SHARED_LIBRARIES=ON \ -DENABLE_LIBRARIES=OFF \ + -DUSE_UNWIND=ON \ -DENABLE_UTILS=OFF \ -DENABLE_TESTS=OFF ``` diff --git a/docs/en/development/contrib.md b/docs/en/development/contrib.md index 639b78185e4..76a2f647231 100644 --- a/docs/en/development/contrib.md +++ b/docs/en/development/contrib.md @@ -17,7 +17,6 @@ toc_title: Third-Party Libraries Used | googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | | h3 | [Apache License 2.0](https://github.com/uber/h3/blob/master/LICENSE) | | hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libbtrie | [BSD 2-Clause License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | | libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | | libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | | libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | diff --git a/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md b/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md index 4bfb9dc200e..ea0b265d652 100644 --- a/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md @@ -273,13 +273,15 @@ SELECT sum(Duration) AS Duration FROM UAct GROUP BY UserID -```text +``` + +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┐ │ 4324182021466249494 │ 6 │ 185 │ └─────────────────────┴───────────┴──────────┘ ``` -``` sqk +``` sql select count() FROM UAct ``` diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 584bd31e276..5f99ff99dab 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -579,6 +579,7 @@ Tags: - `disk` — a disk within a volume. - `max_data_part_size_bytes` — the maximum size of a part that can be stored on any of the volume’s disks. - `move_factor` — when the amount of available space gets lower than this factor, data automatically start to move on the next volume if any (by default, 0.1). +- `prefer_not_to_merge` — Disables merging of data parts on this volume. When this setting is enabled, merging data on this volume is not allowed. This allows controlling how ClickHouse works with slow disks. Cofiguration examples: @@ -607,6 +608,18 @@ Cofiguration examples: 0.2 + + + +
+ jbod1 +
+ + external + true + +
+
... diff --git a/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md b/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md index 684e7e28112..b82bc65afc2 100644 --- a/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md @@ -5,7 +5,7 @@ toc_title: ReplacingMergeTree # ReplacingMergeTree {#replacingmergetree} -The engine differs from [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree) in that it removes duplicate entries with the same [sorting key](../../../engines/table-engines/mergetree-family/mergetree.md) value. +The engine differs from [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree) in that it removes duplicate entries with the same [sorting key](../../../engines/table-engines/mergetree-family/mergetree.md) value (`ORDER BY` table section, not `PRIMARY KEY`). Data deduplication occurs only during a merge. Merging occurs in the background at an unknown time, so you can’t plan for it. Some of the data may remain unprocessed. Although you can run an unscheduled merge using the `OPTIMIZE` query, don’t count on using it, because the `OPTIMIZE` query will read and write a large amount of data. @@ -29,13 +29,16 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] For a description of request parameters, see [statement description](../../../sql-reference/statements/create/table.md). +!!! note "Attention" + Uniqueness of rows is determined by the `ORDER BY` table section, not `PRIMARY KEY`. + **ReplacingMergeTree Parameters** - `ver` — column with version. Type `UInt*`, `Date` or `DateTime`. Optional parameter. When merging, `ReplacingMergeTree` from all the rows with the same sorting key leaves only one: - - Last in the selection, if `ver` not set. + - The last in the selection, if `ver` not set. A selection is a set of rows in a set of parts participating in the merge. The most recently created part (the last insert) will be the last one in the selection. Thus, after deduplication, the very last row from the most recent insert will remain for each unique sorting key. - With the maximum version, if `ver` specified. **Query clauses** diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index 932facc9ddc..625869a3cb8 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -53,6 +53,42 @@ Example of setting the addresses of the ZooKeeper cluster: ``` +ClickHouse also supports to store replicas meta information in the auxiliary ZooKeeper cluster by providing ZooKeeper cluster name and path as engine arguments. +In other word, it supports to store the metadata of differnt tables in different ZooKeeper clusters. + +Example of setting the addresses of the auxiliary ZooKeeper cluster: + +``` xml + + + + example_2_1 + 2181 + + + example_2_2 + 2181 + + + example_2_3 + 2181 + + + + + example_3_1 + 2181 + + + +``` + +To store table datameta in a auxiliary ZooKeeper cluster instead of default ZooKeeper cluster, we can use the SQL to create table with +ReplicatedMergeTree engine as follow: + +``` +CREATE TABLE table_name ( ... ) ENGINE = ReplicatedMergeTree('zookeeper_name_configured_in_auxiliary_zookeepers:path', 'replica_name') ... +``` You can specify any existing ZooKeeper cluster and the system will use a directory on it for its own data (the directory is specified when creating a replicatable table). If ZooKeeper isn’t set in the config file, you can’t create replicated tables, and any existing replicated tables will be read-only. @@ -152,7 +188,7 @@ You can specify default arguments for `Replicated` table engine in the server co ```xml /clickhouse/tables/{shard}/{database}/{table} -{replica} +{replica} ``` In this case, you can omit arguments when creating tables: diff --git a/docs/en/getting-started/tutorial.md b/docs/en/getting-started/tutorial.md index 8d41279fef9..3e051456a75 100644 --- a/docs/en/getting-started/tutorial.md +++ b/docs/en/getting-started/tutorial.md @@ -11,7 +11,7 @@ By going through this tutorial, you’ll learn how to set up a simple ClickHouse ## Single Node Setup {#single-node-setup} -To postpone the complexities of a distributed environment, we’ll start with deploying ClickHouse on a single server or virtual machine. ClickHouse is usually installed from [deb](../getting-started/install.md#install-from-deb-packages) or [rpm](../getting-started/install.md#from-rpm-packages) packages, but there are [alternatives](../getting-started/install.md#from-docker-image) for the operating systems that do no support them. +To postpone the complexities of a distributed environment, we’ll start with deploying ClickHouse on a single server or virtual machine. ClickHouse is usually installed from [deb](../getting-started/install.md#install-from-deb-packages) or [rpm](../getting-started/install.md#from-rpm-packages) packages, but there are [alternatives](../getting-started/install.md#from-docker-image) for the operating systems that do not support them. For example, you have chosen `deb` packages and executed: diff --git a/docs/en/index.md b/docs/en/index.md index 8280d5c9f97..676fd444995 100644 --- a/docs/en/index.md +++ b/docs/en/index.md @@ -5,7 +5,7 @@ toc_title: Overview # What Is ClickHouse? {#what-is-clickhouse} -ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP). +ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP). In a “normal” row-oriented DBMS, data is stored in this order: diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index d310705d1c1..618ae374e8a 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -25,6 +25,7 @@ The supported formats are: | [Vertical](#vertical) | ✗ | ✔ | | [VerticalRaw](#verticalraw) | ✗ | ✔ | | [JSON](#json) | ✗ | ✔ | +| [JSONAsString](#jsonasstring) | ✔ | ✗ | | [JSONString](#jsonstring) | ✗ | ✔ | | [JSONCompact](#jsoncompact) | ✗ | ✔ | | [JSONCompactString](#jsoncompactstring) | ✗ | ✔ | @@ -507,6 +508,34 @@ Example: } ``` +## JSONAsString {#jsonasstring} + +In this format, a single JSON object is interpreted as a single value. If input has several JSON objects (comma separated) they will be interpreted as a sepatate rows. + +This format can only be parsed for table with a single field of type [String](../sql-reference/data-types/string.md). The remaining columns must be set to [DEFAULT](../sql-reference/statements/create/table.md#default) or [MATERIALIZED](../sql-reference/statements/create/table.md#materialized), or omitted. Once you collect whole JSON object to string you can use [JSON functions](../sql-reference/functions/json-functions.md) to process it. + +**Example** + +Query: + +``` sql +DROP TABLE IF EXISTS json_as_string; +CREATE TABLE json_as_string (json String) ENGINE = Memory; +INSERT INTO json_as_string FORMAT JSONAsString {"foo":{"bar":{"x":"y"},"baz":1}},{},{"any json stucture":1} +SELECT * FROM json_as_string; +``` + +Result: + +``` text +┌─json──────────────────────────────┐ +│ {"foo":{"bar":{"x":"y"},"baz":1}} │ +│ {} │ +│ {"any json stucture":1} │ +└───────────────────────────────────┘ +``` + + ## JSONCompact {#jsoncompact} ## JSONCompactString {#jsoncompactstring} diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md index 1cffead788a..b365bd880ac 100644 --- a/docs/en/introduction/adopters.md +++ b/docs/en/introduction/adopters.md @@ -23,6 +23,7 @@ toc_title: Adopters | BIGO | Video | Computing Platform | — | — | [Blog Article, August 2020](https://www.programmersought.com/article/44544895251/) | | Bloomberg | Finance, Media | Monitoring | 102 servers | — | [Slides, May 2018](https://www.slideshare.net/Altinity/http-analytics-for-6m-requests-per-second-using-clickhouse-by-alexander-bocharov) | | Bloxy | Blockchain | Analytics | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/4_bloxy.pptx) | +| Bytedance | Social platforms | — | — | — | [The ClickHouse Meetup East, October 2020](https://www.youtube.com/watch?v=ckChUkC3Pns) | | CardsMobile | Finance | Analytics | — | — | [VC.ru](https://vc.ru/s/cardsmobile/143449-rukovoditel-gruppy-analiza-dannyh) | | CARTO | Business Intelligence | Geo analytics | — | — | [Geospatial processing with ClickHouse](https://carto.com/blog/geospatial-processing-with-clickhouse/) | | CERN | Research | Experiment | — | — | [Press release, April 2012](https://www.yandex.com/company/press_center/press_releases/2012/2012-04-10/) | @@ -96,6 +97,7 @@ toc_title: Adopters | Splunk | Business Analytics | Main product | — | — | [Slides in English, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/splunk.pdf) | | Spotify | Music | Experimentation | — | — | [Slides, July 2018](https://www.slideshare.net/glebus/using-clickhouse-for-experimentation-104247173) | | Staffcop | Information Security | Main Product | — | — | [Official website, Documentation](https://www.staffcop.ru/sce43) | +| Suning | E-Commerce | User behaviour analytics | — | — | [Blog article](https://www.sohu.com/a/434152235_411876) | | Teralytics | Mobility | Analytics | — | — | [Tech blog](https://www.teralytics.net/knowledge-hub/visualizing-mobility-data-the-scalability-challenge) | | Tencent | Big Data | Data processing | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/5.%20ClickHouse大数据集群应用_李俊飞腾讯网媒事业部.pdf) | | Tencent | Messaging | Logging | — | — | [Talk in Chinese, November 2019](https://youtu.be/T-iVQRuw-QY?t=5050) | @@ -111,7 +113,7 @@ toc_title: Adopters | Yandex Cloud | Public Cloud | Main product | — | — | [Talk in Russian, December 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) | | Yandex DataLens | Business Intelligence | Main product | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.tech/meetup38/datalens.pdf) | | Yandex Market | e-Commerce | Metrics, Logging | — | — | [Talk in Russian, January 2019](https://youtu.be/_l1qP0DyBcA?t=478) | -| Yandex Metrica | Web analytics | Main product | 360 servers in one cluster, 1862 servers in one department | 66.41 PiB / 5.68 PiB | [Slides, February 2020](https://presentations.clickhouse.tech/meetup40/introduction/#13) | +| Yandex Metrica | Web analytics | Main product | 630 servers in one cluster, 360 servers in another cluster, 1862 servers in one department | 133 PiB / 8.31 PiB / 120 trillion records | [Slides, February 2020](https://presentations.clickhouse.tech/meetup40/introduction/#13) | | ЦВТ | Software Development | Metrics, Logging | — | — | [Blog Post, March 2019, in Russian](https://vc.ru/dev/62715-kak-my-stroili-monitoring-na-prometheus-clickhouse-i-elk) | | МКБ | Bank | Web-system monitoring | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/mkb.pdf) | | ЦФТ | Banking, Financial products, Payments | — | — | — | [Meetup in Russian, April 2020](https://team.cft.ru/events/162) | diff --git a/docs/en/operations/opentelemetry.md b/docs/en/operations/opentelemetry.md index 45533d3733f..2afeabc7956 100644 --- a/docs/en/operations/opentelemetry.md +++ b/docs/en/operations/opentelemetry.md @@ -44,11 +44,10 @@ stages, such as query planning or distributed queries. To be useful, the tracing information has to be exported to a monitoring system that supports OpenTelemetry, such as Jaeger or Prometheus. ClickHouse avoids -a dependency on a particular monitoring system, instead only -providing the tracing data conforming to the standard. A natural way to do so -in an SQL RDBMS is a system table. OpenTelemetry trace span information +a dependency on a particular monitoring system, instead only providing the +tracing data through a system table. OpenTelemetry trace span information [required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span) -is stored in the system table called `system.opentelemetry_span_log`. +is stored in the `system.opentelemetry_span_log` table. The table must be enabled in the server configuration, see the `opentelemetry_span_log` element in the default config file `config.xml`. It is enabled by default. @@ -67,3 +66,31 @@ The table has the following columns: The tags or attributes are saved as two parallel arrays, containing the keys and values. Use `ARRAY JOIN` to work with them. + +## Integration with monitoring systems + +At the moment, there is no ready tool that can export the tracing data from +ClickHouse to a monitoring system. + +For testing, it is possible to setup the export using a materialized view with the URL engine over the `system.opentelemetry_span_log` table, which would push the arriving log data to an HTTP endpoint of a trace collector. For example, to push the minimal span data to a Zipkin instance running at `http://localhost:9411`, in Zipkin v2 JSON format: + +```sql +CREATE MATERIALIZED VIEW default.zipkin_spans +ENGINE = URL('http://127.0.0.1:9411/api/v2/spans', 'JSONEachRow') +SETTINGS output_format_json_named_tuples_as_objects = 1, + output_format_json_array_of_rows = 1 AS +SELECT + lower(hex(reinterpretAsFixedString(trace_id))) AS traceId, + lower(hex(parent_span_id)) AS parentId, + lower(hex(span_id)) AS id, + operation_name AS name, + start_time_us AS timestamp, + finish_time_us - start_time_us AS duration, + cast(tuple('clickhouse'), 'Tuple(serviceName text)') AS localEndpoint, + cast(tuple( + attribute.values[indexOf(attribute.names, 'db.statement')]), + 'Tuple("db.statement" text)') AS tags +FROM system.opentelemetry_span_log +``` + +In case of any errors, the part of the log data for which the error has occurred will be silently lost. Check the server log for error messages if the data does not arrive. diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index e111cf3ab75..533fcea5500 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -139,7 +139,7 @@ Lazy loading of dictionaries. If `true`, then each dictionary is created on first use. If dictionary creation failed, the function that was using the dictionary throws an exception. -If `false`, all dictionaries are created when the server starts, and if there is an error, the server shuts down. +If `false`, all dictionaries are created when the server starts, if the dictionary or dictionaries are created too long or are created with errors, then the server boots without of these dictionaries and continues to try to create these dictionaries. The default is `true`. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index ba899754b18..8346d5ceac9 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -2293,6 +2293,47 @@ Result: └─────────────────────────┴─────────┘ ``` +## system_events_show_zero_values {#system_events_show_zero_values} + +Allows to select zero-valued events from [`system.events`](../../operations/system-tables/events.md). + +Some monitoring systems require passing all the metrics values to them for each checkpoint, even if the metric value is zero. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: `0`. + +**Examples** + +Query + +```sql +SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; +``` + +Result + +```text +Ok. +``` + +Query +```sql +SET system_events_show_zero_values = 1; +SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; +``` + +Result + +```text +┌─event────────────────────┬─value─┬─description───────────────────────────────────────────┐ +│ QueryMemoryLimitExceeded │ 0 │ Number of times when memory limit exceeded for query. │ +└──────────────────────────┴───────┴───────────────────────────────────────────────────────┘ +``` + ## allow_experimental_bigint_types {#allow_experimental_bigint_types} Enables or disables integer values exceeding the range that is supported by the int data type. diff --git a/docs/en/operations/system-tables/clusters.md b/docs/en/operations/system-tables/clusters.md index f18dfb3d1c0..cba52586e93 100644 --- a/docs/en/operations/system-tables/clusters.md +++ b/docs/en/operations/system-tables/clusters.md @@ -23,4 +23,44 @@ Please note that `errors_count` is updated once per query to the cluster, but `e - [distributed_replica_error_cap setting](../../operations/settings/settings.md#settings-distributed_replica_error_cap) - [distributed_replica_error_half_life setting](../../operations/settings/settings.md#settings-distributed_replica_error_half_life) +**Example** + +```sql +:) SELECT * FROM system.clusters LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +cluster: test_cluster +shard_num: 1 +shard_weight: 1 +replica_num: 1 +host_name: clickhouse01 +host_address: 172.23.0.11 +port: 9000 +is_local: 1 +user: default +default_database: +errors_count: 0 +estimated_recovery_time: 0 + +Row 2: +────── +cluster: test_cluster +shard_num: 1 +shard_weight: 1 +replica_num: 2 +host_name: clickhouse02 +host_address: 172.23.0.12 +port: 9000 +is_local: 0 +user: default +default_database: +errors_count: 0 +estimated_recovery_time: 0 + +2 rows in set. Elapsed: 0.002 sec. +``` + [Original article](https://clickhouse.tech/docs/en/operations/system_tables/clusters) diff --git a/docs/en/operations/system-tables/columns.md b/docs/en/operations/system-tables/columns.md index 92cbdd19ca8..92a6315d06b 100644 --- a/docs/en/operations/system-tables/columns.md +++ b/docs/en/operations/system-tables/columns.md @@ -23,4 +23,50 @@ The `system.columns` table contains the following columns (the column type is sh - `is_in_sampling_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag that indicates whether the column is in the sampling key expression. - `compression_codec` ([String](../../sql-reference/data-types/string.md)) — Compression codec name. +**Example** + +```sql +:) select * from system.columns LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: system +table: aggregate_function_combinators +name: name +type: String +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: + +Row 2: +────── +database: system +table: aggregate_function_combinators +name: is_internal +type: UInt8 +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: + +2 rows in set. Elapsed: 0.002 sec. +``` + [Original article](https://clickhouse.tech/docs/en/operations/system_tables/columns) diff --git a/docs/en/operations/system-tables/databases.md b/docs/en/operations/system-tables/databases.md index 84b696a3bf8..8ef5551d9b0 100644 --- a/docs/en/operations/system-tables/databases.md +++ b/docs/en/operations/system-tables/databases.md @@ -1,9 +1,38 @@ # system.databases {#system-databases} -This table contains a single String column called ‘name’ – the name of a database. +Contains information about the databases that are available to the current user. -Each database that the server knows about has a corresponding entry in the table. +Columns: -This system table is used for implementing the `SHOW DATABASES` query. +- `name` ([String](../../sql-reference/data-types/string.md)) — Database name. +- `engine` ([String](../../sql-reference/data-types/string.md)) — [Database engine](../../engines/database-engines/index.md). +- `data_path` ([String](../../sql-reference/data-types/string.md)) — Data path. +- `metadata_path` ([String](../../sql-reference/data-types/enum.md)) — Metadata path. +- `uuid` ([UUID](../../sql-reference/data-types/uuid.md)) — Database UUID. -[Original article](https://clickhouse.tech/docs/en/operations/system_tables/databases) \ No newline at end of file +The `name` column from this system table is used for implementing the `SHOW DATABASES` query. + +**Example** + +Create a database. + +``` sql +CREATE DATABASE test +``` + +Check all of the available databases to the user. + +``` sql +SELECT * FROM system.databases +``` + +``` text +┌─name───────────────────────────┬─engine─┬─data_path──────────────────┬─metadata_path───────────────────────────────────────────────────────┬─────────────────────────────────uuid─┐ +│ _temporary_and_external_tables │ Memory │ /var/lib/clickhouse/ │ │ 00000000-0000-0000-0000-000000000000 │ +│ default │ Atomic │ /var/lib/clickhouse/store/ │ /var/lib/clickhouse/store/d31/d317b4bd-3595-4386-81ee-c2334694128a/ │ d317b4bd-3595-4386-81ee-c2334694128a │ +│ test │ Atomic │ /var/lib/clickhouse/store/ │ /var/lib/clickhouse/store/39b/39bf0cc5-4c06-4717-87fe-c75ff3bd8ebb/ │ 39bf0cc5-4c06-4717-87fe-c75ff3bd8ebb │ +│ system │ Atomic │ /var/lib/clickhouse/store/ │ /var/lib/clickhouse/store/1d1/1d1c869d-e465-4b1b-a51f-be033436ebf9/ │ 1d1c869d-e465-4b1b-a51f-be033436ebf9 │ +└────────────────────────────────┴────────┴────────────────────────────┴─────────────────────────────────────────────────────────────────────┴──────────────────────────────────────┘ +``` + +[Original article](https://clickhouse.tech/docs/en/operations/system_tables/databases) diff --git a/docs/en/operations/system-tables/disks.md b/docs/en/operations/system-tables/disks.md index 9c01b6d9aa4..e9d324580d8 100644 --- a/docs/en/operations/system-tables/disks.md +++ b/docs/en/operations/system-tables/disks.md @@ -11,3 +11,21 @@ Columns: - `keep_free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Amount of disk space that should stay free on disk in bytes. Defined in the `keep_free_space_bytes` parameter of disk configuration. [Original article](https://clickhouse.tech/docs/en/operations/system_tables/disks) + + +**Example** + +```sql +:) SELECT * FROM system.disks; +``` + +```text +┌─name────┬─path─────────────────┬───free_space─┬──total_space─┬─keep_free_space─┐ +│ default │ /var/lib/clickhouse/ │ 276392587264 │ 490652508160 │ 0 │ +└─────────┴──────────────────────┴──────────────┴──────────────┴─────────────────┘ + +1 rows in set. Elapsed: 0.001 sec. +``` + + + diff --git a/docs/en/operations/system-tables/functions.md b/docs/en/operations/system-tables/functions.md index d9a5e3cc363..fbcd4b7b723 100644 --- a/docs/en/operations/system-tables/functions.md +++ b/docs/en/operations/system-tables/functions.md @@ -8,3 +8,26 @@ Columns: - `is_aggregate`(`UInt8`) — Whether the function is aggregate. [Original article](https://clickhouse.tech/docs/en/operations/system_tables/functions) + +**Example** + +```sql + SELECT * FROM system.functions LIMIT 10; +``` + +```text +┌─name─────────────────────┬─is_aggregate─┬─case_insensitive─┬─alias_to─┐ +│ sumburConsistentHash │ 0 │ 0 │ │ +│ yandexConsistentHash │ 0 │ 0 │ │ +│ demangle │ 0 │ 0 │ │ +│ addressToLine │ 0 │ 0 │ │ +│ JSONExtractRaw │ 0 │ 0 │ │ +│ JSONExtractKeysAndValues │ 0 │ 0 │ │ +│ JSONExtract │ 0 │ 0 │ │ +│ JSONExtractString │ 0 │ 0 │ │ +│ JSONExtractFloat │ 0 │ 0 │ │ +│ JSONExtractInt │ 0 │ 0 │ │ +└──────────────────────────┴──────────────┴──────────────────┴──────────┘ + +10 rows in set. Elapsed: 0.002 sec. +``` \ No newline at end of file diff --git a/docs/en/operations/system-tables/merge_tree_settings.md b/docs/en/operations/system-tables/merge_tree_settings.md index 78aab24cb41..c2c5703f869 100644 --- a/docs/en/operations/system-tables/merge_tree_settings.md +++ b/docs/en/operations/system-tables/merge_tree_settings.md @@ -10,4 +10,45 @@ Columns: - `type` (String) — Setting type (implementation specific string value). - `changed` (UInt8) — Whether the setting was explicitly defined in the config or explicitly changed. +**Example** +```sql +:) SELECT * FROM system.merge_tree_settings LIMIT 4 FORMAT Vertical; +``` + +```text +Row 1: +────── +name: index_granularity +value: 8192 +changed: 0 +description: How many rows correspond to one primary key value. +type: SettingUInt64 + +Row 2: +────── +name: min_bytes_for_wide_part +value: 0 +changed: 0 +description: Minimal uncompressed size in bytes to create part in wide format instead of compact +type: SettingUInt64 + +Row 3: +────── +name: min_rows_for_wide_part +value: 0 +changed: 0 +description: Minimal number of rows to create part in wide format instead of compact +type: SettingUInt64 + +Row 4: +────── +name: merge_max_block_size +value: 8192 +changed: 0 +description: How many rows in blocks should be formed for merge operations. +type: SettingUInt64 + +4 rows in set. Elapsed: 0.001 sec. +``` + [Original article](https://clickhouse.tech/docs/en/operations/system_tables/merge_tree_settings) diff --git a/docs/en/operations/system-tables/numbers.md b/docs/en/operations/system-tables/numbers.md index 9b7e148242c..d1737c9abbb 100644 --- a/docs/en/operations/system-tables/numbers.md +++ b/docs/en/operations/system-tables/numbers.md @@ -6,4 +6,27 @@ You can use this table for tests, or if you need to do a brute force search. Reads from this table are not parallelized. +**Example** + +```sql +:) SELECT * FROM system.numbers LIMIT 10; +``` + +```text +┌─number─┐ +│ 0 │ +│ 1 │ +│ 2 │ +│ 3 │ +│ 4 │ +│ 5 │ +│ 6 │ +│ 7 │ +│ 8 │ +│ 9 │ +└────────┘ + +10 rows in set. Elapsed: 0.001 sec. +``` + [Original article](https://clickhouse.tech/docs/en/operations/system_tables/numbers) diff --git a/docs/en/operations/system-tables/numbers_mt.md b/docs/en/operations/system-tables/numbers_mt.md index 870b256223e..b40dc9a2d6f 100644 --- a/docs/en/operations/system-tables/numbers_mt.md +++ b/docs/en/operations/system-tables/numbers_mt.md @@ -4,4 +4,27 @@ The same as [system.numbers](../../operations/system-tables/numbers.md) but read Used for tests. +**Example** + +```sql +:) SELECT * FROM system.numbers_mt LIMIT 10; +``` + +```text +┌─number─┐ +│ 0 │ +│ 1 │ +│ 2 │ +│ 3 │ +│ 4 │ +│ 5 │ +│ 6 │ +│ 7 │ +│ 8 │ +│ 9 │ +└────────┘ + +10 rows in set. Elapsed: 0.001 sec. +``` + [Original article](https://clickhouse.tech/docs/en/operations/system_tables/numbers_mt) diff --git a/docs/en/operations/system-tables/one.md b/docs/en/operations/system-tables/one.md index 854fab32730..a85e01bc75a 100644 --- a/docs/en/operations/system-tables/one.md +++ b/docs/en/operations/system-tables/one.md @@ -6,4 +6,18 @@ This table is used if a `SELECT` query doesn’t specify the `FROM` clause. This is similar to the `DUAL` table found in other DBMSs. +**Example** + +```sql +:) SELECT * FROM system.one LIMIT 10; +``` + +```text +┌─dummy─┐ +│ 0 │ +└───────┘ + +1 rows in set. Elapsed: 0.001 sec. +``` + [Original article](https://clickhouse.tech/docs/en/operations/system_tables/one) diff --git a/docs/en/operations/system-tables/processes.md b/docs/en/operations/system-tables/processes.md index 2af39eff862..a379fc4a07a 100644 --- a/docs/en/operations/system-tables/processes.md +++ b/docs/en/operations/system-tables/processes.md @@ -14,4 +14,51 @@ Columns: - `query` (String) – The query text. For `INSERT`, it doesn’t include the data to insert. - `query_id` (String) – Query ID, if defined. + +```sql +:) SELECT * FROM system.processes LIMIT 10 FORMAT Vertical; +``` + +```text +Row 1: +────── +is_initial_query: 1 +user: default +query_id: 35a360fa-3743-441d-8e1f-228c938268da +address: ::ffff:172.23.0.1 +port: 47588 +initial_user: default +initial_query_id: 35a360fa-3743-441d-8e1f-228c938268da +initial_address: ::ffff:172.23.0.1 +initial_port: 47588 +interface: 1 +os_user: bharatnc +client_hostname: tower +client_name: ClickHouse +client_revision: 54437 +client_version_major: 20 +client_version_minor: 7 +client_version_patch: 2 +http_method: 0 +http_user_agent: +quota_key: +elapsed: 0.000582537 +is_cancelled: 0 +read_rows: 0 +read_bytes: 0 +total_rows_approx: 0 +written_rows: 0 +written_bytes: 0 +memory_usage: 0 +peak_memory_usage: 0 +query: SELECT * from system.processes LIMIT 10 FORMAT Vertical; +thread_ids: [67] +ProfileEvents.Names: ['Query','SelectQuery','ReadCompressedBytes','CompressedReadBufferBlocks','CompressedReadBufferBytes','IOBufferAllocs','IOBufferAllocBytes','ContextLock','RWLockAcquiredReadLocks'] +ProfileEvents.Values: [1,1,36,1,10,1,89,16,1] +Settings.Names: ['use_uncompressed_cache','load_balancing','log_queries','max_memory_usage'] +Settings.Values: ['0','in_order','1','10000000000'] + +1 rows in set. Elapsed: 0.002 sec. +``` + [Original article](https://clickhouse.tech/docs/en/operations/system_tables/processes) diff --git a/docs/en/operations/system-tables/storage_policies.md b/docs/en/operations/system-tables/storage_policies.md index c8171b50aed..5adab1cb2aa 100644 --- a/docs/en/operations/system-tables/storage_policies.md +++ b/docs/en/operations/system-tables/storage_policies.md @@ -10,6 +10,7 @@ Columns: - `disks` ([Array(String)](../../sql-reference/data-types/array.md)) — Disk names, defined in the storage policy. - `max_data_part_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Maximum size of a data part that can be stored on volume disks (0 — no limit). - `move_factor` ([Float64](../../sql-reference/data-types/float.md)) — Ratio of free disk space. When the ratio exceeds the value of configuration parameter, ClickHouse start to move data to the next volume in order. +- `prefer_not_to_merge` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Value of the `prefer_not_to_merge` setting. When this setting is enabled, merging data on this volume is not allowed. This allows controlling how ClickHouse works with slow disks. If the storage policy contains more then one volume, then information for each volume is stored in the individual row of the table. diff --git a/docs/en/operations/system-tables/tables.md b/docs/en/operations/system-tables/tables.md index e69b8aa67a0..6ad1425e032 100644 --- a/docs/en/operations/system-tables/tables.md +++ b/docs/en/operations/system-tables/tables.md @@ -52,4 +52,56 @@ This table contains the following columns (the column type is shown in brackets) The `system.tables` table is used in `SHOW TABLES` query implementation. +```sql +:) SELECT * FROM system.tables LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: system +name: aggregate_function_combinators +uuid: 00000000-0000-0000-0000-000000000000 +engine: SystemAggregateFunctionCombinators +is_temporary: 0 +data_paths: [] +metadata_path: /var/lib/clickhouse/metadata/system/aggregate_function_combinators.sql +metadata_modification_time: 1970-01-01 03:00:00 +dependencies_database: [] +dependencies_table: [] +create_table_query: +engine_full: +partition_key: +sorting_key: +primary_key: +sampling_key: +storage_policy: +total_rows: ᴺᵁᴸᴸ +total_bytes: ᴺᵁᴸᴸ + +Row 2: +────── +database: system +name: asynchronous_metrics +uuid: 00000000-0000-0000-0000-000000000000 +engine: SystemAsynchronousMetrics +is_temporary: 0 +data_paths: [] +metadata_path: /var/lib/clickhouse/metadata/system/asynchronous_metrics.sql +metadata_modification_time: 1970-01-01 03:00:00 +dependencies_database: [] +dependencies_table: [] +create_table_query: +engine_full: +partition_key: +sorting_key: +primary_key: +sampling_key: +storage_policy: +total_rows: ᴺᵁᴸᴸ +total_bytes: ᴺᵁᴸᴸ + +2 rows in set. Elapsed: 0.004 sec. +``` + [Original article](https://clickhouse.tech/docs/en/operations/system_tables/tables) diff --git a/docs/en/operations/utilities/clickhouse-copier.md b/docs/en/operations/utilities/clickhouse-copier.md index ec5a619b86b..4137bd6f334 100644 --- a/docs/en/operations/utilities/clickhouse-copier.md +++ b/docs/en/operations/utilities/clickhouse-copier.md @@ -70,11 +70,21 @@ Parameters: + false 127.0.0.1 9000 + ... diff --git a/docs/en/sql-reference/aggregate-functions/reference/avg.md b/docs/en/sql-reference/aggregate-functions/reference/avg.md index 4ebae95b79d..e2e6aace734 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/avg.md +++ b/docs/en/sql-reference/aggregate-functions/reference/avg.md @@ -4,4 +4,59 @@ toc_priority: 5 # avg {#agg_function-avg} -Calculates the average. Only works for numbers. The result is always Float64. +Calculates the arithmetic mean. + +**Syntax** + +``` sql +avgWeighted(x) +``` + +**Parameter** + +- `x` — Values. + +`x` must be +[Integer](../../../sql-reference/data-types/int-uint.md), +[floating-point](../../../sql-reference/data-types/float.md), or +[Decimal](../../../sql-reference/data-types/decimal.md). + +**Returned value** + +- `NaN` if the supplied parameter is empty. +- Mean otherwise. + +**Return type** is always [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5) +``` + +Result: + +``` text +┌─avg(x)─┐ +│ 2.5 │ +└────────┘ +``` + +**Example** + +Query: + +``` sql +CREATE table test (t UInt8) ENGINE = Memory; +SELECT avg(t) FROM test +``` + +Result: + +``` text +┌─avg(x)─┐ +│ nan │ +└────────┘ +``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md b/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md index 20b7187a744..7b9c0de2755 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md +++ b/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md @@ -14,17 +14,21 @@ avgWeighted(x, weight) **Parameters** -- `x` — Values. [Integer](../../../sql-reference/data-types/int-uint.md) or [floating-point](../../../sql-reference/data-types/float.md). -- `weight` — Weights of the values. [Integer](../../../sql-reference/data-types/int-uint.md) or [floating-point](../../../sql-reference/data-types/float.md). +- `x` — Values. +- `weight` — Weights of the values. -Type of `x` and `weight` must be the same. +`x` and `weight` must both be +[Integer](../../../sql-reference/data-types/int-uint.md), +[floating-point](../../../sql-reference/data-types/float.md), or +[Decimal](../../../sql-reference/data-types/decimal.md), +but may have different types. **Returned value** -- Weighted mean. -- `NaN`. If all the weights are equal to 0. +- `NaN` if all the weights are equal to 0 or the supplied weights parameter is empty. +- Weighted mean otherwise. -Type: [Float64](../../../sql-reference/data-types/float.md). +**Return type** is always [Float64](../../../sql-reference/data-types/float.md). **Example** @@ -42,3 +46,54 @@ Result: │ 8 │ └────────────────────────┘ ``` + +**Example** + +Query: + +``` sql +SELECT avgWeighted(x, w) +FROM values('x Int8, w Float64', (4, 1), (1, 0), (10, 2)) +``` + +Result: + +``` text +┌─avgWeighted(x, weight)─┐ +│ 8 │ +└────────────────────────┘ +``` + +**Example** + +Query: + +``` sql +SELECT avgWeighted(x, w) +FROM values('x Int8, w Int8', (0, 0), (1, 0), (10, 0)) +``` + +Result: + +``` text +┌─avgWeighted(x, weight)─┐ +│ nan │ +└────────────────────────┘ +``` + +**Example** + +Query: + +``` sql +CREATE table test (t UInt8) ENGINE = Memory; +SELECT avgWeighted(t) FROM test +``` + +Result: + +``` text +┌─avgWeighted(x, weight)─┐ +│ nan │ +└────────────────────────┘ +``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md b/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md new file mode 100644 index 00000000000..dc23029f239 --- /dev/null +++ b/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md @@ -0,0 +1,53 @@ +## rankCorr {#agg_function-rankcorr} + +Computes a rank correlation coefficient. + +**Syntax** + +``` sql +rankCorr(x, y) +``` + +**Parameters** + +- `x` — Arbitrary value. [Float32](../../../sql-reference/data-types/float.md#float32-float64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64). +- `y` — Arbitrary value. [Float32](../../../sql-reference/data-types/float.md#float32-float64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64). + +**Returned value(s)** + +- Returns a rank correlation coefficient of the ranks of x and y. The value of the correlation coefficient ranges from -1 to +1. If less than two arguments are passed, the function will return an exception. The value close to +1 denotes a high linear relationship, and with an increase of one random variable, the second random variable also increases. The value close to -1 denotes a high linear relationship, and with an increase of one random variable, the second random variable decreases. The value close or equal to 0 denotes no relationship between the two random variables. + +Type: [Float64](../../../sql-reference/data-types/float.md#float32-float64). + +**Example** + +Query: + +``` sql +SELECT rankCorr(number, number) FROM numbers(100); +``` + +Result: + +``` text +┌─rankCorr(number, number)─┐ +│ 1 │ +└──────────────────────────┘ +``` + +Query: + +``` sql +SELECT roundBankers(rankCorr(exp(number), sin(number)), 3) FROM numbers(100); +``` + +Result: + +``` text +┌─roundBankers(rankCorr(exp(number), sin(number)), 3)─┐ +│ -0.037 │ +└─────────────────────────────────────────────────────┘ +``` +**See Also** + +- [Spearman's rank correlation coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) \ No newline at end of file diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 63b356e27e6..75db3fafe36 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -25,7 +25,37 @@ SELECT ## toTimeZone {#totimezone} -Convert time or date and time to the specified time zone. +Convert time or date and time to the specified time zone. The time zone is an attribute of the Date/DateTime types. The internal value (number of seconds) of the table field or of the resultset's column does not change, the column's type changes and its string representation changes accordingly. + +```sql +SELECT + toDateTime('2019-01-01 00:00:00', 'UTC') AS time_utc, + toTypeName(time_utc) AS type_utc, + toInt32(time_utc) AS int32utc, + toTimeZone(time_utc, 'Asia/Yekaterinburg') AS time_yekat, + toTypeName(time_yekat) AS type_yekat, + toInt32(time_yekat) AS int32yekat, + toTimeZone(time_utc, 'US/Samoa') AS time_samoa, + toTypeName(time_samoa) AS type_samoa, + toInt32(time_samoa) AS int32samoa +FORMAT Vertical; +``` + +```text +Row 1: +────── +time_utc: 2019-01-01 00:00:00 +type_utc: DateTime('UTC') +int32utc: 1546300800 +time_yekat: 2019-01-01 05:00:00 +type_yekat: DateTime('Asia/Yekaterinburg') +int32yekat: 1546300800 +time_samoa: 2018-12-31 13:00:00 +type_samoa: DateTime('US/Samoa') +int32samoa: 1546300800 +``` + +`toTimeZone(time_utc, 'Asia/Yekaterinburg')` changes the `DateTime('UTC')` type to `DateTime('Asia/Yekaterinburg')`. The value (Unixtimestamp) 1546300800 stays the same, but the string representation (the result of the toString() function) changes from `time_utc: 2019-01-01 00:00:00` to `time_yekat: 2019-01-01 05:00:00`. ## toYear {#toyear} @@ -67,9 +97,8 @@ Leap seconds are not accounted for. ## toUnixTimestamp {#to-unix-timestamp} -For DateTime argument: converts value to its internal numeric representation (Unix Timestamp). -For String argument: parse datetime from string according to the timezone (optional second argument, server timezone is used by default) and returns the corresponding unix timestamp. -For Date argument: the behaviour is unspecified. +For DateTime argument: converts value to the number with type UInt32 -- Unix Timestamp (https://en.wikipedia.org/wiki/Unix_time). +For String argument: converts the input string to the datetime according to the timezone (optional second argument, server timezone is used by default) and returns the corresponding unix timestamp. **Syntax** diff --git a/docs/en/sql-reference/functions/in-functions.md b/docs/en/sql-reference/functions/in-functions.md index 065805a36ae..dd3c1900fdc 100644 --- a/docs/en/sql-reference/functions/in-functions.md +++ b/docs/en/sql-reference/functions/in-functions.md @@ -9,16 +9,4 @@ toc_title: IN Operator See the section [IN operators](../../sql-reference/operators/in.md#select-in-operators). -## tuple(x, y, …), operator (x, y, …) {#tuplex-y-operator-x-y} - -A function that allows grouping multiple columns. -For columns with the types T1, T2, …, it returns a Tuple(T1, T2, …) type tuple containing these columns. There is no cost to execute the function. -Tuples are normally used as intermediate values for an argument of IN operators, or for creating a list of formal parameters of lambda functions. Tuples can’t be written to a table. - -## tupleElement(tuple, n), operator x.N {#tupleelementtuple-n-operator-x-n} - -A function that allows getting a column from a tuple. -‘N’ is the column index, starting from 1. N must be a constant. ‘N’ must be a constant. ‘N’ must be a strict postive integer no greater than the size of the tuple. -There is no cost to execute the function. - [Original article](https://clickhouse.tech/docs/en/query_language/functions/in_functions/) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 31ed47c3195..51a1f6b4cd7 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -1,5 +1,5 @@ --- -toc_priority: 66 +toc_priority: 67 toc_title: Other --- diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 881139f103c..dba8a6e275c 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -536,4 +536,58 @@ For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCas !!! note "Note" For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables – collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function – we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one – this works for Latin and mostly for all Cyrillic letters. +## countSubstrings(haystack, needle) {#countSubstrings} + +Count the number of substring occurrences + +For a case-insensitive search, use the function `countSubstringsCaseInsensitive` (or `countSubstringsCaseInsensitiveUTF8`). + +**Syntax** + +``` sql +countSubstrings(haystack, needle[, start_pos]) +``` + +**Parameters** + +- `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `needle` — The substring to search for. [String](../../sql-reference/syntax.md#syntax-string-literal). +- `start_pos` – Optional parameter, position of the first character in the string to start search. [UInt](../../sql-reference/data-types/int-uint.md) + +**Returned values** + +- Number of occurrences. + +Type: `Integer`. + +**Examples** + +Query: + +``` sql +SELECT countSubstrings('foobar.com', '.') +``` + +Result: + +``` text +┌─countSubstrings('foobar.com', '.')─┐ +│ 1 │ +└────────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT countSubstrings('aaaa', 'aa') +``` + +Result: + +``` text +┌─countSubstrings('aaaa', 'aa')─┐ +│ 2 │ +└───────────────────────────────┘ +``` + [Original article](https://clickhouse.tech/docs/en/query_language/functions/string_search_functions/) diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md new file mode 100644 index 00000000000..dcbcd3e374b --- /dev/null +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -0,0 +1,114 @@ +--- +toc_priority: 66 +toc_title: Tuples +--- + +# Functions for Working with Tuples {#tuple-functions} + +## tuple {#tuple} + +A function that allows grouping multiple columns. +For columns with the types T1, T2, …, it returns a Tuple(T1, T2, …) type tuple containing these columns. There is no cost to execute the function. +Tuples are normally used as intermediate values for an argument of IN operators, or for creating a list of formal parameters of lambda functions. Tuples can’t be written to a table. + +The function implements the operator `(x, y, …)`. + +**Syntax** + +``` sql +tuple(x, y, …) +``` + +## tupleElement {#tupleelement} + +A function that allows getting a column from a tuple. +‘N’ is the column index, starting from 1. N must be a constant. ‘N’ must be a constant. ‘N’ must be a strict postive integer no greater than the size of the tuple. +There is no cost to execute the function. + +The function implements the operator `x.N`. + +**Syntax** + +``` sql +tupleElement(tuple, n) +``` + +## untuple {#untuple} + +Performs syntactic substitution of [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2) elements in the call location. + +**Syntax** + +``` sql +untuple(x) +``` + +You can use the `EXCEPT` expression to skip columns as a result of the query. + +**Parameters** + +- `x` - A `tuple` function, column, or tuple of elements. [Tuple](../../sql-reference/data-types/tuple.md). + +**Returned value** + +- None. + +**Examples** + +Input table: + +``` text +┌─key─┬─v1─┬─v2─┬─v3─┬─v4─┬─v5─┬─v6────────┐ +│ 1 │ 10 │ 20 │ 40 │ 30 │ 15 │ (33,'ab') │ +│ 2 │ 25 │ 65 │ 70 │ 40 │ 6 │ (44,'cd') │ +│ 3 │ 57 │ 30 │ 20 │ 10 │ 5 │ (55,'ef') │ +│ 4 │ 55 │ 12 │ 7 │ 80 │ 90 │ (66,'gh') │ +│ 5 │ 30 │ 50 │ 70 │ 25 │ 55 │ (77,'kl') │ +└─────┴────┴────┴────┴────┴────┴───────────┘ +``` + +Example of using a `Tuple`-type column as the `untuple` function parameter: + +Query: + +``` sql +SELECT untuple(v6) FROM kv; +``` + +Result: + +``` text +┌─_ut_1─┬─_ut_2─┐ +│ 33 │ ab │ +│ 44 │ cd │ +│ 55 │ ef │ +│ 66 │ gh │ +│ 77 │ kl │ +└───────┴───────┘ +``` + +Example of using an `EXCEPT` expression: + +Query: + +``` sql +SELECT untuple((* EXCEPT (v2, v3),)) FROM kv; +``` + +Result: + +``` text +┌─key─┬─v1─┬─v4─┬─v5─┬─v6────────┐ +│ 1 │ 10 │ 30 │ 15 │ (33,'ab') │ +│ 2 │ 25 │ 40 │ 6 │ (44,'cd') │ +│ 3 │ 57 │ 10 │ 5 │ (55,'ef') │ +│ 4 │ 55 │ 80 │ 90 │ (66,'gh') │ +│ 5 │ 30 │ 25 │ 55 │ (77,'kl') │ +└─────┴────┴────┴────┴───────────┘ +``` + +**See Also** + +- [Tuple](../../sql-reference/data-types/tuple.md) + +[Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-functions/) diff --git a/docs/en/sql-reference/statements/alter/delete.md b/docs/en/sql-reference/statements/alter/delete.md index 23a7bf0e8f1..6c638c0a3ac 100644 --- a/docs/en/sql-reference/statements/alter/delete.md +++ b/docs/en/sql-reference/statements/alter/delete.md @@ -9,7 +9,7 @@ toc_title: DELETE ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr ``` -Allows to delete data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). +Deletes data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). !!! note "Note" The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. diff --git a/docs/en/sql-reference/statements/alter/index/index.md b/docs/en/sql-reference/statements/alter/index/index.md index 4660478551f..56d81aaf52f 100644 --- a/docs/en/sql-reference/statements/alter/index/index.md +++ b/docs/en/sql-reference/statements/alter/index/index.md @@ -14,10 +14,9 @@ The following operations are available: - `ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name` - The query rebuilds the secondary index `name` in the partition `partition_name`. Implemented as a [mutation](../../../../sql-reference/statements/alter/index.md#mutations). -The first two commands areare lightweight in a sense that they only change metadata or remove files. +The first two commands are lightweight in a sense that they only change metadata or remove files. Also, they are replicated, syncing indices metadata via ZooKeeper. !!! note "Note" - Index manipulation is supported only for tables with [`*MergeTree`](../../../../engines/table-engines/mergetree-family/mergetree.md) engine (including -[replicated](../../../../engines/table-engines/mergetree-family/replication.md) variants). + Index manipulation is supported only for tables with [`*MergeTree`](../../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../../engines/table-engines/mergetree-family/replication.md) variants). diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index d2dd1c638cc..2d46ee609f1 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -21,10 +21,10 @@ The following operations with [partitions](../../../engines/table-engines/merget -## DETACH PARTITION {#alter_detach-partition} +## DETACH PARTITION\|PART {#alter_detach-partition} ``` sql -ALTER TABLE table_name DETACH PARTITION partition_expr +ALTER TABLE table_name DETACH PARTITION|PART partition_expr ``` Moves all data for the specified partition to the `detached` directory. The server forgets about the detached data partition as if it does not exist. The server will not know about this data until you make the [ATTACH](#alter_attach-partition) query. @@ -32,7 +32,8 @@ Moves all data for the specified partition to the `detached` directory. The serv Example: ``` sql -ALTER TABLE visits DETACH PARTITION 201901 +ALTER TABLE mt DETACH PARTITION '2020-11-21'; +ALTER TABLE mt DETACH PART 'all_2_2_0'; ``` Read about setting the partition expression in a section [How to specify the partition expression](#alter-how-to-specify-part-expr). @@ -41,10 +42,10 @@ After the query is executed, you can do whatever you want with the data in the ` This query is replicated – it moves the data to the `detached` directory on all replicas. Note that you can execute this query only on a leader replica. To find out if a replica is a leader, perform the `SELECT` query to the [system.replicas](../../../operations/system-tables/replicas.md#system_tables-replicas) table. Alternatively, it is easier to make a `DETACH` query on all replicas - all the replicas throw an exception, except the leader replica. -## DROP PARTITION {#alter_drop-partition} +## DROP PARTITION\|PART {#alter_drop-partition} ``` sql -ALTER TABLE table_name DROP PARTITION partition_expr +ALTER TABLE table_name DROP PARTITION|PART partition_expr ``` Deletes the specified partition from the table. This query tags the partition as inactive and deletes data completely, approximately in 10 minutes. @@ -53,6 +54,13 @@ Read about setting the partition expression in a section [How to specify the par The query is replicated – it deletes data on all replicas. +Example: + +``` sql +ALTER TABLE mt DROP PARTITION '2020-11-21'; +ALTER TABLE mt DROP PART 'all_4_4_0'; +``` + ## DROP DETACHED PARTITION\|PART {#alter_drop-detached} ``` sql @@ -233,6 +241,46 @@ ALTER TABLE hits MOVE PART '20190301_14343_16206_438' TO VOLUME 'slow' ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' ``` +## UPDATE IN PARTITION {#update-in-partition} + +Manipulates data in the specifies partition matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). + +Syntax: + +``` sql +ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] [IN PARTITION partition_id] WHERE filter_expr +``` + +### Example + +``` sql +ALTER TABLE mt UPDATE x = x + 1 IN PARTITION 2 WHERE p = 2; +``` + +### See Also + +- [UPDATE](../../../sql-reference/statements/alter/update.md#alter-table-update-statements) + +## DELETE IN PARTITION {#delete-in-partition} + +Deletes data in the specifies partition matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). + +Syntax: + +``` sql +ALTER TABLE [db.]table DELETE [IN PARTITION partition_id] WHERE filter_expr +``` + +### Example + +``` sql +ALTER TABLE mt DELETE IN PARTITION 2 WHERE p = 2; +``` + +### See Also + +- [DELETE](../../../sql-reference/statements/alter/delete.md#alter-mutations) + ## How to Set Partition Expression {#alter-how-to-specify-part-expr} You can specify the partition expression in `ALTER ... PARTITION` queries in different ways: @@ -250,4 +298,6 @@ All the rules above are also true for the [OPTIMIZE](../../../sql-reference/stat OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; ``` +`IN PARTITION` specifies the partition to which the [UPDATE](../../../sql-reference/statements/alter/update.md#alter-table-update-statements) or [DELETE](../../../sql-reference/statements/alter/delete.md#alter-mutations) expressions are applied as a result of the `ALTER TABLE` query. New parts are created only from the specified partition. In this way, `IN PARTITION` helps to reduce the load when the table is divided into many partitions, and you only need to update the data point-by-point. + The examples of `ALTER ... PARTITION` queries are demonstrated in the tests [`00502_custom_partitioning_local`](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00502_custom_partitioning_local.sql) and [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). diff --git a/docs/en/sql-reference/statements/alter/update.md b/docs/en/sql-reference/statements/alter/update.md index 45e00236974..13ea1b2a8db 100644 --- a/docs/en/sql-reference/statements/alter/update.md +++ b/docs/en/sql-reference/statements/alter/update.md @@ -9,7 +9,7 @@ toc_title: UPDATE ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr ``` -Allows to manipulate data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). +Manipulates data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). !!! note "Note" The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 82326bf51cf..e9952fc76fd 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -29,6 +29,8 @@ A column description is `name type` in the simplest case. Example: `RegionID UIn Expressions can also be defined for default values (see below). +If necessary, primary key can be specified, with one or more key expressions. + ### With a Schema Similar to Other Table {#with-a-schema-similar-to-other-table} ``` sql @@ -97,6 +99,34 @@ If you add a new column to a table but later change its default expression, the It is not possible to set default values for elements in nested data structures. +## Primary Key {#primary-key} + +You can define a [primary key](../../../engines/table-engines/mergetree-family/mergetree.md#primary-keys-and-indexes-in-queries) when creating a table. Primary key can be specified in two ways: + +- inside the column list + +``` sql +CREATE TABLE db.table_name +( + name1 type1, name2 type2, ..., + PRIMARY KEY(expr1[, expr2,...])] +) +ENGINE = engine; +``` + +- outside the column list + +``` sql +CREATE TABLE db.table_name +( + name1 type1, name2 type2, ... +) +ENGINE = engine +PRIMARY KEY(expr1[, expr2,...]); +``` + +You can't combine both ways in one query. + ## Constraints {#constraints} Along with columns descriptions constraints could be defined: diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 509b7553536..ddba1443d04 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -152,7 +152,7 @@ ClickHouse can manage background processes in [MergeTree](../../engines/table-en Provides possibility to stop background merges for tables in the MergeTree family: ``` sql -SYSTEM STOP MERGES [[db.]merge_tree_family_table_name] +SYSTEM STOP MERGES [ON VOLUME | [db.]merge_tree_family_table_name] ``` !!! note "Note" @@ -163,7 +163,7 @@ SYSTEM STOP MERGES [[db.]merge_tree_family_table_name] Provides possibility to start background merges for tables in the MergeTree family: ``` sql -SYSTEM START MERGES [[db.]merge_tree_family_table_name] +SYSTEM START MERGES [ON VOLUME | [db.]merge_tree_family_table_name] ``` ### STOP TTL MERGES {#query_language-stop-ttl-merges} diff --git a/docs/en/sql-reference/syntax.md b/docs/en/sql-reference/syntax.md index 296f5c7c5f3..5d0eee76393 100644 --- a/docs/en/sql-reference/syntax.md +++ b/docs/en/sql-reference/syntax.md @@ -57,7 +57,7 @@ Identifiers are: Identifiers can be quoted or non-quoted. The latter is preferred. -Non-quoted identifiers must match the regex `^[0-9a-zA-Z_]*[a-zA-Z_]$` and can not be equal to [keywords](#syntax-keywords). Examples: `x, _1, X_y__Z123_.` +Non-quoted identifiers must match the regex `^[a-zA-Z_][0-9a-zA-Z_]*$` and can not be equal to [keywords](#syntax-keywords). Examples: `x`, `_1`, `X_y__Z123_`. If you want to use identifiers the same as keywords or you want to use other symbols in identifiers, quote it using double quotes or backticks, for example, `"id"`, `` `id` ``. diff --git a/docs/es/development/contrib.md b/docs/es/development/contrib.md index 9018c19cc92..3f3013570e5 100644 --- a/docs/es/development/contrib.md +++ b/docs/es/development/contrib.md @@ -19,7 +19,6 @@ toc_title: Bibliotecas de terceros utilizadas | Más información | [Licencia de 3 cláusulas BSD](https://github.com/google/googletest/blob/master/LICENSE) | | H3 | [Licencia Apache 2.0](https://github.com/uber/h3/blob/master/LICENSE) | | hyperscan | [Licencia de 3 cláusulas BSD](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libbtrie | [Licencia BSD de 2 cláusulas](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | | libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | | libdivide | [Licencia Zlib](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | | libgsasl | [Información adicional](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | diff --git a/docs/fa/development/contrib.md b/docs/fa/development/contrib.md index 25573c28125..2ee5fc73369 100644 --- a/docs/fa/development/contrib.md +++ b/docs/fa/development/contrib.md @@ -21,7 +21,6 @@ toc_title: "\u06A9\u062A\u0627\u0628\u062E\u0627\u0646\u0647 \u0647\u0627\u06CC | googletest | [لیسانس 3 بند](https://github.com/google/googletest/blob/master/LICENSE) | | اچ 3 | [نمایی مجوز 2.0](https://github.com/uber/h3/blob/master/LICENSE) | | hyperscan | [لیسانس 3 بند](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| لیبتری | [لیسانس 2 بند](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | | شکنجه نوجوان | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | | لیبیدوید | [مجوز زلب](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | | نوشیدن شراب | [الجی پی ال2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | diff --git a/docs/fr/development/contrib.md b/docs/fr/development/contrib.md index f4006d0a787..6909ef905bd 100644 --- a/docs/fr/development/contrib.md +++ b/docs/fr/development/contrib.md @@ -19,7 +19,6 @@ toc_title: "Biblioth\xE8ques Tierces Utilis\xE9es" | googletest | [Licence BSD 3-Clause](https://github.com/google/googletest/blob/master/LICENSE) | | h3 | [Licence Apache 2.0](https://github.com/uber/h3/blob/master/LICENSE) | | hyperscan | [Licence BSD 3-Clause](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libbtrie | [Licence BSD 2-Clause](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | | libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | | libdivide | [Licence Zlib](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | | libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | diff --git a/docs/ja/development/contrib.md b/docs/ja/development/contrib.md index 2e16b2bc72a..892d2c66a13 100644 --- a/docs/ja/development/contrib.md +++ b/docs/ja/development/contrib.md @@ -20,7 +20,6 @@ toc_title: "\u30B5\u30FC\u30C9\u30D1\u30FC\u30C6\u30A3\u88FD\u30E9\u30A4\u30D6\u | googletest | [BSD3条項ライセンス](https://github.com/google/googletest/blob/master/LICENSE) | | h3 | [Apacheライセンス2.0](https://github.com/uber/h3/blob/master/LICENSE) | | hyperscan | [BSD3条項ライセンス](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libbtrie | [BSD2条項ライセンス](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | | libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | | libdivide | [Zlibライセンス](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | | libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | diff --git a/docs/ru/development/contrib.md b/docs/ru/development/contrib.md index e65ab4819e8..05367267e41 100644 --- a/docs/ru/development/contrib.md +++ b/docs/ru/development/contrib.md @@ -18,7 +18,6 @@ toc_title: "\u0418\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u043c\u044b\u | googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | | h3 | [Apache License 2.0](https://github.com/uber/h3/blob/master/LICENSE) | | hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libbtrie | [BSD 2-Clause License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | | libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | | libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | | libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | diff --git a/docs/ru/engines/table-engines/mergetree-family/mergetree.md b/docs/ru/engines/table-engines/mergetree-family/mergetree.md index e4b6e0b1e59..f738ce13d7c 100644 --- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md @@ -183,18 +183,18 @@ ClickHouse не требует уникального первичного кл - Увеличить эффективность индекса. - Пусть первичный ключ — `(a, b)`, тогда добавление ещё одного столбца `c` повысит эффективность, если выполнены условия: + Пусть первичный ключ — `(a, b)`, тогда добавление ещё одного столбца `c` повысит эффективность, если выполнены условия: - - Есть запросы с условием на столбец `c`. - - Часто встречаются достаточно длинные (в несколько раз больше `index_granularity`) диапазоны данных с одинаковыми значениями `(a, b)`. Иначе говоря, когда добавление ещё одного столбца позволит пропускать достаточно длинные диапазоны данных. + - Есть запросы с условием на столбец `c`. + - Часто встречаются достаточно длинные (в несколько раз больше `index_granularity`) диапазоны данных с одинаковыми значениями `(a, b)`. Иначе говоря, когда добавление ещё одного столбца позволит пропускать достаточно длинные диапазоны данных. - Улучшить сжатие данных. - ClickHouse сортирует данные по первичному ключу, поэтому чем выше однородность, тем лучше сжатие. + ClickHouse сортирует данные по первичному ключу, поэтому чем выше однородность, тем лучше сжатие. - Обеспечить дополнительную логику при слиянии кусков данных в движках [CollapsingMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) и [SummingMergeTree](summingmergetree.md). - В этом случае имеет смысл указать отдельный *ключ сортировки*, отличающийся от первичного ключа. + В этом случае имеет смысл указать отдельный *ключ сортировки*, отличающийся от первичного ключа. Длинный первичный ключ будет негативно влиять на производительность вставки и потребление памяти, однако на производительность ClickHouse при запросах `SELECT` лишние столбцы в первичном ключе не влияют. @@ -309,11 +309,11 @@ SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 - `bloom_filter([false_positive])` — [фильтр Блума](https://en.wikipedia.org/wiki/Bloom_filter) для указанных стоблцов. - Необязательный параметр `false_positive` — это вероятность получения ложноположительного срабатывания. Возможные значения: (0, 1). Значение по умолчанию: 0.025. + Необязательный параметр `false_positive` — это вероятность получения ложноположительного срабатывания. Возможные значения: (0, 1). Значение по умолчанию: 0.025. - Поддержанные типы данных: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`. - - Фильтром могут пользоваться функции: [equals](../../../engines/table_engines/mergetree_family/mergetree.md), [notEquals](../../../engines/table_engines/mergetree_family/mergetree.md), [in](../../../engines/table_engines/mergetree_family/mergetree.md), [notIn](../../../engines/table_engines/mergetree_family/mergetree.md). + Поддержанные типы данных: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`. + + Фильтром могут пользоваться функции: [equals](../../../engines/table-engines/mergetree-family/mergetree.md), [notEquals](../../../engines/table-engines/mergetree-family/mergetree.md), [in](../../../engines/table-engines/mergetree-family/mergetree.md), [notIn](../../../engines/table-engines/mergetree-family/mergetree.md). **Примеры** @@ -565,6 +565,7 @@ ALTER TABLE example_table - `disk` — диск, находящийся внутри тома. - `max_data_part_size_bytes` — максимальный размер куска данных, который может находится на любом из дисков этого тома. - `move_factor` — доля доступного свободного места на томе, если места становится меньше, то данные начнут перемещение на следующий том, если он есть (по умолчанию 0.1). +- `prefer_not_to_merge` — Отключает слияние кусков данных, хранящихся на данном томе. Если данная настройка включена, то слияние данных, хранящихся на данном томе, не допускается. Это позволяет контролировать работу ClickHouse с медленными дисками. Примеры конфигураций: @@ -593,6 +594,19 @@ ALTER TABLE example_table 0.2 + + + +
+ jbod1 +
+ + external + true + +
+
+ ... @@ -645,4 +659,4 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' После выполнения фоновых слияний или мутаций старые куски не удаляются сразу, а через некоторое время (табличная настройка `old_parts_lifetime`). Также они не перемещаются на другие тома или диски, поэтому до момента удаления они продолжают учитываться при подсчёте занятого дискового пространства. -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/mergetree/) +[Оригинальная статья](https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/mergetree/) diff --git a/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md b/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md index 1228371e8ea..a4e47b161ad 100644 --- a/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md @@ -5,7 +5,7 @@ toc_title: ReplacingMergeTree # ReplacingMergeTree {#replacingmergetree} -Движок отличается от [MergeTree](mergetree.md#table_engines-mergetree) тем, что выполняет удаление дублирующихся записей с одинаковым значением [ключа сортировки](mergetree.md)). +Движок отличается от [MergeTree](mergetree.md#table_engines-mergetree) тем, что выполняет удаление дублирующихся записей с одинаковым значением [ключа сортировки](mergetree.md) (секция `ORDER BY`, не `PRIMARY KEY`). Дедупликация данных производится лишь во время слияний. Слияние происходят в фоне в неизвестный момент времени, на который вы не можете ориентироваться. Некоторая часть данных может остаться необработанной. Хотя вы можете вызвать внеочередное слияние с помощью запроса `OPTIMIZE`, на это не стоит рассчитывать, так как запрос `OPTIMIZE` приводит к чтению и записи большого объёма данных. @@ -28,14 +28,17 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Описание параметров запроса смотрите в [описании запроса](../../../engines/table-engines/mergetree-family/replacingmergetree.md). +!!! note "Внимание" + Уникальность строк определяется `ORDER BY` секцией таблицы, а не `PRIMARY KEY`. + **Параметры ReplacingMergeTree** - `ver` — столбец с версией, тип `UInt*`, `Date` или `DateTime`. Необязательный параметр. - При слиянии, из всех строк с одинаковым значением ключа сортировки `ReplacingMergeTree` оставляет только одну: + При слиянии `ReplacingMergeTree` оставляет только строку для каждого уникального ключа сортировки: - - Последнюю в выборке, если `ver` не задан. - - С максимальной версией, если `ver` задан. + - Последнюю в выборке, если `ver` не задан. Под выборкой здесь понимается набор строк в наборе партов, участвующих в слиянии. Последний по времени создания парт (последний инсерт) будет последним в выборке. Таким образом, после дедупликации для каждого значения ключа сортировки останется самая последняя строка из самого последнего инсерта. + - С максимальной версией, если `ver` задан. **Секции запроса** diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md index 9941e4f3ac5..58aae05f188 100644 --- a/docs/ru/operations/server-configuration-parameters/settings.md +++ b/docs/ru/operations/server-configuration-parameters/settings.md @@ -127,7 +127,8 @@ ClickHouse проверяет условия для `min_part_size` и `min_part Если `true`, то каждый словарь создаётся при первом использовании. Если словарь не удалось создать, то вызов функции, использующей словарь, сгенерирует исключение. -Если `false`, то все словари создаются при старте сервера, и в случае ошибки сервер завершает работу. +Если `false`, то все словари создаются при старте сервера, если словарь или словари создаются слишком долго или создаются с ошибкой, то сервер загружается без +этих словарей и продолжает попытки создать эти словари. По умолчанию - `true`. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index af0fc3e6137..b04a927f944 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -2099,6 +2099,48 @@ SELECT TOP 3 name, value FROM system.settings; └─────────────────────────┴─────────┘ ``` +## system_events_show_zero_values {#system_events_show_zero_values} + +Позволяет выбрать события с нулевыми значениями из таблицы [`system.events`](../../operations/system-tables/events.md). + +В некоторые системы мониторинга вам нужно передать значения всех измерений (для каждой контрольной точки), даже если в результате — "0". + +Возможные значения: + +- 0 — настройка отключена — вы получите все события. +- 1 — настройка включена — вы сможете отсортировать события по нулевым и остальным значениям. + +Значение по умолчанию: `0`. + +**Примеры** + +Запрос + +```sql +SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; +``` + +Результат + +```text +Ok. +``` + +Запрос + +```sql +SET system_events_show_zero_values = 1; +SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; +``` + +Результат + +```text +┌─event────────────────────┬─value─┬─description───────────────────────────────────────────┐ +│ QueryMemoryLimitExceeded │ 0 │ Number of times when memory limit exceeded for query. │ +└──────────────────────────┴───────┴───────────────────────────────────────────────────────┘ +``` + ## allow_experimental_bigint_types {#allow_experimental_bigint_types} Включает или отключает поддержку целочисленных значений, превышающих максимальное значение, допустимое для типа `int`. diff --git a/docs/ru/operations/system-tables/storage_policies.md b/docs/ru/operations/system-tables/storage_policies.md index df5c920b5ba..e62266af131 100644 --- a/docs/ru/operations/system-tables/storage_policies.md +++ b/docs/ru/operations/system-tables/storage_policies.md @@ -10,6 +10,7 @@ - `disks` ([Array(String)](../../sql-reference/data-types/array.md)) — имена дисков, содержащихся в политике хранения. - `max_data_part_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) — максимальный размер куска данных, который может храниться на дисках тома (0 — без ограничений). - `move_factor` — доля доступного свободного места на томе, если места становится меньше, то данные начнут перемещение на следующий том, если он есть (по умолчанию 0.1). +- `prefer_not_to_merge` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Значение настройки `prefer_not_to_merge`. Если данная настройка включена, то слияние данных, хранящихся на данном томе, не допускается. Это позволяет контролировать работу ClickHouse с медленными дисками. Если политика хранения содержит несколько томов, то каждому тому соответствует отдельная запись в таблице. diff --git a/docs/ru/sql-reference/aggregate-functions/reference/rankCorr.md b/docs/ru/sql-reference/aggregate-functions/reference/rankCorr.md new file mode 100644 index 00000000000..48a19e87c52 --- /dev/null +++ b/docs/ru/sql-reference/aggregate-functions/reference/rankCorr.md @@ -0,0 +1,53 @@ +## rankCorr {#agg_function-rankcorr} + +Вычисляет коэффициент ранговой корреляции. + +**Синтаксис** + +``` sql +rankCorr(x, y) +``` + +**Параметры** + +- `x` — Произвольное значение. [Float32](../../../sql-reference/data-types/float.md#float32-float64) или [Float64](../../../sql-reference/data-types/float.md#float32-float64). +- `y` — Произвольное значение. [Float32](../../../sql-reference/data-types/float.md#float32-float64) или [Float64](../../../sql-reference/data-types/float.md#float32-float64). + +**Возвращаемое значение** + +- Возвращает коэффициент ранговой корреляции рангов x и y. Значение коэффициента корреляции изменяется в пределах от -1 до +1. Если передается менее двух аргументов, функция возвращает исключение. Значение, близкое к +1, указывает на высокую линейную зависимость, и с увеличением одной случайной величины увеличивается и вторая случайная величина. Значение, близкое к -1, указывает на высокую линейную зависимость, и с увеличением одной случайной величины вторая случайная величина уменьшается. Значение, близкое или равное 0, означает отсутствие связи между двумя случайными величинами. + +Тип: [Float64](../../../sql-reference/data-types/float.md#float32-float64). + +**Пример** + +Запрос: + +``` sql +SELECT rankCorr(number, number) FROM numbers(100); +``` + +Результат: + +``` text +┌─rankCorr(number, number)─┐ +│ 1 │ +└──────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT roundBankers(rankCorr(exp(number), sin(number)), 3) FROM numbers(100); +``` + +Результат: + +``` text +┌─roundBankers(rankCorr(exp(number), sin(number)), 3)─┐ +│ -0.037 │ +└─────────────────────────────────────────────────────┘ +``` +**Смотрите также** + +- [Коэффициент ранговой корреляции Спирмена](https://ru.wikipedia.org/wiki/%D0%9A%D0%BE%D1%80%D1%80%D0%B5%D0%BB%D1%8F%D1%86%D0%B8%D1%8F#%D0%9A%D0%BE%D1%8D%D1%84%D1%84%D0%B8%D1%86%D0%B8%D0%B5%D0%BD%D1%82_%D1%80%D0%B0%D0%BD%D0%B3%D0%BE%D0%B2%D0%BE%D0%B9_%D0%BA%D0%BE%D1%80%D1%80%D0%B5%D0%BB%D1%8F%D1%86%D0%B8%D0%B8_%D0%A1%D0%BF%D0%B8%D1%80%D0%BC%D0%B5%D0%BD%D0%B0) \ No newline at end of file diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index deffc935870..3c9bd99de57 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -25,6 +25,40 @@ SELECT Поддерживаются только часовые пояса, отличающиеся от UTC на целое число часов. +## toTimeZone {#totimezone} + +Переводит дату или дату-с-временем в указанный часовой пояс. Часовой пояс (таймзона) это атрибут типов Date/DateTime, внутреннее значение (количество секунд) поля таблицы или колонки результата не изменяется, изменяется тип поля и автоматически его текстовое отображение. + +```sql +SELECT + toDateTime('2019-01-01 00:00:00', 'UTC') AS time_utc, + toTypeName(time_utc) AS type_utc, + toInt32(time_utc) AS int32utc, + toTimeZone(time_utc, 'Asia/Yekaterinburg') AS time_yekat, + toTypeName(time_yekat) AS type_yekat, + toInt32(time_yekat) AS int32yekat, + toTimeZone(time_utc, 'US/Samoa') AS time_samoa, + toTypeName(time_samoa) AS type_samoa, + toInt32(time_samoa) AS int32samoa +FORMAT Vertical; +``` + +```text +Row 1: +────── +time_utc: 2019-01-01 00:00:00 +type_utc: DateTime('UTC') +int32utc: 1546300800 +time_yekat: 2019-01-01 05:00:00 +type_yekat: DateTime('Asia/Yekaterinburg') +int32yekat: 1546300800 +time_samoa: 2018-12-31 13:00:00 +type_samoa: DateTime('US/Samoa') +int32samoa: 1546300800 +``` + +`toTimeZone(time_utc, 'Asia/Yekaterinburg')` изменяет тип `DateTime('UTC')` в `DateTime('Asia/Yekaterinburg')`. Значение (unix-время) 1546300800 остается неизменным, но текстовое отображение (результат функции toString()) меняется `time_utc: 2019-01-01 00:00:00` в `time_yekat: 2019-01-01 05:00:00`. + ## toYear {#toyear} Переводит дату или дату-с-временем в число типа UInt16, содержащее номер года (AD). @@ -57,32 +91,31 @@ SELECT ## toUnixTimestamp {#to-unix-timestamp} -For DateTime argument: converts value to its internal numeric representation (Unix Timestamp). -For String argument: parse datetime from string according to the timezone (optional second argument, server timezone is used by default) and returns the corresponding unix timestamp. -For Date argument: the behaviour is unspecified. +Переводит дату-с-временем в число типа UInt32 -- Unix Timestamp (https://en.wikipedia.org/wiki/Unix_time). +Для аргумента String, строка конвертируется в дату и время в соответствии с часовым поясом (необязательный второй аргумент, часовой пояс сервера используется по умолчанию). -**Syntax** +**Синтаксис** ``` sql toUnixTimestamp(datetime) toUnixTimestamp(str, [timezone]) ``` -**Returned value** +**Возвращаемое значение** -- Returns the unix timestamp. +- Возвращает Unix Timestamp. -Type: `UInt32`. +Тип: `UInt32`. -**Example** +**Пример** -Query: +Запрос: ``` sql SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp ``` -Result: +Результат: ``` text ┌─unix_timestamp─┐ @@ -490,4 +523,4 @@ SELECT formatDateTime(toDate('2010-01-04'), '%g') └────────────────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/date_time_functions/) \ No newline at end of file +[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/date_time_functions/) diff --git a/docs/ru/sql-reference/functions/in-functions.md b/docs/ru/sql-reference/functions/in-functions.md index e137187a36b..b732f67303b 100644 --- a/docs/ru/sql-reference/functions/in-functions.md +++ b/docs/ru/sql-reference/functions/in-functions.md @@ -9,16 +9,4 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u Смотрите раздел [Операторы IN](../operators/in.md#select-in-operators). -## tuple(x, y, …), оператор (x, y, …) {#tuplex-y-operator-x-y} - -Функция, позволяющая сгруппировать несколько столбцов. -Для столбцов, имеющих типы T1, T2, … возвращает кортеж типа Tuple(T1, T2, …), содержащий эти столбцы. Выполнение функции ничего не стоит. -Кортежи обычно используются как промежуточное значение в качестве аргумента операторов IN, или для создания списка формальных параметров лямбда-функций. Кортежи не могут быть записаны в таблицу. - -## tupleElement(tuple, n), оператор x.N {#tupleelementtuple-n-operator-x-n} - -Функция, позволяющая достать столбец из кортежа. -N - индекс столбца начиная с 1. N должно быть константой. N должно быть целым строго положительным числом не большим размера кортежа. -Выполнение функции ничего не стоит. - [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/in_functions/) diff --git a/docs/ru/sql-reference/functions/tuple-functions.md b/docs/ru/sql-reference/functions/tuple-functions.md new file mode 100644 index 00000000000..f88886ec6f1 --- /dev/null +++ b/docs/ru/sql-reference/functions/tuple-functions.md @@ -0,0 +1,114 @@ +--- +toc_priority: 68 +toc_title: Функции для работы с кортежами +--- + +# Функции для работы с кортежами {#tuple-functions} + +## tuple {#tuple} + +Функция, позволяющая сгруппировать несколько столбцов. +Для столбцов, имеющих типы T1, T2, … возвращает кортеж типа Tuple(T1, T2, …), содержащий эти столбцы. Выполнение функции ничего не стоит. +Кортежи обычно используются как промежуточное значение в качестве аргумента операторов IN, или для создания списка формальных параметров лямбда-функций. Кортежи не могут быть записаны в таблицу. + +С помощью функции реализуется оператор `(x, y, …)`. + +**Синтаксис** + +``` sql +tuple(x, y, …) +``` + +## tupleElement {#tupleelement} + +Функция, позволяющая достать столбец из кортежа. +N - индекс столбца начиная с 1. N должно быть константой. N должно быть целым строго положительным числом не большим размера кортежа. +Выполнение функции ничего не стоит. + +С помощью функции реализуется оператор `x.N`. + +**Синтаксис** + +``` sql +tupleElement(tuple, n) +``` + +## untuple {#untuple} + +Выполняет синтаксическую подстановку элементов [кортежа](../../sql-reference/data-types/tuple.md#tuplet1-t2) в место вызова. + +**Синтаксис** + +``` sql +untuple(x) +``` + +Чтобы пропустить некоторые столбцы в результате запроса, вы можете использовать выражение `EXCEPT`. + +**Параметры** + +- `x` - функция `tuple`, столбец или кортеж элементов. [Tuple](../../sql-reference/data-types/tuple.md). + +**Возвращаемое значение** + +- Нет. + +**Примеры** + +Входная таблица: + +``` text +┌─key─┬─v1─┬─v2─┬─v3─┬─v4─┬─v5─┬─v6────────┐ +│ 1 │ 10 │ 20 │ 40 │ 30 │ 15 │ (33,'ab') │ +│ 2 │ 25 │ 65 │ 70 │ 40 │ 6 │ (44,'cd') │ +│ 3 │ 57 │ 30 │ 20 │ 10 │ 5 │ (55,'ef') │ +│ 4 │ 55 │ 12 │ 7 │ 80 │ 90 │ (66,'gh') │ +│ 5 │ 30 │ 50 │ 70 │ 25 │ 55 │ (77,'kl') │ +└─────┴────┴────┴────┴────┴────┴───────────┘ +``` + +Пример использования столбца типа `Tuple` в качестве параметра функции `untuple`: + +Запрос: + +``` sql +SELECT untuple(v6) FROM kv; +``` + +Результат: + +``` text +┌─_ut_1─┬─_ut_2─┐ +│ 33 │ ab │ +│ 44 │ cd │ +│ 55 │ ef │ +│ 66 │ gh │ +│ 77 │ kl │ +└───────┴───────┘ +``` + +Пример использования выражения `EXCEPT`: + +Запрос: + +``` sql +SELECT untuple((* EXCEPT (v2, v3),)) FROM kv; +``` + +Результат: + +``` text +┌─key─┬─v1─┬─v4─┬─v5─┬─v6────────┐ +│ 1 │ 10 │ 30 │ 15 │ (33,'ab') │ +│ 2 │ 25 │ 40 │ 6 │ (44,'cd') │ +│ 3 │ 57 │ 10 │ 5 │ (55,'ef') │ +│ 4 │ 55 │ 80 │ 90 │ (66,'gh') │ +│ 5 │ 30 │ 25 │ 55 │ (77,'kl') │ +└─────┴────┴────┴────┴───────────┘ +``` + +**Смотрите также** + +- [Tuple](../../sql-reference/data-types/tuple.md) + +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/functions/tuple-functions/) diff --git a/docs/ru/sql-reference/statements/alter/delete.md b/docs/ru/sql-reference/statements/alter/delete.md index 29e1ae564d2..ee5f03d9d95 100644 --- a/docs/ru/sql-reference/statements/alter/delete.md +++ b/docs/ru/sql-reference/statements/alter/delete.md @@ -9,7 +9,7 @@ toc_title: DELETE ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr ``` -Позволяет удалить данные, соответствующие указанному выражению фильтрации. Реализовано как [мутация](../../../sql-reference/statements/alter/index.md#mutations). +Удаляет данные, соответствующие указанному выражению фильтрации. Реализовано как [мутация](../../../sql-reference/statements/alter/index.md#mutations). !!! note "Note" Префикс `ALTER TABLE` делает этот синтаксис отличным от большинства других систем, поддерживающих SQL. Он предназначен для обозначения того, что в отличие от аналогичных запросов в базах данных OLTP это тяжелая операция, не предназначенная для частого использования. diff --git a/docs/ru/sql-reference/statements/alter/partition.md b/docs/ru/sql-reference/statements/alter/partition.md index 5c4a23428ad..b43340467fc 100644 --- a/docs/ru/sql-reference/statements/alter/partition.md +++ b/docs/ru/sql-reference/statements/alter/partition.md @@ -19,10 +19,10 @@ toc_title: PARTITION - [FETCH PARTITION](#alter_fetch-partition) — скачать партицию с другого сервера; - [MOVE PARTITION\|PART](#alter_move-partition) — переместить партицию/кускок на другой диск или том. -## DETACH PARTITION {#alter_detach-partition} +## DETACH PARTITION\|PART {#alter_detach-partition} ``` sql -ALTER TABLE table_name DETACH PARTITION partition_expr +ALTER TABLE table_name DETACH PARTITION|PART partition_expr ``` Перемещает заданную партицию в директорию `detached`. Сервер не будет знать об этой партиции до тех пор, пока вы не выполните запрос [ATTACH](#alter_attach-partition). @@ -30,7 +30,8 @@ ALTER TABLE table_name DETACH PARTITION partition_expr Пример: ``` sql -ALTER TABLE visits DETACH PARTITION 201901 +ALTER TABLE mt DETACH PARTITION '2020-11-21'; +ALTER TABLE mt DETACH PART 'all_2_2_0'; ``` Подробнее о том, как корректно задать имя партиции, см. в разделе [Как задавать имя партиции в запросах ALTER](#alter-how-to-specify-part-expr). @@ -39,10 +40,10 @@ ALTER TABLE visits DETACH PARTITION 201901 Запрос реплицируется — данные будут перенесены в директорию `detached` и забыты на всех репликах. Обратите внимание, запрос может быть отправлен только на реплику-лидер. Чтобы узнать, является ли реплика лидером, выполните запрос `SELECT` к системной таблице [system.replicas](../../../operations/system-tables/replicas.md#system_tables-replicas). Либо можно выполнить запрос `DETACH` на всех репликах — тогда на всех репликах, кроме реплики-лидера, запрос вернет ошибку. -## DROP PARTITION {#alter_drop-partition} +## DROP PARTITION\|PART {#alter_drop-partition} ``` sql -ALTER TABLE table_name DROP PARTITION partition_expr +ALTER TABLE table_name DROP PARTITION|PART partition_expr ``` Удаляет партицию. Партиция помечается как неактивная и будет полностью удалена примерно через 10 минут. @@ -51,6 +52,13 @@ ALTER TABLE table_name DROP PARTITION partition_expr Запрос реплицируется — данные будут удалены на всех репликах. +Пример: + +``` sql +ALTER TABLE mt DROP PARTITION '2020-11-21'; +ALTER TABLE mt DROP PART 'all_4_4_0'; +``` + ## DROP DETACHED PARTITION\|PART {#alter_drop-detached} ``` sql @@ -235,6 +243,46 @@ ALTER TABLE hits MOVE PART '20190301_14343_16206_438' TO VOLUME 'slow' ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' ``` +## UPDATE IN PARTITION {#update-in-partition} + +Манипулирует данными в указанной партиции, соответствующими заданному выражению фильтрации. Реализовано как мутация [mutation](../../../sql-reference/statements/alter/index.md#mutations). + +Синтаксис: + +``` sql +ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] [IN PARTITION partition_id] WHERE filter_expr +``` + +### Пример + +``` sql +ALTER TABLE mt UPDATE x = x + 1 IN PARTITION 2 WHERE p = 2; +``` + +### Смотрите также + +- [UPDATE](../../../sql-reference/statements/alter/update.md#alter-table-update-statements) + +## DELETE IN PARTITION {#delete-in-partition} + +Удаляет данные в указанной партиции, соответствующие указанному выражению фильтрации. Реализовано как мутация [mutation](../../../sql-reference/statements/alter/index.md#mutations). + +Синтаксис: + +``` sql +ALTER TABLE [db.]table DELETE [IN PARTITION partition_id] WHERE filter_expr +``` + +### Пример + +``` sql +ALTER TABLE mt DELETE IN PARTITION 2 WHERE p = 2; +``` + +### Смотрите также + +- [DELETE](../../../sql-reference/statements/alter/delete.md#alter-mutations) + ## Как задавать имя партиции в запросах ALTER {#alter-how-to-specify-part-expr} Чтобы задать нужную партицию в запросах `ALTER ... PARTITION`, можно использовать: @@ -254,6 +302,8 @@ ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; ``` +`IN PARTITION` указывает на партицию, для которой применяются выражения [UPDATE](../../../sql-reference/statements/alter/update.md#alter-table-update-statements) или [DELETE](../../../sql-reference/statements/alter/delete.md#alter-mutations) в результате запроса `ALTER TABLE`. Новые куски создаются только в указанной партиции. Таким образом, `IN PARTITION` помогает снизить нагрузку, когда таблица разбита на множество партиций, а вам нужно обновить данные лишь точечно. + Примеры запросов `ALTER ... PARTITION` можно посмотреть в тестах: [`00502_custom_partitioning_local`](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00502_custom_partitioning_local.sql) и [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/alter/partition/) \ No newline at end of file diff --git a/docs/ru/sql-reference/statements/alter/update.md b/docs/ru/sql-reference/statements/alter/update.md index f497b2c4511..e3d6725419a 100644 --- a/docs/ru/sql-reference/statements/alter/update.md +++ b/docs/ru/sql-reference/statements/alter/update.md @@ -9,7 +9,7 @@ toc_title: UPDATE ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr ``` -Позволяет манипулировать данными, соответствующими заданному выражению фильтрации. Реализовано как [мутация](../../../sql-reference/statements/alter/index.md#mutations). +Манипулирует данными, соответствующими заданному выражению фильтрации. Реализовано как [мутация](../../../sql-reference/statements/alter/index.md#mutations). !!! note "Note" Префикс `ALTER TABLE` делает этот синтаксис отличным от большинства других систем, поддерживающих SQL. Он предназначен для обозначения того, что в отличие от аналогичных запросов в базах данных OLTP это тяжелая операция, не предназначенная для частого использования. diff --git a/docs/ru/sql-reference/statements/system.md b/docs/ru/sql-reference/statements/system.md index 4780e9b613f..4f7ac98807d 100644 --- a/docs/ru/sql-reference/statements/system.md +++ b/docs/ru/sql-reference/statements/system.md @@ -130,7 +130,7 @@ ClickHouse может управлять фоновыми процессами Позволяет остановить фоновые мержи для таблиц семейства MergeTree: ``` sql -SYSTEM STOP MERGES [[db.]merge_tree_family_table_name] +SYSTEM STOP MERGES [ON VOLUME | [db.]merge_tree_family_table_name] ``` !!! note "Note" @@ -141,7 +141,7 @@ SYSTEM STOP MERGES [[db.]merge_tree_family_table_name] Включает фоновые мержи для таблиц семейства MergeTree: ``` sql -SYSTEM START MERGES [[db.]merge_tree_family_table_name] +SYSTEM START MERGES [ON VOLUME | [db.]merge_tree_family_table_name] ``` ### STOP TTL MERGES {#query_language-stop-ttl-merges} diff --git a/docs/ru/whats-new/extended-roadmap.md b/docs/ru/whats-new/extended-roadmap.md index 57a29ce90ad..aff8e1cbcfb 100644 --- a/docs/ru/whats-new/extended-roadmap.md +++ b/docs/ru/whats-new/extended-roadmap.md @@ -15,8 +15,6 @@ Задача «normalized z-Order curve» в перспективе может быть полезна для БК и Метрики, так как позволяет смешивать OrderID и PageID и избежать дублирования данных. В задаче также вводится способ индексации путём обращения функции нескольких аргументов на интервале, что имеет смысл для дальнейшего развития. -[Андрей Чулков](https://github.com/achulkov2), ВШЭ. - ### 1.2. + Wait-free каталог баз данных {#wait-free-katalog-baz-dannykh} Q2. Делает [Александр Токмаков](https://github.com/tavplubix), первый рабочий вариант в декабре 2019. Нужно для DataLens и Яндекс.Метрики. @@ -292,7 +290,8 @@ Upd. Иван Блинков сделал эту задачу путём зам ### 4.1. Уменьшение числа потоков при распределённых запросах {#umenshenie-chisla-potokov-pri-raspredelionnykh-zaprosakh} -Весна 2020. Upd. Есть прототип. Upd. Он не работает. Upd. Человек отказался от задачи, теперь сроки не определены. +Upd. Есть прототип. Upd. Он не работает. Upd. Человек отказался от задачи, теперь сроки не определены. +Upd. Павел Круглов, весна 2021. ### 4.2. Спекулятивное выполнение запросов на нескольких репликах {#spekuliativnoe-vypolnenie-zaprosov-na-neskolkikh-replikakh} @@ -306,6 +305,8 @@ Upd. Иван Блинков сделал эту задачу путём зам Upd. Сейчас обсуждается, как сделать другую задачу вместо этой. +Павел Круглов, весна 2021. + ### 4.3. Ограничение числа одновременных скачиваний с реплик {#ogranichenie-chisla-odnovremennykh-skachivanii-s-replik} Изначально делал Олег Алексеенков, но пока решение не готово, хотя там не так уж много доделывать. @@ -320,9 +321,10 @@ Upd. Сейчас обсуждается, как сделать другую з ### 4.7. Ленивая загрузка множеств для IN и JOIN с помощью k/v запросов {#lenivaia-zagruzka-mnozhestv-dlia-in-i-join-s-pomoshchiu-kv-zaprosov} -### 4.8. Разделить background pool для fetch и merge {#razdelit-background-pool-dlia-fetch-i-merge} +### 4.8. + Разделить background pool для fetch и merge {#razdelit-background-pool-dlia-fetch-i-merge} -В очереди. Исправить проблему, что восстанавливающаяся реплика перестаёт мержить. Частично компенсируется 4.3. +Исправить проблему, что восстанавливающаяся реплика перестаёт мержить. Частично компенсируется 4.3. +Ура, готово! Сделал Александр Сапин. ## 5. Операции {#operatsii} @@ -381,6 +383,7 @@ Upd. Появилась вторая версия LTS - 20.3. ### 6.5. Эксперименты с LLVM X-Ray {#eksperimenty-s-llvm-x-ray} Требует 2.2. +Перенос на 2021 или отмена. ### 6.6. + Стек трейс для любых исключений {#stek-treis-dlia-liubykh-iskliuchenii} @@ -401,6 +404,8 @@ Upd. В разработке. ### 6.10. Сбор общих системных метрик {#sbor-obshchikh-sistemnykh-metrik} +Перенос на весну 2021. + ## 7. Сопровождение разработки {#soprovozhdenie-razrabotki} @@ -461,7 +466,7 @@ UBSan включен в функциональных тестах, но не в ### 7.12. Показывать тестовое покрытие нового кода в PR {#pokazyvat-testovoe-pokrytie-novogo-koda-v-pr} Пока есть просто показ тестового покрытия всего кода. -Отложено. +Отложено на весну 2021. ### 7.13. + Включение аналога -Weverything в gcc {#vkliuchenie-analoga-weverything-v-gcc} @@ -512,6 +517,7 @@ Upd. Минимальная подсветка добавлена, а все о Поводом использования libressl послужило желание нашего хорошего друга из известной компании несколько лет назад. Но сейчас ситуация состоит в том, что openssl продолжает развиваться, а libressl не особо, и можно спокойно менять обратно. Нужно для Яндекс.Облака для поддержки TLS 1.3. +Теперь нужно заменить OpenSSL на BoringSSL. ### 7.16. + tzdata внутри бинарника {#tzdata-vnutri-binarnika} @@ -612,7 +618,7 @@ Upd. Эльдар Заитов добавляет OSS Fuzz. Upd. Сделаны randomString, randomFixedString. Upd. Сделаны fuzzBits. -### 7.24. Fuzzing лексера и парсера запросов; кодеков и форматов {#fuzzing-leksera-i-parsera-zaprosov-kodekov-i-formatov} +### 7.24. + Fuzzing лексера и парсера запросов; кодеков и форматов {#fuzzing-leksera-i-parsera-zaprosov-kodekov-i-formatov} Продолжение 7.23. @@ -656,6 +662,7 @@ Upd. В Аркадии частично работает небольшая ча ### 7.30. Возможность переключения бинарных файлов на продакшене без выкладки пакетов {#vozmozhnost-perekliucheniia-binarnykh-failov-na-prodakshene-bez-vykladki-paketov} Низкий приоритет. +Сделали файл clickhouse.old. ### 7.31. Зеркалирование нагрузки между серверами {#zerkalirovanie-nagruzki-mezhdu-serverami} @@ -737,7 +744,7 @@ Upd. Задача взята в работу. ### 8.6. Kerberos аутентификация для HDFS и Kafka {#kerberos-autentifikatsiia-dlia-hdfs-i-kafka} Андрей Коняев, ArenaData. Он куда-то пропал. -Upd. В процессе работа для Kafka. +Для Kafka готово, для HDFS в процессе. ### 8.7. + Исправление мелочи HDFS на очень старых ядрах Linux {#ispravlenie-melochi-hdfs-na-ochen-starykh-iadrakh-linux} @@ -1024,14 +1031,14 @@ Upd. Сделано хранение прав. До готового к испо [Виталий Баранов](https://github.com/vitlibar). Финальная стадия разработки, рабочая версия в декабре 2019. Q1. Сделано управление правами полностью, но не реализовано их хранение, см. 12.1. -### 12.3. Подключение справочника пользователей и прав доступа из LDAP {#podkliuchenie-spravochnika-polzovatelei-i-prav-dostupa-iz-ldap} +### 12.3. + Подключение справочника пользователей и прав доступа из LDAP {#podkliuchenie-spravochnika-polzovatelei-i-prav-dostupa-iz-ldap} Аутентификация через LDAP - Денис Глазачев. [Виталий Баранов](https://github.com/vitlibar) и Денис Глазачев, Altinity. Требует 12.1. Q3. Upd. Pull request на финальной стадии. -### 12.4. Подключение IDM системы Яндекса как справочника пользователей и прав доступа {#podkliuchenie-idm-sistemy-iandeksa-kak-spravochnika-polzovatelei-i-prav-dostupa} +### 12.4. - Подключение IDM системы Яндекса как справочника пользователей и прав доступа {#podkliuchenie-idm-sistemy-iandeksa-kak-spravochnika-polzovatelei-i-prav-dostupa} Пока низкий приоритет. Нужно для Метрики. Требует 12.3. Отложено. @@ -1051,7 +1058,7 @@ Upd. Есть pull request. ### 13.1. Overcommit запросов по памяти и вытеснение {#overcommit-zaprosov-po-pamiati-i-vytesnenie} -Требует 2.1. Способ реализации обсуждается. Александр Казаков. +Требует 2.1. Способ реализации обсуждается. ### 13.2. Общий конвейер выполнения на сервер {#obshchii-konveier-vypolneniia-na-server} @@ -1059,8 +1066,6 @@ Upd. Есть pull request. ### 13.3. Пулы ресурсов {#puly-resursov} -Александр Казаков. - Требует 13.2 или сможем сделать более неудобную реализацию раньше. Обсуждается вариант неудобной реализации. Пока средний приоритет, целимся на Q1/Q2. Вариант реализации выбрал Александр Казаков. @@ -1068,6 +1073,7 @@ Upd. Не уследили, и задачу стали обсуждать мен Upd. Задачу смотрит Александр Казаков. Upd. Задача взята в работу. Upd. Задача как будто взята в работу. +Upd. Задачу не сделал. ## 14. Диалект SQL {#dialekt-sql} @@ -1082,19 +1088,18 @@ Upd. Задача как будто взята в работу. ### 14.3. Поддержка подстановок для множеств в правой части IN {#podderzhka-podstanovok-dlia-mnozhestv-v-pravoi-chasti-in} -### 14.4. Поддержка подстановок для идентификаторов (имён) в SQL запросе {#podderzhka-podstanovok-dlia-identifikatorov-imion-v-sql-zaprose} +### 14.4. + Поддержка подстановок для идентификаторов (имён) в SQL запросе {#podderzhka-podstanovok-dlia-identifikatorov-imion-v-sql-zaprose} -zhang2014 -Задача на паузе. +Amos Bird сделал. ### 14.5. + Поддержка задания множества как массива в правой части секции IN {#podderzhka-zadaniia-mnozhestva-kak-massiva-v-pravoi-chasti-sektsii-in} Василий Немков, Altinity, делал эту задачу, но забросил её в пользу других задач. В результате, сейчас доделывает Антон Попов. -### 14.6. Глобальный scope для WITH {#globalnyi-scope-dlia-with} +### 14.6. + Глобальный scope для WITH {#globalnyi-scope-dlia-with} -В обсуждении. Amos Bird. +Amos Bird сделал. ### 14.7. Nullable для WITH ROLLUP, WITH CUBE, WITH TOTALS {#nullable-dlia-with-rollup-with-cube-with-totals} @@ -1148,13 +1153,13 @@ Upd. Есть pull request. Готово. ### 14.17. + Ввести понятие stateful функций {#vvesti-poniatie-stateful-funktsii} -zhang2014. Для runningDifference, neighbour - их учёт в оптимизаторе запросов. В интерфейсе уже сделано. Надо проверить, что учитывается в нужных местах (например, что работает predicate pushdown сквозь ORDER BY, если таких функций нет). +Александр Кузьменков. -### 14.18. UNION DISTINCT и возможность включить его по-умолчанию {#union-distinct-i-vozmozhnost-vkliuchit-ego-po-umolchaniiu} +### 14.18. + UNION DISTINCT и возможность включить его по-умолчанию {#union-distinct-i-vozmozhnost-vkliuchit-ego-po-umolchaniiu} -Для BI систем. +Для BI систем. flynn ucasFL. ### 14.19. + Совместимость парсера типов данных с SQL {#sovmestimost-parsera-tipov-dannykh-s-sql} @@ -1278,7 +1283,7 @@ Upd. Есть pull request. Исправление фундаментальной проблемы - есть PR. Фундаментальная проблема решена. -### 18.2. Агрегатные функции для статистических тестов {#agregatnye-funktsii-dlia-statisticheskikh-testov} +### 18.2. + Агрегатные функции для статистических тестов {#agregatnye-funktsii-dlia-statisticheskikh-testov} Артём Цыганов, Руденский Константин Игоревич, Семёнов Денис, ВШЭ. @@ -1286,6 +1291,7 @@ Upd. Есть pull request. Сделали прототип двух тестов, есть pull request. Также есть pull request для корелляции рангов. Upd. Помержили корелляцию рангов, но ещё не помержили сравнение t-test, u-test. +Upd. Всё доделал Никита Михайлов. ### 18.3. Инфраструктура для тренировки моделей в ClickHouse {#infrastruktura-dlia-trenirovki-modelei-v-clickhouse} @@ -1295,7 +1301,7 @@ Upd. Помержили корелляцию рангов, но ещё не по ## 19. Улучшение работы кластера {#uluchshenie-raboty-klastera} -### 19.1. Параллельные кворумные вставки без линеаризуемости {#parallelnye-kvorumnye-vstavki-bez-linearizuemosti} +### 19.1. + Параллельные кворумные вставки без линеаризуемости {#parallelnye-kvorumnye-vstavki-bez-linearizuemosti} Upd. В работе, ожидается в начале октября. @@ -1361,6 +1367,8 @@ Upd. Задача в разработке. ### 20.2. Поддержка DELETE путём преобразования множества ключей в множество row_numbers на реплике, столбца флагов и индекса по диапазонам {#podderzhka-delete-putiom-preobrazovaniia-mnozhestva-kliuchei-v-mnozhestvo-row-numbers-na-replike-stolbtsa-flagov-i-indeksa-po-diapazonam} +Задача назначена на 2021. + ### 20.3. Поддержка ленивых DELETE путём запоминания выражений и преобразования к множеству ключей в фоне {#podderzhka-lenivykh-delete-putiom-zapominaniia-vyrazhenii-i-preobrazovaniia-k-mnozhestvu-kliuchei-v-fone} ### 20.4. Поддержка UPDATE с помощью преобразования в DELETE и вставок {#podderzhka-update-s-pomoshchiu-preobrazovaniia-v-delete-i-vstavok} @@ -1413,6 +1421,7 @@ ucasFL, в разработке. Готово. [Achimbab](https://github.com/achimbab). Есть pull request. Но это не совсем то. Upd. В обсуждении. +Upd. Назначено на 2021. ### 21.8. Взаимная интеграция аллокатора и кэша {#vzaimnaia-integratsiia-allokatora-i-kesha} @@ -1427,6 +1436,7 @@ Upd. В обсуждении. Upd. Есть нерабочий прототип, скорее всего будет отложено. Upd. Отложено до осени. Upd. Отложено до. +Upd. Отложено. ### 21.8.1. Отдельный аллокатор для кэшей с ASLR {#otdelnyi-allokator-dlia-keshei-s-aslr} @@ -1517,7 +1527,7 @@ Upd. Сделаны самые существенные из предложен Для сортировки по кортежам используется обычная сортировка с компаратором, который в цикле по элементам кортежа делает виртуальные вызовы `IColumn::compareAt`. Это неоптимально - как из-за короткого цикла по неизвестному в compile-time количеству элементов, так и из-за виртуальных вызовов. Чтобы обойтись без виртуальных вызовов, есть метод `IColumn::getPermutation`. Он используется в случае сортировки по одному столбцу. Есть вариант, что в случае сортировки по кортежу, что-то похожее тоже можно применить… например, сделать метод `updatePermutation`, принимающий аргументы offset и limit, и допереставляющий перестановку в диапазоне значений, в которых предыдущий столбец имел равные значения. -3. RadixSort для сортировки. +\+ 3. RadixSort для сортировки. Один наш знакомый начал делать задачу по попытке использования RadixSort для сортировки столбцов. Был сделан вариант indirect сортировки (для `getPermutation`), но не оптимизирован до конца - есть лишние ненужные перекладывания элементов. Для того, чтобы его оптимизировать, придётся добавить немного шаблонной магии (на последнем шаге что-то не копировать, вместо перекладывания индексов - складывать их в готовое место). Также этот человек добавил метод MSD Radix Sort для реализации radix partial sort. Но даже не проверил производительность. @@ -1527,7 +1537,9 @@ Upd. Сделаны самые существенные из предложен Виртуальный метод `compareAt` возвращает -1, 0, 1. Но алгоритмы сортировки сравнениями обычно рассчитаны на `operator<` и не могут получить преимущества от three-way comparison. А можно ли написать так, чтобы преимущество было? -5. pdq partial sort +\+ 5. pdq partial sort + +Upd. Данила Кутенин решил эту задачу ультимативно, используя Floyd–Rivest алгоритм. Хороший алгоритм сортировки сравнениями `pdqsort` не имеет варианта partial sort. Заметим, что на практике, почти все сортировки в запросах ClickHouse являются partial_sort, так как `ORDER BY` почти всегда идёт с `LIMIT`. Кстати, Данила Кутенин уже попробовал это и показал, что в тривиальном случае преимущества нет. Но не очевидно, что нельзя сделать лучше. @@ -1619,6 +1631,7 @@ Upd. Добавили таймауты. Altinity. Я не в курсе, какой статус. +Там предлагают очень сложное решение вместо простого. ### 22.16. + Исправление низкой производительности кодека DoubleDelta {#ispravlenie-nizkoi-proizvoditelnosti-kodeka-doubledelta} @@ -1656,15 +1669,15 @@ Upd. Готово. Нужно для Метрики. Алексей Миловидов. -### 22.25. Избавиться от библиотеки btrie {#izbavitsia-ot-biblioteki-btrie} +### 22.25. + Избавиться от библиотеки btrie {#izbavitsia-ot-biblioteki-btrie} -Алексей Миловидов. Низкий приоритет. +Владимир Черкасов сделал эту задачу. ### 22.26. Плохая производительность quantileTDigest {#plokhaia-proizvoditelnost-quantiletdigest} [#2668](https://github.com/ClickHouse/ClickHouse/issues/2668) -Алексей Миловидов или будет переназначено. +Павел Круглов и Илья Щербак (ВК). ### 22.27. Проверить несколько PR, которые были закрыты zhang2014 и sundy-li {#proverit-neskolko-pr-kotorye-byli-zakryty-zhang2014-i-sundy-li} @@ -1766,7 +1779,7 @@ Upd. Отменено. Виталий Баранов. Отложено, после бэкапов. -### 24.5. Поддержка функций шифрования для отдельных значений {#podderzhka-funktsii-shifrovaniia-dlia-otdelnykh-znachenii} +### 24.5. + Поддержка функций шифрования для отдельных значений {#podderzhka-funktsii-shifrovaniia-dlia-otdelnykh-znachenii} Смотрите также 24.5. @@ -1775,6 +1788,7 @@ Upd. Отменено. Делает Василий Немков, Altinity Есть pull request в процессе ревью, исправляем проблемы производительности. +Сейчас в состоянии, что уже добавлено в продакшен, но производительность всё ещё низкая (тех долг). ### 24.6. Userspace RAID {#userspace-raid} @@ -1825,7 +1839,7 @@ RAID позволяет одновременно увеличить надёжн Upd. Есть pull request. В стадии ревью. Готово. -### 24.10. Поддержка типов half/bfloat16/unum {#podderzhka-tipov-halfbfloat16unum} +### 24.10. - Поддержка типов half/bfloat16/unum {#podderzhka-tipov-halfbfloat16unum} [#7657](https://github.com/ClickHouse/ClickHouse/issues/7657) @@ -1833,6 +1847,7 @@ Upd. Есть pull request. В стадии ревью. Готово. Есть pull request на промежуточной стадии. Отложено. +Отменено. ### 24.11. User Defined Functions {#user-defined-functions} @@ -1882,10 +1897,12 @@ Upd. Прототип bitonic sort помержен, но целесообраз Требует 2.1. Upd. Есть два прототипа от внешних контрибьюторов. +Александр Кузьменков. ### 24.15. Поддержка полуструктурированных данных {#podderzhka-polustrukturirovannykh-dannykh} Требует 1.14 и 2.10. +Антон Попов. ### 24.16. Улучшение эвристики слияний {#uluchshenie-evristiki-sliianii} @@ -1915,6 +1932,7 @@ Upd. Есть pull request - в большинстве случаев однов ### 24.21. Реализация в ClickHouse протокола распределённого консенсуса {#realizatsiia-v-clickhouse-protokola-raspredelionnogo-konsensusa} Имеет смысл только после 19.2. +Александр Сапин. ### 24.22. Вывод типов по блоку данных. Вывод формата данных по примеру {#vyvod-tipov-po-bloku-dannykh-vyvod-formata-dannykh-po-primeru} @@ -1955,13 +1973,14 @@ ClickHouse также может использоваться для быстр Михаил Филитов, ВШЭ. Upd. Есть pull request. Нужно ещё чистить код библиотеки. -### 24.26. Поддержка open tracing или аналогов {#podderzhka-open-tracing-ili-analogov} +### 24.26. + Поддержка open tracing или аналогов {#podderzhka-open-tracing-ili-analogov} [#5182](https://github.com/ClickHouse/ClickHouse/issues/5182) Александр Кожихов, ВШЭ и Яндекс.YT. Upd. Есть pull request с прототипом. Upd. Александ Кузьменков взял задачу в работу. +Сделано. ### 24.27. Реализация алгоритмов min-hash, sim-hash для нечёткого поиска полудубликатов {#realizatsiia-algoritmov-min-hash-sim-hash-dlia-nechiotkogo-poiska-poludublikatov} @@ -1995,7 +2014,7 @@ Amos Bird, но его решение слишком громоздкое и п Перепиcывание в JOIN. Не раньше 21.11, 21.12, 21.9. Низкий приоритет. Отложено. -### 24.32. Поддержка GRPC {#podderzhka-grpc} +### 24.32. + Поддержка GRPC {#podderzhka-grpc} Мария Конькова, ВШЭ и Яндекс. Также смотрите 24.29. @@ -2009,6 +2028,7 @@ Amos Bird, но его решение слишком громоздкое и п Задача в работе, есть pull request. [#10136](https://github.com/ClickHouse/ClickHouse/pull/10136) Upd. Задачу взял в работу Виталий Баранов. +Сделано. ## 25. DevRel {#devrel} @@ -2067,13 +2087,14 @@ Upd. Задачу взял в работу Виталий Баранов. Алексей Миловидов и все подготовленные докладчики. Upd. Участвуем. -### 25.14. Конференции в России: все HighLoad, возможно CodeFest, DUMP или UWDC, возможно C++ Russia {#konferentsii-v-rossii-vse-highload-vozmozhno-codefest-dump-ili-uwdc-vozmozhno-c-russia} +### 25.14. + Конференции в России: все HighLoad, возможно CodeFest, DUMP или UWDC, возможно C++ Russia {#konferentsii-v-rossii-vse-highload-vozmozhno-codefest-dump-ili-uwdc-vozmozhno-c-russia} Алексей Миловидов и все подготовленные докладчики. Upd. Есть Saint HighLoad online. Upd. Есть C++ Russia. CodeFest, DUMP, UWDC отменились. Upd. Добавились Highload Fwdays, Матемаркетинг. +Upd. Добавились подкасты C++ Russia. ### 25.15. Конференции зарубежные: Percona, DataOps, попытка попасть на более крупные {#konferentsii-zarubezhnye-percona-dataops-popytka-popast-na-bolee-krupnye} @@ -2096,6 +2117,7 @@ DataOps отменилась. Есть минимальный прототип. Сделал Илья Яцишин. Этот прототип не позволяет делиться ссылками на результаты запросов. Upd. На финальной стадии инструмент для экспериментирования с разными версиями ClickHouse. +Upd. По факту, задача считается не сделанной (готово только 99%, не 100%). ### 25.17. Взаимодействие с ВУЗами: ВШЭ, УрФУ, ICT Beijing {#vzaimodeistvie-s-vuzami-vshe-urfu-ict-beijing} @@ -2103,6 +2125,7 @@ Upd. На финальной стадии инструмент для экспе Благодаря Robert Hodges добавлен CMU. Upd. Взаимодействие с ВШЭ 2019/2020 успешно выполнено. Upd. Идёт подготовка к 2020/2021. +Upd. Уже взяли несколько десятков человек на 2020/2021. ### 25.18. - Лекция в ШАД {#lektsiia-v-shad} diff --git a/docs/tools/build.py b/docs/tools/build.py index bcbf3ac27cd..45d74423fa8 100755 --- a/docs/tools/build.py +++ b/docs/tools/build.py @@ -202,7 +202,11 @@ def build(args): if __name__ == '__main__': os.chdir(os.path.join(os.path.dirname(__file__), '..')) - website_dir = os.path.join('..', 'website') + + # A root path to ClickHouse source code. + src_dir = '..' + + website_dir = os.path.join(src_dir, 'website') arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--lang', default='en,es,fr,ru,zh,ja,tr,fa') @@ -210,6 +214,7 @@ if __name__ == '__main__': arg_parser.add_argument('--docs-dir', default='.') arg_parser.add_argument('--theme-dir', default=website_dir) arg_parser.add_argument('--website-dir', default=website_dir) + arg_parser.add_argument('--src-dir', default=src_dir) arg_parser.add_argument('--blog-dir', default=os.path.join(website_dir, 'blog')) arg_parser.add_argument('--output-dir', default='build') arg_parser.add_argument('--enable-stable-releases', action='store_true') diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt index b21eb4892fd..4106100bfa3 100644 --- a/docs/tools/requirements.txt +++ b/docs/tools/requirements.txt @@ -18,7 +18,7 @@ Markdown==3.3.2 MarkupSafe==1.1.1 mkdocs==1.1.2 mkdocs-htmlproofer-plugin==0.0.3 -mkdocs-macros-plugin==0.5.0 +mkdocs-macros-plugin==0.4.20 nltk==3.5 nose==1.3.7 protobuf==3.14.0 diff --git a/docs/tools/website.py b/docs/tools/website.py index a658b0cfc34..4cce69bd869 100644 --- a/docs/tools/website.py +++ b/docs/tools/website.py @@ -145,13 +145,19 @@ def build_website(args): 'public', 'node_modules', 'templates', - 'locale' + 'locale', + '.gitkeep' ) ) + + # This file can be requested to check for available ClickHouse releases. + shutil.copy2( + os.path.join(args.src_dir, 'utils', 'list-versions', 'version_date.tsv'), + os.path.join(args.output_dir, 'data', 'version_date.tsv')) + shutil.copy2( os.path.join(args.website_dir, 'js', 'embedd.min.js'), - os.path.join(args.output_dir, 'js', 'embedd.min.js') - ) + os.path.join(args.output_dir, 'js', 'embedd.min.js')) for root, _, filenames in os.walk(args.output_dir): for filename in filenames: diff --git a/docs/tr/development/contrib.md b/docs/tr/development/contrib.md index 63cc289ec9b..f56cf2a625b 100644 --- a/docs/tr/development/contrib.md +++ b/docs/tr/development/contrib.md @@ -19,7 +19,6 @@ toc_title: "Kullan\u0131lan \xDC\xE7\xFCnc\xFC Taraf K\xFCt\xFCphaneleri" | googletest | [BSD 3-Clause Lisansı](https://github.com/google/googletest/blob/master/LICENSE) | | h33 | [Apache Lic 2.0ense 2.0](https://github.com/uber/h3/blob/master/LICENSE) | | hyperscan | [BSD 3-Clause Lisansı](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libbtrie | [BSD 2-Clause Lisansı](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | | libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | | libdivide | [Zlib Lisansı](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | | libgsasl | [LGPL v2. 1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | diff --git a/docs/zh/development/contrib.md b/docs/zh/development/contrib.md index 0129ee62ce7..8e8efc3c04e 100644 --- a/docs/zh/development/contrib.md +++ b/docs/zh/development/contrib.md @@ -11,7 +11,6 @@ | FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | | googletest | [BSD3-条款许可](https://github.com/google/googletest/blob/master/LICENSE) | | 超扫描 | [BSD3-条款许可](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libbtrie | [BSD2-条款许可](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | | libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | | libdivide | [Zlib许可证](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | | libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | diff --git a/docs/zh/getting-started/example-datasets/amplab-benchmark.md b/docs/zh/getting-started/example-datasets/amplab-benchmark.md index 11a1c34b91e..22f2735968b 100644 --- a/docs/zh/getting-started/example-datasets/amplab-benchmark.md +++ b/docs/zh/getting-started/example-datasets/amplab-benchmark.md @@ -1,8 +1,13 @@ -# AMPLab大数据基准测试 {#amplab-da-shu-ju-ji-zhun-ce-shi} +--- +toc_priority: 19 +toc_title: AMPLab Big Data Benchmark +--- + +# AMPLab Big Data Benchmark {#amplab-big-data-benchmark} 参考 https://amplab.cs.berkeley.edu/benchmark/ -需要您在https://aws.amazon.com注册一个免费的账号。注册时需要您提供信用卡、邮箱、电话等信息。之后可以在https://console.aws.amazon.com/iam/home?nc2=h_m_sc#security_credential获取新的访问密钥 +需要您在[Amazon](https://aws.amazon.com)注册一个免费的账号。注册时需要您提供信用卡、邮箱、电话等信息。之后可以在[Amazon AWS Console](https://console.aws.amazon.com/iam/home?nc2=h_m_sc#security_credential)获取新的访问密钥 在控制台运行以下命令: diff --git a/docs/zh/getting-started/example-datasets/criteo.md b/docs/zh/getting-started/example-datasets/criteo.md index 8135634f9c4..40ca1b4a781 100644 --- a/docs/zh/getting-started/example-datasets/criteo.md +++ b/docs/zh/getting-started/example-datasets/criteo.md @@ -1,6 +1,11 @@ -# Criteo TB级别点击日志 {#criteo-tbji-bie-dian-ji-ri-zhi} +--- +toc_priority: 18 +toc_title: Terabyte Click Logs from Criteo +--- -可以从http://labs.criteo.com/downloads/download-terabyte-click-logs/上下载数据 +# Terabyte of Click Logs from Criteo {#criteo-tbji-bie-dian-ji-ri-zhi} + +可以从 http://labs.criteo.com/downloads/download-terabyte-click-logs/ 上下载数据 创建原始数据对应的表结构: diff --git a/docs/zh/getting-started/example-datasets/index.md b/docs/zh/getting-started/example-datasets/index.md index 15f4d483312..acd554ca4da 100644 --- a/docs/zh/getting-started/example-datasets/index.md +++ b/docs/zh/getting-started/example-datasets/index.md @@ -6,15 +6,16 @@ toc_title: "\u5BFC\u8A00" # 示例数据集 {#example-datasets} -本节介绍如何获取示例数据集并将其导入ClickHouse。 +本节介绍如何获取示例数据集并将其导入ClickHouse。对于某些数据集,还可以使用示例查询。 + 对于某些数据集示例查询也可用。 -- [脱敏的Yandex.Metrica数据集](metrica.md) -- [星型基准测试](star-schema.md) -- [维基访问数据](wikistat.md) -- [Criteo TB级别点击日志](criteo.md) -- [AMPLab大数据基准测试](amplab-benchmark.md) -- [纽约出租车数据](nyc-taxi.md) -- [航班飞行数据](ontime.md) +- [Anonymized Yandex.Metrica Dataset](../../getting-started/example-datasets/metrica.md) +- [Star Schema Benchmark](../../getting-started/example-datasets/star-schema.md) +- [WikiStat](../../getting-started/example-datasets/wikistat.md) +- [Terabyte of Click Logs from Criteo](../../getting-started/example-datasets/criteo.md) +- [AMPLab Big Data Benchmark](../../getting-started/example-datasets/amplab-benchmark.md) +- [New York Taxi Data](../../getting-started/example-datasets/nyc-taxi.md) +- [OnTime](../../getting-started/example-datasets/ontime.md) [原始文章](https://clickhouse.tech/docs/en/getting_started/example_datasets) diff --git a/docs/zh/getting-started/example-datasets/metrica.md b/docs/zh/getting-started/example-datasets/metrica.md index 22dccc26dcb..353a24ce0cb 100644 --- a/docs/zh/getting-started/example-datasets/metrica.md +++ b/docs/zh/getting-started/example-datasets/metrica.md @@ -1,17 +1,17 @@ --- -toc_priority: 21 -toc_title: "Yandex\u6885\u7279\u91CC\u5361\u6570\u636E" +toc_priority: 15 +toc_title: Yandex.Metrica Data --- -# 脱敏的Yandex.Metrica数据集 {#anonymized-yandex-metrica-data} +# Anonymized Yandex.Metrica Data {#anonymized-yandex-metrica-data} -Dataset由两个表组成,其中包含有关命中的匿名数据 (`hits_v1`)和访问 (`visits_v1`)的Yandex的。梅特里卡 你可以阅读更多关于Yandex的。梅特里卡 [ClickHouse历史](../../introduction/history.md) 科。 +数据集由两个表组成,包含关于Yandex.Metrica的hits(`hits_v1`)和visit(`visits_v1`)的匿名数据。你可以阅读更多关于Yandex的信息。在[ClickHouse历史](../../introduction/history.md)的Metrica部分。 -数据集由两个表组成,其中任何一个都可以作为压缩表下载 `tsv.xz` 文件或作为准备的分区。 除此之外,该扩展版本 `hits` 包含1亿行的表可作为TSV在https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_100m_obfuscated_v1.tsv.xz 并作为准备的分区在https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_100m_obfuscated_v1.tar.xz. +数据集由两个表组成,他们中的任何一个都可以下载作为一个压缩`tsv.xz`的文件或准备的分区。除此之外,一个扩展版的`hits`表包含1亿行TSV在https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_100m_obfuscated_v1.tsv.xz,准备分区在https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_100m_obfuscated_v1.tar.xz。 ## 从准备好的分区获取表 {#obtaining-tables-from-prepared-partitions} -下载和导入点击表: +下载和导入`hits`表: ``` bash curl -O https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar @@ -21,7 +21,7 @@ sudo service clickhouse-server restart clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" ``` -下载和导入访问: +下载和导入`visits`表: ``` bash curl -O https://clickhouse-datasets.s3.yandex.net/visits/partitions/visits_v1.tar @@ -31,9 +31,9 @@ sudo service clickhouse-server restart clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ``` -## 从压缩TSV文件获取表 {#obtaining-tables-from-compressed-tsv-file} +## 从TSV压缩文件获取表 {#obtaining-tables-from-compressed-tsv-file} -从压缩的TSV文件下载并导入命中: +从TSV压缩文件下载并导入`hits`: ``` bash curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv @@ -47,7 +47,7 @@ clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL" clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" ``` -从压缩tsv文件下载和导入访问: +从压缩tsv文件下载和导入`visits`: ``` bash curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv @@ -63,6 +63,6 @@ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ## 查询示例 {#example-queries} -[点击教程](../../getting-started/tutorial.md) 是基于Yandex的。Metrica数据集和开始使用此数据集的推荐方式是通过教程。 +[使用教程](../../getting-started/tutorial.md)是以Yandex.Metrica数据集开始教程。 -查询这些表的其他示例可以在 [有状态测试](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) ClickHouse的(它们被命名为 `test.hists` 和 `test.visits` 那里)。 +可以在ClickHouse的[stateful tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) 中找到对这些表的查询的其他示例(它们被命名为`test.hists`和`test.visits`)。 diff --git a/docs/zh/getting-started/example-datasets/nyc-taxi.md b/docs/zh/getting-started/example-datasets/nyc-taxi.md index fa146d1ca38..c6b41e9d396 100644 --- a/docs/zh/getting-started/example-datasets/nyc-taxi.md +++ b/docs/zh/getting-started/example-datasets/nyc-taxi.md @@ -1,15 +1,20 @@ -# 纽约市出租车数据 {#niu-yue-shi-chu-zu-che-shu-ju} +--- +toc_priority: 20 +toc_title: New York Taxi Data +--- + +# 纽约出租车数据 {#niu-yue-shi-chu-zu-che-shu-ju} 纽约市出租车数据有以下两个方式获取: -从原始数据导入 -下载预处理好的分区数据 +- 从原始数据导入 +- 下载处理好的数据 ## 怎样导入原始数据 {#zen-yang-dao-ru-yuan-shi-shu-ju} -可以参考https://github.com/toddwschneider/nyc-taxi-data和http://tech.marksblogg.com/billion-nyc-taxi-rides-redshift.html中的关于数据集结构描述与数据下载指令说明。 +可以参考 https://github.com/toddwschneider/nyc-taxi-data 和 http://tech.marksblogg.com/billion-nyc-taxi-rides-redshift.html 中的关于数据集结构描述与数据下载指令说明。 -数据集包含227GB的CSV文件。这大约需要一个小时的下载时间(1Gbit带宽下,并行下载大概是一半时间)。 +数据集包含227GB的CSV文件。在1Gbig的带宽下,下载大约需要一个小时这大约需要一个小时的下载时间(从s3.amazonaws.com并行下载时间至少可以缩减一半)。 下载时注意损坏的文件。可以检查文件大小并重新下载损坏的文件。 有些文件中包含一些无效的行,您可以使用如下语句修复他们: @@ -21,7 +26,7 @@ mv data/yellow_tripdata_2010-02.csv_ data/yellow_tripdata_2010-02.csv mv data/yellow_tripdata_2010-03.csv_ data/yellow_tripdata_2010-03.csv ``` -然后您必须在PostgreSQL中预处理这些数据。这将创建多边形中的点(以匹配在地图中纽约市中范围),然后通过使用JOIN查询将数据关联组合到一个规范的表中。为了完成这部分操作,您需要安装PostgreSQL的同时安装PostGIS插件。 +然后必须在PostgreSQL中对数据进行预处理。这将创建多边形中选择的点(将地图上的点与纽约市的行政区相匹配),并使用连接将所有数据合并到一个非规范化的平面表中。为此,您需要安装支持PostGIS的PostgreSQL。 运行`initialize_database.sh`时要小心,并手动重新检查是否正确创建了所有表。 @@ -114,7 +119,7 @@ COPY ) TO '/opt/milovidov/nyc-taxi-data/trips.tsv'; ``` -数据快照的创建速度约为每秒50 MB。 在创建快照时,PostgreSQL以每秒约28 MB的速度从磁盘读取数据。 +数据快照的创建速度约为每秒50MB。 在创建快照时,PostgreSQL以每秒约28MB的速度从磁盘读取数据。 这大约需要5个小时。 最终生成的TSV文件为590612904969 bytes。 在ClickHouse中创建临时表: @@ -186,11 +191,11 @@ real 75m56.214s 数据的读取速度为112-140 Mb/秒。 通过这种方式将数据加载到Log表中需要76分钟。 -这个表中的数据需要使用142 GB的磁盘空间. +这个表中的数据需要使用142GB的磁盘空间. (也可以直接使用`COPY ... TO PROGRAM`从Postgres中导入数据) -由于数据中与天气相关的所有数据(precipitation……average_wind_speed)都填充了NULL。 所以,我们将从最终数据集中删除它们 +数据中所有与天气相关的字段(precipitation……average_wind_speed)都填充了NULL。 所以,我们将从最终数据集中删除它们 首先,我们使用单台服务器创建表,后面我们将在多台节点上创建这些表。 @@ -259,7 +264,7 @@ FROM trips ``` 这需要3030秒,速度约为每秒428,000行。 -要加快速度,可以使用`Log`引擎替换’MergeTree\`引擎来创建表。 在这种情况下,下载速度超过200秒。 +要加快速度,可以使用`Log`引擎替换`MergeTree`引擎来创建表。 在这种情况下,下载速度超过200秒。 这个表需要使用126GB的磁盘空间。 @@ -286,8 +291,7 @@ $ clickhouse-client --query "select count(*) from datasets.trips_mergetree" ``` !!! info "信息" - 如果要运行下面的SQL查询,必须使用完整的表名, -`datasets.trips_mergetree`。 + 如果要运行下面的SQL查询,必须使用完整的表名,`datasets.trips_mergetree`。 ## 单台服务器运行结果 {#dan-tai-fu-wu-qi-yun-xing-jie-guo} @@ -328,9 +332,9 @@ ORDER BY year, count(*) DESC 我们使用的是如下配置的服务器: -两个英特尔(R)至强(R)CPU E5-2650v2@2.60GHz,总共有16个物理内核,128GiB RAM,硬件RAID-5上的8X6TB HD +两个`Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz`,总共有16个物理内核,128GiB RAM,8X6TB HD,RAID-5 -执行时间是取三次运行中最好的值,但是从第二次查询开始,查询就讲从文件系统的缓存中读取数据。同时在每次读取和处理后不在进行缓存。 +执行时间是取三次运行中最好的值,但是从第二次查询开始,查询就将从文件系统的缓存中读取数据。同时在每次读取和处理后不在进行缓存。 在三台服务器中创建表结构: @@ -356,12 +360,12 @@ INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree 在三台服务器集群中运行的结果: -Q1:0.212秒. +Q1: 0.212秒. Q2:0.438秒。 Q3:0.733秒。 -Q4:1.241秒. +Q4: 1.241秒. -不出意料,查询是线性扩展的。 +这并不奇怪,因为查询是线性扩展的。 我们同时在140台服务器的集群中运行的结果: @@ -371,7 +375,7 @@ Q3:0.051秒。 Q4:0.072秒。 在这种情况下,查询处理时间首先由网络延迟确定。 -我们使用位于芬兰的Yandex数据中心中的客户端去位于俄罗斯的集群上运行查询,这增加了大约20毫秒的延迟。 +我们使用位于芬兰Yandex数据中心的客户机在俄罗斯的一个集群上运行查询,这增加了大约20毫秒的延迟。 ## 总结 {#zong-jie} diff --git a/docs/zh/getting-started/example-datasets/ontime.md b/docs/zh/getting-started/example-datasets/ontime.md index 51749d9013b..4c21eee51a2 100644 --- a/docs/zh/getting-started/example-datasets/ontime.md +++ b/docs/zh/getting-started/example-datasets/ontime.md @@ -1,9 +1,14 @@ -# 航班飞行数据 {#hang-ban-fei-xing-shu-ju} +--- +toc_priority: 21 +toc_title: OnTime +--- + +# OnTime {#ontime} 航班飞行数据有以下两个方式获取: - 从原始数据导入 -- 下载预处理好的分区数据 +- 下载预处理好的数据 ## 从原始数据导入 {#cong-yuan-shi-shu-ju-dao-ru} @@ -19,7 +24,7 @@ done done ``` -(引用 https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh ) +(参考 https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh ) 创建表结构: @@ -157,8 +162,7 @@ $ clickhouse-client --query "select count(*) from datasets.ontime" ``` !!! info "信息" - 如果要运行下面的SQL查询,必须使用完整的表名, -`datasets.ontime`。 + 如果要运行下面的SQL查询,必须使用完整的表名,`datasets.ontime`。 ## 查询: {#cha-xun} @@ -356,7 +360,7 @@ ORDER by rate DESC LIMIT 1000; ``` -奖金: +Bonus: ``` sql SELECT avg(cnt) @@ -402,3 +406,5 @@ LIMIT 10; - https://www.percona.com/blog/2014/04/21/using-apache-hadoop-and-impala-together-with-mysql-for-data-analysis/ - https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/ - http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html + +[原始文章](https://clickhouse.tech/docs/en/getting_started/example_datasets/ontime/) diff --git a/docs/zh/getting-started/example-datasets/star-schema.md b/docs/zh/getting-started/example-datasets/star-schema.md index 71dd58160a6..fcb6e90c694 100644 --- a/docs/zh/getting-started/example-datasets/star-schema.md +++ b/docs/zh/getting-started/example-datasets/star-schema.md @@ -1,4 +1,9 @@ -# 星型基准测试 {#star-schema-benchmark} +--- +toc_priority: 16 +toc_title: Star Schema Benchmark +--- + +# Star Schema Benchmark {#star-schema-benchmark} 编译 dbgen: @@ -10,6 +15,9 @@ $ make 开始生成数据: +!!! warning "注意" + 使用`-s 100`dbgen将生成6亿行数据(67GB), 如果使用`-s 1000`它会生成60亿行数据(这需要很多时间)) + ``` bash $ ./dbgen -s 1000 -T c $ ./dbgen -s 1000 -T l @@ -18,7 +26,7 @@ $ ./dbgen -s 1000 -T s $ ./dbgen -s 1000 -T d ``` -在ClickHouse中创建表结构: +在ClickHouse中创建数据表: ``` sql CREATE TABLE customer @@ -92,7 +100,7 @@ $ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl $ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl ``` -将«星型模型»转换为非规范化的«平面模型»: +将`star schema`转换为`flat schema`: ``` sql SET max_memory_usage = 20000000000, allow_experimental_multiple_joins_emulation = 1; diff --git a/docs/zh/getting-started/example-datasets/wikistat.md b/docs/zh/getting-started/example-datasets/wikistat.md index 2986f90ef9f..4320d6b4926 100644 --- a/docs/zh/getting-started/example-datasets/wikistat.md +++ b/docs/zh/getting-started/example-datasets/wikistat.md @@ -1,4 +1,9 @@ -# 维基访问数据 {#wei-ji-fang-wen-shu-ju} +--- +toc_priority: 17 +toc_title: WikiStat +--- + +# WikiStat {#wikistat} 参考: http://dumps.wikimedia.org/other/pagecounts-raw/ diff --git a/docs/zh/getting-started/index.md b/docs/zh/getting-started/index.md index ac70394c785..ac6074eb72f 100644 --- a/docs/zh/getting-started/index.md +++ b/docs/zh/getting-started/index.md @@ -1,3 +1,10 @@ +--- +machine_translated: true +machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd +toc_folder_title: "\u5BFC\u8A00" +toc_priority: 2 +--- + # 入门 {#ru-men} 如果您是ClickHouse的新手,并希望亲身体验它的性能,首先您需要通过 [安装过程](install.md). diff --git a/docs/zh/getting-started/install.md b/docs/zh/getting-started/install.md index 84791197ab6..51d6ed198fa 100644 --- a/docs/zh/getting-started/install.md +++ b/docs/zh/getting-started/install.md @@ -1,34 +1,46 @@ +--- +toc_priority: 11 +toc_title: 安装部署 +--- + # 安装 {#clickhouse-an-zhuang} ## 系统要求 {#xi-tong-yao-qiu} ClickHouse可以在任何具有x86_64,AArch64或PowerPC64LE CPU架构的Linux,FreeBSD或Mac OS X上运行。 -虽然预构建的二进制文件通常是为x86  _64编译并利用SSE 4.2指令集,但除非另有说明,否则使用支持它的CPU将成为额外的系统要求。这是检查当前CPU是否支持SSE 4.2的命令: +官方预构建的二进制文件通常针对x86_64进行编译,并利用`SSE 4.2`指令集,因此,除非另有说明,支持它的CPU使用将成为额外的系统需求。下面是检查当前CPU是否支持SSE 4.2的命令: ``` bash $ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" ``` -要在不支持SSE 4.2或具有AArch64或PowerPC64LE体系结构的处理器上运行ClickHouse,您应该[通过源构建ClickHouse](#from-sources)进行适当的配置调整。 +要在不支持`SSE 4.2`或`AArch64`,`PowerPC64LE`架构的处理器上运行ClickHouse,您应该通过适当的配置调整从[源代码构建ClickHouse](#from-sources)。 -## 可用的安装选项 {#install-from-deb-packages} +## 可用安装包 {#install-from-deb-packages} -建议为Debian或Ubuntu使用官方的预编译`deb`软件包。 运行以下命令以安装软件包: +### `DEB`安装包 -然后运行: +建议使用Debian或Ubuntu的官方预编译`deb`软件包。运行以下命令来安装包: ``` bash {% include 'install/deb.sh' %} ``` -你也可以从这里手动下载安装包:https://repo.clickhouse.tech/deb/stable/main/。 +如果您想使用最新的版本,请用`testing`替代`stable`(我们只推荐您用于测试环境)。 -如果你想使用最新的测试版本,请使用`testing`替换`stable`。 +你也可以从这里手动下载安装包:[下载](https://repo.clickhouse.tech/deb/stable/main/)。 -### 来自RPM包 {#from-rpm-packages} +安装包列表: -Yandex ClickHouse团队建议使用官方预编译的`rpm`软件包,用于CentOS,RedHat和所有其他基于rpm的Linux发行版。 +- `clickhouse-common-static` — ClickHouse编译的二进制文件。 +- `clickhouse-server` — 创建`clickhouse-server`软连接,并安装默认配置服务 +- `clickhouse-client` — 创建`clickhouse-client`客户端工具软连接,并安装客户端配置文件。 +- `clickhouse-common-static-dbg` — 带有调试信息的ClickHouse二进制文件。 + +### `RPM`安装包 {#from-rpm-packages} + +推荐使用CentOS、RedHat和所有其他基于rpm的Linux发行版的官方预编译`rpm`包。 首先,您需要添加官方存储库: @@ -38,84 +50,120 @@ sudo rpm --import https://repo.clickhouse.tech/CLICKHOUSE-KEY.GPG sudo yum-config-manager --add-repo https://repo.clickhouse.tech/rpm/stable/x86_64 ``` -如果您想使用最新版本,请将`stable`替换为`testing`(建议您在测试环境中使用)。 +如果您想使用最新的版本,请用`testing`替代`stable`(我们只推荐您用于测试环境)。`prestable`有时也可用。 -然后运行这些命令以实际安装包: +然后运行命令安装: ``` bash sudo yum install clickhouse-server clickhouse-client ``` -您也可以从此处手动下载和安装软件包:https://repo.clickhouse.tech/rpm/stable/x86_64。 +你也可以从这里手动下载安装包:[下载](https://repo.clickhouse.tech/rpm/stable/x86_64)。 -### 来自Docker {#from-docker-image} +### `Tgz`安装包 {#from-tgz-archives} -要在Docker中运行ClickHouse,请遵循[码头工人中心](https://hub.docker.com/r/yandex/clickhouse-server/)上的指南。那些图像使用官方的`deb`包。 +如果您的操作系统不支持安装`deb`或`rpm`包,建议使用官方预编译的`tgz`软件包。 + +所需的版本可以通过`curl`或`wget`从存储库`https://repo.clickhouse.tech/tgz/`下载。 + +下载后解压缩下载资源文件并使用安装脚本进行安装。以下是一个最新版本的安装示例: + +``` bash +export LATEST_VERSION=`curl https://api.github.com/repos/ClickHouse/ClickHouse/tags 2>/dev/null | grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | head -n 1` +curl -O https://repo.clickhouse.tech/tgz/clickhouse-common-static-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.tech/tgz/clickhouse-common-static-dbg-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.tech/tgz/clickhouse-server-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.tech/tgz/clickhouse-client-$LATEST_VERSION.tgz + +tar -xzvf clickhouse-common-static-$LATEST_VERSION.tgz +sudo clickhouse-common-static-$LATEST_VERSION/install/doinst.sh + +tar -xzvf clickhouse-common-static-dbg-$LATEST_VERSION.tgz +sudo clickhouse-common-static-dbg-$LATEST_VERSION/install/doinst.sh + +tar -xzvf clickhouse-server-$LATEST_VERSION.tgz +sudo clickhouse-server-$LATEST_VERSION/install/doinst.sh +sudo /etc/init.d/clickhouse-server start + +tar -xzvf clickhouse-client-$LATEST_VERSION.tgz +sudo clickhouse-client-$LATEST_VERSION/install/doinst.sh +``` + +对于生产环境,建议使用最新的`stable`版本。你可以在GitHub页面https://github.com/ClickHouse/ClickHouse/tags找到它,它以后缀`-stable`标志。 + +### `Docker`安装包 {#from-docker-image} + +要在Docker中运行ClickHouse,请遵循[Docker Hub](https://hub.docker.com/r/yandex/clickhouse-server/)上的指南。它是官方的`deb`安装包。 + +### 其他环境安装包 {#from-other} + +对于非linux操作系统和Arch64 CPU架构,ClickHouse将会以`master`分支的最新提交的进行编译提供(它将会有几小时的延迟)。 + +- [macOS](https://builds.clickhouse.tech/master/macos/clickhouse) — `curl -O 'https://builds.clickhouse.tech/master/macos/clickhouse' && chmod a+x ./clickhouse` +- [FreeBSD](https://builds.clickhouse.tech/master/freebsd/clickhouse) — `curl -O 'https://builds.clickhouse.tech/master/freebsd/clickhouse' && chmod a+x ./clickhouse` +- [AArch64](https://builds.clickhouse.tech/master/aarch64/clickhouse) — `curl -O 'https://builds.clickhouse.tech/master/aarch64/clickhouse' && chmod a+x ./clickhouse` + +下载后,您可以使用`clickhouse client`连接服务,或者使用`clickhouse local`模式处理数据,不过您必须要额外在GitHub下载[server](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/config.xml)和[users](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/users.xml)配置文件。 + +不建议在生产环境中使用这些构建版本,因为它们没有经过充分的测试,但是您可以自行承担这样做的风险。它们只是ClickHouse功能的一个部分。 ### 使用源码安装 {#from-sources} -具体编译方式可以参考build.md。 +要手动编译ClickHouse, 请遵循[Linux](../development/build.md)或[Mac OS X](../development/build-osx.md)说明。 -你可以编译并安装它们。 -你也可以直接使用而不进行安装。 +您可以编译并安装它们,也可以使用不安装包的程序。通过手动构建,您可以禁用`SSE 4.2`或`AArch64 cpu`。 -``` text -Client: programs/clickhouse-client -Server: programs/clickhouse-server -``` + Client: programs/clickhouse-client + Server: programs/clickhouse-server -在服务器中为数据创建如下目录: +您需要创建一个数据和元数据文件夹,并为所需的用户`chown`授权。它们的路径可以在服务器配置(`src/programs/server/config.xml`)中改变,默认情况下它们是: -``` text -/opt/clickhouse/data/default/ -/opt/clickhouse/metadata/default/ -``` + /opt/clickhouse/data/default/ + /opt/clickhouse/metadata/default/ -(它们可以在server config中配置。) -为需要的用户运行’chown’ - -日志的路径可以在server config (src/programs/server/config.xml)中配置。 +在Gentoo上,你可以使用`emerge clickhouse`从源代码安装ClickHouse。 ## 启动 {#qi-dong} -可以运行如下命令在后台启动服务: +如果没有`service`,可以运行如下命令在后台启动服务: ``` bash -sudo service clickhouse-server start +$ sudo /etc/init.d/clickhouse-server start ``` -可以在`/var/log/clickhouse-server/`目录中查看日志。 +日志文件将输出在`/var/log/clickhouse-server/`文件夹。 -如果服务没有启动,请检查配置文件 `/etc/clickhouse-server/config.xml`。 +如果服务器没有启动,检查`/etc/clickhouse-server/config.xml`中的配置。 -你也可以在控制台中直接启动服务: +您也可以手动从控制台启动服务器: -``` bash -clickhouse-server --config-file=/etc/clickhouse-server/config.xml +```bash +$ clickhouse-server --config-file=/etc/clickhouse-server/config.xml ``` -在这种情况下,日志将被打印到控制台中,这在开发过程中很方便。 -如果配置文件在当前目录中,你可以不指定’–config-file’参数。它默认使用’./config.xml’。 +在这种情况下,日志将被打印到控制台,这在开发过程中很方便。 -你可以使用命令行客户端连接到服务: +如果配置文件在当前目录中,则不需要指定`——config-file`参数。默认情况下,它的路径为`./config.xml`。 + +ClickHouse支持访问限制设置。它们位于`users.xml`文件(与`config.xml`同级目录)。 +默认情况下,允许`default`用户从任何地方访问,不需要密码。可查看`user/default/networks`。 +更多信息,请参见[Configuration Files](../operations/configuration-files.md)。 + +启动服务后,您可以使用命令行客户端连接到它: ``` bash -clickhouse-client +$ clickhouse-client ``` -默认情况下它使用’default’用户无密码的与localhost:9000服务建立连接。 -客户端也可以用于连接远程服务,例如: +默认情况下,使用`default`用户并不携带密码连接到`localhost:9000`。还可以使用`--host`参数连接到指定服务器。 + +终端必须使用UTF-8编码。 +更多信息,请参阅[Command-line client](../interfaces/cli.md)。 + +示例: -``` bash -clickhouse-client --host=example.com ``` - -有关更多信息,请参考«Command-line client»部分。 - -检查系统是否工作: - -``` bash -milovidov@hostname:~/work/metrica/src/src/Client$ ./clickhouse-client +$ ./clickhouse-client ClickHouse client version 0.0.18749. Connecting to localhost:9000. Connected to ClickHouse server version 0.0.18749. @@ -135,6 +183,6 @@ SELECT 1 **恭喜,系统已经工作了!** -为了继续进行实验,你可以尝试下载测试数据集。 +为了继续进行实验,你可以尝试下载测试数据集或查看[教程](https://clickhouse.tech/tutorial.html)。 [原始文章](https://clickhouse.tech/docs/en/getting_started/install/) diff --git a/docs/zh/getting-started/tutorial.md b/docs/zh/getting-started/tutorial.md index 07f595b4354..93f368bc2dc 100644 --- a/docs/zh/getting-started/tutorial.md +++ b/docs/zh/getting-started/tutorial.md @@ -1,19 +1,19 @@ --- toc_priority: 12 -toc_title: "\u6559\u7A0B" +toc_title: 使用教程 --- -# 点击教程 {#clickhouse-tutorial} +# ClickHouse教程 {#clickhouse-tutorial} -## 从本教程中可以期待什么? {#what-to-expect-from-this-tutorial} +## 从本教程中可以获得什么? {#what-to-expect-from-this-tutorial} -通过本教程,您将学习如何设置一个简单的ClickHouse集群。 它会很小,但却是容错和可扩展的。 然后,我们将使用其中一个示例数据集来填充数据并执行一些演示查询。 +通过学习本教程,您将了解如何设置一个简单的ClickHouse集群。它会很小,但是可以容错和扩展。然后,我们将使用其中一个示例数据集来填充数据并执行一些演示查询。 ## 单节点设置 {#single-node-setup} -为了推迟分布式环境的复杂性,我们将首先在单个服务器或虚拟机上部署ClickHouse。 ClickHouse通常是从[deb](install.md#install-from-deb-packages) 或 [rpm](install.md#from-rpm-packages) 包安装,但对于不支持它们的操作系统也有 [替代方法](install.md#from-docker-image) 。 +为了延迟演示分布式环境的复杂性,我们将首先在单个服务器或虚拟机上部署ClickHouse。ClickHouse通常是从[deb](install.md#install-from-deb-packages)或[rpm](install.md#from-rpm-packages)包安装,但对于不支持它们的操作系统也有[其他方法](install.md#from-docker-image)。 -例如,您选择了从 `deb` 包安装,执行: +例如,您选择`deb`安装包,执行: ``` bash {% include 'install/deb.sh' %} @@ -21,13 +21,13 @@ toc_title: "\u6559\u7A0B" 在我们安装的软件中包含这些包: -- `clickhouse-client` 包,包含 [clickhouse-client](../interfaces/cli.md) 应用程序,它是交互式ClickHouse控制台客户端。 +- `clickhouse-client` 包,包含[clickhouse-client](../interfaces/cli.md)客户端,它是交互式ClickHouse控制台客户端。 - `clickhouse-common` 包,包含一个ClickHouse可执行文件。 - `clickhouse-server` 包,包含要作为服务端运行的ClickHouse配置文件。 -服务端配置文件位于 `/etc/clickhouse-server/`。 在进一步讨论之前,请注意 `config.xml`文件中的`` 元素. Path决定了数据存储的位置,因此该位置应该位于磁盘容量较大的卷上;默认值为 `/var/lib/clickhouse/`。 如果你想调整配置,考虑到它可能会在未来的软件包更新中被重写,直接编辑`config.xml` 文件并不方便。 推荐的方法是在[配置文件](../operations/configuration-files.md)目录创建文件,作为config.xml文件的“补丁”,用以复写配置元素。 +服务器配置文件位于`/etc/clickhouse-server/`。在继续之前,请注意`config.xml`中的``元素。它决定了数据存储的位置,因此它应该位于磁盘容量的卷上;默认值是`/var/lib/clickhouse/`。如果你想调整配置,直接编辑config是不方便的。考虑到它可能会在将来的包更新中被重写。建议重写配置元素的方法是在配置中创建[config.d文件夹](../operations/configuration-files.md),作为config.xml的重写方式。 -你可能已经注意到了, `clickhouse-server` 安装后不会自动启动。 它也不会在更新后自动重新启动。 您启动服务端的方式取决于您的初始系统,通常情况下是这样: +你可能已经注意到了,`clickhouse-server`安装后不会自动启动。 它也不会在更新后自动重新启动。 您启动服务端的方式取决于您的初始系统,通常情况下是这样: ``` bash sudo service clickhouse-server start @@ -39,9 +39,9 @@ sudo service clickhouse-server start sudo /etc/init.d/clickhouse-server start ``` -服务端日志的默认位置是 `/var/log/clickhouse-server/`。当服务端在日志中记录 `Ready for connections` 消息,即表示服务端已准备好处理客户端连接。 +服务端日志的默认位置是`/var/log/clickhouse-server/`。当服务端在日志中记录`Ready for connections`消息,即表示服务端已准备好处理客户端连接。 -一旦 `clickhouse-server` 启动并运行,我们可以利用 `clickhouse-client` 连接到服务端,并运行一些测试查询,如 `SELECT "Hello, world!";`. +一旦`clickhouse-server`启动并运行,我们可以利用`clickhouse-client`连接到服务端,并运行一些测试查询,如`SELECT "Hello, world!";`.
@@ -80,7 +80,7 @@ clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv ## 导入示例数据集 {#import-sample-dataset} -现在是时候用一些示例数据填充我们的ClickHouse服务端。 在本教程中,我们将使用Yandex.Metrica的匿名数据,它是在ClickHouse成为开源之前作为生产环境运行的第一个服务(关于这一点的更多内容请参阅[ClickHouse历史](../introduction/history.md))。有 [多种导入Yandex.Metrica数据集的的方法](example-datasets/metrica.md),为了本教程,我们将使用最现实的一个。 +现在是时候用一些示例数据填充我们的ClickHouse服务端。 在本教程中,我们将使用Yandex.Metrica的匿名数据,它是在ClickHouse成为开源之前作为生产环境运行的第一个服务(关于这一点的更多内容请参阅[ClickHouse历史](../introduction/history.md))。[多种导入Yandex.Metrica数据集方法](example-datasets/metrica.md),为了本教程,我们将使用最现实的一个。 ### 下载并提取表数据 {#download-and-extract-table-data} @@ -93,17 +93,17 @@ curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unx ### 创建表 {#create-tables} -与大多数数据库管理系统一样,ClickHouse在逻辑上将表分组为数据库。包含一个 `default` 数据库,但我们将创建一个新的数据库 `tutorial`: +与大多数数据库管理系统一样,ClickHouse在逻辑上将表分组为数据库。包含一个`default`数据库,但我们将创建一个新的数据库`tutorial`: ``` bash clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial" ``` -与创建数据库相比,创建表的语法要复杂得多(请参阅 [参考资料](../sql-reference/statements/create.md). 一般 `CREATE TABLE` 声明必须指定三个关键的事情: +与创建数据库相比,创建表的语法要复杂得多(请参阅[参考资料](../sql-reference/statements/create.md). 一般`CREATE TABLE`声明必须指定三个关键的事情: 1. 要创建的表的名称。 2. 表结构,例如:列名和对应的[数据类型](../sql-reference/data-types/index.md)。 -3. [表引擎](../engines/table-engines/index.md) 及其设置,这决定了对此表的查询操作是如何在物理层面执行的所有细节。 +3. [表引擎](../engines/table-engines/index.md)及其设置,这决定了对此表的查询操作是如何在物理层面执行的所有细节。 Yandex.Metrica是一个网络分析服务,样本数据集不包括其全部功能,因此只有两个表可以创建: @@ -455,11 +455,11 @@ SETTINGS index_granularity = 8192 您可以使用`clickhouse-client`的交互模式执行这些查询(只需在终端中启动它,而不需要提前指定查询)。或者如果你愿意,可以尝试一些[替代接口](../interfaces/index.md)。 -正如我们所看到的, `hits_v1` 使用 [基本的MergeTree引擎](../engines/table-engines/mergetree-family/mergetree.md),而 `visits_v1` 使用 [折叠树](../engines/table-engines/mergetree-family/collapsingmergetree.md) 变体。 +正如我们所看到的, `hits_v1`使用 [MergeTree引擎](../engines/table-engines/mergetree-family/mergetree.md),而`visits_v1`使用 [Collapsing](../engines/table-engines/mergetree-family/collapsingmergetree.md)引擎。 ### 导入数据 {#import-data} -数据导入到ClickHouse是通过以下方式完成的 [INSERT INTO](../sql-reference/statements/insert-into.md) 查询像许多其他SQL数据库。 然而,数据通常是在一个提供 [支持的序列化格式](../interfaces/formats.md) 而不是 `VALUES` 子句(也支持)。 +数据导入到ClickHouse是通过[INSERT INTO](../sql-reference/statements/insert-into.md)方式完成的,查询类似许多SQL数据库。然而,数据通常是在一个提供[支持序列化格式](../interfaces/formats.md)而不是`VALUES`子句(也支持)。 我们之前下载的文件是以制表符分隔的格式,所以这里是如何通过控制台客户端导入它们: @@ -468,7 +468,7 @@ clickhouse-client --query "INSERT INTO tutorial.hits_v1 FORMAT TSV" --max_insert clickhouse-client --query "INSERT INTO tutorial.visits_v1 FORMAT TSV" --max_insert_block_size=100000 < visits_v1.tsv ``` -ClickHouse有很多 [要调整的设置](../operations/settings/index.md) 在控制台客户端中指定它们的一种方法是通过参数,就像我们看到上面语句中的 `--max_insert_block_size`。找出可用的设置、含义及其默认值的最简单方法是查询 `system.settings` 表: +ClickHouse有很多[要调整的设置](../operations/settings/index.md)在控制台客户端中指定它们的一种方法是通过参数,就像我们看到上面语句中的`--max_insert_block_size`。找出可用的设置、含义及其默认值的最简单方法是查询`system.settings` 表: ``` sql SELECT name, value, changed, description @@ -479,14 +479,14 @@ FORMAT TSV max_insert_block_size 1048576 0 "The maximum block size for insertion, if we control the creation of blocks for insertion." ``` -您也可以 [OPTIMIZE](../sql-reference/statements/misc.md#misc_operations-optimize) 导入后的表。 使用MergeTree-family引擎配置的表总是在后台合并数据部分以优化数据存储(或至少检查是否有意义)。 这些查询强制表引擎立即进行存储优化,而不是稍后一段时间执行: +您也可以[OPTIMIZE](../sql-reference/statements/misc.md#misc_operations-optimize)导入后的表。使用MergeTree-family引擎配置的表总是在后台合并数据部分以优化数据存储(或至少检查是否有意义)。 这些查询强制表引擎立即进行存储优化,而不是稍后一段时间执行: ``` bash clickhouse-client --query "OPTIMIZE TABLE tutorial.hits_v1 FINAL" clickhouse-client --query "OPTIMIZE TABLE tutorial.visits_v1 FINAL" ``` -这些查询开始一个I/O和CPU密集型操作,所以如果表一直接收到新数据,最好不要管它,让合并在后台运行。 +这些查询开始I/O和CPU密集型操作,所以如果表一直接收到新数据,最好不要管它,让合并在后台运行。 现在我们可以检查表导入是否成功: @@ -524,9 +524,9 @@ ClickHouse集群是一个同质集群。 设置步骤: 1. 在群集的所有机器上安装ClickHouse服务端 2. 在配置文件中设置群集配置 3. 在每个实例上创建本地表 -4. 创建一个 [分布式表](../engines/table-engines/special/distributed.md) +4. 创建一个[分布式表](../engines/table-engines/special/distributed.md) -[分布式表](../engines/table-engines/special/distributed.md) 实际上是一种 “视图”,映射到ClickHouse集群的本地表。 从分布式表中执行 **SELECT** 查询会使用集群所有分片的资源。 您可以为多个集群指定configs,并创建多个分布式表,为不同的集群提供视图。 +[分布式表](../engines/table-engines/special/distributed.md)实际上是一种`view`,映射到ClickHouse集群的本地表。 从分布式表中执行**SELECT**查询会使用集群所有分片的资源。 您可以为多个集群指定configs,并创建多个分布式表,为不同的集群提供视图。 具有三个分片,每个分片一个副本的集群的示例配置: @@ -555,7 +555,7 @@ ClickHouse集群是一个同质集群。 设置步骤: ``` -为了进一步演示,让我们使用和创建 `hits_v1` 表相同的 `CREATE TABLE` 语句创建一个新的本地表,但表名不同: +为了进一步演示,让我们使用和创建`hits_v1`表相同的`CREATE TABLE`语句创建一个新的本地表,但表名不同: ``` sql CREATE TABLE tutorial.hits_local (...) ENGINE = MergeTree() ... @@ -568,9 +568,9 @@ CREATE TABLE tutorial.hits_all AS tutorial.hits_local ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand()); ``` -常见的做法是在集群的所有计算机上创建类似的分布式表。 它允许在群集的任何计算机上运行分布式查询。 还有一个替代选项可以使用以下方法为给定的SELECT查询创建临时分布式表 [远程](../sql-reference/table-functions/remote.md) 表功能。 +常见的做法是在集群的所有计算机上创建类似的分布式表。 它允许在群集的任何计算机上运行分布式查询。 还有一个替代选项可以使用以下方法为给定的SELECT查询创建临时分布式表[远程](../sql-reference/table-functions/remote.md)表功能。 -让我们运行 [INSERT SELECT](../sql-reference/statements/insert-into.md) 将该表传播到多个服务器。 +让我们运行[INSERT SELECT](../sql-reference/statements/insert-into.md)将该表传播到多个服务器。 ``` sql INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1; @@ -609,10 +609,10 @@ INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1; ``` -启用本机复制 [Zookeeper](http://zookeeper.apache.org/) 是必需的。 ClickHouse负责所有副本的数据一致性,并在失败后自动运行恢复过程。 建议将ZooKeeper集群部署在单独的服务器上(其中没有其他进程,包括运行的ClickHouse)。 +启用本机复制[Zookeeper](http://zookeeper.apache.org/)是必需的。 ClickHouse负责所有副本的数据一致性,并在失败后自动运行恢复过程。建议将ZooKeeper集群部署在单独的服务器上(其中没有其他进程,包括运行的ClickHouse)。 -!!! note "注" - ZooKeeper不是一个严格的要求:在某些简单的情况下,您可以通过将数据写入应用程序代码中的所有副本来复制数据。 这种方法是 **不** 建议的,在这种情况下,ClickHouse将无法保证所有副本上的数据一致性。 因此需要由您的应用来保证这一点。 +!!! note "注意" + ZooKeeper不是一个严格的要求:在某些简单的情况下,您可以通过将数据写入应用程序代码中的所有副本来复制数据。 这种方法是**不**建议的,在这种情况下,ClickHouse将无法保证所有副本上的数据一致性。 因此需要由您的应用来保证这一点。 ZooKeeper位置在配置文件中指定: @@ -653,12 +653,12 @@ ENGINE = ReplcatedMergeTree( ... ``` -在这里,我们使用 [ReplicatedMergeTree](../engines/table-engines/mergetree-family/replication.md) 表引擎。 在参数中,我们指定包含分片和副本标识符的ZooKeeper路径。 +在这里,我们使用[ReplicatedMergeTree](../engines/table-engines/mergetree-family/replication.md)表引擎。 在参数中,我们指定包含分片和副本标识符的ZooKeeper路径。 ``` sql INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local; ``` -复制在多主机模式下运行。 数据可以加载到任何副本中,然后系统会自动将其与其他实例同步。 复制是异步的,因此在给定时刻,并非所有副本都可能包含最近插入的数据。 至少应有一个副本允许数据摄取。 其他人将同步数据和修复一致性,一旦他们将再次变得活跃。 请注意,这种方法允许最近插入的数据丢失的可能性很低。 +复制在多主机模式下运行。数据可以加载到任何副本中,然后系统自动将其与其他实例同步。复制是异步的,因此在给定时刻,并非所有副本都可能包含最近插入的数据。至少应该有一个副本允许数据摄入。另一些则会在重新激活后同步数据并修复一致性。请注意,这种方法允许最近插入的数据丢失的可能性很低。 [原始文章](https://clickhouse.tech/docs/en/getting_started/tutorial/) diff --git a/docs/zh/index.md b/docs/zh/index.md index 5294dc6c8c7..2bef22f3de4 100644 --- a/docs/zh/index.md +++ b/docs/zh/index.md @@ -4,53 +4,50 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS) 在传统的行式数据库系统中,数据按如下顺序存储: -| row | watchID | JavaEnable | title | GoodEvent | EventTime | -|-----|-------------|------------|------------|-----------|---------------------| -| #0 | 89354350662 | 1 | 投资者关系 | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | 联系我们 | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | 任务 | 1 | 2016-05-18 07:38:00 | -| #N | … | … | … | … | … | +| Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | +|-----|-------------|------------|--------------------|-----------|---------------------| +| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| #N | … | … | … | … | … | 处于同一行中的数据总是被物理的存储在一起。 -常见的行式数据库系统有: MySQL、Postgres和MS SQL Server。 -{: .灰色 } +常见的行式数据库系统有:`MySQL`、`Postgres`和`MS SQL Server`。 在列式数据库系统中,数据按如下的顺序存储: -| row: | #0 | #1 | #2 | #N | +| Row: | #0 | #1 | #2 | #N | |-------------|---------------------|---------------------|---------------------|-----| -| watchID: | 89354350662 | 90329509958 | 89953706054 | … | +| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | | JavaEnable: | 1 | 0 | 1 | … | -| title: | 投资者关系 | 联系我们 | 任务 | … | +| Title: | Investor Relations | Contact us | Mission | … | | GoodEvent: | 1 | 1 | 1 | … | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | -该示例中只展示了数据在列式数据库中数据的排列方式。 -对于存储而言,列式数据库总是将同一列的数据存储在一起,不同列的数据也总是分开存储。 +这些示例只显示了数据的排列顺序。来自不同列的值被单独存储,来自同一列的数据被存储在一起。 常见的列式数据库有: Vertica、 Paraccel (Actian Matrix,Amazon Redshift)、 Sybase IQ、 Exasol、 Infobright、 InfiniDB、 MonetDB (VectorWise, Actian Vector)、 LucidDB、 SAP HANA、 Google Dremel、 Google PowerDrill、 Druid、 kdb+。 -{: .灰色 } -不同的数据存储方式适用不同的业务场景,数据访问的场景包括:进行了何种查询、多久查询一次以及各类查询的比例; 每种查询读取多少数据————行、列和字节;读取数据和写入数据之间的关系;使用的数据集大小以及如何使用本地的数据集;是否使用事务,以及它们是如何进行隔离的;数据的复制机制与数据的完整性要求;每种类型的查询要求的延迟与吞吐量等等。 +不同的数据存储方式适用不同的业务场景,数据访问的场景包括:进行了何种查询、多久查询一次以及各类查询的比例;每种类型的查询(行、列和字节)读取多少数据;读取数据和更新之间的关系;使用的数据集大小以及如何使用本地的数据集;是否使用事务,以及它们是如何进行隔离的;数据的复制机制与数据的完整性要求;每种类型的查询要求的延迟与吞吐量等等。 -系统负载越高,依据使用场景进行定制化就越重要,并且定制将会变的越精细。没有一个系统能够同时适用所有明显不同的业务场景。如果系统适用于广泛的场景,在负载高的情况下,要兼顾所有的场景,那么将不得不做出选择。是要平衡还是要效率? +系统负载越高,依据使用场景进行定制化就越重要,并且定制将会变的越精细。没有一个系统能够同时适用所有不同的业务场景。如果系统适用于广泛的场景,在负载高的情况下,要兼顾所有的场景,那么将不得不做出选择。是要平衡还是要效率? ## OLAP场景的关键特征 {#olapchang-jing-de-guan-jian-te-zheng} -- 大多数是读请求 -- 数据总是以相当大的批(\> 1000 rows)进行写入 -- 不修改已添加的数据 -- 每次查询都从数据库中读取大量的行,但是同时又仅需要少量的列 +- 绝大多数是读请求 +- 数据以相当大的批次(\> 1000行)更新,而不是单行更新;或者根本没有更新。 +- 已添加到数据库的数据不能修改。 +- 对于读取,从数据库中提取相当多的行,但只提取列的一小部分。 - 宽表,即每个表包含着大量的列 -- 较少的查询(通常每台服务器每秒数百个查询或更少) +- 查询相对较少(通常每台服务器每秒查询数百次或更少) - 对于简单查询,允许延迟大约50毫秒 -- 列中的数据相对较小: 数字和短字符串(例如,每个URL 60个字节) -- 处理单个查询时需要高吞吐量(每个服务器每秒高达数十亿行) +- 列中的数据相对较小:数字和短字符串(例如,每个URL 60个字节) +- 处理单个查询时需要高吞吐量(每台服务器每秒可达数十亿行) - 事务不是必须的 - 对数据一致性要求低 -- 每一个查询除了一个大表外都很小 -- 查询结果明显小于源数据,换句话说,数据被过滤或聚合后能够被盛放在单台服务器的内存中 +- 每个查询有一个大表。除了他意以外,其他的都很小。 +- 查询结果明显小于源数据。换句话说,数据经过过滤或聚合,因此结果适合于单个服务器的RAM中 很容易可以看出,OLAP场景与其他通常业务场景(例如,OLTP或K/V)有很大的不同, 因此想要使用OLTP或Key-Value数据库去高效的处理分析查询场景,并不是非常完美的适用方案。例如,使用OLAP数据库去处理分析请求通常要优于使用MongoDB或Redis去处理分析请求。 diff --git a/docs/zh/introduction/adopters.md b/docs/zh/introduction/adopters.md index 38b9ca690e3..fc7dfa4efeb 100644 --- a/docs/zh/introduction/adopters.md +++ b/docs/zh/introduction/adopters.md @@ -1,6 +1,6 @@ --- -toc_priority: 8 -toc_title: "\u91C7\u7528\u8005" +toc_priority: 5 +toc_title: "ClickHouse用户" --- # ClickHouse用户 {#clickhouse-adopters} diff --git a/docs/zh/introduction/distinctive-features.md b/docs/zh/introduction/distinctive-features.md index 7396008f3b9..e9a506f2481 100644 --- a/docs/zh/introduction/distinctive-features.md +++ b/docs/zh/introduction/distinctive-features.md @@ -1,3 +1,8 @@ +--- +toc_priority: 2 +toc_title: ClickHouse的特性 +--- + # ClickHouse的特性 {#clickhouse-de-te-xing} ## 真正的列式数据库管理系统 {#zhen-zheng-de-lie-shi-shu-ju-ku-guan-li-xi-tong} @@ -12,9 +17,13 @@ 在一些列式数据库管理系统中(例如:InfiniDB CE 和 MonetDB) 并没有使用数据压缩。但是, 若想达到比较优异的性能,数据压缩确实起到了至关重要的作用。 +除了在磁盘空间和CPU消耗之间进行不同权衡的高效通用压缩编解码器之外,ClickHouse还提供针对特定类型数据的[专用编解码器](../sql-reference/statements/create/table.md#create-query-specialized-codecs),这使得ClickHouse能够与更小的数据库(如时间序列数据库)竞争并超越它们。 + ## 数据的磁盘存储 {#shu-ju-de-ci-pan-cun-chu} -许多的列式数据库(如 SAP HANA, Google PowerDrill)只能在内存中工作,这种方式会造成比实际更多的设备预算。ClickHouse被设计用于工作在传统磁盘上的系统,它提供每GB更低的存储成本,但如果有可以使用SSD和内存,它也会合理的利用这些资源。 +许多的列式数据库(如 SAP HANA, Google PowerDrill)只能在内存中工作,这种方式会造成比实际更多的设备预算。 + +ClickHouse被设计用于工作在传统磁盘上的系统,它提供每GB更低的存储成本,但如果可以使用SSD和内存,它也会合理的利用这些资源。 ## 多核心并行处理 {#duo-he-xin-bing-xing-chu-li} @@ -27,9 +36,11 @@ ClickHouse会使用服务器上一切可用的资源,从而以最自然的方 ## 支持SQL {#zhi-chi-sql} -ClickHouse支持基于SQL的声明式查询语言,该语言大部分情况下是与SQL标准兼容的。 -支持的查询包括 GROUP BY,ORDER BY,IN,JOIN以及非相关子查询。 -不支持窗口函数和相关子查询。 +ClickHouse支持一种[基于SQL的声明式查询语言](../sql-reference/index.md),它在许多情况下与[ANSI SQL标准](../sql-reference/ansi.md)相同。 + +支持的查询[GROUP BY](../sql-reference/statements/select/group-by.md), [ORDER BY](../sql-reference/statements/select/order-by.md), [FROM](../sql-reference/statements/select/from.md), [JOIN](../sql-reference/statements/select/join.md), [IN](../sql-reference/operators/in.md)以及非相关子查询。 + +相关(依赖性)子查询和窗口函数暂不受支持,但将来会被实现。 ## 向量引擎 {#xiang-liang-yin-qing} @@ -55,12 +66,20 @@ ClickHouse提供各种各样在允许牺牲数据精度的情况下对查询进 2. 基于数据的部分样本进行近似查询。这时,仅会从磁盘检索少部分比例的数据。 3. 不使用全部的聚合条件,通过随机选择有限个数据聚合条件进行聚合。这在数据聚合条件满足某些分布条件下,在提供相当准确的聚合结果的同时降低了计算资源的使用。 +## Adaptive Join Algorithm {#adaptive-join-algorithm} + +ClickHouse支持自定义[JOIN](../sql-reference/statements/select/join.md)多个表,它更倾向于散列连接算法,如果有多个大表,则使用合并-连接算法 + ## 支持数据复制和数据完整性 {#zhi-chi-shu-ju-fu-zhi-he-shu-ju-wan-zheng-xing} ClickHouse使用异步的多主复制技术。当数据被写入任何一个可用副本后,系统会在后台将数据分发给其他副本,以保证系统在不同副本上保持相同的数据。在大多数情况下ClickHouse能在故障后自动恢复,在一些少数的复杂情况下需要手动恢复。 更多信息,参见 [数据复制](../engines/table-engines/mergetree-family/replication.md)。 +## 角色的访问控制 {#role-based-access-control} + +ClickHouse使用SQL查询实现用户帐户管理,并允许[角色的访问控制](../operations/access-rights.md),类似于ANSI SQL标准和流行的关系数据库管理系统。 + # 限制 {#clickhouseke-xian-zhi} 1. 没有完整的事务支持。 diff --git a/docs/zh/introduction/history.md b/docs/zh/introduction/history.md index 29c8c263f9f..265ade8785b 100644 --- a/docs/zh/introduction/history.md +++ b/docs/zh/introduction/history.md @@ -1,3 +1,8 @@ +--- +toc_priority: 4 +toc_title: ClickHouse历史 +--- + # ClickHouse历史 {#clickhouseli-shi} ClickHouse最初是为 [YandexMetrica](https://metrica.yandex.com/) [世界第二大Web分析平台](http://w3techs.com/technologies/overview/traffic_analysis/all) 而开发的。多年来一直作为该系统的核心组件被该系统持续使用着。目前为止,该系统在ClickHouse中有超过13万亿条记录,并且每天超过200多亿个事件被处理。它允许直接从原始数据中动态查询并生成报告。本文简要介绍了ClickHouse在其早期发展阶段的目标。 diff --git a/docs/zh/introduction/performance.md b/docs/zh/introduction/performance.md index a5960cfa52e..0ae4b9b1e1e 100644 --- a/docs/zh/introduction/performance.md +++ b/docs/zh/introduction/performance.md @@ -1,3 +1,8 @@ +--- +toc_priority: 3 +toc_title: ClickHouse性能 +--- + # 性能 {#performance} 根据Yandex的内部测试结果,ClickHouse表现出了比同类可比较产品更优的性能。你可以在 [这里](https://clickhouse.tech/benchmark/dbms/) 查看具体的测试结果。 diff --git a/docs/zh/operations/utilities/clickhouse-local.md b/docs/zh/operations/utilities/clickhouse-local.md index 4e89961e198..3ff38c01651 100644 --- a/docs/zh/operations/utilities/clickhouse-local.md +++ b/docs/zh/operations/utilities/clickhouse-local.md @@ -3,18 +3,18 @@ toc_priority: 60 toc_title: clickhouse-local --- -# ツ环板-ョツ嘉ッツ偲 {#clickhouse-local} +# ClickHouse Local {#clickhouse-local} -该 `clickhouse-local` 程序使您能够对本地文件执行快速处理,而无需部署和配置ClickHouse服务器。 +`clickhouse-local`模式可以使您能够对本地文件执行快速处理,而无需部署和配置ClickHouse服务器。 -接受表示表的数据并使用以下方式查询它们 [ツ环板ECTョツ嘉ッツ偲](../../operations/utilities/clickhouse-local.md). +[ClickHouse SQL语法](../../operations/utilities/clickhouse-local.md)支持对表格数据的查询. -`clickhouse-local` 使用与ClickHouse server相同的核心,因此它支持大多数功能以及相同的格式和表引擎。 +`clickhouse-local`使用与ClickHouse Server相同的核心,因此它支持大多数功能以及相同的格式和表引擎。 -默认情况下 `clickhouse-local` 不能访问同一主机上的数据,但它支持使用以下方式加载服务器配置 `--config-file` 争论。 +默认情况下`clickhouse-local`不能访问同一主机上的数据,但它支持使用`--config-file`方式加载服务器配置。 !!! warning "警告" - 不建议将生产服务器配置加载到 `clickhouse-local` 因为数据可以在人为错误的情况下被损坏。 + 不建议将生产服务器配置加载到`clickhouse-local`因为数据可以在人为错误的情况下被损坏。 ## 用途 {#usage} @@ -26,21 +26,21 @@ clickhouse-local --structure "table_structure" --input-format "format_of_incomin 参数: -- `-S`, `--structure` — table structure for input data. -- `-if`, `--input-format` — input format, `TSV` 默认情况下。 -- `-f`, `--file` — path to data, `stdin` 默认情况下。 -- `-q` `--query` — queries to execute with `;` 如delimeter。 -- `-N`, `--table` — table name where to put output data, `table` 默认情况下。 -- `-of`, `--format`, `--output-format` — output format, `TSV` 默认情况下。 -- `--stacktrace` — whether to dump debug output in case of exception. -- `--verbose` — more details on query execution. -- `-s` — disables `stderr` 记录。 -- `--config-file` — path to configuration file in same format as for ClickHouse server, by default the configuration empty. -- `--help` — arguments references for `clickhouse-local`. +- `-S`, `--structure` — 输入数据的表结构。 +- `-if`, `--input-format` — 输入格式化类型, 默认是`TSV`。 +- `-f`, `--file` — 数据路径, 默认是`stdin`。 +- `-q` `--query` — 要查询的SQL语句使用`;`做分隔符。 +- `-N`, `--table` — 数据输出的表名,默认是`table`。 +- `-of`, `--format`, `--output-format` — 输出格式化类型, 默认是`TSV`。 +- `--stacktrace` — 是否在出现异常时输出栈信息。 +- `--verbose` — debug显示查询的详细信息。 +- `-s` — 禁用`stderr`输出信息。 +- `--config-file` — 与ClickHouse服务器格式相同配置文件的路径,默认情况下配置为空。 +- `--help` — `clickhouse-local`使用帮助信息。 -还有每个ClickHouse配置变量的参数,这些变量更常用,而不是 `--config-file`. +对于每个ClickHouse配置的参数,也可以单独使用,可以不使用`--config-file`指定。 -## 例 {#examples} +## 示例 {#examples} ``` bash echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" @@ -49,7 +49,7 @@ Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. 3 4 ``` -前面的例子是一样的: +另一个示例,类似上一个使用示例: ``` bash $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" @@ -58,7 +58,22 @@ Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. 3 4 ``` -现在让我们为每个Unix用户输出内存用户: +你可以使用`stdin`或`--file`参数, 打开任意数量的文件来使用多个文件[`file` table function](../../sql-reference/table-functions/file.md): + +```bash +$ echo 1 | tee 1.tsv +1 + +$ echo 2 | tee 2.tsv +2 + +$ clickhouse-local --query " + select * from file('1.tsv', TSV, 'a int') t1 + cross join file('2.tsv', TSV, 'b int') t2" +1 2 +``` + +现在让我们查询每个Unix用户使用内存: ``` bash $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' | clickhouse-local -S "user String, mem Float64" -q "SELECT user, round(sum(mem), 2) as memTotal FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" diff --git a/docs/zh/sql-reference/functions/conditional-functions.md b/docs/zh/sql-reference/functions/conditional-functions.md index b8e96620014..a804f723d6f 100644 --- a/docs/zh/sql-reference/functions/conditional-functions.md +++ b/docs/zh/sql-reference/functions/conditional-functions.md @@ -1,13 +1,108 @@ # 条件函数 {#tiao-jian-han-shu} -## 如果(cond,那么,否则),cond? 运算符然后:else {#ifcond-then-else-cond-operator-then-else} +## if {#if} + +控制条件分支。 与大多数系统不同,ClickHouse始终评估两个表达式 `then` 和 `else`。 + +**语法** + +``` sql +SELECT if(cond, then, else) +``` + +如果条件 `cond` 的计算结果为非零值,则返回表达式 `then` 的结果,并且跳过表达式 `else` 的结果(如果存在)。 如果 `cond` 为零或 `NULL`,则将跳过 `then` 表达式的结果,并返回 `else` 表达式的结果(如果存在)。 + +**参数** + +- `cond` – 条件结果可以为零或不为零。 类型是 UInt8,Nullable(UInt8) 或 NULL。 +- `then` - 如果满足条件则返回的表达式。 +- `else` - 如果不满足条件则返回的表达式。 + +**返回值** + +该函数执行 `then` 和 `else` 表达式并返回其结果,这取决于条件 `cond` 最终是否为零。 + +**示例** + +查询: + +``` sql +SELECT if(1, plus(2, 2), plus(2, 6)) +``` + +结果: + +``` text +┌─plus(2, 2)─┐ +│ 4 │ +└────────────┘ +``` + +查询: + +``` sql +SELECT if(0, plus(2, 2), plus(2, 6)) +``` + +结果: + +``` text +┌─plus(2, 6)─┐ +│ 8 │ +└────────────┘ +``` + +- `then` 和 `else` 必须具有最低的通用类型。 + +**示例:** + +给定表`LEFT_RIGHT`: + +``` sql +SELECT * +FROM LEFT_RIGHT + +┌─left─┬─right─┐ +│ ᴺᵁᴸᴸ │ 4 │ +│ 1 │ 3 │ +│ 2 │ 2 │ +│ 3 │ 1 │ +│ 4 │ ᴺᵁᴸᴸ │ +└──────┴───────┘ +``` + +下面的查询比较了 `left` 和 `right` 的值: + +``` sql +SELECT + left, + right, + if(left < right, 'left is smaller than right', 'right is greater or equal than left') AS is_smaller +FROM LEFT_RIGHT +WHERE isNotNull(left) AND isNotNull(right) + +┌─left─┬─right─┬─is_smaller──────────────────────────┐ +│ 1 │ 3 │ left is smaller than right │ +│ 2 │ 2 │ right is greater or equal than left │ +│ 3 │ 1 │ right is greater or equal than left │ +└──────┴───────┴─────────────────────────────────────┘ +``` + +注意:在此示例中未使用'NULL'值,请检查[条件中的NULL值](#null-values-in-conditionals) 部分。 + +## 三元运算符 {#ternary-operator} + +与 `if` 函数相同。 + +语法: `cond ? then : else` 如果`cond != 0`则返回`then`,如果`cond = 0`则返回`else`。 -`cond`必须是`UInt8`类型,`then`和`else`必须存在最低的共同类型。 -`then`和`else`可以是`NULL` +- `cond`必须是`UInt8`类型,`then`和`else`必须存在最低的共同类型。 -## 多 {#multiif} +- `then`和`else`可以是`NULL` + +## multiIf {#multiif} 允许您在查询中更紧凑地编写[CASE](../operators/index.md#operator_case)运算符。 @@ -27,18 +122,74 @@ **示例** -存在如下一张表 +再次使用表 `LEFT_RIGHT` 。 - ┌─x─┬────y─┐ - │ 1 │ ᴺᵁᴸᴸ │ - │ 2 │ 3 │ - └───┴──────┘ +``` sql +SELECT + left, + right, + multiIf(left < right, 'left is smaller', left > right, 'left is greater', left = right, 'Both equal', 'Null value') AS result +FROM LEFT_RIGHT -执行查询 `SELECT multiIf(isNull(y), x, y < 3, y, NULL) FROM t_null`。结果: +┌─left─┬─right─┬─result──────────┐ +│ ᴺᵁᴸᴸ │ 4 │ Null value │ +│ 1 │ 3 │ left is smaller │ +│ 2 │ 2 │ Both equal │ +│ 3 │ 1 │ left is greater │ +│ 4 │ ᴺᵁᴸᴸ │ Null value │ +└──────┴───────┴─────────────────┘ +``` +## 直接使用条件结果 {#using-conditional-results-directly} - ┌─multiIf(isNull(y), x, less(y, 3), y, NULL)─┐ - │ 1 │ - │ ᴺᵁᴸᴸ │ - └────────────────────────────────────────────┘ +条件结果始终为 `0`、 `1` 或 `NULL`。 因此,你可以像这样直接使用条件结果: + +``` sql +SELECT left < right AS is_small +FROM LEFT_RIGHT + +┌─is_small─┐ +│ ᴺᵁᴸᴸ │ +│ 1 │ +│ 0 │ +│ 0 │ +│ ᴺᵁᴸᴸ │ +└──────────┘ +``` + +## 条件中的NULL值 {#null-values-in-conditionals} + +当条件中包含 `NULL` 值时,结果也将为 `NULL`。 + +``` sql +SELECT + NULL < 1, + 2 < NULL, + NULL < NULL, + NULL = NULL + +┌─less(NULL, 1)─┬─less(2, NULL)─┬─less(NULL, NULL)─┬─equals(NULL, NULL)─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +└───────────────┴───────────────┴──────────────────┴────────────────────┘ +``` + +因此,如果类型是 `Nullable`,你应该仔细构造查询。 + +以下示例说明这一点。 + +``` sql +SELECT + left, + right, + multiIf(left < right, 'left is smaller', left > right, 'right is smaller', 'Both equal') AS faulty_result +FROM LEFT_RIGHT + +┌─left─┬─right─┬─faulty_result────┐ +│ ᴺᵁᴸᴸ │ 4 │ Both equal │ +│ 1 │ 3 │ left is smaller │ +│ 2 │ 2 │ Both equal │ +│ 3 │ 1 │ right is smaller │ +│ 4 │ ᴺᵁᴸᴸ │ Both equal │ +└──────┴───────┴──────────────────┘ +``` [来源文章](https://clickhouse.tech/docs/en/query_language/functions/conditional_functions/) diff --git a/docs/zh/sql-reference/functions/encoding-functions.md b/docs/zh/sql-reference/functions/encoding-functions.md index 39065f1d8b9..75e0118a88d 100644 --- a/docs/zh/sql-reference/functions/encoding-functions.md +++ b/docs/zh/sql-reference/functions/encoding-functions.md @@ -1,5 +1,71 @@ # 编码函数 {#bian-ma-han-shu} +## char {#char} + +返回长度为传递参数数量的字符串,并且每个字节都有对应参数的值。接受数字Numeric类型的多个参数。如果参数的值超出了UInt8数据类型的范围,则将其转换为UInt8,并可能进行舍入和溢出。 + +**语法** + +``` sql +char(number_1, [number_2, ..., number_n]); +``` + +**参数** + +- `number_1, number_2, ..., number_n` — 数值参数解释为整数。类型: [Int](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md). + +**返回值** + +- 给定字节数的字符串。 + +类型: `String`。 + +**示例** + +查询: + +``` sql +SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello +``` + +结果: + +``` text +┌─hello─┐ +│ hello │ +└───────┘ +``` + +你可以通过传递相应的字节来构造任意编码的字符串。 这是UTF-8的示例: + +查询: + +``` sql +SELECT char(0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5, 0xD1, 0x82) AS hello; +``` + +结果: + +``` text +┌─hello──┐ +│ привет │ +└────────┘ +``` + +查询: + +``` sql +SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello; +``` + +结果: + +``` text +┌─hello─┐ +│ 你好 │ +└───────┘ +``` + ## hex {#hex} 接受`String`,`unsigned integer`,`Date`或`DateTime`类型的参数。返回包含参数的十六进制表示的字符串。使用大写字母`A-F`。不使用`0x`前缀或`h`后缀。对于字符串,所有字节都简单地编码为两个十六进制数字。数字转换为大端(«易阅读»)格式。对于数字,去除其中较旧的零,但仅限整个字节。例如,`hex(1)='01'`。 `Date`被编码为自Unix时间开始以来的天数。 `DateTime`编码为自Unix时间开始以来的秒数。 @@ -17,11 +83,11 @@ 接受FixedString(16)值。返回包含36个字符的文本格式的字符串。 -## 位掩码列表(num) {#bitmasktolistnum} +## bitmaskToList(num) {#bitmasktolistnum} 接受一个整数。返回一个字符串,其中包含一组2的幂列表,其列表中的所有值相加等于这个整数。列表使用逗号分割,按升序排列。 -## 位掩码阵列(num) {#bitmasktoarraynum} +## bitmaskToArray(num) {#bitmasktoarraynum} 接受一个整数。返回一个UInt64类型数组,其中包含一组2的幂列表,其列表中的所有值相加等于这个整数。数组中的数字按升序排列。 diff --git a/docs/zh/sql-reference/functions/math-functions.md b/docs/zh/sql-reference/functions/math-functions.md index 81c2fcecdbc..6634b095b0d 100644 --- a/docs/zh/sql-reference/functions/math-functions.md +++ b/docs/zh/sql-reference/functions/math-functions.md @@ -76,7 +76,7 @@ SELECT erf(3 / sqrt(2)) 返回x的三角余弦值。 -## 谭(x) {#tanx} +## tan(x) {#tanx} 返回x的三角正切值。 @@ -88,7 +88,7 @@ SELECT erf(3 / sqrt(2)) 返回x的反三角余弦值。 -## 阿坦(x) {#atanx} +## atan(x) {#atanx} 返回x的反三角正切值。 diff --git a/docs/zh/sql-reference/functions/random-functions.md b/docs/zh/sql-reference/functions/random-functions.md index d2d9fdf87a6..f058b98c779 100644 --- a/docs/zh/sql-reference/functions/random-functions.md +++ b/docs/zh/sql-reference/functions/random-functions.md @@ -6,7 +6,7 @@ 您可以向它传递任何类型的参数,但传递的参数将不会使用在任何随机数生成过程中。 此参数的唯一目的是防止公共子表达式消除,以便在相同的查询中使用相同的随机函数生成不同的随机数。 -## 兰德 {#rand} +## rand, rand32 {#rand} 返回一个UInt32类型的随机数字,所有UInt32类型的数字被生成的概率均相等。此函数线性同于的方式生成随机数。 diff --git a/docs/zh/sql-reference/functions/string-functions.md b/docs/zh/sql-reference/functions/string-functions.md index 0dbcc031d56..1c27176a45e 100644 --- a/docs/zh/sql-reference/functions/string-functions.md +++ b/docs/zh/sql-reference/functions/string-functions.md @@ -1,6 +1,6 @@ # 字符串函数 {#zi-fu-chuan-han-shu} -## 空 {#string-functions-empty} +## empty {#string-functions-empty} 对于空字符串返回1,对于非空字符串返回0。 结果类型是UInt8。 @@ -13,13 +13,13 @@ 结果类型是UInt8。 该函数也适用于数组。 -## 长度 {#length} +## length {#length} 返回字符串的字节长度。 结果类型是UInt64。 该函数也适用于数组。 -## 长度8 {#lengthutf8} +## lengthUTF8 {#lengthutf8} 假定字符串以UTF-8编码组成的文本,返回此字符串的Unicode字符长度。如果传入的字符串不是UTF-8编码,则函数可能返回一个预期外的值(不会抛出异常)。 结果类型是UInt64。 @@ -29,16 +29,16 @@ 假定字符串以UTF-8编码组成的文本,返回此字符串的Unicode字符长度。如果传入的字符串不是UTF-8编码,则函数可能返回一个预期外的值(不会抛出异常)。 结果类型是UInt64。 -## 字符长度,字符长度 {#character-length-character-length} +## character_length,CHARACTER_LENGTH {#character-length-character-length} 假定字符串以UTF-8编码组成的文本,返回此字符串的Unicode字符长度。如果传入的字符串不是UTF-8编码,则函数可能返回一个预期外的值(不会抛出异常)。 结果类型是UInt64。 -## 低一点 {#lower-lcase} +## lower, lcase {#lower-lcase} 将字符串中的ASCII转换为小写。 -## 上,ucase {#upper-ucase} +## upper, ucase {#upper-ucase} 将字符串中的ASCII转换为大写。 @@ -84,7 +84,7 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') └───────────────────────┘ ``` -## 反向 {#reverse} +## reverse {#reverse} 反转字符串。 @@ -118,11 +118,11 @@ SELECT format('{} {}', 'Hello', 'World') 与[concat](#concat-s1-s2)相同,区别在于,你需要保证concat(s1, s2, s3) -\> s4是单射的,它将用于GROUP BY的优化。 -## 子串(s,offset,length),mid(s,offset,length),substr(s,offset,length) {#substrings-offset-length-mids-offset-length-substrs-offset-length} +## substring(s,offset,length),mid(s,offset,length),substr(s,offset,length) {#substrings-offset-length-mids-offset-length-substrs-offset-length} 以字节为单位截取指定位置字符串,返回以’offset’位置为开头,长度为’length’的子串。’offset’从1开始(与标准SQL相同)。’offset’和’length’参数必须是常量。 -## substringf8(s,offset,length) {#substringutf8s-offset-length} +## substringUTF8(s,offset,length) {#substringutf8s-offset-length} 与’substring’相同,但其操作单位为Unicode字符,函数假设字符串是以UTF-8进行编码的文本。如果不是则可能返回一个预期外的结果(不会抛出异常)。 @@ -150,7 +150,7 @@ SELECT format('{} {}', 'Hello', 'World') 返回是否以指定的后缀结尾。如果字符串以指定的后缀结束,则返回1,否则返回0。 -## 开始使用(s,前缀) {#startswiths-prefix} +## startsWith(s,前缀) {#startswiths-prefix} 返回是否以指定的前缀开头。如果字符串以指定的前缀开头,则返回1,否则返回0。 diff --git a/docs/zh/sql-reference/statements/misc.md b/docs/zh/sql-reference/statements/misc.md index fd3eea9796e..a736ed2af5b 100644 --- a/docs/zh/sql-reference/statements/misc.md +++ b/docs/zh/sql-reference/statements/misc.md @@ -151,7 +151,7 @@ DROP [ROW] POLICY [IF EXISTS] name [,...] ON [database.]table [,...] [ON CLUSTER 删除配额。 -已删除的配额将从分配配额的所有实体撤销。 +已删除的配额将从分配该配额的所有实体撤销。 ### 语法 {#drop-quota-syntax} @@ -161,9 +161,9 @@ DROP QUOTA [IF EXISTS] name [,...] [ON CLUSTER cluster_name] ## DROP SETTINGS PROFILE {#drop-settings-profile-statement} -删除配额。 +删除settings配置。 -已删除的配额将从分配配额的所有实体撤销。 +已删除的settings配置将从分配该settings配置的所有实体撤销。 ### 语法 {#drop-settings-profile-syntax} @@ -177,7 +177,7 @@ DROP [SETTINGS] PROFILE [IF EXISTS] name [,...] [ON CLUSTER cluster_name] EXISTS [TEMPORARY] [TABLE|DICTIONARY] [db.]name [INTO OUTFILE filename] [FORMAT format] ``` -返回单 `UInt8`-type column,其中包含单个值 `0` 如果表或数据库不存在,或 `1` 如果该表存在于指定的数据库中。 +返回单个 `UInt8` 类型的列,其中包含单个值 `0` 如果表或数据库不存在,或 `1` 如果该表存在于指定的数据库中。 ## KILL QUERY {#kill-query-statement} diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index d9c5dc78fe4..8f45bf53f53 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -43,13 +43,81 @@ else () ${ENABLE_CLICKHOUSE_ALL}) endif () +message(STATUS "ClickHouse modes:") + +if (NOT ENABLE_CLICKHOUSE_SERVER) + message(WARNING "ClickHouse server mode is not going to be built.") +else() + message(STATUS "Server mode: ON") +endif() + +if (NOT ENABLE_CLICKHOUSE_CLIENT) + message(WARNING "ClickHouse client mode is not going to be built. You won't be able to connect to the server and run + tests") +else() + message(STATUS "Client mode: ON") +endif() + +if (ENABLE_CLICKHOUSE_LOCAL) + message(STATUS "Local mode: ON") +else() + message(STATUS "Local mode: OFF") +endif() + +if (ENABLE_CLICKHOUSE_BENCHMARK) + message(STATUS "Benchmark mode: ON") +else() + message(STATUS "Benchmark mode: OFF") +endif() + +if (ENABLE_CLICKHOUSE_EXTRACT_FROM_CONFIG) + message(STATUS "Extract from config mode: ON") +else() + message(STATUS "Extract from config mode: OFF") +endif() + +if (ENABLE_CLICKHOUSE_COMPRESSOR) + message(STATUS "Compressor mode: ON") +else() + message(STATUS "Compressor mode: OFF") +endif() + +if (ENABLE_CLICKHOUSE_COPIER) + message(STATUS "Copier mode: ON") +else() + message(STATUS "Copier mode: OFF") +endif() + +if (ENABLE_CLICKHOUSE_FORMAT) + message(STATUS "Format mode: ON") +else() + message(STATUS "Format mode: OFF") +endif() + +if (ENABLE_CLICKHOUSE_OBFUSCATOR) + message(STATUS "Obfuscator mode: ON") +else() + message(STATUS "Obfuscator mode: OFF") +endif() + +if (ENABLE_CLICKHOUSE_ODBC_BRIDGE) + message(STATUS "ODBC bridge mode: ON") +else() + message(STATUS "ODBC bridge mode: OFF") +endif() + +if (ENABLE_CLICKHOUSE_INSTALL) + message(STATUS "ClickHouse install: ON") +else() + message(STATUS "ClickHouse install: OFF") +endif() + if(NOT (MAKE_STATIC_LIBRARIES OR SPLIT_SHARED_LIBRARIES)) set(CLICKHOUSE_ONE_SHARED ON) endif() configure_file (config_tools.h.in ${ConfigIncludePath}/config_tools.h) - macro(clickhouse_target_link_split_lib target name) if(NOT CLICKHOUSE_ONE_SHARED) target_link_libraries(${target} PRIVATE clickhouse-${name}-lib) @@ -112,8 +180,6 @@ add_subdirectory (obfuscator) add_subdirectory (install) add_subdirectory (git-import) -#add_subdirectory (grpc-client) - if (ENABLE_CLICKHOUSE_ODBC_BRIDGE) add_subdirectory (odbc-bridge) endif () diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 5348a9e36c5..e4858eeda8b 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -2515,7 +2515,7 @@ public: { std::string traceparent = options["opentelemetry-traceparent"].as(); std::string error; - if (!context.getClientInfo().parseTraceparentHeader( + if (!context.getClientInfo().client_trace_context.parseTraceparentHeader( traceparent, error)) { throw Exception(ErrorCodes::BAD_ARGUMENTS, @@ -2526,7 +2526,7 @@ public: if (options.count("opentelemetry-tracestate")) { - context.getClientInfo().opentelemetry_tracestate = + context.getClientInfo().client_trace_context.tracestate = options["opentelemetry-tracestate"].as(); } diff --git a/programs/copier/ClusterCopier.cpp b/programs/copier/ClusterCopier.cpp index a129dc7efcc..2f19fc47fd2 100644 --- a/programs/copier/ClusterCopier.cpp +++ b/programs/copier/ClusterCopier.cpp @@ -62,6 +62,9 @@ decltype(auto) ClusterCopier::retry(T && func, UInt64 max_tries) { std::exception_ptr exception; + if (max_tries == 0) + throw Exception("Cannot perform zero retries", ErrorCodes::LOGICAL_ERROR); + for (UInt64 try_number = 1; try_number <= max_tries; ++try_number) { try @@ -605,7 +608,7 @@ TaskStatus ClusterCopier::tryMoveAllPiecesToDestinationTable(const TaskTable & t settings_push.replication_alter_partitions_sync = 2; query_alter_ast_string += " ALTER TABLE " + getQuotedTable(original_table) + - " ATTACH PARTITION " + partition_name + + ((partition_name == "'all'") ? " ATTACH PARTITION ID " : " ATTACH PARTITION ") + partition_name + " FROM " + getQuotedTable(helping_table); LOG_DEBUG(log, "Executing ALTER query: {}", query_alter_ast_string); @@ -636,7 +639,7 @@ TaskStatus ClusterCopier::tryMoveAllPiecesToDestinationTable(const TaskTable & t if (!task_table.isReplicatedTable()) { query_deduplicate_ast_string += " OPTIMIZE TABLE " + getQuotedTable(original_table) + - " PARTITION " + partition_name + " DEDUPLICATE;"; + ((partition_name == "'all'") ? " PARTITION ID " : " PARTITION ") + partition_name + " DEDUPLICATE;"; LOG_DEBUG(log, "Executing OPTIMIZE DEDUPLICATE query: {}", query_alter_ast_string); @@ -807,7 +810,7 @@ bool ClusterCopier::tryDropPartitionPiece( DatabaseAndTableName helping_table = DatabaseAndTableName(original_table.first, original_table.second + "_piece_" + toString(current_piece_number)); String query = "ALTER TABLE " + getQuotedTable(helping_table); - query += " DROP PARTITION " + task_partition.name + ""; + query += ((task_partition.name == "'all'") ? " DROP PARTITION ID " : " DROP PARTITION ") + task_partition.name + ""; /// TODO: use this statement after servers will be updated up to 1.1.54310 // query += " DROP PARTITION ID '" + task_partition.name + "'"; @@ -1567,7 +1570,7 @@ void ClusterCopier::dropParticularPartitionPieceFromAllHelpingTables(const TaskT DatabaseAndTableName original_table = task_table.table_push; DatabaseAndTableName helping_table = DatabaseAndTableName(original_table.first, original_table.second + "_piece_" + toString(current_piece_number)); - String query = "ALTER TABLE " + getQuotedTable(helping_table) + " DROP PARTITION " + partition_name; + String query = "ALTER TABLE " + getQuotedTable(helping_table) + ((partition_name == "'all'") ? " DROP PARTITION ID " : " DROP PARTITION ") + partition_name; const ClusterPtr & cluster_push = task_table.cluster_push; Settings settings_push = task_cluster->settings_push; @@ -1670,14 +1673,24 @@ void ClusterCopier::createShardInternalTables(const ConnectionTimeouts & timeout std::set ClusterCopier::getShardPartitions(const ConnectionTimeouts & timeouts, TaskShard & task_shard) { + std::set res; + createShardInternalTables(timeouts, task_shard, false); TaskTable & task_table = task_shard.task_table; + const String & partition_name = queryToString(task_table.engine_push_partition_key_ast); + + if (partition_name == "'all'") + { + res.emplace("'all'"); + return res; + } + String query; { WriteBufferFromOwnString wb; - wb << "SELECT DISTINCT " << queryToString(task_table.engine_push_partition_key_ast) << " AS partition FROM" + wb << "SELECT DISTINCT " << partition_name << " AS partition FROM" << " " << getQuotedTable(task_shard.table_read_shard) << " ORDER BY partition DESC"; query = wb.str(); } @@ -1692,7 +1705,6 @@ std::set ClusterCopier::getShardPartitions(const ConnectionTimeouts & ti local_context.setSettings(task_cluster->settings_pull); Block block = getBlockWithAllStreamData(InterpreterFactory::get(query_ast, local_context)->execute().getInputStream()); - std::set res; if (block) { ColumnWithTypeAndName & column = block.getByPosition(0); @@ -1803,7 +1815,7 @@ UInt64 ClusterCopier::executeQueryOnCluster( if (execution_mode == ClusterExecutionMode::ON_EACH_NODE) max_successful_executions_per_shard = 0; - std::atomic origin_replicas_number; + std::atomic origin_replicas_number = 0; /// We need to execute query on one replica at least auto do_for_shard = [&] (UInt64 shard_index, Settings shard_settings) diff --git a/programs/grpc-client/CMakeLists.txt b/programs/grpc-client/CMakeLists.txt deleted file mode 100644 index d848434e918..00000000000 --- a/programs/grpc-client/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -include_directories(${CMAKE_CURRENT_BINARY_DIR}) -get_filename_component(rpc_proto "${CMAKE_CURRENT_SOURCE_DIR}/../server/grpc_protos/GrpcConnection.proto" ABSOLUTE) -protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${rpc_proto}) -PROTOBUF_GENERATE_GRPC_CPP(GRPC_SRCS GRPC_HDRS ${rpc_proto}) - -add_executable(grpc-client grpc_client.cpp ${PROTO_SRCS} ${PROTO_HDRS} ${GRPC_SRCS} ${GRPC_HDRS}) -target_link_libraries(grpc-client PUBLIC grpc++ PUBLIC libprotobuf PUBLIC daemon) \ No newline at end of file diff --git a/programs/grpc-client/grpc_client.cpp b/programs/grpc-client/grpc_client.cpp deleted file mode 100644 index 5345b3e7d33..00000000000 --- a/programs/grpc-client/grpc_client.cpp +++ /dev/null @@ -1,173 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "GrpcConnection.grpc.pb.h" - -class GRPCClient -{ - public: - explicit GRPCClient(std::shared_ptr channel) - : stub_(GRPCConnection::GRPC::NewStub(channel)) - {} - std::string Query(const GRPCConnection::User& userInfo, - const std::string& query, - std::vector insert_data = {}) - { - GRPCConnection::QueryRequest request; - grpc::Status status; - GRPCConnection::QueryResponse reply; - grpc::ClientContext context; - auto deadline = std::chrono::system_clock::now() + std::chrono::milliseconds(10000); - context.set_deadline(deadline); - - auto user = std::make_unique(userInfo); - auto querySettigs = std::make_unique(); - int id = rand(); - request.set_allocated_user_info(user.release()); - // interactive_delay in miliseconds - request.set_interactive_delay(1000); - - querySettigs->set_query(query); - querySettigs->set_format("Values"); - querySettigs->set_query_id(std::to_string(id)); - querySettigs->set_data_stream((insert_data.size() != 0)); - (*querySettigs->mutable_settings())["max_query_size"] ="100"; - - - request.set_allocated_query_info(querySettigs.release()); - - void* got_tag = (void*)1; - bool ok = false; - - std::unique_ptr > reader(stub_->Query(&context)); - reader->Write(request); - - auto write = [&reply, &reader, &insert_data]() - { - GRPCConnection::QueryRequest request_insert; - for (const auto& data : insert_data) - { - request_insert.set_insert_data(data); - if (reply.exception_occured().empty()) - { - reader->Write(request_insert); - } - else - { - break; - } - } - request_insert.set_insert_data(""); - if (reply.exception_occured().empty()) - { - reader->Write(request_insert); - } - // reader->WritesDone(); - }; - std::thread write_thread(write); - write_thread.detach(); - - while (reader->Read(&reply)) - { - - if (!reply.output().empty()) - { - std::cout << "Query Part:\n " << id<< reply.output()<<'\n'; - } - else if (reply.progress().read_rows() - || reply.progress().read_bytes() - || reply.progress().total_rows_to_read() - || reply.progress().written_rows() - || reply.progress().written_bytes()) - { - std::cout << "Progress " << id<< ":{\n" << "read_rows: " << reply.progress().read_rows() << '\n' - << "read_bytes: " << reply.progress().read_bytes() << '\n' - << "total_rows_to_read: " << reply.progress().total_rows_to_read() << '\n' - << "written_rows: " << reply.progress().written_rows() << '\n' - << "written_bytes: " << reply.progress().written_bytes() << '\n'; - - - } - else if (!reply.totals().empty()) - { - std::cout << "Totals:\n " << id << " " << reply.totals() <<'\n'; - } - else if (!reply.extremes().empty()) - { - std::cout << "Extremes:\n " << id << " " << reply.extremes() <<'\n'; - } - } - - if (status.ok() && reply.exception_occured().empty()) - { - return ""; - } - else if (status.ok() && !reply.exception_occured().empty()) - { - return reply.exception_occured(); - } - else - { - return "RPC failed"; - } - } - - private: - std::unique_ptr stub_; -}; - -int main(int argc, char** argv) -{ - GRPCConnection::User userInfo1; - userInfo1.set_user("default"); - userInfo1.set_password(""); - userInfo1.set_quota("default"); - - std::cout << "Try: " << argv[1] << std::endl; - grpc::ChannelArguments ch_args; - ch_args.SetMaxReceiveMessageSize(-1); - GRPCClient client( - grpc::CreateCustomChannel(argv[1], grpc::InsecureChannelCredentials(), ch_args)); - { - std::cout << client.Query(userInfo1, "CREATE TABLE t (a UInt8) ENGINE = Memory") << std::endl; - std::cout << client.Query(userInfo1, "CREATE TABLE t (a UInt8) ENGINE = Memory") << std::endl; - std::cout << client.Query(userInfo1, "INSERT INTO t VALUES", {"(1),(2),(3)", "(4),(6),(5)"}) << std::endl; - std::cout << client.Query(userInfo1, "INSERT INTO t_not_defined VALUES", {"(1),(2),(3)", "(4),(6),(5)"}) << std::endl; - std::cout << client.Query(userInfo1, "SELECT a FROM t ORDER BY a") << std::endl; - std::cout << client.Query(userInfo1, "DROP TABLE t") << std::endl; - } - { - std::cout << client.Query(userInfo1, "SELECT count() FROM numbers(1)") << std::endl; - std::cout << client.Query(userInfo1, "SELECT 100") << std::endl; - std::cout << client.Query(userInfo1, "SELECT count() FROM numbers(10000000000)") << std::endl; - std::cout << client.Query(userInfo1, "SELECT count() FROM numbers(100)") << std::endl; - } - { - std::cout << client.Query(userInfo1, "CREATE TABLE arrays_test (s String, arr Array(UInt8)) ENGINE = Memory;") << std::endl; - std::cout << client.Query(userInfo1, "INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []);") << std::endl; - std::cout << client.Query(userInfo1, "SELECT s FROM arrays_test") << std::endl; - std::cout << client.Query(userInfo1, "DROP TABLE arrays_test") << std::endl; - std::cout << client.Query(userInfo1, "") << std::endl; - } - - {//Check null return from pipe - std::cout << client.Query(userInfo1, "CREATE TABLE table2 (x UInt8, y UInt8) ENGINE = Memory;") << std::endl; - std::cout << client.Query(userInfo1, "SELECT x FROM table2") << std::endl; - std::cout << client.Query(userInfo1, "DROP TABLE table2") << std::endl; - } - {//Check Totals - std::cout << client.Query(userInfo1, "CREATE TABLE tabl (x UInt8, y UInt8) ENGINE = Memory;") << std::endl; - std::cout << client.Query(userInfo1, "INSERT INTO tabl VALUES (1, 2), (2, 4), (3, 2), (3, 3), (3, 4);") << std::endl; - std::cout << client.Query(userInfo1, "SELECT sum(x), y FROM tabl GROUP BY y WITH TOTALS") << std::endl; - std::cout << client.Query(userInfo1, "DROP TABLE tabl") << std::endl; - } - - return 0; -} diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index da22452819a..9e3942e126d 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,7 @@ namespace po = boost::program_options; namespace fs = std::filesystem; -auto executeScript(const std::string & command, bool throw_on_error = false) +static auto executeScript(const std::string & command, bool throw_on_error = false) { auto sh = ShellCommand::execute(command); WriteBufferFromFileDescriptor wb_stdout(STDOUT_FILENO); @@ -87,7 +88,7 @@ auto executeScript(const std::string & command, bool throw_on_error = false) return sh->tryWait(); } -bool ask(std::string question) +static bool ask(std::string question) { while (true) { @@ -104,6 +105,16 @@ bool ask(std::string question) } } +static bool filesEqual(std::string path1, std::string path2) +{ + MMapReadBufferFromFile in1(path1, 0); + MMapReadBufferFromFile in2(path2, 0); + + /// memcmp is faster than hashing and comparing hashes + return in1.buffer().size() == in2.buffer().size() + && 0 == memcmp(in1.buffer().begin(), in2.buffer().begin(), in1.buffer().size()); +} + int mainEntryClickHouseInstall(int argc, char ** argv) { @@ -143,57 +154,89 @@ int mainEntryClickHouseInstall(int argc, char ** argv) throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot obtain path to the binary from {}, file doesn't exist", binary_self_path.string()); + fs::path binary_self_canonical_path = fs::canonical(binary_self_path); + /// Copy binary to the destination directory. /// TODO An option to link instead of copy - useful for developers. - /// TODO Check if the binary is the same. - - size_t binary_size = fs::file_size(binary_self_path); fs::path prefix = fs::path(options["prefix"].as()); fs::path bin_dir = prefix / fs::path(options["binary-path"].as()); - size_t available_space = fs::space(bin_dir).available; - if (available_space < binary_size) - throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Not enough space for clickhouse binary in {}, required {}, available {}.", - bin_dir.string(), ReadableSize(binary_size), ReadableSize(available_space)); - fs::path main_bin_path = bin_dir / "clickhouse"; fs::path main_bin_tmp_path = bin_dir / "clickhouse.new"; fs::path main_bin_old_path = bin_dir / "clickhouse.old"; - fmt::print("Copying ClickHouse binary to {}\n", main_bin_tmp_path.string()); + size_t binary_size = fs::file_size(binary_self_path); - try + bool old_binary_exists = fs::exists(main_bin_path); + bool already_installed = false; + + /// Check if the binary is the same file (already installed). + if (old_binary_exists && binary_self_canonical_path == fs::canonical(main_bin_path)) { - ReadBufferFromFile in(binary_self_path.string()); - WriteBufferFromFile out(main_bin_tmp_path.string()); - copyData(in, out); - out.sync(); - - if (0 != fchmod(out.getFD(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH)) - throwFromErrno(fmt::format("Cannot chmod {}", main_bin_tmp_path.string()), ErrorCodes::SYSTEM_ERROR); - - out.finalize(); + already_installed = true; + fmt::print("ClickHouse binary is already located at {}\n", main_bin_path.string()); } - catch (const Exception & e) + /// Check if binary has the same content. + else if (old_binary_exists && binary_size == fs::file_size(main_bin_path)) { - if (e.code() == ErrorCodes::CANNOT_OPEN_FILE && geteuid() != 0) - std::cerr << "Install must be run as root: sudo ./clickhouse install\n"; - throw; + fmt::print("Found already existing ClickHouse binary at {} having the same size. Will check its contents.\n", + main_bin_path.string()); + + if (filesEqual(binary_self_path.string(), main_bin_path.string())) + { + already_installed = true; + fmt::print("ClickHouse binary is already located at {} and it has the same content as {}\n", + main_bin_path.string(), binary_self_canonical_path.string()); + } } - if (fs::exists(main_bin_path)) + if (already_installed) { - fmt::print("{} already exists, will rename existing binary to {} and put the new binary in place\n", - main_bin_path.string(), main_bin_old_path.string()); - - /// There is file exchange operation in Linux but it's not portable. - fs::rename(main_bin_path, main_bin_old_path); + if (0 != chmod(main_bin_path.string().c_str(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH)) + throwFromErrno(fmt::format("Cannot chmod {}", main_bin_path.string()), ErrorCodes::SYSTEM_ERROR); } + else + { + size_t available_space = fs::space(bin_dir).available; + if (available_space < binary_size) + throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Not enough space for clickhouse binary in {}, required {}, available {}.", + bin_dir.string(), ReadableSize(binary_size), ReadableSize(available_space)); - fmt::print("Renaming {} to {}.\n", main_bin_tmp_path.string(), main_bin_path.string()); - fs::rename(main_bin_tmp_path, main_bin_path); + fmt::print("Copying ClickHouse binary to {}\n", main_bin_tmp_path.string()); + + try + { + ReadBufferFromFile in(binary_self_path.string()); + WriteBufferFromFile out(main_bin_tmp_path.string()); + copyData(in, out); + out.sync(); + + if (0 != fchmod(out.getFD(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH)) + throwFromErrno(fmt::format("Cannot chmod {}", main_bin_tmp_path.string()), ErrorCodes::SYSTEM_ERROR); + + out.finalize(); + } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::CANNOT_OPEN_FILE && geteuid() != 0) + std::cerr << "Install must be run as root: sudo ./clickhouse install\n"; + throw; + } + + if (old_binary_exists) + { + fmt::print("{} already exists, will rename existing binary to {} and put the new binary in place\n", + main_bin_path.string(), main_bin_old_path.string()); + + /// There is file exchange operation in Linux but it's not portable. + fs::rename(main_bin_path, main_bin_old_path); + } + + fmt::print("Renaming {} to {}.\n", main_bin_tmp_path.string(), main_bin_path.string()); + fs::rename(main_bin_tmp_path, main_bin_path); + } /// Create symlinks. @@ -401,8 +444,8 @@ int mainEntryClickHouseInstall(int argc, char ** argv) ConfigurationPtr configuration(new Poco::Util::XMLConfiguration(processor.processConfig())); if (!configuration->getString("users.default.password", "").empty() - || configuration->getString("users.default.password_sha256_hex", "").empty() - || configuration->getString("users.default.password_double_sha1_hex", "").empty()) + || !configuration->getString("users.default.password_sha256_hex", "").empty() + || !configuration->getString("users.default.password_double_sha1_hex", "").empty()) { has_password_for_default_user = true; } @@ -576,7 +619,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) " || echo \"Cannot set 'net_admin' or 'ipc_lock' or 'sys_nice' capability for clickhouse binary." " This is optional. Taskstats accounting will be disabled." " To enable taskstats accounting you may add the required capability later manually.\"", - "/tmp/test_setcap.sh", main_bin_path.string()); + "/tmp/test_setcap.sh", fs::canonical(main_bin_path).string()); fmt::print(" {}\n", command); executeScript(command); #endif @@ -597,10 +640,6 @@ int mainEntryClickHouseInstall(int argc, char ** argv) } } - std::string maybe_sudo; - if (getuid() != 0) - maybe_sudo = "sudo "; - std::string maybe_password; if (has_password_for_default_user) maybe_password = " --password"; @@ -608,10 +647,19 @@ int mainEntryClickHouseInstall(int argc, char ** argv) fmt::print( "\nClickHouse has been successfully installed.\n" "\nStart clickhouse-server with:\n" - " {}clickhouse start\n" + " sudo clickhouse start\n" "\nStart clickhouse-client with:\n" " clickhouse-client{}\n\n", - maybe_sudo, maybe_password); + maybe_password); + } + catch (const fs::filesystem_error &) + { + std::cerr << getCurrentExceptionMessage(false) << '\n'; + + if (getuid() != 0) + std::cerr << "\nRun with sudo.\n"; + + return getCurrentExceptionCode(); } catch (...) { @@ -783,17 +831,20 @@ namespace return pid; } - int stop(const fs::path & pid_file) + int stop(const fs::path & pid_file, bool force) { UInt64 pid = isRunning(pid_file); if (!pid) return 0; - if (0 == kill(pid, 15)) /// Terminate - fmt::print("Sent termination signal.\n", pid); + int signal = force ? SIGKILL : SIGTERM; + const char * signal_name = force ? "kill" : "terminate"; + + if (0 == kill(pid, signal)) + fmt::print("Sent {} signal to process with pid {}.\n", signal_name, pid); else - throwFromErrno("Cannot send termination signal", ErrorCodes::SYSTEM_ERROR); + throwFromErrno(fmt::format("Cannot send {} signal", signal_name), ErrorCodes::SYSTEM_ERROR); size_t try_num = 0; constexpr size_t num_tries = 60; @@ -869,6 +920,7 @@ int mainEntryClickHouseStop(int argc, char ** argv) desc.add_options() ("help,h", "produce help message") ("pid-path", po::value()->default_value("/var/run/clickhouse-server"), "directory for pid file") + ("force", po::value()->default_value(false), "Stop with KILL signal instead of TERM") ; po::variables_map options; @@ -887,7 +939,7 @@ int mainEntryClickHouseStop(int argc, char ** argv) { fs::path pid_file = fs::path(options["pid-path"].as()) / "clickhouse-server.pid"; - return stop(pid_file); + return stop(pid_file, options["force"].as()); } catch (...) { @@ -940,6 +992,7 @@ int mainEntryClickHouseRestart(int argc, char ** argv) ("config-path", po::value()->default_value("/etc/clickhouse-server"), "directory with configs") ("pid-path", po::value()->default_value("/var/run/clickhouse-server"), "directory for pid file") ("user", po::value()->default_value("clickhouse"), "clickhouse user") + ("force", po::value()->default_value(false), "Stop with KILL signal instead of TERM") ; po::variables_map options; @@ -962,7 +1015,7 @@ int mainEntryClickHouseRestart(int argc, char ** argv) fs::path config = fs::path(options["config-path"].as()) / "config.xml"; fs::path pid_file = fs::path(options["pid-path"].as()) / "clickhouse-server.pid"; - if (int res = stop(pid_file)) + if (int res = stop(pid_file, options["force"].as())) return res; return start(user, executable, config, pid_file); } diff --git a/programs/odbc-bridge/ODBCBlockInputStream.cpp b/programs/odbc-bridge/ODBCBlockInputStream.cpp index 00ca89bd887..3e2a2d0c7d4 100644 --- a/programs/odbc-bridge/ODBCBlockInputStream.cpp +++ b/programs/odbc-bridge/ODBCBlockInputStream.cpp @@ -79,11 +79,18 @@ namespace assert_cast(column).insert(value.convert()); break; case ValueType::vtDate: - assert_cast(column).insertValue(UInt16{LocalDate{value.convert()}.getDayNum()}); + { + Poco::DateTime date = value.convert(); + assert_cast(column).insertValue(UInt16{LocalDate(date.year(), date.month(), date.day()).getDayNum()}); break; + } case ValueType::vtDateTime: - assert_cast(column).insertValue(time_t{LocalDateTime{value.convert()}}); + { + Poco::DateTime datetime = value.convert(); + assert_cast(column).insertValue(time_t{LocalDateTime( + datetime.year(), datetime.month(), datetime.day(), datetime.hour(), datetime.minute(), datetime.second())}); break; + } case ValueType::vtUUID: assert_cast(column).insert(parse(value.convert())); break; @@ -112,6 +119,7 @@ Block ODBCBlockInputStream::readImpl() for (const auto idx : ext::range(0, row.fieldCount())) { + /// TODO This is extremely slow. const Poco::Dynamic::Var & value = row[idx]; if (!value.isEmpty()) diff --git a/programs/server/config.xml b/programs/server/config.xml index cef191a71f1..dde3702a44b 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -134,21 +134,21 @@ 4096 3 - + - true --> - + true + - + - + - + @@ -159,8 +159,8 @@ + 4194304 --> + 100 @@ -581,7 +581,7 @@ system query_log
toYYYYMM(event_date) + *_dictionary.xml diff --git a/src/AggregateFunctions/AggregateFunctionAvg.cpp b/src/AggregateFunctions/AggregateFunctionAvg.cpp index 3764fd67ff5..9b1c3d6cef6 100644 --- a/src/AggregateFunctions/AggregateFunctionAvg.cpp +++ b/src/AggregateFunctions/AggregateFunctionAvg.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -13,43 +14,37 @@ namespace ErrorCodes namespace { - -template -struct Avg +bool allowType(const DataTypePtr& type) noexcept { - using FieldType = std::conditional_t, - std::conditional_t, Decimal256, Decimal128>, - NearestFieldType>; - // using FieldType = std::conditional_t, Decimal128, NearestFieldType>; - using Function = AggregateFunctionAvg>; -}; - -template -using AggregateFuncAvg = typename Avg::Function; + const WhichDataType t(type); + return t.isInt() || t.isUInt() || t.isFloat() || t.isDecimal(); +} AggregateFunctionPtr createAggregateFunctionAvg(const std::string & name, const DataTypes & argument_types, const Array & parameters) { assertNoParameters(name, parameters); assertUnary(name, argument_types); - AggregateFunctionPtr res; - DataTypePtr data_type = argument_types[0]; - if (isDecimal(data_type)) - res.reset(createWithDecimalType(*data_type, *data_type, argument_types)); - else - res.reset(createWithNumericType(*data_type, argument_types)); + const DataTypePtr& data_type = argument_types[0]; + + if (!allowType(data_type)) + throw Exception("Illegal type " + data_type->getName() + " of argument for aggregate function " + name, + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + AggregateFunctionPtr res; + + if (isDecimal(data_type)) + res.reset(createWithDecimalType( + *data_type, argument_types, getDecimalScale(*data_type))); + else + res.reset(createWithNumericType(*data_type, argument_types)); - if (!res) - throw Exception("Illegal type " + argument_types[0]->getName() + " of argument for aggregate function " + name, - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); return res; } - } void registerAggregateFunctionAvg(AggregateFunctionFactory & factory) { factory.registerFunction("avg", createAggregateFunctionAvg, AggregateFunctionFactory::CaseInsensitive); } - } diff --git a/src/AggregateFunctions/AggregateFunctionAvg.h b/src/AggregateFunctions/AggregateFunctionAvg.h index 944d9cbfaf5..fca9df9dd98 100644 --- a/src/AggregateFunctions/AggregateFunctionAvg.h +++ b/src/AggregateFunctions/AggregateFunctionAvg.h @@ -1,78 +1,102 @@ #pragma once +#include #include #include - #include #include #include - #include +#include "Core/DecimalFunctions.h" namespace DB { -namespace ErrorCodes -{ -} +template +using DecimalOrVectorCol = std::conditional_t, ColumnDecimal, ColumnVector>; -template -struct AggregateFunctionAvgData -{ - using NumeratorType = T; - using DenominatorType = Denominator; +template constexpr bool DecimalOrExtendedInt = + IsDecimalNumber + || std::is_same_v + || std::is_same_v + || std::is_same_v + || std::is_same_v; - T numerator{0}; +/** + * Helper class to encapsulate values conversion for avg and avgWeighted. + */ +template +struct AvgFraction +{ + Numerator numerator{0}; Denominator denominator{0}; - template - ResultT NO_SANITIZE_UNDEFINED result() const + /// Allow division by zero as sometimes we need to return NaN. + /// Invoked only is either Numerator or Denominator are Decimal. + Float64 NO_SANITIZE_UNDEFINED divideIfAnyDecimal(UInt32 num_scale, UInt32 denom_scale) const { - if constexpr (std::is_floating_point_v) - if constexpr (std::numeric_limits::is_iec559) - { - if constexpr (is_big_int_v) - return static_cast(numerator) / static_cast(denominator); - else - return static_cast(numerator) / denominator; /// allow division by zero - } + if constexpr (IsDecimalNumber && IsDecimalNumber) + { + // According to the docs, num(S1) / denom(S2) would have scale S1 - if (denominator == static_cast(0)) - return static_cast(0); + if constexpr (std::is_same_v && std::is_same_v) + ///Special case as Decimal256 / Decimal128 = compile error (as Decimal128 is not parametrized by a wide + ///int), but an __int128 instead + return DecimalUtils::convertTo( + numerator / (denominator.template convertTo()), num_scale); + else + return DecimalUtils::convertTo(numerator / denominator, num_scale); + } - if constexpr (std::is_same_v) - return static_cast(numerator / static_cast(denominator)); + /// Numerator is always casted to Float64 to divide correctly if the denominator is not Float64. + Float64 num_converted; + + if constexpr (IsDecimalNumber) + num_converted = DecimalUtils::convertTo(numerator, num_scale); else - return static_cast(numerator / denominator); + num_converted = static_cast(numerator); /// all other types, including extended integral. + + std::conditional_t, + Float64, Denominator> denom_converted; + + if constexpr (IsDecimalNumber) + denom_converted = DecimalUtils::convertTo(denominator, denom_scale); + else if constexpr (DecimalOrExtendedInt) + /// no way to divide Float64 and extended integral type without an explicit cast. + denom_converted = static_cast(denominator); + else + denom_converted = denominator; /// can divide on float, no cast required. + + return num_converted / denom_converted; + } + + Float64 NO_SANITIZE_UNDEFINED divide() const + { + if constexpr (DecimalOrExtendedInt) /// if extended int + return static_cast(numerator) / static_cast(denominator); + else + return static_cast(numerator) / denominator; } }; -/// Calculates arithmetic mean of numbers. -template -class AggregateFunctionAvgBase : public IAggregateFunctionDataHelper + +/** + * @tparam Derived When deriving from this class, use the child class name as in CRTP, e.g. + * class Self : Agg. + */ +template +class AggregateFunctionAvgBase : public + IAggregateFunctionDataHelper, Derived> { public: - using ResultType = std::conditional_t, T, Float64>; - using ResultDataType = std::conditional_t, DataTypeDecimal, DataTypeNumber>; - using ColVecType = std::conditional_t, ColumnDecimal, ColumnVector>; - using ColVecResult = std::conditional_t, ColumnDecimal, ColumnVector>; + using Fraction = AvgFraction; + using Base = IAggregateFunctionDataHelper; - /// ctor for native types - AggregateFunctionAvgBase(const DataTypes & argument_types_) : IAggregateFunctionDataHelper(argument_types_, {}), scale(0) {} + explicit AggregateFunctionAvgBase(const DataTypes & argument_types_, + UInt32 num_scale_ = 0, UInt32 denom_scale_ = 0) + : Base(argument_types_, {}), num_scale(num_scale_), denom_scale(denom_scale_) {} - /// ctor for Decimals - AggregateFunctionAvgBase(const IDataType & data_type, const DataTypes & argument_types_) - : IAggregateFunctionDataHelper(argument_types_, {}), scale(getDecimalScale(data_type)) - { - } - - DataTypePtr getReturnType() const override - { - if constexpr (IsDecimalNumber) - return std::make_shared(ResultDataType::maxPrecision(), scale); - else - return std::make_shared(); - } + DataTypePtr getReturnType() const final { return std::make_shared>(); } void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override { @@ -84,7 +108,7 @@ public: { writeBinary(this->data(place).numerator, buf); - if constexpr (std::is_unsigned_v) + if constexpr (std::is_unsigned_v) writeVarUInt(this->data(place).denominator, buf); else /// Floating point denominator type can be used writeBinary(this->data(place).denominator, buf); @@ -94,7 +118,7 @@ public: { readBinary(this->data(place).numerator, buf); - if constexpr (std::is_unsigned_v) + if constexpr (std::is_unsigned_v) readVarUInt(this->data(place).denominator, buf); else /// Floating point denominator type can be used readBinary(this->data(place).denominator, buf); @@ -102,29 +126,34 @@ public: void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override { - auto & column = static_cast(to); - column.getData().push_back(this->data(place).template result()); + if constexpr (IsDecimalNumber || IsDecimalNumber) + static_cast &>(to).getData().push_back( + this->data(place).divideIfAnyDecimal(num_scale, denom_scale)); + else + static_cast &>(to).getData().push_back(this->data(place).divide()); } - -protected: - UInt32 scale; +private: + UInt32 num_scale; + UInt32 denom_scale; }; -template -class AggregateFunctionAvg final : public AggregateFunctionAvgBase> +template +using AvgFieldType = std::conditional_t, + std::conditional_t, Decimal256, Decimal128>, + NearestFieldType>; + +template +class AggregateFunctionAvg final : public AggregateFunctionAvgBase, UInt64, AggregateFunctionAvg> { public: - using AggregateFunctionAvgBase>::AggregateFunctionAvgBase; + using AggregateFunctionAvgBase, UInt64, AggregateFunctionAvg>::AggregateFunctionAvgBase; - using ColVecType = std::conditional_t, ColumnDecimal, ColumnVector>; - void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override + void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const final { - const auto & column = static_cast(*columns[0]); - this->data(place).numerator += column.getData()[row_num]; - this->data(place).denominator += 1; + this->data(place).numerator += static_cast &>(*columns[0]).getData()[row_num]; + ++this->data(place).denominator; } - String getName() const override { return "avg"; } + String getName() const final { return "avg"; } }; - } diff --git a/src/AggregateFunctions/AggregateFunctionAvgWeighted.cpp b/src/AggregateFunctions/AggregateFunctionAvgWeighted.cpp index 6722a94cdc6..983b3bf3d4c 100644 --- a/src/AggregateFunctions/AggregateFunctionAvgWeighted.cpp +++ b/src/AggregateFunctions/AggregateFunctionAvgWeighted.cpp @@ -1,3 +1,5 @@ +#include +#include #include #include #include @@ -13,47 +15,91 @@ namespace ErrorCodes namespace { - -template -struct AvgWeighted +bool allowTypes(const DataTypePtr& left, const DataTypePtr& right) noexcept { - using FieldType = std::conditional_t, - std::conditional_t, Decimal256, Decimal128>, - NearestFieldType>; - // using FieldType = std::conditional_t, Decimal128, NearestFieldType>; - using Function = AggregateFunctionAvgWeighted>; -}; + const WhichDataType l_dt(left), r_dt(right); -template -using AggregateFuncAvgWeighted = typename AvgWeighted::Function; + constexpr auto allow = [](WhichDataType t) + { + return t.isInt() || t.isUInt() || t.isFloat() || t.isDecimal(); + }; + + return allow(l_dt) && allow(r_dt); +} + +#define AT_SWITCH(LINE) \ + switch (which.idx) \ + { \ + LINE(Int8); LINE(Int16); LINE(Int32); LINE(Int64); LINE(Int128); LINE(Int256); \ + LINE(UInt8); LINE(UInt16); LINE(UInt32); LINE(UInt64); LINE(UInt128); LINE(UInt256); \ + LINE(Decimal32); LINE(Decimal64); LINE(Decimal128); LINE(Decimal256); \ + LINE(Float32); LINE(Float64); \ + default: return nullptr; \ + } + +template +static IAggregateFunction * create(const IDataType & second_type, TArgs && ... args) +{ + const WhichDataType which(second_type); + +#define LINE(Type) \ + case TypeIndex::Type: return new AggregateFunctionAvgWeighted(std::forward(args)...) + AT_SWITCH(LINE) +#undef LINE +} + +// Not using helper functions because there are no templates for binary decimal/numeric function. +template +static IAggregateFunction * create(const IDataType & first_type, const IDataType & second_type, TArgs && ... args) +{ + const WhichDataType which(first_type); + +#define LINE(Type) \ + case TypeIndex::Type: return create(second_type, std::forward(args)...) + AT_SWITCH(LINE) +#undef LINE +} AggregateFunctionPtr createAggregateFunctionAvgWeighted(const std::string & name, const DataTypes & argument_types, const Array & parameters) { assertNoParameters(name, parameters); assertBinary(name, argument_types); - AggregateFunctionPtr res; const auto data_type = static_cast(argument_types[0]); const auto data_type_weight = static_cast(argument_types[1]); - if (!data_type->equals(*data_type_weight)) - throw Exception("Different types " + data_type->getName() + " and " + data_type_weight->getName() + " of arguments for aggregate function " + name, - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - if (isDecimal(data_type)) - res.reset(createWithDecimalType(*data_type, *data_type, argument_types)); + + if (!allowTypes(data_type, data_type_weight)) + throw Exception( + "Types " + data_type->getName() + + " and " + data_type_weight->getName() + + " are non-conforming as arguments for aggregate function " + name, + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + AggregateFunctionPtr ptr; + + const bool left_decimal = isDecimal(data_type); + const bool right_decimal = isDecimal(data_type_weight); + + if (left_decimal && right_decimal) + ptr.reset(create(*data_type, *data_type_weight, + argument_types, + getDecimalScale(*data_type), getDecimalScale(*data_type_weight))); + else if (left_decimal) + ptr.reset(create(*data_type, *data_type_weight, argument_types, + getDecimalScale(*data_type))); + else if (right_decimal) + ptr.reset(create(*data_type, *data_type_weight, argument_types, + // numerator is not decimal, so its scale is 0 + 0, getDecimalScale(*data_type_weight))); else - res.reset(createWithNumericType(*data_type, argument_types)); + ptr.reset(create(*data_type, *data_type_weight, argument_types)); - if (!res) - throw Exception("Illegal type " + data_type->getName() + " of argument for aggregate function " + name, - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - return res; + return ptr; } - } void registerAggregateFunctionAvgWeighted(AggregateFunctionFactory & factory) { factory.registerFunction("avgWeighted", createAggregateFunctionAvgWeighted, AggregateFunctionFactory::CaseSensitive); } - } diff --git a/src/AggregateFunctions/AggregateFunctionAvgWeighted.h b/src/AggregateFunctions/AggregateFunctionAvgWeighted.h index 8eb619585c7..6538367ad93 100644 --- a/src/AggregateFunctions/AggregateFunctionAvgWeighted.h +++ b/src/AggregateFunctions/AggregateFunctionAvgWeighted.h @@ -1,26 +1,44 @@ #pragma once +#include #include namespace DB { -template -class AggregateFunctionAvgWeighted final : public AggregateFunctionAvgBase> +template +using AvgWeightedFieldType = std::conditional_t, + std::conditional_t, Decimal256, Decimal128>, + std::conditional_t, + Float64, // no way to do UInt128 * UInt128, better cast to Float64 + NearestFieldType>>; + +template +using MaxFieldType = std::conditional_t<(sizeof(AvgWeightedFieldType) > sizeof(AvgWeightedFieldType)), + AvgWeightedFieldType, AvgWeightedFieldType>; + +template +class AggregateFunctionAvgWeighted final : + public AggregateFunctionAvgBase< + MaxFieldType, AvgWeightedFieldType, AggregateFunctionAvgWeighted> { public: - using AggregateFunctionAvgBase>::AggregateFunctionAvgBase; + using Base = AggregateFunctionAvgBase< + MaxFieldType, AvgWeightedFieldType, AggregateFunctionAvgWeighted>; + using Base::Base; + + using ValueT = MaxFieldType; - using ColVecType = std::conditional_t, ColumnDecimal, ColumnVector>; void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override { - const auto & values = static_cast(*columns[0]); - const auto & weights = static_cast(*columns[1]); + const auto& weights = static_cast &>(*columns[1]); - this->data(place).numerator += static_cast(values.getData()[row_num]) * weights.getData()[row_num]; - this->data(place).denominator += weights.getData()[row_num]; + this->data(place).numerator += static_cast( + static_cast &>(*columns[0]).getData()[row_num]) * + static_cast(weights.getData()[row_num]); + + this->data(place).denominator += static_cast>(weights.getData()[row_num]); } String getName() const override { return "avgWeighted"; } }; - } diff --git a/src/AggregateFunctions/AggregateFunctionFactory.h b/src/AggregateFunctions/AggregateFunctionFactory.h index 7bbe04aff70..d308a5f559b 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.h +++ b/src/AggregateFunctions/AggregateFunctionFactory.h @@ -21,7 +21,8 @@ class IDataType; using DataTypePtr = std::shared_ptr; using DataTypes = std::vector; -/** Creator have arguments: name of aggregate function, types of arguments, values of parameters. +/** + * The invoker has arguments: name of aggregate function, types of arguments, values of parameters. * Parameters are for "parametric" aggregate functions. * For example, in quantileWeighted(0.9)(x, weight), 0.9 is "parameter" and x, weight are "arguments". */ @@ -87,7 +88,6 @@ private: std::optional tryGetPropertiesImpl(const String & name) const; -private: using AggregateFunctions = std::unordered_map; AggregateFunctions aggregate_functions; diff --git a/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp b/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp new file mode 100644 index 00000000000..ceb0b930f73 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp @@ -0,0 +1,37 @@ +#include +#include +#include +#include "registerAggregateFunctions.h" +#include + + +namespace ErrorCodes +{ +extern const int NOT_IMPLEMENTED; +} + +namespace DB +{ + +namespace +{ + +AggregateFunctionPtr createAggregateFunctionMannWhitneyUTest(const std::string & name, const DataTypes & argument_types, const Array & parameters) +{ + assertBinary(name, argument_types); + + if (!isNumber(argument_types[0]) || !isNumber(argument_types[1])) + throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED); + + return std::make_shared(argument_types, parameters); +} + +} + + +void registerAggregateFunctionMannWhitney(AggregateFunctionFactory & factory) +{ + factory.registerFunction("mannWhitneyUTest", createAggregateFunctionMannWhitneyUTest); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionMannWhitney.h b/src/AggregateFunctions/AggregateFunctionMannWhitney.h new file mode 100644 index 00000000000..82a15c6cfae --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.h @@ -0,0 +1,246 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; +} + + +struct MannWhitneyData : public StatisticalSample +{ + /*Since null hypothesis is "for randomly selected values X and Y from two populations, + *the probability of X being greater than Y is equal to the probability of Y being greater than X". + *Or "the distribution F of first sample equals to the distribution G of second sample". + *Then alternative for this hypothesis (H1) is "two-sided"(F != G), "less"(F < G), "greater" (F > G). */ + enum class Alternative + { + TwoSided, + Less, + Greater + }; + + /// The behaviour equals to the similar function from scipy. + /// https://github.com/scipy/scipy/blob/ab9e9f17e0b7b2d618c4d4d8402cd4c0c200d6c0/scipy/stats/stats.py#L6978 + std::pair getResult(Alternative alternative, bool continuity_correction) + { + ConcatenatedSamples both(this->x, this->y); + RanksArray ranks; + Float64 tie_correction; + + /// Compute ranks according to both samples. + std::tie(ranks, tie_correction) = computeRanksAndTieCorrection(both); + + const Float64 n1 = this->size_x; + const Float64 n2 = this->size_y; + + Float64 r1 = 0; + for (size_t i = 0; i < n1; ++i) + r1 += ranks[i]; + + const Float64 u1 = n1 * n2 + (n1 * (n1 + 1.)) / 2. - r1; + const Float64 u2 = n1 * n2 - u1; + + /// The distribution of U-statistic under null hypothesis H0 is symmetric with respect to meanrank. + const Float64 meanrank = n1 * n2 /2. + 0.5 * continuity_correction; + const Float64 sd = std::sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0); + + Float64 u = 0; + if (alternative == Alternative::TwoSided) + /// There is no difference which u_i to take as u, because z will be differ only in sign and we take std::abs() from it. + u = std::max(u1, u2); + else if (alternative == Alternative::Less) + u = u1; + else if (alternative == Alternative::Greater) + u = u2; + + Float64 z = (u - meanrank) / sd; + if (alternative == Alternative::TwoSided) + z = std::abs(z); + + /// In fact cdf is a probability function, so it is intergral of density from (-inf, z]. + /// But since standard normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z]. + const Float64 cdf = integrateSimpson(0, z, [] (Float64 t) { return std::pow(M_E, -0.5 * t * t) / std::sqrt(2 * M_PI);}); + + Float64 p_value = 0; + if (alternative == Alternative::TwoSided) + p_value = 1 - 2 * cdf; + else + p_value = 0.5 - cdf; + + return {u2, p_value}; + } + +private: + using Sample = typename StatisticalSample::SampleX; + + /// We need to compute ranks according to all samples. Use this class to avoid extra copy and memory allocation. + class ConcatenatedSamples + { + public: + ConcatenatedSamples(const Sample & first_, const Sample & second_) + : first(first_), second(second_) {} + + const Float64 & operator[](size_t ind) const + { + if (ind < first.size()) + return first[ind]; + return second[ind % first.size()]; + } + + size_t size() const + { + return first.size() + second.size(); + } + + private: + const Sample & first; + const Sample & second; + }; +}; + +class AggregateFunctionMannWhitney final: + public IAggregateFunctionDataHelper +{ +private: + using Alternative = typename MannWhitneyData::Alternative; + Alternative alternative; + bool continuity_correction{true}; + +public: + explicit AggregateFunctionMannWhitney(const DataTypes & arguments, const Array & params) + :IAggregateFunctionDataHelper ({arguments}, {}) + { + if (params.size() > 2) + throw Exception("Aggregate function " + getName() + " require two parameter or less", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + if (params.empty()) + { + alternative = Alternative::TwoSided; + return; + } + + if (params[0].getType() != Field::Types::String) + throw Exception("Aggregate function " + getName() + " require require first parameter to be a String", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + auto param = params[0].get(); + if (param == "two-sided") + alternative = Alternative::TwoSided; + else if (param == "less") + alternative = Alternative::Less; + else if (param == "greater") + alternative = Alternative::Greater; + else + throw Exception("Unknown parameter in aggregate function " + getName() + + ". It must be one of: 'two sided', 'less', 'greater'", ErrorCodes::BAD_ARGUMENTS); + + if (params.size() != 2) + return; + + if (params[1].getType() != Field::Types::UInt64) + throw Exception("Aggregate function " + getName() + " require require second parameter to be a UInt64", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + continuity_correction = static_cast(params[1].get()); + } + + String getName() const override + { + return "mannWhitneyUTest"; + } + + DataTypePtr getReturnType() const override + { + DataTypes types + { + std::make_shared>(), + std::make_shared>(), + }; + + Strings names + { + "u_statistic", + "p_value" + }; + + return std::make_shared( + std::move(types), + std::move(names) + ); + } + + void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena * arena) const override + { + Float64 value = columns[0]->getFloat64(row_num); + UInt8 is_second = columns[1]->getUInt(row_num); + + if (is_second) + this->data(place).addY(value, arena); + else + this->data(place).addX(value, arena); + } + + void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override + { + auto & a = this->data(place); + auto & b = this->data(rhs); + + a.merge(b, arena); + } + + void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override + { + this->data(place).write(buf); + } + + void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena * arena) const override + { + this->data(place).read(buf, arena); + } + + void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override + { + if (!this->data(place).size_x || !this->data(place).size_y) + throw Exception("Aggregate function " + getName() + " require both samples to be non empty", ErrorCodes::BAD_ARGUMENTS); + + auto [u_statistic, p_value] = this->data(place).getResult(alternative, continuity_correction); + + /// Because p-value is a probability. + p_value = std::min(1.0, std::max(0.0, p_value)); + + auto & column_tuple = assert_cast(to); + auto & column_stat = assert_cast &>(column_tuple.getColumn(0)); + auto & column_value = assert_cast &>(column_tuple.getColumn(1)); + + column_stat.getData().push_back(u_statistic); + column_value.getData().push_back(p_value); + } + +}; + +}; diff --git a/src/AggregateFunctions/AggregateFunctionRankCorrelation.cpp b/src/AggregateFunctions/AggregateFunctionRankCorrelation.cpp index 796ff028424..87fc24f8f98 100644 --- a/src/AggregateFunctions/AggregateFunctionRankCorrelation.cpp +++ b/src/AggregateFunctions/AggregateFunctionRankCorrelation.cpp @@ -21,23 +21,10 @@ AggregateFunctionPtr createAggregateFunctionRankCorrelation(const std::string & assertBinary(name, argument_types); assertNoParameters(name, parameters); - AggregateFunctionPtr res; - - if (isDecimal(argument_types[0]) || isDecimal(argument_types[1])) - { + if (!isNumber(argument_types[0]) || !isNumber(argument_types[1])) throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED); - } - else - { - res.reset(createWithTwoNumericTypes(*argument_types[0], *argument_types[1], argument_types)); - } - if (!res) - { - throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED); - } - - return res; + return std::make_shared(argument_types); } } diff --git a/src/AggregateFunctions/AggregateFunctionRankCorrelation.h b/src/AggregateFunctions/AggregateFunctionRankCorrelation.h index 75592cf5c9b..bdec03d5975 100644 --- a/src/AggregateFunctions/AggregateFunctionRankCorrelation.h +++ b/src/AggregateFunctions/AggregateFunctionRankCorrelation.h @@ -1,73 +1,56 @@ #pragma once #include +#include #include #include #include #include -#include +#include #include #include #include #include #include -#include -#include -#include - #include #include -#include - - namespace DB { -template