Merge branch 'master' into issue-16775

This commit is contained in:
Alexey Milovidov 2021-05-24 05:56:57 +03:00
commit 6f70feed2f
101 changed files with 2622 additions and 334 deletions

View File

@ -36,7 +36,7 @@ option(FAIL_ON_UNSUPPORTED_OPTIONS_COMBINATION
if(FAIL_ON_UNSUPPORTED_OPTIONS_COMBINATION) if(FAIL_ON_UNSUPPORTED_OPTIONS_COMBINATION)
set(RECONFIGURE_MESSAGE_LEVEL FATAL_ERROR) set(RECONFIGURE_MESSAGE_LEVEL FATAL_ERROR)
else() else()
set(RECONFIGURE_MESSAGE_LEVEL STATUS) set(RECONFIGURE_MESSAGE_LEVEL WARNING)
endif() endif()
enable_language(C CXX ASM) enable_language(C CXX ASM)
@ -504,7 +504,6 @@ include (cmake/find/libuv.cmake) # for amqpcpp and cassandra
include (cmake/find/amqpcpp.cmake) include (cmake/find/amqpcpp.cmake)
include (cmake/find/capnp.cmake) include (cmake/find/capnp.cmake)
include (cmake/find/llvm.cmake) include (cmake/find/llvm.cmake)
include (cmake/find/termcap.cmake) # for external static llvm
include (cmake/find/h3.cmake) include (cmake/find/h3.cmake)
include (cmake/find/libxml2.cmake) include (cmake/find/libxml2.cmake)
include (cmake/find/brotli.cmake) include (cmake/find/brotli.cmake)

View File

@ -1,98 +1,31 @@
if (APPLE OR SPLIT_SHARED_LIBRARIES OR NOT ARCH_AMD64) if (APPLE OR SPLIT_SHARED_LIBRARIES OR NOT ARCH_AMD64 OR SANITIZE STREQUAL "undefined")
set (ENABLE_EMBEDDED_COMPILER OFF CACHE INTERNAL "") set (ENABLE_EMBEDDED_COMPILER OFF CACHE INTERNAL "")
endif() endif()
option (ENABLE_EMBEDDED_COMPILER "Enable support for 'compile_expressions' option for query execution" ON) option (ENABLE_EMBEDDED_COMPILER "Enable support for 'compile_expressions' option for query execution" ON)
# Broken in macos. TODO: update clang, re-test, enable on Apple
if (ENABLE_EMBEDDED_COMPILER AND NOT SPLIT_SHARED_LIBRARIES AND ARCH_AMD64 AND NOT (SANITIZE STREQUAL "undefined"))
option (USE_INTERNAL_LLVM_LIBRARY "Use bundled or system LLVM library." ${NOT_UNBUNDLED})
endif()
if (NOT ENABLE_EMBEDDED_COMPILER) if (NOT ENABLE_EMBEDDED_COMPILER)
if(USE_INTERNAL_LLVM_LIBRARY) set (USE_EMBEDDED_COMPILER 0)
message (${RECONFIGURE_MESSAGE_LEVEL} "Cannot use internal LLVM library with ENABLE_EMBEDDED_COMPILER=OFF")
endif()
return() return()
endif() endif()
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/llvm/llvm/CMakeLists.txt") if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/llvm/llvm/CMakeLists.txt")
if (USE_INTERNAL_LLVM_LIBRARY) message (${RECONFIGURE_MESSAGE_LEVEL} "submodule /contrib/llvm is missing. to fix try run: \n git submodule update --init --recursive")
message (WARNING "submodule contrib/llvm is missing. to fix try run: \n git submodule update --init --recursive")
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't fidd internal LLVM library")
endif()
set (MISSING_INTERNAL_LLVM_LIBRARY 1)
endif () endif ()
if (NOT USE_INTERNAL_LLVM_LIBRARY) set (USE_EMBEDDED_COMPILER 1)
set (LLVM_PATHS "/usr/local/lib/llvm" "/usr/lib/llvm")
foreach(llvm_v 11.1 11) set (LLVM_FOUND 1)
if (NOT LLVM_FOUND) set (LLVM_VERSION "12.0.0bundled")
find_package (LLVM ${llvm_v} CONFIG PATHS ${LLVM_PATHS}) set (LLVM_INCLUDE_DIRS
endif () "${ClickHouse_SOURCE_DIR}/contrib/llvm/llvm/include"
endforeach () "${ClickHouse_BINARY_DIR}/contrib/llvm/llvm/include"
)
set (LLVM_LIBRARY_DIRS "${ClickHouse_BINARY_DIR}/contrib/llvm/llvm")
if (LLVM_FOUND) message(STATUS "LLVM include Directory: ${LLVM_INCLUDE_DIRS}")
# Remove dynamically-linked zlib and libedit from LLVM's dependencies: message(STATUS "LLVM library Directory: ${LLVM_LIBRARY_DIRS}")
set_target_properties(LLVMSupport PROPERTIES INTERFACE_LINK_LIBRARIES "-lpthread;LLVMDemangle;${ZLIB_LIBRARIES}") message(STATUS "LLVM C++ compiler flags: ${LLVM_CXXFLAGS}")
set_target_properties(LLVMLineEditor PROPERTIES INTERFACE_LINK_LIBRARIES "LLVMSupport")
option(LLVM_HAS_RTTI "Enable if LLVM was build with RTTI enabled" ON)
set (USE_EMBEDDED_COMPILER 1)
else()
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system LLVM")
set (USE_EMBEDDED_COMPILER 0)
endif()
if (LLVM_FOUND AND OS_LINUX AND USE_LIBCXX AND NOT FORCE_LLVM_WITH_LIBCXX)
message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is not set but the LLVM library from OS packages "
"in Linux is incompatible with libc++ ABI. LLVM Will be disabled. Force: -DFORCE_LLVM_WITH_LIBCXX=ON")
message (${RECONFIGURE_MESSAGE_LEVEL} "Unsupported LLVM configuration, cannot enable LLVM")
set (LLVM_FOUND 0)
set (USE_EMBEDDED_COMPILER 0)
endif ()
endif()
if(NOT LLVM_FOUND AND NOT MISSING_INTERNAL_LLVM_LIBRARY)
if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
message(WARNING "Option ENABLE_EMBEDDED_COMPILER is set but internal LLVM library cannot build if build directory is the same as source directory.")
set (LLVM_FOUND 0)
set (USE_EMBEDDED_COMPILER 0)
elseif (SPLIT_SHARED_LIBRARIES)
# llvm-tablegen cannot find shared libraries that we build. Probably can be easily fixed.
message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is not compatible with SPLIT_SHARED_LIBRARIES. Build of LLVM will be disabled.")
set (LLVM_FOUND 0)
set (USE_EMBEDDED_COMPILER 0)
elseif (NOT ARCH_AMD64)
# It's not supported yet, but you can help.
message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is only available for x86_64. Build of LLVM will be disabled.")
set (LLVM_FOUND 0)
set (USE_EMBEDDED_COMPILER 0)
elseif (SANITIZE STREQUAL "undefined")
# llvm-tblgen, that is used during LLVM build, doesn't work with UBSan.
message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY does not work with UBSan, because 'llvm-tblgen' tool from LLVM has undefined behaviour. Build of LLVM will be disabled.")
set (LLVM_FOUND 0)
set (USE_EMBEDDED_COMPILER 0)
else ()
set (USE_INTERNAL_LLVM_LIBRARY ON)
set (LLVM_FOUND 1)
set (USE_EMBEDDED_COMPILER 1)
set (LLVM_VERSION "9.0.0bundled")
set (LLVM_INCLUDE_DIRS
"${ClickHouse_SOURCE_DIR}/contrib/llvm/llvm/include"
"${ClickHouse_BINARY_DIR}/contrib/llvm/llvm/include"
)
set (LLVM_LIBRARY_DIRS "${ClickHouse_BINARY_DIR}/contrib/llvm/llvm")
endif()
endif()
if (LLVM_FOUND)
message(STATUS "LLVM include Directory: ${LLVM_INCLUDE_DIRS}")
message(STATUS "LLVM library Directory: ${LLVM_LIBRARY_DIRS}")
message(STATUS "LLVM C++ compiler flags: ${LLVM_CXXFLAGS}")
else()
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't enable LLVM")
endif()
# This list was generated by listing all LLVM libraries, compiling the binary and removing all libraries while it still compiles. # This list was generated by listing all LLVM libraries, compiling the binary and removing all libraries while it still compiles.
set (REQUIRED_LLVM_LIBRARIES set (REQUIRED_LLVM_LIBRARIES

View File

@ -1,17 +0,0 @@
if (ENABLE_EMBEDDED_COMPILER AND NOT USE_INTERNAL_LLVM_LIBRARY AND USE_STATIC_LIBRARIES)
find_library (TERMCAP_LIBRARY tinfo)
if (NOT TERMCAP_LIBRARY)
find_library (TERMCAP_LIBRARY ncurses)
endif()
if (NOT TERMCAP_LIBRARY)
find_library (TERMCAP_LIBRARY termcap)
endif()
if (NOT TERMCAP_LIBRARY)
message (FATAL_ERROR "Statically Linking external LLVM requires termcap")
endif()
target_link_libraries(LLVMSupport INTERFACE ${TERMCAP_LIBRARY})
message (STATUS "Using termcap: ${TERMCAP_LIBRARY}")
endif()

View File

@ -220,11 +220,12 @@ elseif(GTEST_SRC_DIR)
target_compile_definitions(gtest INTERFACE GTEST_HAS_POSIX_RE=0) target_compile_definitions(gtest INTERFACE GTEST_HAS_POSIX_RE=0)
endif() endif()
if (USE_EMBEDDED_COMPILER AND USE_INTERNAL_LLVM_LIBRARY) if (USE_EMBEDDED_COMPILER)
# ld: unknown option: --color-diagnostics # ld: unknown option: --color-diagnostics
if (APPLE) if (APPLE)
set (LINKER_SUPPORTS_COLOR_DIAGNOSTICS 0 CACHE INTERNAL "") set (LINKER_SUPPORTS_COLOR_DIAGNOSTICS 0 CACHE INTERNAL "")
endif () endif ()
set (LLVM_ENABLE_EH 1 CACHE INTERNAL "") set (LLVM_ENABLE_EH 1 CACHE INTERNAL "")
set (LLVM_ENABLE_RTTI 1 CACHE INTERNAL "") set (LLVM_ENABLE_RTTI 1 CACHE INTERNAL "")
set (LLVM_ENABLE_PIC 0 CACHE INTERNAL "") set (LLVM_ENABLE_PIC 0 CACHE INTERNAL "")
@ -239,8 +240,6 @@ if (USE_EMBEDDED_COMPILER AND USE_INTERNAL_LLVM_LIBRARY)
set (CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_bak}) set (CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_bak})
unset (CMAKE_CXX_STANDARD_bak) unset (CMAKE_CXX_STANDARD_bak)
target_include_directories(LLVMSupport SYSTEM BEFORE PRIVATE ${ZLIB_INCLUDE_DIR})
endif () endif ()
if (USE_INTERNAL_LIBGSASL_LIBRARY) if (USE_INTERNAL_LIBGSASL_LIBRARY)

2
contrib/llvm vendored

@ -1 +1 @@
Subproject commit cfaf365cf96918999d09d976ec736b4518cf5d02 Subproject commit a7198805de67374eb3fb4c6b89797fa2d1cd7e50

View File

@ -5,9 +5,9 @@ toc_title: Configuration Files
# Configuration Files {#configuration_files} # Configuration Files {#configuration_files}
ClickHouse supports multi-file configuration management. The main server configuration file is `/etc/clickhouse-server/config.xml`. Other files must be in the `/etc/clickhouse-server/config.d` directory. ClickHouse supports multi-file configuration management. The main server configuration file is `/etc/clickhouse-server/config.xml` or `/etc/clickhouse-server/config.yaml`. Other files must be in the `/etc/clickhouse-server/config.d` directory. Note, that any configuration file can be written either in XML or YAML, but mixing formats in one file is not supported. For example, you can have main configs as `config.xml` and `users.xml` and write additional files in `config.d` and `users.d` directories in `.yaml`.
All the configuration files should be in XML format. Also, they should have the same root element, usually `<yandex>`. All the configuration files should be in XML or YAML formats. All XML files should have the same root element, usually `<yandex>`. As for YAML, `yandex:` should not be present, the parser will insert it automatically.
## Override {#override} ## Override {#override}
@ -32,7 +32,7 @@ Users configuration can be splitted into separate files similar to `config.xml`
Directory name is defined as `users_config` setting without `.xml` postfix concatenated with `.d`. Directory name is defined as `users_config` setting without `.xml` postfix concatenated with `.d`.
Directory `users.d` is used by default, as `users_config` defaults to `users.xml`. Directory `users.d` is used by default, as `users_config` defaults to `users.xml`.
## Example {#example} ## XML example {#example}
For example, you can have separate config file for each user like this: For example, you can have separate config file for each user like this:
@ -55,6 +55,70 @@ $ cat /etc/clickhouse-server/users.d/alice.xml
</yandex> </yandex>
``` ```
## YAML examples {#example}
Here you can see default config written in YAML: [config-example.yaml](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/config-example.yaml).
There are some differences between YAML and XML formats in terms of ClickHouse configurations. Here are some tips for writing a configuration in YAML format.
You should use a Scalar node to write a key-value pair:
``` yaml
key: value
```
To create a node, containing other nodes you should use a Map:
``` yaml
map_key:
key1: val1
key2: val2
key3: val3
```
To create a list of values or nodes assigned to one tag you should use a Sequence:
``` yaml
seq_key:
- val1
- val2
- key1: val3
- map:
key2: val4
key3: val5
```
If you want to write an attribute for a Sequence or Map node, you should use a @ prefix before the attribute key. Note, that @ is reserved by YAML standard, so you should also to wrap it into double quotes:
``` yaml
map:
"@attr1": value1
"@attr2": value2
key: 123
```
From that Map we will get these XML nodes:
``` xml
<map attr1="value1" attr2="value2">
<key>123</key>
</map>
```
You can also set attributes for Sequence:
``` yaml
seq:
- "@attr1": value1
- "@attr2": value2
- 123
- abc
```
So, we can get YAML config equal to this XML one:
``` xml
<seq attr1="value1" attr2="value2">123</seq>
<seq attr1="value1" attr2="value2">abc</seq>
```
## Implementation Details {#implementation-details} ## Implementation Details {#implementation-details}
For each config file, the server also generates `file-preprocessed.xml` files when starting. These files contain all the completed substitutions and overrides, and they are intended for informational use. If ZooKeeper substitutions were used in the config files but ZooKeeper is not available on the server start, the server loads the configuration from the preprocessed file. For each config file, the server also generates `file-preprocessed.xml` files when starting. These files contain all the completed substitutions and overrides, and they are intended for informational use. If ZooKeeper substitutions were used in the config files but ZooKeeper is not available on the server start, the server loads the configuration from the preprocessed file.

View File

@ -16,6 +16,9 @@ A query may simultaneously specify `PREWHERE` and `WHERE`. In this case, `PREWHE
If the `optimize_move_to_prewhere` setting is set to 0, heuristics to automatically move parts of expressions from `WHERE` to `PREWHERE` are disabled. If the `optimize_move_to_prewhere` setting is set to 0, heuristics to automatically move parts of expressions from `WHERE` to `PREWHERE` are disabled.
!!! note "Attention"
The `PREWHERE` section is executed before` FINAL`, so the results of `FROM FINAL` queries may be skewed when using` PREWHERE` with fields not in the `ORDER BY` section of a table.
## Limitations {#limitations} ## Limitations {#limitations}
`PREWHERE` is only supported by tables from the `*MergeTree` family. `PREWHERE` is only supported by tables from the `*MergeTree` family.

View File

@ -12,6 +12,5 @@ ClickHouse поддерживает специальные функции для
ClickHouse поддерживает: ClickHouse поддерживает:
- [Встроенные словари](internal-dicts.md#internal_dicts) со специфическим [набором функций](../../sql-reference/dictionaries/external-dictionaries/index.md). - [Встроенные словари](internal-dicts.md#internal_dicts) со специфическим [набором функций](../../sql-reference/functions/ext-dict-functions.md).
- [Подключаемые (внешние) словари](external-dictionaries/external-dicts.md#dicts-external-dicts) с [набором функций](../../sql-reference/dictionaries/external-dictionaries/index.md). - [Подключаемые (внешние) словари](external-dictionaries/external-dicts.md#dicts-external-dicts) с [набором функций](../../sql-reference/functions/ext-dict-functions.md).

View File

@ -16,6 +16,9 @@ Prewhere — это оптимизация для более эффективн
Если значение параметра `optimize_move_to_prewhere` равно 0, эвристика по автоматическому перемещнию части выражений из `WHERE` к `PREWHERE` отключается. Если значение параметра `optimize_move_to_prewhere` равно 0, эвристика по автоматическому перемещнию части выражений из `WHERE` к `PREWHERE` отключается.
!!! note "Внимание"
Секция `PREWHERE` выполняется до `FINAL`, поэтому результаты запросов `FROM FINAL` могут исказится при использовании `PREWHERE` с полями не входящями в `ORDER BY` таблицы.
## Ограничения {#limitations} ## Ограничения {#limitations}
`PREWHERE` поддерживается только табличными движками из семейства `*MergeTree`. `PREWHERE` поддерживается только табличными движками из семейства `*MergeTree`.

View File

@ -8,15 +8,15 @@ toc_title: "\u5BFC\u8A00"
# 字典 {#dictionaries} # 字典 {#dictionaries}
字典是一个映射 (`key -> attributes`)这是方便各种类型的参考清单。 字典是一个映射 (`键 -> 属性`, 是方便各种类型的参考清单。
ClickHouse支持使用可用于查询的字典的特殊功能。 这是更容易和更有效地使用字典与功能比 `JOIN` 与参考表 ClickHouse支持一些特殊函数配合字典在查询中使用。 将字典与函数结合使用比将 `JOIN` 操作与引用表结合使用更简单、更有效
[NULL](../../sql-reference/syntax.md#null-literal) 值不能存储在字典中。 [NULL](../../sql-reference/syntax.md#null-literal) 值不能存储在字典中。
ClickHouse支持: ClickHouse支持:
- [内置字典](internal-dicts.md#internal_dicts) 具有特定的 [功能集](../../sql-reference/functions/ym-dict-functions.md). - [内置字典](internal-dicts.md#internal_dicts) ,这些字典具有特定的 [函数集](../../sql-reference/functions/ym-dict-functions.md).
- [插件(外部)字典](external-dictionaries/external-dicts.md#dicts-external-dicts) 用一个 [功能集](../../sql-reference/functions/ext-dict-functions.md). - [插件(外部)字典](external-dictionaries/external-dicts.md#dicts-external-dicts) ,这些字典拥有一个 [函数集](../../sql-reference/functions/ext-dict-functions.md).
[原始文章](https://clickhouse.tech/docs/en/query_language/dicts/) <!--hide--> [原始文章](https://clickhouse.tech/docs/en/query_language/dicts/) <!--hide-->

View File

@ -49,6 +49,12 @@ option (ENABLE_CLICKHOUSE_GIT_IMPORT "A tool to analyze Git repositories"
option (ENABLE_CLICKHOUSE_KEEPER "ClickHouse alternative to ZooKeeper" ${ENABLE_CLICKHOUSE_ALL}) option (ENABLE_CLICKHOUSE_KEEPER "ClickHouse alternative to ZooKeeper" ${ENABLE_CLICKHOUSE_ALL})
if (NOT USE_NURAFT)
# RECONFIGURE_MESSAGE_LEVEL should not be used here,
# since USE_NURAFT is set to OFF for FreeBSD and Darwin.
message (STATUS "clickhouse-keeper will not be built (lack of NuRaft)")
set(ENABLE_CLICKHOUSE_KEEPER OFF)
endif()
if (CLICKHOUSE_SPLIT_BINARY) if (CLICKHOUSE_SPLIT_BINARY)
option(ENABLE_CLICKHOUSE_INSTALL "Install ClickHouse without .deb/.rpm/.tgz packages (having the binary only)" OFF) option(ENABLE_CLICKHOUSE_INSTALL "Install ClickHouse without .deb/.rpm/.tgz packages (having the binary only)" OFF)
@ -259,7 +265,10 @@ add_subdirectory (obfuscator)
add_subdirectory (install) add_subdirectory (install)
add_subdirectory (git-import) add_subdirectory (git-import)
add_subdirectory (bash-completion) add_subdirectory (bash-completion)
add_subdirectory (keeper)
if (ENABLE_CLICKHOUSE_KEEPER)
add_subdirectory (keeper)
endif()
if (ENABLE_CLICKHOUSE_ODBC_BRIDGE) if (ENABLE_CLICKHOUSE_ODBC_BRIDGE)
add_subdirectory (odbc-bridge) add_subdirectory (odbc-bridge)
@ -278,7 +287,18 @@ if (CLICKHOUSE_ONE_SHARED)
endif() endif()
if (CLICKHOUSE_SPLIT_BINARY) if (CLICKHOUSE_SPLIT_BINARY)
set (CLICKHOUSE_ALL_TARGETS clickhouse-server clickhouse-client clickhouse-local clickhouse-benchmark clickhouse-extract-from-config clickhouse-compressor clickhouse-format clickhouse-obfuscator clickhouse-git-import clickhouse-copier clickhouse-keeper) set (CLICKHOUSE_ALL_TARGETS
clickhouse-server
clickhouse-client
clickhouse-local
clickhouse-benchmark
clickhouse-extract-from-config
clickhouse-compressor
clickhouse-format
clickhouse-obfuscator
clickhouse-git-import
clickhouse-copier
)
if (ENABLE_CLICKHOUSE_ODBC_BRIDGE) if (ENABLE_CLICKHOUSE_ODBC_BRIDGE)
list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-odbc-bridge) list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-odbc-bridge)
@ -288,6 +308,10 @@ if (CLICKHOUSE_SPLIT_BINARY)
list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-library-bridge) list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-library-bridge)
endif () endif ()
if (ENABLE_CLICKHOUSE_KEEPER)
list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-keeper)
endif ()
set_target_properties(${CLICKHOUSE_ALL_TARGETS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..) set_target_properties(${CLICKHOUSE_ALL_TARGETS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..)
add_custom_target (clickhouse-bundle ALL DEPENDS ${CLICKHOUSE_ALL_TARGETS}) add_custom_target (clickhouse-bundle ALL DEPENDS ${CLICKHOUSE_ALL_TARGETS})

View File

@ -30,9 +30,7 @@
# include <Poco/Net/SecureServerSocket.h> # include <Poco/Net/SecureServerSocket.h>
#endif #endif
#if USE_NURAFT #include <Server/KeeperTCPHandlerFactory.h>
# include <Server/KeeperTCPHandlerFactory.h>
#endif
#if defined(OS_LINUX) #if defined(OS_LINUX)
# include <unistd.h> # include <unistd.h>
@ -357,7 +355,6 @@ int Keeper::main(const std::vector<std::string> & /*args*/)
auto servers = std::make_shared<std::vector<ProtocolServerAdapter>>(); auto servers = std::make_shared<std::vector<ProtocolServerAdapter>>();
#if USE_NURAFT
/// Initialize test keeper RAFT. Do nothing if no nu_keeper_server in config. /// Initialize test keeper RAFT. Do nothing if no nu_keeper_server in config.
global_context->initializeKeeperStorageDispatcher(); global_context->initializeKeeperStorageDispatcher();
for (const auto & listen_host : listen_hosts) for (const auto & listen_host : listen_hosts)
@ -398,9 +395,6 @@ int Keeper::main(const std::vector<std::string> & /*args*/)
#endif #endif
}); });
} }
#else
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse keeper built without NuRaft library. Cannot use coordination.");
#endif
for (auto & server : *servers) for (auto & server : *servers)
server.start(); server.start();

View File

@ -0,0 +1,945 @@
# NOTE: User and query level settings are set up in "users.xml" file.
# If you have accidentally specified user-level settings here, server won't start.
# You can either move the settings to the right place inside "users.xml" file
# or add skip_check_for_incorrect_settings: 1 here.
logger:
# Possible levels [1]:
# - none (turns off logging)
# - fatal
# - critical
# - error
# - warning
# - notice
# - information
# - debug
# - trace
# [1]: https://github.com/pocoproject/poco/blob/poco-1.9.4-release/Foundation/include/Poco/Logger.h#L105-L114
level: trace
log: /var/log/clickhouse-server/clickhouse-server.log
errorlog: /var/log/clickhouse-server/clickhouse-server.err.log
# Rotation policy
# See https://github.com/pocoproject/poco/blob/poco-1.9.4-release/Foundation/include/Poco/FileChannel.h#L54-L85
size: 1000M
count: 10
# console: 1
# Default behavior is autodetection (log to console if not daemon mode and is tty)
# Per level overrides (legacy):
# For example to suppress logging of the ConfigReloader you can use:
# NOTE: levels.logger is reserved, see below.
# levels:
# ConfigReloader: none
# Per level overrides:
# For example to suppress logging of the RBAC for default user you can use:
# (But please note that the logger name maybe changed from version to version, even after minor upgrade)
# levels:
# - logger:
# name: 'ContextAccess (default)'
# level: none
# - logger:
# name: 'DatabaseOrdinary (test)'
# level: none
# It is the name that will be shown in the clickhouse-client.
# By default, anything with "production" will be highlighted in red in query prompt.
# display_name: production
# Port for HTTP API. See also 'https_port' for secure connections.
# This interface is also used by ODBC and JDBC drivers (DataGrip, Dbeaver, ...)
# and by most of web interfaces (embedded UI, Grafana, Redash, ...).
http_port: 8123
# Port for interaction by native protocol with:
# - clickhouse-client and other native ClickHouse tools (clickhouse-benchmark, clickhouse-copier);
# - clickhouse-server with other clickhouse-servers for distributed query processing;
# - ClickHouse drivers and applications supporting native protocol
# (this protocol is also informally called as "the TCP protocol");
# See also 'tcp_port_secure' for secure connections.
tcp_port: 9000
# Compatibility with MySQL protocol.
# ClickHouse will pretend to be MySQL for applications connecting to this port.
mysql_port: 9004
# Compatibility with PostgreSQL protocol.
# ClickHouse will pretend to be PostgreSQL for applications connecting to this port.
postgresql_port: 9005
# HTTP API with TLS (HTTPS).
# You have to configure certificate to enable this interface.
# See the openSSL section below.
# https_port: 8443
# Native interface with TLS.
# You have to configure certificate to enable this interface.
# See the openSSL section below.
# tcp_port_secure: 9440
# Native interface wrapped with PROXYv1 protocol
# PROXYv1 header sent for every connection.
# ClickHouse will extract information about proxy-forwarded client address from the header.
# tcp_with_proxy_port: 9011
# Port for communication between replicas. Used for data exchange.
# It provides low-level data access between servers.
# This port should not be accessible from untrusted networks.
# See also 'interserver_http_credentials'.
# Data transferred over connections to this port should not go through untrusted networks.
# See also 'interserver_https_port'.
interserver_http_port: 9009
# Port for communication between replicas with TLS.
# You have to configure certificate to enable this interface.
# See the openSSL section below.
# See also 'interserver_http_credentials'.
# interserver_https_port: 9010
# Hostname that is used by other replicas to request this server.
# If not specified, than it is determined analogous to 'hostname -f' command.
# This setting could be used to switch replication to another network interface
# (the server may be connected to multiple networks via multiple addresses)
# interserver_http_host: example.yandex.ru
# You can specify credentials for authenthication between replicas.
# This is required when interserver_https_port is accessible from untrusted networks,
# and also recommended to avoid SSRF attacks from possibly compromised services in your network.
# interserver_http_credentials:
# user: interserver
# password: ''
# Listen specified address.
# Use :: (wildcard IPv6 address), if you want to accept connections both with IPv4 and IPv6 from everywhere.
# Notes:
# If you open connections from wildcard address, make sure that at least one of the following measures applied:
# - server is protected by firewall and not accessible from untrusted networks;
# - all users are restricted to subset of network addresses (see users.xml);
# - all users have strong passwords, only secure (TLS) interfaces are accessible, or connections are only made via TLS interfaces.
# - users without password have readonly access.
# See also: https://www.shodan.io/search?query=clickhouse
# listen_host: '::'
# Same for hosts without support for IPv6:
# listen_host: 0.0.0.0
# Default values - try listen localhost on IPv4 and IPv6.
# listen_host: '::1'
# listen_host: 127.0.0.1
# Don't exit if IPv6 or IPv4 networks are unavailable while trying to listen.
# listen_try: 0
# Allow multiple servers to listen on the same address:port. This is not recommended.
# listen_reuse_port: 0
# listen_backlog: 64
max_connections: 4096
# For 'Connection: keep-alive' in HTTP 1.1
keep_alive_timeout: 3
# gRPC protocol (see src/Server/grpc_protos/clickhouse_grpc.proto for the API)
# grpc_port: 9100
grpc:
enable_ssl: false
# The following two files are used only if enable_ssl=1
ssl_cert_file: /path/to/ssl_cert_file
ssl_key_file: /path/to/ssl_key_file
# Whether server will request client for a certificate
ssl_require_client_auth: false
# The following file is used only if ssl_require_client_auth=1
ssl_ca_cert_file: /path/to/ssl_ca_cert_file
# Default compression algorithm (applied if client doesn't specify another algorithm).
# Supported algorithms: none, deflate, gzip, stream_gzip
compression: deflate
# Default compression level (applied if client doesn't specify another level).
# Supported levels: none, low, medium, high
compression_level: medium
# Send/receive message size limits in bytes. -1 means unlimited
max_send_message_size: -1
max_receive_message_size: -1
# Enable if you want very detailed logs
verbose_logs: false
# Used with https_port and tcp_port_secure. Full ssl options list: https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/SSLManager.h#L71
openSSL:
server:
# Used for https server AND secure tcp port
# openssl req -subj "/CN=localhost" -new -newkey rsa:2048 -days 365 -nodes -x509 -keyout /etc/clickhouse-server/server.key -out /etc/clickhouse-server/server.crt
certificateFile: /etc/clickhouse-server/server.crt
privateKeyFile: /etc/clickhouse-server/server.key
# dhparams are optional. You can delete the dhParamsFile: element.
# To generate dhparams, use the following command:
# openssl dhparam -out /etc/clickhouse-server/dhparam.pem 4096
# Only file format with BEGIN DH PARAMETERS is supported.
dhParamsFile: /etc/clickhouse-server/dhparam.pem
verificationMode: none
loadDefaultCAFile: true
cacheSessions: true
disableProtocols: 'sslv2,sslv3'
preferServerCiphers: true
client:
# Used for connecting to https dictionary source and secured Zookeeper communication
loadDefaultCAFile: true
cacheSessions: true
disableProtocols: 'sslv2,sslv3'
preferServerCiphers: true
# Use for self-signed: verificationMode: none
invalidCertificateHandler:
# Use for self-signed: name: AcceptCertificateHandler
name: RejectCertificateHandler
# Default root page on http[s] server. For example load UI from https://tabix.io/ when opening http://localhost:8123
# http_server_default_response: |-
# <html ng-app="SMI2"><head><base href="http://ui.tabix.io/"></head><body><div ui-view="" class="content-ui"></div><script src="http://loader.tabix.io/master.js"></script></body></html>
# Maximum number of concurrent queries.
max_concurrent_queries: 100
# Maximum memory usage (resident set size) for server process.
# Zero value or unset means default. Default is "max_server_memory_usage_to_ram_ratio" of available physical RAM.
# If the value is larger than "max_server_memory_usage_to_ram_ratio" of available physical RAM, it will be cut down.
# The constraint is checked on query execution time.
# If a query tries to allocate memory and the current memory usage plus allocation is greater
# than specified threshold, exception will be thrown.
# It is not practical to set this constraint to small values like just a few gigabytes,
# because memory allocator will keep this amount of memory in caches and the server will deny service of queries.
max_server_memory_usage: 0
# Maximum number of threads in the Global thread pool.
# This will default to a maximum of 10000 threads if not specified.
# This setting will be useful in scenarios where there are a large number
# of distributed queries that are running concurrently but are idling most
# of the time, in which case a higher number of threads might be required.
max_thread_pool_size: 10000
# On memory constrained environments you may have to set this to value larger than 1.
max_server_memory_usage_to_ram_ratio: 0.9
# Simple server-wide memory profiler. Collect a stack trace at every peak allocation step (in bytes).
# Data will be stored in system.trace_log table with query_id = empty string.
# Zero means disabled.
total_memory_profiler_step: 4194304
# Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type.
# The probability is for every alloc/free regardless to the size of the allocation.
# Note that sampling happens only when the amount of untracked memory exceeds the untracked memory limit,
# which is 4 MiB by default but can be lowered if 'total_memory_profiler_step' is lowered.
# You may want to set 'total_memory_profiler_step' to 1 for extra fine grained sampling.
total_memory_tracker_sample_probability: 0
# Set limit on number of open files (default: maximum). This setting makes sense on Mac OS X because getrlimit() fails to retrieve
# correct maximum value.
# max_open_files: 262144
# Size of cache of uncompressed blocks of data, used in tables of MergeTree family.
# In bytes. Cache is single for server. Memory is allocated only on demand.
# Cache is used when 'use_uncompressed_cache' user setting turned on (off by default).
# Uncompressed cache is advantageous only for very short queries and in rare cases.
# Note: uncompressed cache can be pointless for lz4, because memory bandwidth
# is slower than multi-core decompression on some server configurations.
# Enabling it can sometimes paradoxically make queries slower.
uncompressed_cache_size: 8589934592
# Approximate size of mark cache, used in tables of MergeTree family.
# In bytes. Cache is single for server. Memory is allocated only on demand.
# You should not lower this value.
mark_cache_size: 5368709120
# If you enable the `min_bytes_to_use_mmap_io` setting,
# the data in MergeTree tables can be read with mmap to avoid copying from kernel to userspace.
# It makes sense only for large files and helps only if data reside in page cache.
# To avoid frequent open/mmap/munmap/close calls (which are very expensive due to consequent page faults)
# and to reuse mappings from several threads and queries,
# the cache of mapped files is maintained. Its size is the number of mapped regions (usually equal to the number of mapped files).
# The amount of data in mapped files can be monitored
# in system.metrics, system.metric_log by the MMappedFiles, MMappedFileBytes metrics
# and in system.asynchronous_metrics, system.asynchronous_metrics_log by the MMapCacheCells metric,
# and also in system.events, system.processes, system.query_log, system.query_thread_log by the
# CreatedReadBufferMMap, CreatedReadBufferMMapFailed, MMappedFileCacheHits, MMappedFileCacheMisses events.
# Note that the amount of data in mapped files does not consume memory directly and is not accounted
# in query or server memory usage - because this memory can be discarded similar to OS page cache.
# The cache is dropped (the files are closed) automatically on removal of old parts in MergeTree,
# also it can be dropped manually by the SYSTEM DROP MMAP CACHE query.
mmap_cache_size: 1000
# Cache size for compiled expressions.
compiled_expression_cache_size: 1073741824
# Path to data directory, with trailing slash.
path: /var/lib/clickhouse/
# Path to temporary data for processing hard queries.
tmp_path: /var/lib/clickhouse/tmp/
# Policy from the <storage_configuration> for the temporary files.
# If not set <tmp_path> is used, otherwise <tmp_path> is ignored.
# Notes:
# - move_factor is ignored
# - keep_free_space_bytes is ignored
# - max_data_part_size_bytes is ignored
# - you must have exactly one volume in that policy
# tmp_policy: tmp
# Directory with user provided files that are accessible by 'file' table function.
user_files_path: /var/lib/clickhouse/user_files/
# LDAP server definitions.
ldap_servers: ''
# List LDAP servers with their connection parameters here to later 1) use them as authenticators for dedicated local users,
# who have 'ldap' authentication mechanism specified instead of 'password', or to 2) use them as remote user directories.
# Parameters:
# host - LDAP server hostname or IP, this parameter is mandatory and cannot be empty.
# port - LDAP server port, default is 636 if enable_tls is set to true, 389 otherwise.
# bind_dn - template used to construct the DN to bind to.
# The resulting DN will be constructed by replacing all '{user_name}' substrings of the template with the actual
# user name during each authentication attempt.
# user_dn_detection - section with LDAP search parameters for detecting the actual user DN of the bound user.
# This is mainly used in search filters for further role mapping when the server is Active Directory. The
# resulting user DN will be used when replacing '{user_dn}' substrings wherever they are allowed. By default,
# user DN is set equal to bind DN, but once search is performed, it will be updated with to the actual detected
# user DN value.
# base_dn - template used to construct the base DN for the LDAP search.
# The resulting DN will be constructed by replacing all '{user_name}' and '{bind_dn}' substrings
# of the template with the actual user name and bind DN during the LDAP search.
# scope - scope of the LDAP search.
# Accepted values are: 'base', 'one_level', 'children', 'subtree' (the default).
# search_filter - template used to construct the search filter for the LDAP search.
# The resulting filter will be constructed by replacing all '{user_name}', '{bind_dn}', and '{base_dn}'
# substrings of the template with the actual user name, bind DN, and base DN during the LDAP search.
# Note, that the special characters must be escaped properly in XML.
# verification_cooldown - a period of time, in seconds, after a successful bind attempt, during which a user will be assumed
# to be successfully authenticated for all consecutive requests without contacting the LDAP server.
# Specify 0 (the default) to disable caching and force contacting the LDAP server for each authentication request.
# enable_tls - flag to trigger use of secure connection to the LDAP server.
# Specify 'no' for plain text (ldap://) protocol (not recommended).
# Specify 'yes' for LDAP over SSL/TLS (ldaps://) protocol (recommended, the default).
# Specify 'starttls' for legacy StartTLS protocol (plain text (ldap://) protocol, upgraded to TLS).
# tls_minimum_protocol_version - the minimum protocol version of SSL/TLS.
# Accepted values are: 'ssl2', 'ssl3', 'tls1.0', 'tls1.1', 'tls1.2' (the default).
# tls_require_cert - SSL/TLS peer certificate verification behavior.
# Accepted values are: 'never', 'allow', 'try', 'demand' (the default).
# tls_cert_file - path to certificate file.
# tls_key_file - path to certificate key file.
# tls_ca_cert_file - path to CA certificate file.
# tls_ca_cert_dir - path to the directory containing CA certificates.
# tls_cipher_suite - allowed cipher suite (in OpenSSL notation).
# Example:
# my_ldap_server:
# host: localhost
# port: 636
# bind_dn: 'uid={user_name},ou=users,dc=example,dc=com'
# verification_cooldown: 300
# enable_tls: yes
# tls_minimum_protocol_version: tls1.2
# tls_require_cert: demand
# tls_cert_file: /path/to/tls_cert_file
# tls_key_file: /path/to/tls_key_file
# tls_ca_cert_file: /path/to/tls_ca_cert_file
# tls_ca_cert_dir: /path/to/tls_ca_cert_dir
# tls_cipher_suite: ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:AES256-GCM-SHA384
# Example (typical Active Directory with configured user DN detection for further role mapping):
# my_ad_server:
# host: localhost
# port: 389
# bind_dn: 'EXAMPLE\{user_name}'
# user_dn_detection:
# base_dn: CN=Users,DC=example,DC=com
# search_filter: '(&amp;(objectClass=user)(sAMAccountName={user_name}))'
# enable_tls: no
# To enable Kerberos authentication support for HTTP requests (GSS-SPNEGO), for those users who are explicitly configured
# to authenticate via Kerberos, define a single 'kerberos' section here.
# Parameters:
# principal - canonical service principal name, that will be acquired and used when accepting security contexts.
# This parameter is optional, if omitted, the default principal will be used.
# This parameter cannot be specified together with 'realm' parameter.
# realm - a realm, that will be used to restrict authentication to only those requests whose initiator's realm matches it.
# This parameter is optional, if omitted, no additional filtering by realm will be applied.
# This parameter cannot be specified together with 'principal' parameter.
# Example:
# kerberos: ''
# Example:
# kerberos:
# principal: HTTP/clickhouse.example.com@EXAMPLE.COM
# Example:
# kerberos:
# realm: EXAMPLE.COM
# Sources to read users, roles, access rights, profiles of settings, quotas.
user_directories:
users_xml:
# Path to configuration file with predefined users.
path: users.yaml
local_directory:
# Path to folder where users created by SQL commands are stored.
path: /var/lib/clickhouse/access/
# To add an LDAP server as a remote user directory of users that are not defined locally, define a single 'ldap' section
# with the following parameters:
# server - one of LDAP server names defined in 'ldap_servers' config section above.
# This parameter is mandatory and cannot be empty.
# roles - section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server.
# If no roles are specified here or assigned during role mapping (below), user will not be able to perform any
# actions after authentication.
# role_mapping - section with LDAP search parameters and mapping rules.
# When a user authenticates, while still bound to LDAP, an LDAP search is performed using search_filter and the
# name of the logged in user. For each entry found during that search, the value of the specified attribute is
# extracted. For each attribute value that has the specified prefix, the prefix is removed, and the rest of the
# value becomes the name of a local role defined in ClickHouse, which is expected to be created beforehand by
# CREATE ROLE command.
# There can be multiple 'role_mapping' sections defined inside the same 'ldap' section. All of them will be
# applied.
# base_dn - template used to construct the base DN for the LDAP search.
# The resulting DN will be constructed by replacing all '{user_name}', '{bind_dn}', and '{user_dn}'
# substrings of the template with the actual user name, bind DN, and user DN during each LDAP search.
# scope - scope of the LDAP search.
# Accepted values are: 'base', 'one_level', 'children', 'subtree' (the default).
# search_filter - template used to construct the search filter for the LDAP search.
# The resulting filter will be constructed by replacing all '{user_name}', '{bind_dn}', '{user_dn}', and
# '{base_dn}' substrings of the template with the actual user name, bind DN, user DN, and base DN during
# each LDAP search.
# Note, that the special characters must be escaped properly in XML.
# attribute - attribute name whose values will be returned by the LDAP search. 'cn', by default.
# prefix - prefix, that will be expected to be in front of each string in the original list of strings returned by
# the LDAP search. Prefix will be removed from the original strings and resulting strings will be treated
# as local role names. Empty, by default.
# Example:
# ldap:
# server: my_ldap_server
# roles:
# my_local_role1: ''
# my_local_role2: ''
# role_mapping:
# base_dn: 'ou=groups,dc=example,dc=com'
# scope: subtree
# search_filter: '(&amp;(objectClass=groupOfNames)(member={bind_dn}))'
# attribute: cn
# prefix: clickhouse_
# Example (typical Active Directory with role mapping that relies on the detected user DN):
# ldap:
# server: my_ad_server
# role_mapping:
# base_dn: 'CN=Users,DC=example,DC=com'
# attribute: CN
# scope: subtree
# search_filter: '(&amp;(objectClass=group)(member={user_dn}))'
# prefix: clickhouse_
# Default profile of settings.
default_profile: default
# Comma-separated list of prefixes for user-defined settings.
# custom_settings_prefixes: ''
# System profile of settings. This settings are used by internal processes (Distributed DDL worker and so on).
# system_profile: default
# Buffer profile of settings.
# This settings are used by Buffer storage to flush data to the underlying table.
# Default: used from system_profile directive.
# buffer_profile: default
# Default database.
default_database: default
# Server time zone could be set here.
# Time zone is used when converting between String and DateTime types,
# when printing DateTime in text formats and parsing DateTime from text,
# it is used in date and time related functions, if specific time zone was not passed as an argument.
# Time zone is specified as identifier from IANA time zone database, like UTC or Africa/Abidjan.
# If not specified, system time zone at server startup is used.
# Please note, that server could display time zone alias instead of specified name.
# Example: W-SU is an alias for Europe/Moscow and Zulu is an alias for UTC.
# timezone: Europe/Moscow
# You can specify umask here (see "man umask"). Server will apply it on startup.
# Number is always parsed as octal. Default umask is 027 (other users cannot read logs, data files, etc; group can only read).
# umask: 022
# Perform mlockall after startup to lower first queries latency
# and to prevent clickhouse executable from being paged out under high IO load.
# Enabling this option is recommended but will lead to increased startup time for up to a few seconds.
mlock_executable: true
# Reallocate memory for machine code ("text") using huge pages. Highly experimental.
remap_executable: false
# Uncomment below in order to use JDBC table engine and function.
# To install and run JDBC bridge in background:
# * [Debian/Ubuntu]
# export MVN_URL=https://repo1.maven.org/maven2/ru/yandex/clickhouse/clickhouse-jdbc-bridge
# export PKG_VER=$(curl -sL $MVN_URL/maven-metadata.xml | grep '<release>' | sed -e 's|.*>\(.*\)<.*|\1|')
# wget https://github.com/ClickHouse/clickhouse-jdbc-bridge/releases/download/v$PKG_VER/clickhouse-jdbc-bridge_$PKG_VER-1_all.deb
# apt install --no-install-recommends -f ./clickhouse-jdbc-bridge_$PKG_VER-1_all.deb
# clickhouse-jdbc-bridge &
# * [CentOS/RHEL]
# export MVN_URL=https://repo1.maven.org/maven2/ru/yandex/clickhouse/clickhouse-jdbc-bridge
# export PKG_VER=$(curl -sL $MVN_URL/maven-metadata.xml | grep '<release>' | sed -e 's|.*>\(.*\)<.*|\1|')
# wget https://github.com/ClickHouse/clickhouse-jdbc-bridge/releases/download/v$PKG_VER/clickhouse-jdbc-bridge-$PKG_VER-1.noarch.rpm
# yum localinstall -y clickhouse-jdbc-bridge-$PKG_VER-1.noarch.rpm
# clickhouse-jdbc-bridge &
# Please refer to https://github.com/ClickHouse/clickhouse-jdbc-bridge#usage for more information.
# jdbc_bridge:
# host: 127.0.0.1
# port: 9019
# Configuration of clusters that could be used in Distributed tables.
# https://clickhouse.tech/docs/en/operations/table_engines/distributed/
remote_servers:
# Test only shard config for testing distributed storage
test_shard_localhost:
# Inter-server per-cluster secret for Distributed queries
# default: no secret (no authentication will be performed)
# If set, then Distributed queries will be validated on shards, so at least:
# - such cluster should exist on the shard,
# - such cluster should have the same secret.
# And also (and which is more important), the initial_user will
# be used as current user for the query.
# Right now the protocol is pretty simple and it only takes into account:
# - cluster name
# - query
# Also it will be nice if the following will be implemented:
# - source hostname (see interserver_http_host), but then it will depends from DNS,
# it can use IP address instead, but then the you need to get correct on the initiator node.
# - target hostname / ip address (same notes as for source hostname)
# - time-based security tokens
# secret: ''
shard:
# Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas).
# internal_replication: false
# Optional. Shard weight when writing data. Default: 1.
# weight: 1
replica:
host: localhost
port: 9000
# Optional. Priority of the replica for load_balancing. Default: 1 (less value has more priority).
# priority: 1
test_cluster_two_shards_localhost:
shard:
- replica:
host: localhost
port: 9000
- replica:
host: localhost
port: 9000
test_cluster_two_shards:
shard:
- replica:
host: 127.0.0.1
port: 9000
- replica:
host: 127.0.0.2
port: 9000
test_cluster_two_shards_internal_replication:
shard:
- internal_replication: true
replica:
host: 127.0.0.1
port: 9000
- internal_replication: true
replica:
host: 127.0.0.2
port: 9000
test_shard_localhost_secure:
shard:
replica:
host: localhost
port: 9440
secure: 1
test_unavailable_shard:
shard:
- replica:
host: localhost
port: 9000
- replica:
host: localhost
port: 1
# The list of hosts allowed to use in URL-related storage engines and table functions.
# If this section is not present in configuration, all hosts are allowed.
# remote_url_allow_hosts:
# Host should be specified exactly as in URL. The name is checked before DNS resolution.
# Example: "yandex.ru", "yandex.ru." and "www.yandex.ru" are different hosts.
# If port is explicitly specified in URL, the host:port is checked as a whole.
# If host specified here without port, any port with this host allowed.
# "yandex.ru" -> "yandex.ru:443", "yandex.ru:80" etc. is allowed, but "yandex.ru:80" -> only "yandex.ru:80" is allowed.
# If the host is specified as IP address, it is checked as specified in URL. Example: "[2a02:6b8:a::a]".
# If there are redirects and support for redirects is enabled, every redirect (the Location field) is checked.
# Regular expression can be specified. RE2 engine is used for regexps.
# Regexps are not aligned: don't forget to add ^ and $. Also don't forget to escape dot (.) metacharacter
# (forgetting to do so is a common source of error).
# If element has 'incl' attribute, then for it's value will be used corresponding substitution from another file.
# By default, path to file with substitutions is /etc/metrika.xml. It could be changed in config in 'include_from' element.
# Values for substitutions are specified in /yandex/name_of_substitution elements in that file.
# ZooKeeper is used to store metadata about replicas, when using Replicated tables.
# Optional. If you don't use replicated tables, you could omit that.
# See https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/replication/
# zookeeper:
# - node:
# host: example1
# port: 2181
# - node:
# host: example2
# port: 2181
# - node:
# host: example3
# port: 2181
# Substitutions for parameters of replicated tables.
# Optional. If you don't use replicated tables, you could omit that.
# See https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/replication/#creating-replicated-tables
# macros:
# shard: 01
# replica: example01-01-1
# Reloading interval for embedded dictionaries, in seconds. Default: 3600.
builtin_dictionaries_reload_interval: 3600
# Maximum session timeout, in seconds. Default: 3600.
max_session_timeout: 3600
# Default session timeout, in seconds. Default: 60.
default_session_timeout: 60
# Sending data to Graphite for monitoring. Several sections can be defined.
# interval - send every X second
# root_path - prefix for keys
# hostname_in_path - append hostname to root_path (default = true)
# metrics - send data from table system.metrics
# events - send data from table system.events
# asynchronous_metrics - send data from table system.asynchronous_metrics
# graphite:
# host: localhost
# port: 42000
# timeout: 0.1
# interval: 60
# root_path: one_min
# hostname_in_path: true
# metrics: true
# events: true
# events_cumulative: false
# asynchronous_metrics: true
# graphite:
# host: localhost
# port: 42000
# timeout: 0.1
# interval: 1
# root_path: one_sec
# metrics: true
# events: true
# events_cumulative: false
# asynchronous_metrics: false
# Serve endpoint for Prometheus monitoring.
# endpoint - mertics path (relative to root, statring with "/")
# port - port to setup server. If not defined or 0 than http_port used
# metrics - send data from table system.metrics
# events - send data from table system.events
# asynchronous_metrics - send data from table system.asynchronous_metrics
# status_info - send data from different component from CH, ex: Dictionaries status
# prometheus:
# endpoint: /metrics
# port: 9363
# metrics: true
# events: true
# asynchronous_metrics: true
# status_info: true
# Query log. Used only for queries with setting log_queries = 1.
query_log:
# What table to insert data. If table is not exist, it will be created.
# When query log structure is changed after system update,
# then old table will be renamed and new table will be created automatically.
database: system
table: query_log
# PARTITION BY expr: https://clickhouse.yandex/docs/en/table_engines/mergetree-family/custom_partitioning_key/
# Example:
# event_date
# toMonday(event_date)
# toYYYYMM(event_date)
# toStartOfHour(event_time)
partition_by: toYYYYMM(event_date)
# Table TTL specification: https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/mergetree/#mergetree-table-ttl
# Example:
# event_date + INTERVAL 1 WEEK
# event_date + INTERVAL 7 DAY DELETE
# event_date + INTERVAL 2 WEEK TO DISK 'bbb'
# ttl: 'event_date + INTERVAL 30 DAY DELETE'
# Instead of partition_by, you can provide full engine expression (starting with ENGINE = ) with parameters,
# Example: engine: 'ENGINE = MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024'
# Interval of flushing data.
flush_interval_milliseconds: 7500
# Trace log. Stores stack traces collected by query profilers.
# See query_profiler_real_time_period_ns and query_profiler_cpu_time_period_ns settings.
trace_log:
database: system
table: trace_log
partition_by: toYYYYMM(event_date)
flush_interval_milliseconds: 7500
# Query thread log. Has information about all threads participated in query execution.
# Used only for queries with setting log_query_threads = 1.
query_thread_log:
database: system
table: query_thread_log
partition_by: toYYYYMM(event_date)
flush_interval_milliseconds: 7500
# Uncomment if use part log.
# Part log contains information about all actions with parts in MergeTree tables (creation, deletion, merges, downloads).
# part_log:
# database: system
# table: part_log
# flush_interval_milliseconds: 7500
# Uncomment to write text log into table.
# Text log contains all information from usual server log but stores it in structured and efficient way.
# The level of the messages that goes to the table can be limited (<level>), if not specified all messages will go to the table.
# text_log:
# database: system
# table: text_log
# flush_interval_milliseconds: 7500
# level: ''
# Metric log contains rows with current values of ProfileEvents, CurrentMetrics collected with "collect_interval_milliseconds" interval.
metric_log:
database: system
table: metric_log
flush_interval_milliseconds: 7500
collect_interval_milliseconds: 1000
# Asynchronous metric log contains values of metrics from
# system.asynchronous_metrics.
asynchronous_metric_log:
database: system
table: asynchronous_metric_log
# Asynchronous metrics are updated once a minute, so there is
# no need to flush more often.
flush_interval_milliseconds: 60000
# OpenTelemetry log contains OpenTelemetry trace spans.
opentelemetry_span_log:
# The default table creation code is insufficient, this <engine> spec
# is a workaround. There is no 'event_time' for this log, but two times,
# start and finish. It is sorted by finish time, to avoid inserting
# data too far away in the past (probably we can sometimes insert a span
# that is seconds earlier than the last span in the table, due to a race
# between several spans inserted in parallel). This gives the spans a
# global order that we can use to e.g. retry insertion into some external
# system.
engine: |-
engine MergeTree
partition by toYYYYMM(finish_date)
order by (finish_date, finish_time_us, trace_id)
database: system
table: opentelemetry_span_log
flush_interval_milliseconds: 7500
# Crash log. Stores stack traces for fatal errors.
# This table is normally empty.
crash_log:
database: system
table: crash_log
partition_by: ''
flush_interval_milliseconds: 1000
# Parameters for embedded dictionaries, used in Yandex.Metrica.
# See https://clickhouse.yandex/docs/en/dicts/internal_dicts/
# Path to file with region hierarchy.
# path_to_regions_hierarchy_file: /opt/geo/regions_hierarchy.txt
# Path to directory with files containing names of regions
# path_to_regions_names_files: /opt/geo/
# top_level_domains_path: /var/lib/clickhouse/top_level_domains/
# Custom TLD lists.
# Format: name: /path/to/file
# Changes will not be applied w/o server restart.
# Path to the list is under top_level_domains_path (see above).
top_level_domains_lists: ''
# public_suffix_list: /path/to/public_suffix_list.dat
# Configuration of external dictionaries. See:
# https://clickhouse.tech/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts
dictionaries_config: '*_dictionary.xml'
# Uncomment if you want data to be compressed 30-100% better.
# Don't do that if you just started using ClickHouse.
# compression:
# # Set of variants. Checked in order. Last matching case wins. If nothing matches, lz4 will be used.
# case:
# Conditions. All must be satisfied. Some conditions may be omitted.
# # min_part_size: 10000000000 # Min part size in bytes.
# # min_part_size_ratio: 0.01 # Min size of part relative to whole table size.
# # What compression method to use.
# method: zstd
# Allow to execute distributed DDL queries (CREATE, DROP, ALTER, RENAME) on cluster.
# Works only if ZooKeeper is enabled. Comment it if such functionality isn't required.
distributed_ddl:
# Path in ZooKeeper to queue with DDL queries
path: /clickhouse/task_queue/ddl
# Settings from this profile will be used to execute DDL queries
# profile: default
# Controls how much ON CLUSTER queries can be run simultaneously.
# pool_size: 1
# Cleanup settings (active tasks will not be removed)
# Controls task TTL (default 1 week)
# task_max_lifetime: 604800
# Controls how often cleanup should be performed (in seconds)
# cleanup_delay_period: 60
# Controls how many tasks could be in the queue
# max_tasks_in_queue: 1000
# Settings to fine tune MergeTree tables. See documentation in source code, in MergeTreeSettings.h
# merge_tree:
# max_suspicious_broken_parts: 5
# Protection from accidental DROP.
# If size of a MergeTree table is greater than max_table_size_to_drop (in bytes) than table could not be dropped with any DROP query.
# If you want do delete one table and don't want to change clickhouse-server config, you could create special file <clickhouse-path>/flags/force_drop_table and make DROP once.
# By default max_table_size_to_drop is 50GB; max_table_size_to_drop=0 allows to DROP any tables.
# The same for max_partition_size_to_drop.
# Uncomment to disable protection.
# max_table_size_to_drop: 0
# max_partition_size_to_drop: 0
# Example of parameters for GraphiteMergeTree table engine
graphite_rollup_example:
pattern:
regexp: click_cost
function: any
retention:
- age: 0
precision: 3600
- age: 86400
precision: 60
default:
function: max
retention:
- age: 0
precision: 60
- age: 3600
precision: 300
- age: 86400
precision: 3600
# Directory in <clickhouse-path> containing schema files for various input formats.
# The directory will be created if it doesn't exist.
format_schema_path: /var/lib/clickhouse/format_schemas/
# Default query masking rules, matching lines would be replaced with something else in the logs
# (both text logs and system.query_log).
# name - name for the rule (optional)
# regexp - RE2 compatible regular expression (mandatory)
# replace - substitution string for sensitive data (optional, by default - six asterisks)
query_masking_rules:
rule:
name: hide encrypt/decrypt arguments
regexp: '((?:aes_)?(?:encrypt|decrypt)(?:_mysql)?)\s*\(\s*(?:''(?:\\''|.)+''|.*?)\s*\)'
# or more secure, but also more invasive:
# (aes_\w+)\s*\(.*\)
replace: \1(???)
# Uncomment to use custom http handlers.
# rules are checked from top to bottom, first match runs the handler
# url - to match request URL, you can use 'regex:' prefix to use regex match(optional)
# methods - to match request method, you can use commas to separate multiple method matches(optional)
# headers - to match request headers, match each child element(child element name is header name), you can use 'regex:' prefix to use regex match(optional)
# handler is request handler
# type - supported types: static, dynamic_query_handler, predefined_query_handler
# query - use with predefined_query_handler type, executes query when the handler is called
# query_param_name - use with dynamic_query_handler type, extracts and executes the value corresponding to the <query_param_name> value in HTTP request params
# status - use with static type, response status code
# content_type - use with static type, response content-type
# response_content - use with static type, Response content sent to client, when using the prefix 'file://' or 'config://', find the content from the file or configuration send to client.
# http_handlers:
# - rule:
# url: /
# methods: POST,GET
# headers:
# pragma: no-cache
# handler:
# type: dynamic_query_handler
# query_param_name: query
# - rule:
# url: /predefined_query
# methods: POST,GET
# handler:
# type: predefined_query_handler
# query: 'SELECT * FROM system.settings'
# - rule:
# handler:
# type: static
# status: 200
# content_type: 'text/plain; charset=UTF-8'
# response_content: config://http_server_default_response
send_crash_reports:
# Changing <enabled> to true allows sending crash reports to
# the ClickHouse core developers team via Sentry https://sentry.io
# Doing so at least in pre-production environments is highly appreciated
enabled: false
# Change <anonymize> to true if you don't feel comfortable attaching the server hostname to the crash report
anonymize: false
# Default endpoint should be changed to different Sentry DSN only if you have
# some in-house engineers or hired consultants who're going to debug ClickHouse issues for you
endpoint: 'https://6f33034cfe684dd7a3ab9875e57b1c8d@o388870.ingest.sentry.io/5226277'
# Uncomment to disable ClickHouse internal DNS caching.
# disable_internal_dns_cache: 1

View File

@ -1,86 +0,0 @@
# We can use 3 main node types in YAML: Scalar, Map and Sequence.
# A Scalar is a simple key-value pair:
scalar: 123
# Here we have a key "scalar" and value "123"
# If we rewrite this in XML, we will get <scalar>123</scalar>
# We can also represent an empty value with '':
key: ''
# A Map is a node, which contains other nodes:
map:
key1: value1
key2: value2
small_map:
key3: value3
# This map can be converted into:
# <map>
# <key1>value1</key1>
# <key2>value2</key2>
# <small_map>
# <key3>value3</key3>
# </small_map>
# </map>
# A Sequence is a node, which contains also other nodes.
# The main difference from Map is that Sequence can also contain simple values.
sequence:
- val1
- val2
- key: 123
- map:
mkey1: foo
mkey2: bar
# We can represent it in XML this way:
# <sequence>val1</sequence>
# <sequence>val2</sequence>
# <sequence>
# <key>123</key>
# </sequence>
# <sequence>
# <map>
# <mkey1>foo</mkey1>
# <mkey2>bar</mkey2>
# </map>
# </sequence>
# YAML does not have direct support for structures like XML attributes.
# We represent them as nodes with @ prefix in key. Note, that @ is reserved by YAML standard,
# so you will need to write double quotes around the key. Both Map and Sequence can have
# attributes as children nodes
map:
"@attr1": value1
"@attr2": value2
key: 123
# This gives us:
# <map attr1="value1" attr2="value2">
# <key>123</key>
# </map>
sequence:
- "@attr1": value1
- "@attr2": value2
- 123
- abc
# And this gives us:
# <map attr1="value1" attr2="value2">123</map>
# <map attr1="value1" attr2="value2">abc</map>

View File

@ -226,6 +226,10 @@ bool KeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr
request_info.session_id = session_id; request_info.session_id = session_id;
std::lock_guard lock(push_request_mutex); std::lock_guard lock(push_request_mutex);
if (shutdown_called)
return false;
/// Put close requests without timeouts /// Put close requests without timeouts
if (request->getOpNum() == Coordination::OpNum::Close) if (request->getOpNum() == Coordination::OpNum::Close)
requests_queue->push(std::move(request_info)); requests_queue->push(std::move(request_info));
@ -316,6 +320,8 @@ void KeeperStorageDispatcher::shutdown()
break; break;
} }
} }
std::lock_guard lock(session_to_response_callback_mutex);
session_to_response_callback.clear(); session_to_response_callback.clear();
} }
catch (...) catch (...)

View File

@ -443,7 +443,12 @@ class IColumn;
\ \
M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \ M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \
M(Bool, allow_experimental_query_deduplication, false, "Allow sending parts' UUIDs for a query in order to deduplicate data parts if any", 0) \ \
/** Experimental feature for moving data between shards. */ \
\
M(Bool, allow_experimental_query_deduplication, false, "Experimental data deduplication for SELECT queries based on part UUIDs", 0) \
M(Bool, experimental_query_deduplication_send_all_part_uuids, false, "If false only part UUIDs for currently moving parts are sent. If true all read part UUIDs are sent (useful only for testing).", 0) \
\
M(Bool, engine_file_empty_if_not_exists, false, "Allows to select data from a file engine table without file", 0) \ M(Bool, engine_file_empty_if_not_exists, false, "Allows to select data from a file engine table without file", 0) \
M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \ M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \
M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \ M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \

View File

@ -9,7 +9,6 @@
# include <DataTypes/IDataType.h> # include <DataTypes/IDataType.h>
# include <DataTypes/DataTypeNullable.h> # include <DataTypes/DataTypeNullable.h>
# include <DataTypes/DataTypeFixedString.h>
# include <Columns/ColumnConst.h> # include <Columns/ColumnConst.h>
# include <Columns/ColumnNullable.h> # include <Columns/ColumnNullable.h>
# pragma GCC diagnostic push # pragma GCC diagnostic push
@ -41,7 +40,8 @@ static inline llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const IDa
{ {
const auto & data_type_nullable = static_cast<const DataTypeNullable&>(type); const auto & data_type_nullable = static_cast<const DataTypeNullable&>(type);
auto * wrapped = toNativeType(builder, *data_type_nullable.getNestedType()); auto * wrapped = toNativeType(builder, *data_type_nullable.getNestedType());
return wrapped ? llvm::StructType::get(wrapped, /* is null = */ builder.getInt1Ty()) : nullptr; auto * is_null_type = builder.getInt1Ty();
return wrapped ? llvm::StructType::get(wrapped, is_null_type) : nullptr;
} }
/// LLVM doesn't have unsigned types, it has unsigned instructions. /// LLVM doesn't have unsigned types, it has unsigned instructions.
@ -57,11 +57,6 @@ static inline llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const IDa
return builder.getFloatTy(); return builder.getFloatTy();
else if (data_type.isFloat64()) else if (data_type.isFloat64())
return builder.getDoubleTy(); return builder.getDoubleTy();
else if (data_type.isFixedString())
{
const auto & data_type_fixed_string = static_cast<const DataTypeFixedString &>(type);
return llvm::VectorType::get(builder.getInt8Ty(), data_type_fixed_string.getN());
}
return nullptr; return nullptr;
} }
@ -76,7 +71,7 @@ static inline bool canBeNativeType(const IDataType & type)
return canBeNativeType(*data_type_nullable.getNestedType()); return canBeNativeType(*data_type_nullable.getNestedType());
} }
return data_type.isNativeInt() || data_type.isNativeUInt() || data_type.isFloat() || data_type.isFixedString() || data_type.isDate(); return data_type.isNativeInt() || data_type.isNativeUInt() || data_type.isFloat() || data_type.isDate();
} }
static inline llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const DataTypePtr & type) static inline llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const DataTypePtr & type)

View File

@ -87,7 +87,7 @@ static void compileFunction(llvm::Module & module, const IFunctionBase & functio
for (size_t i = 0; i <= arg_types.size(); ++i) for (size_t i = 0; i <= arg_types.size(); ++i)
{ {
const auto & type = i == arg_types.size() ? function.getResultType() : arg_types[i]; const auto & type = i == arg_types.size() ? function.getResultType() : arg_types[i];
auto * data = b.CreateLoad(b.CreateConstInBoundsGEP1_32(data_type, columns_arg, i)); auto * data = b.CreateLoad(data_type, b.CreateConstInBoundsGEP1_32(data_type, columns_arg, i));
columns[i].data_init = b.CreatePointerCast(b.CreateExtractValue(data, {0}), toNativeType(b, removeNullable(type))->getPointerTo()); columns[i].data_init = b.CreatePointerCast(b.CreateExtractValue(data, {0}), toNativeType(b, removeNullable(type))->getPointerTo());
columns[i].null_init = type->isNullable() ? b.CreateExtractValue(data, {1}) : nullptr; columns[i].null_init = type->isNullable() ? b.CreateExtractValue(data, {1}) : nullptr;
} }
@ -122,14 +122,14 @@ static void compileFunction(llvm::Module & module, const IFunctionBase & functio
auto & column = columns[i]; auto & column = columns[i];
auto type = arg_types[i]; auto type = arg_types[i];
auto * value = b.CreateLoad(column.data); auto * value = b.CreateLoad(toNativeType(b, removeNullable(type)), column.data);
if (!column.null) if (!type->isNullable())
{ {
arguments.emplace_back(value); arguments.emplace_back(value);
continue; continue;
} }
auto * is_null = b.CreateICmpNE(b.CreateLoad(column.null), b.getInt8(0)); auto * is_null = b.CreateICmpNE(b.CreateLoad(b.getInt8Ty(), column.null), b.getInt8(0));
auto * nullable_unitilized = llvm::Constant::getNullValue(toNativeType(b, type)); auto * nullable_unitilized = llvm::Constant::getNullValue(toNativeType(b, type));
auto * nullable_value = b.CreateInsertValue(b.CreateInsertValue(nullable_unitilized, value, {0}), is_null, {1}); auto * nullable_value = b.CreateInsertValue(b.CreateInsertValue(nullable_unitilized, value, {0}), is_null, {1});
arguments.emplace_back(nullable_value); arguments.emplace_back(nullable_value);

View File

@ -38,7 +38,7 @@ int main(int argc, char **argv)
// b.CreateCall(func_declaration); // b.CreateCall(func_declaration);
// auto * load_argument = b.CreateLoad(argument); // auto * load_argument = b.CreateLoad(value_type, argument);
// auto * value = b.CreateAdd(load_argument, load_argument); // auto * value = b.CreateAdd(load_argument, load_argument);
// b.CreateRet(value); // b.CreateRet(value);
// }); // });

View File

@ -84,6 +84,7 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
ParserKeyword s_to_disk("TO DISK"); ParserKeyword s_to_disk("TO DISK");
ParserKeyword s_to_volume("TO VOLUME"); ParserKeyword s_to_volume("TO VOLUME");
ParserKeyword s_to_table("TO TABLE"); ParserKeyword s_to_table("TO TABLE");
ParserKeyword s_to_shard("TO SHARD");
ParserKeyword s_delete("DELETE"); ParserKeyword s_delete("DELETE");
ParserKeyword s_update("UPDATE"); ParserKeyword s_update("UPDATE");
@ -366,6 +367,10 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
return false; return false;
command->move_destination_type = DataDestinationType::TABLE; command->move_destination_type = DataDestinationType::TABLE;
} }
else if (s_to_shard.ignore(pos))
{
command->move_destination_type = DataDestinationType::SHARD;
}
else else
return false; return false;

View File

@ -194,7 +194,7 @@ KeeperTCPHandler::KeeperTCPHandler(IServer & server_, const Poco::Net::StreamSoc
, server(server_) , server(server_)
, log(&Poco::Logger::get("NuKeeperTCPHandler")) , log(&Poco::Logger::get("NuKeeperTCPHandler"))
, global_context(Context::createCopy(server.context())) , global_context(Context::createCopy(server.context()))
, nu_keeper_storage_dispatcher(global_context->getKeeperStorageDispatcher()) , keeper_dispatcher(global_context->getKeeperStorageDispatcher())
, operation_timeout(0, global_context->getConfigRef().getUInt("test_keeper_server.operation_timeout_ms", Coordination::DEFAULT_OPERATION_TIMEOUT_MS) * 1000) , operation_timeout(0, global_context->getConfigRef().getUInt("test_keeper_server.operation_timeout_ms", Coordination::DEFAULT_OPERATION_TIMEOUT_MS) * 1000)
, session_timeout(0, global_context->getConfigRef().getUInt("test_keeper_server.session_timeout_ms", Coordination::DEFAULT_SESSION_TIMEOUT_MS) * 1000) , session_timeout(0, global_context->getConfigRef().getUInt("test_keeper_server.session_timeout_ms", Coordination::DEFAULT_SESSION_TIMEOUT_MS) * 1000)
, poll_wrapper(std::make_unique<SocketInterruptablePollWrapper>(socket_)) , poll_wrapper(std::make_unique<SocketInterruptablePollWrapper>(socket_))
@ -286,12 +286,12 @@ void KeeperTCPHandler::runImpl()
return; return;
} }
if (nu_keeper_storage_dispatcher->hasLeader()) if (keeper_dispatcher->hasLeader())
{ {
try try
{ {
LOG_INFO(log, "Requesting session ID for the new client"); LOG_INFO(log, "Requesting session ID for the new client");
session_id = nu_keeper_storage_dispatcher->getSessionID(session_timeout.totalMilliseconds()); session_id = keeper_dispatcher->getSessionID(session_timeout.totalMilliseconds());
LOG_INFO(log, "Received session ID {}", session_id); LOG_INFO(log, "Received session ID {}", session_id);
} }
catch (const Exception & e) catch (const Exception & e)
@ -318,7 +318,7 @@ void KeeperTCPHandler::runImpl()
UInt8 single_byte = 1; UInt8 single_byte = 1;
[[maybe_unused]] int result = write(response_fd, &single_byte, sizeof(single_byte)); [[maybe_unused]] int result = write(response_fd, &single_byte, sizeof(single_byte));
}; };
nu_keeper_storage_dispatcher->registerSession(session_id, response_callback); keeper_dispatcher->registerSession(session_id, response_callback);
session_stopwatch.start(); session_stopwatch.start();
bool close_received = false; bool close_received = false;
@ -368,7 +368,7 @@ void KeeperTCPHandler::runImpl()
if (response->error == Coordination::Error::ZSESSIONEXPIRED) if (response->error == Coordination::Error::ZSESSIONEXPIRED)
{ {
LOG_DEBUG(log, "Session #{} expired because server shutting down or quorum is not alive", session_id); LOG_DEBUG(log, "Session #{} expired because server shutting down or quorum is not alive", session_id);
nu_keeper_storage_dispatcher->finishSession(session_id); keeper_dispatcher->finishSession(session_id);
return; return;
} }
@ -381,7 +381,7 @@ void KeeperTCPHandler::runImpl()
if (session_stopwatch.elapsedMicroseconds() > static_cast<UInt64>(session_timeout.totalMicroseconds())) if (session_stopwatch.elapsedMicroseconds() > static_cast<UInt64>(session_timeout.totalMicroseconds()))
{ {
LOG_DEBUG(log, "Session #{} expired", session_id); LOG_DEBUG(log, "Session #{} expired", session_id);
nu_keeper_storage_dispatcher->finishSession(session_id); keeper_dispatcher->finishSession(session_id);
break; break;
} }
} }
@ -389,7 +389,7 @@ void KeeperTCPHandler::runImpl()
catch (const Exception & ex) catch (const Exception & ex)
{ {
LOG_INFO(log, "Got exception processing session #{}: {}", session_id, getExceptionMessage(ex, true)); LOG_INFO(log, "Got exception processing session #{}: {}", session_id, getExceptionMessage(ex, true));
nu_keeper_storage_dispatcher->finishSession(session_id); keeper_dispatcher->finishSession(session_id);
} }
} }
@ -407,7 +407,7 @@ std::pair<Coordination::OpNum, Coordination::XID> KeeperTCPHandler::receiveReque
request->xid = xid; request->xid = xid;
request->readImpl(*in); request->readImpl(*in);
if (!nu_keeper_storage_dispatcher->putRequest(request, session_id)) if (!keeper_dispatcher->putRequest(request, session_id))
throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Session {} already disconnected", session_id); throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Session {} already disconnected", session_id);
return std::make_pair(opnum, xid); return std::make_pair(opnum, xid);
} }

View File

@ -38,7 +38,7 @@ private:
IServer & server; IServer & server;
Poco::Logger * log; Poco::Logger * log;
ContextPtr global_context; ContextPtr global_context;
std::shared_ptr<KeeperStorageDispatcher> nu_keeper_storage_dispatcher; std::shared_ptr<KeeperStorageDispatcher> keeper_dispatcher;
Poco::Timespan operation_timeout; Poco::Timespan operation_timeout;
Poco::Timespan session_timeout; Poco::Timespan session_timeout;
int64_t session_id{-1}; int64_t session_id{-1};

View File

@ -10,6 +10,7 @@ enum class DataDestinationType
VOLUME, VOLUME,
TABLE, TABLE,
DELETE, DELETE,
SHARD,
}; };
} }

View File

@ -152,6 +152,7 @@ MergeTreeData::MergeTreeData(
, log_name(table_id_.getNameForLogs()) , log_name(table_id_.getNameForLogs())
, log(&Poco::Logger::get(log_name)) , log(&Poco::Logger::get(log_name))
, storage_settings(std::move(storage_settings_)) , storage_settings(std::move(storage_settings_))
, pinned_part_uuids(std::make_shared<PinnedPartUUIDs>())
, data_parts_by_info(data_parts_indexes.get<TagByInfo>()) , data_parts_by_info(data_parts_indexes.get<TagByInfo>())
, data_parts_by_state_and_info(data_parts_indexes.get<TagByStateAndInfo>()) , data_parts_by_state_and_info(data_parts_indexes.get<TagByStateAndInfo>())
, parts_mover(this) , parts_mover(this)
@ -2997,6 +2998,11 @@ void MergeTreeData::movePartitionToVolume(const ASTPtr & partition, const String
throw Exception("Cannot move parts because moves are manually disabled", ErrorCodes::ABORTED); throw Exception("Cannot move parts because moves are manually disabled", ErrorCodes::ABORTED);
} }
void MergeTreeData::movePartitionToShard(const ASTPtr & /*partition*/, bool /*move_part*/, const String & /*to*/, ContextPtr /*query_context*/)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MOVE PARTITION TO SHARD is not supported by storage {}", getName());
}
void MergeTreeData::fetchPartition( void MergeTreeData::fetchPartition(
const ASTPtr & /*partition*/, const ASTPtr & /*partition*/,
const StorageMetadataPtr & /*metadata_snapshot*/, const StorageMetadataPtr & /*metadata_snapshot*/,
@ -3046,11 +3052,23 @@ Pipe MergeTreeData::alterPartition(
break; break;
case PartitionCommand::MoveDestinationType::TABLE: case PartitionCommand::MoveDestinationType::TABLE:
{
checkPartitionCanBeDropped(command.partition); checkPartitionCanBeDropped(command.partition);
String dest_database = query_context->resolveDatabase(command.to_database); String dest_database = query_context->resolveDatabase(command.to_database);
auto dest_storage = DatabaseCatalog::instance().getTable({dest_database, command.to_table}, query_context); auto dest_storage = DatabaseCatalog::instance().getTable({dest_database, command.to_table}, query_context);
movePartitionToTable(dest_storage, command.partition, query_context); movePartitionToTable(dest_storage, command.partition, query_context);
break; }
break;
case PartitionCommand::MoveDestinationType::SHARD:
{
if (!getSettings()->part_moves_between_shards_enable)
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
"Moving parts between shards is experimental and work in progress"
", see part_moves_between_shards_enable setting");
movePartitionToShard(command.partition, command.part, command.move_destination_name, query_context);
}
break;
} }
} }
break; break;
@ -4581,6 +4599,12 @@ catch (...)
tryLogCurrentException(log, __PRETTY_FUNCTION__); tryLogCurrentException(log, __PRETTY_FUNCTION__);
} }
StorageMergeTree::PinnedPartUUIDsPtr MergeTreeData::getPinnedPartUUIDs() const
{
std::lock_guard lock(pinned_part_uuids_mutex);
return pinned_part_uuids;
}
MergeTreeData::CurrentlyMovingPartsTagger::CurrentlyMovingPartsTagger(MergeTreeMovingParts && moving_parts_, MergeTreeData & data_) MergeTreeData::CurrentlyMovingPartsTagger::CurrentlyMovingPartsTagger(MergeTreeMovingParts && moving_parts_, MergeTreeData & data_)
: parts_to_move(std::move(moving_parts_)), data(data_) : parts_to_move(std::move(moving_parts_)), data(data_)
{ {

View File

@ -20,6 +20,7 @@
#include <Storages/IndicesDescription.h> #include <Storages/IndicesDescription.h>
#include <Storages/MergeTree/MergeTreePartsMover.h> #include <Storages/MergeTree/MergeTreePartsMover.h>
#include <Storages/MergeTree/MergeTreeWriteAheadLog.h> #include <Storages/MergeTree/MergeTreeWriteAheadLog.h>
#include <Storages/MergeTree/PinnedPartUUIDs.h>
#include <Interpreters/PartLog.h> #include <Interpreters/PartLog.h>
#include <Disks/StoragePolicy.h> #include <Disks/StoragePolicy.h>
#include <Interpreters/Aggregator.h> #include <Interpreters/Aggregator.h>
@ -128,6 +129,8 @@ public:
using DataPartStates = std::initializer_list<DataPartState>; using DataPartStates = std::initializer_list<DataPartState>;
using DataPartStateVector = std::vector<DataPartState>; using DataPartStateVector = std::vector<DataPartState>;
using PinnedPartUUIDsPtr = std::shared_ptr<const PinnedPartUUIDs>;
constexpr static auto FORMAT_VERSION_FILE_NAME = "format_version.txt"; constexpr static auto FORMAT_VERSION_FILE_NAME = "format_version.txt";
constexpr static auto DETACHED_DIR_NAME = "detached"; constexpr static auto DETACHED_DIR_NAME = "detached";
@ -801,6 +804,8 @@ public:
/// Mutex for currently_moving_parts /// Mutex for currently_moving_parts
mutable std::mutex moving_parts_mutex; mutable std::mutex moving_parts_mutex;
PinnedPartUUIDsPtr getPinnedPartUUIDs() const;
/// Return main processing background job, like merge/mutate/fetch and so on /// Return main processing background job, like merge/mutate/fetch and so on
virtual std::optional<JobAndPool> getDataProcessingJob() = 0; virtual std::optional<JobAndPool> getDataProcessingJob() = 0;
/// Return job to move parts between disks/volumes and so on. /// Return job to move parts between disks/volumes and so on.
@ -855,6 +860,10 @@ protected:
/// Use get and set to receive readonly versions. /// Use get and set to receive readonly versions.
MultiVersion<MergeTreeSettings> storage_settings; MultiVersion<MergeTreeSettings> storage_settings;
/// Used to determine which UUIDs to send to root query executor for deduplication.
mutable std::shared_mutex pinned_part_uuids_mutex;
PinnedPartUUIDsPtr pinned_part_uuids;
/// Work with data parts /// Work with data parts
struct TagByInfo{}; struct TagByInfo{};
@ -989,7 +998,6 @@ protected:
virtual void replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, ContextPtr context) = 0; virtual void replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, ContextPtr context) = 0;
virtual void movePartitionToTable(const StoragePtr & dest_table, const ASTPtr & partition, ContextPtr context) = 0; virtual void movePartitionToTable(const StoragePtr & dest_table, const ASTPtr & partition, ContextPtr context) = 0;
/// Makes sense only for replicated tables
virtual void fetchPartition( virtual void fetchPartition(
const ASTPtr & partition, const ASTPtr & partition,
const StorageMetadataPtr & metadata_snapshot, const StorageMetadataPtr & metadata_snapshot,
@ -997,6 +1005,8 @@ protected:
bool fetch_part, bool fetch_part,
ContextPtr query_context); ContextPtr query_context);
virtual void movePartitionToShard(const ASTPtr & partition, bool move_part, const String & to, ContextPtr query_context);
void writePartLog( void writePartLog(
PartLogElement::Type type, PartLogElement::Type type,
const ExecutionStatus & execution_status, const ExecutionStatus & execution_status,

View File

@ -528,6 +528,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
selectPartsToReadWithUUIDFilter( selectPartsToReadWithUUIDFilter(
parts, parts,
part_values, part_values,
data.getPinnedPartUUIDs(),
minmax_idx_condition, minmax_idx_condition,
minmax_columns_types, minmax_columns_types,
partition_pruner, partition_pruner,
@ -2246,6 +2247,7 @@ void MergeTreeDataSelectExecutor::selectPartsToRead(
void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter( void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
MergeTreeData::DataPartsVector & parts, MergeTreeData::DataPartsVector & parts,
const std::unordered_set<String> & part_values, const std::unordered_set<String> & part_values,
MergeTreeData::PinnedPartUUIDsPtr pinned_part_uuids,
const std::optional<KeyCondition> & minmax_idx_condition, const std::optional<KeyCondition> & minmax_idx_condition,
const DataTypes & minmax_columns_types, const DataTypes & minmax_columns_types,
std::optional<PartitionPruner> & partition_pruner, std::optional<PartitionPruner> & partition_pruner,
@ -2253,6 +2255,8 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
ContextPtr query_context, ContextPtr query_context,
PartFilterCounters & counters) const PartFilterCounters & counters) const
{ {
const Settings & settings = query_context->getSettings();
/// process_parts prepare parts that have to be read for the query, /// process_parts prepare parts that have to be read for the query,
/// returns false if duplicated parts' UUID have been met /// returns false if duplicated parts' UUID have been met
auto select_parts = [&] (MergeTreeData::DataPartsVector & selected_parts) -> bool auto select_parts = [&] (MergeTreeData::DataPartsVector & selected_parts) -> bool
@ -2309,9 +2313,12 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
/// populate UUIDs and exclude ignored parts if enabled /// populate UUIDs and exclude ignored parts if enabled
if (part->uuid != UUIDHelpers::Nil) if (part->uuid != UUIDHelpers::Nil)
{ {
auto result = temp_part_uuids.insert(part->uuid); if (settings.experimental_query_deduplication_send_all_part_uuids || pinned_part_uuids->contains(part->uuid))
if (!result.second) {
throw Exception("Found a part with the same UUID on the same replica.", ErrorCodes::LOGICAL_ERROR); auto result = temp_part_uuids.insert(part->uuid);
if (!result.second)
throw Exception("Found a part with the same UUID on the same replica.", ErrorCodes::LOGICAL_ERROR);
}
} }
selected_parts.push_back(part_or_projection); selected_parts.push_back(part_or_projection);
@ -2335,7 +2342,8 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
/// Process parts that have to be read for a query. /// Process parts that have to be read for a query.
auto needs_retry = !select_parts(parts); auto needs_retry = !select_parts(parts);
/// If any duplicated part UUIDs met during the first step, try to ignore them in second pass /// If any duplicated part UUIDs met during the first step, try to ignore them in second pass.
/// This may happen when `prefer_localhost_replica` is set and "distributed" stage runs in the same process with "remote" stage.
if (needs_retry) if (needs_retry)
{ {
LOG_DEBUG(log, "Found duplicate uuids locally, will retry part selection without them"); LOG_DEBUG(log, "Found duplicate uuids locally, will retry part selection without them");

View File

@ -166,6 +166,7 @@ private:
void selectPartsToReadWithUUIDFilter( void selectPartsToReadWithUUIDFilter(
MergeTreeData::DataPartsVector & parts, MergeTreeData::DataPartsVector & parts,
const std::unordered_set<String> & part_values, const std::unordered_set<String> & part_values,
MergeTreeData::PinnedPartUUIDsPtr pinned_part_uuids,
const std::optional<KeyCondition> & minmax_idx_condition, const std::optional<KeyCondition> & minmax_idx_condition,
const DataTypes & minmax_columns_types, const DataTypes & minmax_columns_types,
std::optional<PartitionPruner> & partition_pruner, std::optional<PartitionPruner> & partition_pruner,

View File

@ -128,6 +128,10 @@ struct Settings;
M(UInt64, min_marks_to_honor_max_concurrent_queries, 0, "Minimal number of marks to honor the MergeTree-level's max_concurrent_queries (0 - disabled). Queries will still be limited by other max_concurrent_queries settings.", 0) \ M(UInt64, min_marks_to_honor_max_concurrent_queries, 0, "Minimal number of marks to honor the MergeTree-level's max_concurrent_queries (0 - disabled). Queries will still be limited by other max_concurrent_queries settings.", 0) \
M(UInt64, min_bytes_to_rebalance_partition_over_jbod, 0, "Minimal amount of bytes to enable part rebalance over JBOD array (0 - disabled).", 0) \ M(UInt64, min_bytes_to_rebalance_partition_over_jbod, 0, "Minimal amount of bytes to enable part rebalance over JBOD array (0 - disabled).", 0) \
\ \
/** Experimental/work in progress feature. Unsafe for production. */ \
M(UInt64, part_moves_between_shards_enable, 0, "Experimental/Incomplete feature to move parts between shards. Does not take into account sharding expressions.", 0) \
M(UInt64, part_moves_between_shards_delay_seconds, 30, "Time to wait before/after moving parts between shards.", 0) \
\
/** Obsolete settings. Kept for backward compatibility only. */ \ /** Obsolete settings. Kept for backward compatibility only. */ \
M(UInt64, min_relative_delay_to_yield_leadership, 120, "Obsolete setting, does nothing.", 0) \ M(UInt64, min_relative_delay_to_yield_leadership, 120, "Obsolete setting, does nothing.", 0) \
M(UInt64, check_delay_period, 60, "Obsolete setting, does nothing.", 0) \ M(UInt64, check_delay_period, 60, "Obsolete setting, does nothing.", 0) \

View File

@ -0,0 +1,459 @@
#include <Storages/MergeTree/PartMovesBetweenShardsOrchestrator.h>
#include <Storages/MergeTree/PinnedPartUUIDs.h>
#include <Storages/StorageReplicatedMergeTree.h>
#include <boost/range/adaptor/map.hpp>
#include <Poco/JSON/JSON.h>
#include <Poco/JSON/Object.h>
#include <Poco/JSON/Parser.h>
namespace DB
{
PartMovesBetweenShardsOrchestrator::PartMovesBetweenShardsOrchestrator(StorageReplicatedMergeTree & storage_)
: storage(storage_)
, zookeeper_path(storage.zookeeper_path)
, logger_name(storage.getStorageID().getFullTableName() + " (PartMovesBetweenShardsOrchestrator)")
, log(&Poco::Logger::get(logger_name))
, entries_znode_path(zookeeper_path + "/part_moves_shard")
{
/// Schedule pool is not designed for long-running tasks. TODO replace with a separate thread?
task = storage.getContext()->getSchedulePool().createTask(logger_name, [this]{ run(); });
}
void PartMovesBetweenShardsOrchestrator::run()
{
if (!storage.getSettings()->part_moves_between_shards_enable)
return;
if (need_stop)
return;
auto sleep_ms = 10;
try
{
fetchStateFromZK();
if (step())
fetchStateFromZK();
else
sleep_ms = 3 * 1000;
}
catch (...)
{
tryLogCurrentException(log, __PRETTY_FUNCTION__);
}
task->scheduleAfter(sleep_ms);
}
void PartMovesBetweenShardsOrchestrator::shutdown()
{
need_stop = true;
task->deactivate();
LOG_TRACE(log, "PartMovesBetweenShardsOrchestrator thread finished");
}
void PartMovesBetweenShardsOrchestrator::fetchStateFromZK()
{
std::lock_guard lock(state_mutex);
entries.clear();
auto zk = storage.getZooKeeper();
Strings task_names = zk->getChildren(entries_znode_path);
for (auto const & task_name : task_names)
{
PartMovesBetweenShardsOrchestrator::Entry e;
Coordination::Stat stat;
e.znode_path = entries_znode_path + "/" + task_name;
auto entry_str = zk->get(e.znode_path, &stat);
e.fromString(entry_str);
e.version = stat.version;
e.znode_name = task_name;
entries[task_name] = std::move(e);
}
}
bool PartMovesBetweenShardsOrchestrator::step()
{
if (!storage.is_leader)
return false;
auto zk = storage.getZooKeeper();
std::optional<Entry> entry_to_process;
/// Try find an entry to process and copy it.
{
std::lock_guard lock(state_mutex);
for (auto const & entry : entries | boost::adaptors::map_values)
{
if (entry.state.value == EntryState::DONE || entry.state.value == EntryState::CANCELLED)
continue;
entry_to_process.emplace(entry);
break;
}
}
if (!entry_to_process.has_value())
return false;
/// Since some state transitions are long running (waiting on replicas acknowledgement we create this lock to avoid
/// other replicas trying to do the same work. All state transitions should be idempotent so is is safe to lose the
/// lock and have another replica retry.
///
/// Note: This blocks all other entries from being executed. Technical debt.
zkutil::EphemeralNodeHolder::Ptr entry_node_holder;
try
{
entry_node_holder = zkutil::EphemeralNodeHolder::create(entry_to_process->znode_path + "/lock_holder", *zk, storage.replica_name);
}
catch (const Coordination::Exception & e)
{
if (e.code == Coordination::Error::ZNODEEXISTS)
{
LOG_DEBUG(log, "Task {} is being processed by another replica", entry_to_process->znode_name);
return false;
}
throw;
}
try
{
/// Use the same ZooKeeper connection. If we'd lost the lock then connection
/// will become expired and all consequent operations will fail.
stepEntry(entry_to_process.value(), zk);
}
catch (...)
{
tryLogCurrentException(log, __PRETTY_FUNCTION__);
Entry entry_copy = entry_to_process.value();
entry_copy.last_exception_msg = getCurrentExceptionMessage(false);
entry_copy.update_time = std::time(nullptr);
zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
return false;
}
return true;
}
void PartMovesBetweenShardsOrchestrator::stepEntry(const Entry & entry, zkutil::ZooKeeperPtr zk)
{
switch (entry.state.value)
{
case EntryState::DONE:
break;
case EntryState::CANCELLED:
break;
case EntryState::TODO:
{
/// State transition.
Entry entry_copy = entry;
entry_copy.state = EntryState::SYNC_SOURCE;
entry_copy.update_time = std::time(nullptr);
zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
}
break;
case EntryState::SYNC_SOURCE:
{
{
/// Log entry.
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCheckRequest(entry.znode_path, entry.version));
ReplicatedMergeTreeLogEntryData log_entry;
log_entry.type = ReplicatedMergeTreeLogEntryData::SYNC_PINNED_PART_UUIDS;
log_entry.create_time = std::time(nullptr);
log_entry.source_replica = storage.replica_name;
ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/log", "", -1));
ops.emplace_back(zkutil::makeCreateRequest(
zookeeper_path + "/log/log-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
Coordination::Responses responses;
Coordination::Error rc = zk->tryMulti(ops, responses);
zkutil::KeeperMultiException::check(rc, ops, responses);
String log_znode_path = dynamic_cast<const Coordination::CreateResponse &>(*responses.back()).path_created;
log_entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1);
/// This wait in background schedule pool is useless. It'd be
/// better to have some notification which will call `step`
/// function when all replicated will finish. TODO.
storage.waitForAllReplicasToProcessLogEntry(log_entry, true);
}
{
/// State transition.
Entry entry_copy = entry;
entry_copy.state = EntryState::SYNC_DESTINATION;
entry_copy.update_time = std::time(nullptr);
zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
}
}
break;
case EntryState::SYNC_DESTINATION:
{
{
/// Log entry.
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCheckRequest(entry.znode_path, entry.version));
ReplicatedMergeTreeLogEntryData log_entry;
log_entry.type = ReplicatedMergeTreeLogEntryData::SYNC_PINNED_PART_UUIDS;
log_entry.create_time = std::time(nullptr);
log_entry.source_replica = storage.replica_name;
log_entry.source_shard = zookeeper_path;
ops.emplace_back(zkutil::makeSetRequest(entry.to_shard + "/log", "", -1));
ops.emplace_back(zkutil::makeCreateRequest(
entry.to_shard + "/log/log-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
Coordination::Responses responses;
Coordination::Error rc = zk->tryMulti(ops, responses);
zkutil::KeeperMultiException::check(rc, ops, responses);
String log_znode_path = dynamic_cast<const Coordination::CreateResponse &>(*responses.back()).path_created;
log_entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1);
storage.waitForAllTableReplicasToProcessLogEntry(entry.to_shard, log_entry, true);
}
{
/// State transition.
Entry entry_copy = entry;
entry_copy.state = EntryState::DESTINATION_FETCH;
entry_copy.update_time = std::time(nullptr);
zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
}
}
break;
case EntryState::DESTINATION_FETCH:
{
/// Make sure table structure doesn't change when there are part movements in progress.
{
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCheckRequest(entry.znode_path, entry.version));
/// Log entry.
ReplicatedMergeTreeLogEntryData log_entry;
log_entry.type = ReplicatedMergeTreeLogEntryData::CLONE_PART_FROM_SHARD;
log_entry.create_time = std::time(nullptr);
log_entry.new_part_name = entry.part_name;
log_entry.source_replica = storage.replica_name;
log_entry.source_shard = zookeeper_path;
ops.emplace_back(zkutil::makeSetRequest(entry.to_shard + "/log", "", -1));
ops.emplace_back(zkutil::makeCreateRequest(
entry.to_shard + "/log/log-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
Coordination::Responses responses;
Coordination::Error rc = zk->tryMulti(ops, responses);
zkutil::KeeperMultiException::check(rc, ops, responses);
String log_znode_path = dynamic_cast<const Coordination::CreateResponse &>(*responses.back()).path_created;
log_entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1);
storage.waitForAllTableReplicasToProcessLogEntry(entry.to_shard, log_entry, true);
}
{
/// State transition.
Entry entry_copy = entry;
entry_copy.state = EntryState::DESTINATION_ATTACH;
entry_copy.update_time = std::time(nullptr);
zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
}
}
break;
case EntryState::DESTINATION_ATTACH:
{
/// There is a chance that attach on destination will fail and this task will be left in the queue forever.
{
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCheckRequest(entry.znode_path, entry.version));
auto part = storage.getActiveContainingPart(entry.part_name);
/// Allocating block number in other replicas zookeeper path
/// TODO Maybe we can do better.
auto block_number_lock = storage.allocateBlockNumber(part->info.partition_id, zk, "", entry.to_shard);
auto block_number = block_number_lock->getNumber();
auto part_info = part->info;
part_info.min_block = block_number;
part_info.max_block = block_number;
part_info.level = 0;
part_info.mutation = 0;
/// Attach log entry (all replicas already fetched part)
ReplicatedMergeTreeLogEntryData log_entry;
log_entry.type = ReplicatedMergeTreeLogEntryData::ATTACH_PART;
log_entry.part_checksum = part->checksums.getTotalChecksumHex();
log_entry.create_time = std::time(nullptr);
log_entry.new_part_name = part_info.getPartName();
ops.emplace_back(zkutil::makeSetRequest(entry.to_shard + "/log", "", -1));
ops.emplace_back(zkutil::makeCreateRequest(
entry.to_shard + "/log/log-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
Coordination::Responses responses;
Coordination::Error rc = zk->tryMulti(ops, responses);
zkutil::KeeperMultiException::check(rc, ops, responses);
String log_znode_path = dynamic_cast<const Coordination::CreateResponse &>(*responses.back()).path_created;
log_entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1);
storage.waitForAllTableReplicasToProcessLogEntry(entry.to_shard, log_entry, true);
}
{
/// State transition.
Entry entry_copy = entry;
entry_copy.state = EntryState::SOURCE_DROP_PRE_DELAY;
entry_copy.update_time = std::time(nullptr);
zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
}
}
break;
case EntryState::SOURCE_DROP_PRE_DELAY:
{
std::this_thread::sleep_for(std::chrono::seconds(storage.getSettings()->part_moves_between_shards_delay_seconds));
/// State transition.
Entry entry_copy = entry;
entry_copy.state = EntryState::SOURCE_DROP;
entry_copy.update_time = std::time(nullptr);
zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
}
break;
case EntryState::SOURCE_DROP:
{
{
ReplicatedMergeTreeLogEntry log_entry;
if (storage.dropPart(zk, entry.part_name, log_entry,false, false))
storage.waitForAllReplicasToProcessLogEntry(log_entry, true);
}
{
/// State transition.
Entry entry_copy = entry;
entry_copy.state = EntryState::SOURCE_DROP_POST_DELAY;
entry_copy.update_time = std::time(nullptr);
zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
}
}
break;
case EntryState::SOURCE_DROP_POST_DELAY:
{
std::this_thread::sleep_for(std::chrono::seconds(storage.getSettings()->part_moves_between_shards_delay_seconds));
/// State transition.
Entry entry_copy = entry;
entry_copy.state = EntryState::REMOVE_UUID_PIN;
entry_copy.update_time = std::time(nullptr);
zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
}
break;
case EntryState::REMOVE_UUID_PIN:
{
{
PinnedPartUUIDs src_pins;
PinnedPartUUIDs dst_pins;
{
String s = zk->get(zookeeper_path + "/pinned_part_uuids", &src_pins.stat);
src_pins.fromString(s);
}
{
String s = zk->get(entry.to_shard + "/pinned_part_uuids", &dst_pins.stat);
dst_pins.fromString(s);
}
src_pins.part_uuids.erase(entry.part_uuid);
dst_pins.part_uuids.erase(entry.part_uuid);
Coordination::Requests ops;
ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/pinned_part_uuids", src_pins.toString(), src_pins.stat.version));
ops.emplace_back(zkutil::makeSetRequest(entry.to_shard + "/pinned_part_uuids", dst_pins.toString(), dst_pins.stat.version));
zk->multi(ops);
}
/// State transition.
Entry entry_copy = entry;
entry_copy.state = EntryState::DONE;
entry_copy.update_time = std::time(nullptr);
zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
}
break;
}
}
std::vector<PartMovesBetweenShardsOrchestrator::Entry> PartMovesBetweenShardsOrchestrator::getEntries() const
{
std::lock_guard lock(state_mutex);
std::vector<Entry> res;
for (const auto & e : entries)
res.push_back(e.second);
return res;
}
String PartMovesBetweenShardsOrchestrator::Entry::toString() const
{
Poco::JSON::Object json;
json.set(JSON_KEY_CREATE_TIME, DB::toString(create_time));
json.set(JSON_KEY_UPDATE_TIME, DB::toString(update_time));
json.set(JSON_KEY_TASK_UUID, DB::toString(task_uuid));
json.set(JSON_KEY_PART_NAME, part_name);
json.set(JSON_KEY_PART_UUID, DB::toString(part_uuid));
json.set(JSON_KEY_TO_SHARD, to_shard);
json.set(JSON_KEY_STATE, state.toString());
json.set(JSON_KEY_LAST_EX_MSG, last_exception_msg);
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
oss.exceptions(std::ios::failbit);
json.stringify(oss);
return oss.str();
}
void PartMovesBetweenShardsOrchestrator::Entry::fromString(const String & buf)
{
Poco::JSON::Parser parser;
auto json = parser.parse(buf).extract<Poco::JSON::Object::Ptr>();
create_time = parseFromString<time_t>(json->getValue<std::string>(JSON_KEY_CREATE_TIME));
update_time = parseFromString<time_t>(json->getValue<std::string>(JSON_KEY_UPDATE_TIME));
task_uuid = parseFromString<UUID>(json->getValue<std::string>(JSON_KEY_TASK_UUID));
part_name = json->getValue<std::string>(JSON_KEY_PART_NAME);
part_uuid = parseFromString<UUID>(json->getValue<std::string>(JSON_KEY_PART_UUID));
to_shard = json->getValue<std::string>(JSON_KEY_TO_SHARD);
state.value = EntryState::fromString(json->getValue<std::string>(JSON_KEY_STATE));
last_exception_msg = json->getValue<std::string>(JSON_KEY_LAST_EX_MSG);
}
}

View File

@ -0,0 +1,162 @@
#pragma once
#include <vector>
#include <common/logger_useful.h>
#include <common/types.h>
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Core/UUID.h>
#include <Core/BackgroundSchedulePool.h>
#include <IO/WriteHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
class StorageReplicatedMergeTree;
/// Cross shard part movement workflow orchestration.
class PartMovesBetweenShardsOrchestrator
{
public:
struct EntryState
{
enum Value
{
TODO,
SYNC_SOURCE,
SYNC_DESTINATION,
DESTINATION_FETCH,
DESTINATION_ATTACH,
SOURCE_DROP_PRE_DELAY,
SOURCE_DROP,
SOURCE_DROP_POST_DELAY,
REMOVE_UUID_PIN,
DONE,
CANCELLED,
};
EntryState(): value(TODO) {}
EntryState(Value value_): value(value_) {}
Value value;
String toString() const
{
switch (value)
{
case TODO: return "TODO";
case SYNC_SOURCE: return "SYNC_SOURCE";
case SYNC_DESTINATION: return "SYNC_DESTINATION";
case DESTINATION_FETCH: return "DESTINATION_FETCH";
case DESTINATION_ATTACH: return "DESTINATION_ATTACH";
case SOURCE_DROP_PRE_DELAY: return "SOURCE_DROP_PRE_DELAY";
case SOURCE_DROP: return "SOURCE_DROP";
case SOURCE_DROP_POST_DELAY: return "SOURCE_DROP_POST_DELAY";
case REMOVE_UUID_PIN: return "REMOVE_UUID_PIN";
case DONE: return "DONE";
case CANCELLED: return "CANCELLED";
}
throw Exception("Unknown EntryState: " + DB::toString<int>(value), ErrorCodes::LOGICAL_ERROR);
}
static EntryState::Value fromString(String in)
{
if (in == "TODO") return TODO;
else if (in == "SYNC_SOURCE") return SYNC_SOURCE;
else if (in == "SYNC_DESTINATION") return SYNC_DESTINATION;
else if (in == "DESTINATION_FETCH") return DESTINATION_FETCH;
else if (in == "DESTINATION_ATTACH") return DESTINATION_ATTACH;
else if (in == "SOURCE_DROP_PRE_DELAY") return SOURCE_DROP_PRE_DELAY;
else if (in == "SOURCE_DROP") return SOURCE_DROP;
else if (in == "SOURCE_DROP_POST_DELAY") return SOURCE_DROP_POST_DELAY;
else if (in == "REMOVE_UUID_PIN") return REMOVE_UUID_PIN;
else if (in == "DONE") return DONE;
else if (in == "CANCELLED") return CANCELLED;
else throw Exception("Unknown state: " + in, ErrorCodes::LOGICAL_ERROR);
}
};
struct Entry
{
friend class PartMovesBetweenShardsOrchestrator;
time_t create_time = 0;
time_t update_time = 0;
/// Globally unique identifier used for attaching parts on destination.
/// Using `part_uuid` results in part names being reused when moving parts back and forth.
UUID task_uuid;
String part_name;
UUID part_uuid;
String to_shard;
EntryState state;
String last_exception_msg;
String znode_name;
private:
/// Transient value for CAS.
uint32_t version = 0;
String znode_path;
public:
String toString() const;
void fromString(const String & buf);
};
private:
static constexpr auto JSON_KEY_CREATE_TIME = "create_time";
static constexpr auto JSON_KEY_UPDATE_TIME = "update_time";
static constexpr auto JSON_KEY_TASK_UUID = "task_uuid";
static constexpr auto JSON_KEY_PART_NAME = "part_name";
static constexpr auto JSON_KEY_PART_UUID = "part_uuid";
static constexpr auto JSON_KEY_TO_SHARD = "to_shard";
static constexpr auto JSON_KEY_STATE = "state";
static constexpr auto JSON_KEY_LAST_EX_MSG = "last_exception";
public:
PartMovesBetweenShardsOrchestrator(StorageReplicatedMergeTree & storage_);
void start() { task->activateAndSchedule(); }
void wakeup() { task->schedule(); }
void shutdown();
void fetchStateFromZK();
/// We could have one thread per Entry and worry about concurrency issues.
/// Or we could have a single thread trying to run one step at a time.
bool step();
std::vector<Entry> getEntries() const;
private:
void run();
void stepEntry(const Entry & entry, zkutil::ZooKeeperPtr zk);
private:
StorageReplicatedMergeTree & storage;
String zookeeper_path;
String logger_name;
Poco::Logger * log = nullptr;
std::atomic<bool> need_stop{false};
BackgroundSchedulePool::TaskHolder task;
mutable std::mutex state_mutex;
std::map<String, Entry> entries;
public:
String entries_znode_path;
};
}

View File

@ -0,0 +1,36 @@
#include "PinnedPartUUIDs.h"
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Poco/JSON/JSON.h>
#include <Poco/JSON/Object.h>
#include <Poco/JSON/Parser.h>
namespace DB
{
String PinnedPartUUIDs::toString() const
{
std::vector<UUID> vec(part_uuids.begin(), part_uuids.end());
Poco::JSON::Object json;
json.set(JSON_KEY_UUIDS, DB::toString(vec));
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
oss.exceptions(std::ios::failbit);
json.stringify(oss);
return oss.str();
}
void PinnedPartUUIDs::fromString(const String & buf)
{
Poco::JSON::Parser parser;
auto json = parser.parse(buf).extract<Poco::JSON::Object::Ptr>();
std::vector<UUID> vec = parseFromString<std::vector<UUID>>(json->getValue<std::string>(PinnedPartUUIDs::JSON_KEY_UUIDS));
part_uuids.clear();
std::copy(vec.begin(), vec.end(), std::inserter(part_uuids, part_uuids.begin()));
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include <Common/ZooKeeper/IKeeper.h>
#include <Core/UUID.h>
#include <set>
namespace DB
{
struct PinnedPartUUIDs
{
std::set<UUID> part_uuids;
Coordination::Stat stat{};
bool contains(const UUID & part_uuid) const
{
return part_uuids.contains(part_uuid);
}
String toString() const;
void fromString(const String & buf);
private:
static constexpr auto JSON_KEY_UUIDS = "part_uuids";
};
}

View File

@ -54,6 +54,12 @@ void ReplicatedMergeTreeLogEntryData::writeText(WriteBuffer & out) const
out << "get\n" << new_part_name; out << "get\n" << new_part_name;
break; break;
case CLONE_PART_FROM_SHARD:
out << "clone_part_from_shard\n"
<< new_part_name << "\n"
<< "source_shard: " << source_shard;
break;
case ATTACH_PART: case ATTACH_PART:
out << "attach\n" << new_part_name << "\n" out << "attach\n" << new_part_name << "\n"
<< "part_checksum: " << part_checksum; << "part_checksum: " << part_checksum;
@ -142,6 +148,10 @@ void ReplicatedMergeTreeLogEntryData::writeText(WriteBuffer & out) const
out << metadata_str; out << metadata_str;
break; break;
case SYNC_PINNED_PART_UUIDS:
out << "sync_pinned_part_uuids\n";
break;
default: default:
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown log entry type: {}", static_cast<int>(type)); throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown log entry type: {}", static_cast<int>(type));
} }
@ -306,6 +316,16 @@ void ReplicatedMergeTreeLogEntryData::readText(ReadBuffer & in)
metadata_str.resize(metadata_size); metadata_str.resize(metadata_size);
in.readStrict(&metadata_str[0], metadata_size); in.readStrict(&metadata_str[0], metadata_size);
} }
else if (type_str == "sync_pinned_part_uuids")
{
type = SYNC_PINNED_PART_UUIDS;
}
else if (type_str == "clone_part_from_shard")
{
type = CLONE_PART_FROM_SHARD;
in >> new_part_name;
in >> "\nsource_shard: " >> source_shard;
}
if (!trailing_newline_found) if (!trailing_newline_found)
in >> "\n"; in >> "\n";
@ -419,6 +439,14 @@ Strings ReplicatedMergeTreeLogEntryData::getVirtualPartNames(MergeTreeDataFormat
return res; return res;
} }
/// Doesn't produce any part.
if (type == SYNC_PINNED_PART_UUIDS)
return {};
/// Doesn't produce any part by itself.
if (type == CLONE_PART_FROM_SHARD)
return {};
return {new_part_name}; return {new_part_name};
} }

View File

@ -43,6 +43,8 @@ struct ReplicatedMergeTreeLogEntryData
REPLACE_RANGE, /// Drop certain range of partitions and replace them by new ones REPLACE_RANGE, /// Drop certain range of partitions and replace them by new ones
MUTATE_PART, /// Apply one or several mutations to the part. MUTATE_PART, /// Apply one or several mutations to the part.
ALTER_METADATA, /// Apply alter modification according to global /metadata and /columns paths ALTER_METADATA, /// Apply alter modification according to global /metadata and /columns paths
SYNC_PINNED_PART_UUIDS, /// Synchronization point for ensuring that all replicas have up to date in-memory state.
CLONE_PART_FROM_SHARD, /// Clone part from another shard.
}; };
static String typeToString(Type type) static String typeToString(Type type)
@ -58,6 +60,8 @@ struct ReplicatedMergeTreeLogEntryData
case ReplicatedMergeTreeLogEntryData::REPLACE_RANGE: return "REPLACE_RANGE"; case ReplicatedMergeTreeLogEntryData::REPLACE_RANGE: return "REPLACE_RANGE";
case ReplicatedMergeTreeLogEntryData::MUTATE_PART: return "MUTATE_PART"; case ReplicatedMergeTreeLogEntryData::MUTATE_PART: return "MUTATE_PART";
case ReplicatedMergeTreeLogEntryData::ALTER_METADATA: return "ALTER_METADATA"; case ReplicatedMergeTreeLogEntryData::ALTER_METADATA: return "ALTER_METADATA";
case ReplicatedMergeTreeLogEntryData::SYNC_PINNED_PART_UUIDS: return "SYNC_PINNED_PART_UUIDS";
case ReplicatedMergeTreeLogEntryData::CLONE_PART_FROM_SHARD: return "CLONE_PART_FROM_SHARD";
default: default:
throw Exception("Unknown log entry type: " + DB::toString<int>(type), ErrorCodes::LOGICAL_ERROR); throw Exception("Unknown log entry type: " + DB::toString<int>(type), ErrorCodes::LOGICAL_ERROR);
} }
@ -76,6 +80,7 @@ struct ReplicatedMergeTreeLogEntryData
Type type = EMPTY; Type type = EMPTY;
String source_replica; /// Empty string means that this entry was added to the queue immediately, and not copied from the log. String source_replica; /// Empty string means that this entry was added to the queue immediately, and not copied from the log.
String source_shard;
String part_checksum; /// Part checksum for ATTACH_PART, empty otherwise. String part_checksum; /// Part checksum for ATTACH_PART, empty otherwise.

View File

@ -1804,6 +1804,17 @@ ReplicatedMergeTreeMergePredicate::ReplicatedMergeTreeMergePredicate(
merges_version = queue_.pullLogsToQueue(zookeeper); merges_version = queue_.pullLogsToQueue(zookeeper);
{
/// We avoid returning here a version to be used in a lightweight transaction.
///
/// When pinned parts set is changed a log entry is added to the queue in the same transaction.
/// The log entry serves as a synchronization point, and it also increments `merges_version`.
///
/// If pinned parts are fetched after logs are pulled then we can safely say that it contains all locks up to `merges_version`.
String s = zookeeper->get(queue.zookeeper_path + "/pinned_part_uuids");
pinned_part_uuids.fromString(s);
}
Coordination::GetResponse quorum_status_response = quorum_status_future.get(); Coordination::GetResponse quorum_status_response = quorum_status_future.get();
if (quorum_status_response.error == Coordination::Error::ZOK) if (quorum_status_response.error == Coordination::Error::ZOK)
{ {
@ -1872,6 +1883,13 @@ bool ReplicatedMergeTreeMergePredicate::canMergeTwoParts(
for (const MergeTreeData::DataPartPtr & part : {left, right}) for (const MergeTreeData::DataPartPtr & part : {left, right})
{ {
if (pinned_part_uuids.part_uuids.contains(part->uuid))
{
if (out_reason)
*out_reason = "Part " + part->name + " has uuid " + toString(part->uuid) + " which is currently pinned";
return false;
}
if (part->name == inprogress_quorum_part) if (part->name == inprogress_quorum_part)
{ {
if (out_reason) if (out_reason)
@ -1967,6 +1985,13 @@ bool ReplicatedMergeTreeMergePredicate::canMergeSinglePart(
const MergeTreeData::DataPartPtr & part, const MergeTreeData::DataPartPtr & part,
String * out_reason) const String * out_reason) const
{ {
if (pinned_part_uuids.part_uuids.contains(part->uuid))
{
if (out_reason)
*out_reason = "Part " + part->name + " has uuid " + toString(part->uuid) + " which is currently pinned";
return false;
}
if (part->name == inprogress_quorum_part) if (part->name == inprogress_quorum_part)
{ {
if (out_reason) if (out_reason)

View File

@ -8,6 +8,7 @@
#include <Storages/MergeTree/ActiveDataPartSet.h> #include <Storages/MergeTree/ActiveDataPartSet.h>
#include <Storages/MergeTree/MergeTreeData.h> #include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/MergeTreeMutationStatus.h> #include <Storages/MergeTree/MergeTreeMutationStatus.h>
#include <Storages/MergeTree/PinnedPartUUIDs.h>
#include <Storages/MergeTree/ReplicatedMergeTreeQuorumAddedParts.h> #include <Storages/MergeTree/ReplicatedMergeTreeQuorumAddedParts.h>
#include <Storages/MergeTree/ReplicatedMergeTreeAltersSequence.h> #include <Storages/MergeTree/ReplicatedMergeTreeAltersSequence.h>
@ -486,6 +487,9 @@ private:
/// (loaded at some later time than prev_virtual_parts). /// (loaded at some later time than prev_virtual_parts).
std::unordered_map<String, std::set<Int64>> committing_blocks; std::unordered_map<String, std::set<Int64>> committing_blocks;
/// List of UUIDs for parts that have their identity "pinned".
PinnedPartUUIDs pinned_part_uuids;
/// Quorum state taken at some later time than prev_virtual_parts. /// Quorum state taken at some later time than prev_virtual_parts.
String inprogress_quorum_part; String inprogress_quorum_part;

View File

@ -18,7 +18,6 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR; extern const int LOGICAL_ERROR;
} }
std::optional<PartitionCommand> PartitionCommand::parse(const ASTAlterCommand * command_ast) std::optional<PartitionCommand> PartitionCommand::parse(const ASTAlterCommand * command_ast)
{ {
if (command_ast->type == ASTAlterCommand::DROP_PARTITION) if (command_ast->type == ASTAlterCommand::DROP_PARTITION)
@ -65,8 +64,11 @@ std::optional<PartitionCommand> PartitionCommand::parse(const ASTAlterCommand *
res.to_database = command_ast->to_database; res.to_database = command_ast->to_database;
res.to_table = command_ast->to_table; res.to_table = command_ast->to_table;
break; break;
default: case DataDestinationType::SHARD:
res.move_destination_type = PartitionCommand::MoveDestinationType::SHARD;
break; break;
case DataDestinationType::DELETE:
throw Exception("ALTER with this destination type is not handled. This is a bug.", ErrorCodes::LOGICAL_ERROR);
} }
if (res.move_destination_type != PartitionCommand::MoveDestinationType::TABLE) if (res.move_destination_type != PartitionCommand::MoveDestinationType::TABLE)
res.move_destination_name = command_ast->move_destination_name; res.move_destination_name = command_ast->move_destination_name;

View File

@ -64,6 +64,7 @@ struct PartitionCommand
DISK, DISK,
VOLUME, VOLUME,
TABLE, TABLE,
SHARD,
}; };
std::optional<MoveDestinationType> move_destination_type; std::optional<MoveDestinationType> move_destination_type;

View File

@ -43,6 +43,7 @@ static const auto RETRIES_MAX = 20;
static const uint32_t QUEUE_SIZE = 100000; static const uint32_t QUEUE_SIZE = 100000;
static const auto MAX_FAILED_READ_ATTEMPTS = 10; static const auto MAX_FAILED_READ_ATTEMPTS = 10;
static const auto RESCHEDULE_MS = 500; static const auto RESCHEDULE_MS = 500;
static const auto BACKOFF_TRESHOLD = 32000;
static const auto MAX_THREAD_WORK_DURATION_MS = 60000; static const auto MAX_THREAD_WORK_DURATION_MS = 60000;
namespace ErrorCodes namespace ErrorCodes
@ -100,6 +101,7 @@ StorageRabbitMQ::StorageRabbitMQ(
, semaphore(0, num_consumers) , semaphore(0, num_consumers)
, unique_strbase(getRandomName()) , unique_strbase(getRandomName())
, queue_size(std::max(QUEUE_SIZE, static_cast<uint32_t>(getMaxBlockSize()))) , queue_size(std::max(QUEUE_SIZE, static_cast<uint32_t>(getMaxBlockSize())))
, milliseconds_to_wait(RESCHEDULE_MS)
{ {
event_handler = std::make_shared<RabbitMQHandler>(loop.getLoop(), log); event_handler = std::make_shared<RabbitMQHandler>(loop.getLoop(), log);
restoreConnection(false); restoreConnection(false);
@ -852,7 +854,17 @@ void StorageRabbitMQ::streamingToViewsFunc()
LOG_DEBUG(log, "Started streaming to {} attached views", dependencies_count); LOG_DEBUG(log, "Started streaming to {} attached views", dependencies_count);
if (streamToViews()) if (streamToViews())
{
/// Reschedule with backoff.
if (milliseconds_to_wait < BACKOFF_TRESHOLD)
milliseconds_to_wait *= 2;
event_handler->updateLoopState(Loop::STOP);
break; break;
}
else
{
milliseconds_to_wait = RESCHEDULE_MS;
}
auto end_time = std::chrono::steady_clock::now(); auto end_time = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time); auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
@ -871,9 +883,8 @@ void StorageRabbitMQ::streamingToViewsFunc()
} }
} }
/// Wait for attached views
if (!stream_cancelled) if (!stream_cancelled)
streaming_task->scheduleAfter(RESCHEDULE_MS); streaming_task->scheduleAfter(milliseconds_to_wait);
} }
@ -1019,6 +1030,7 @@ bool StorageRabbitMQ::streamToViews()
looping_task->activateAndSchedule(); looping_task->activateAndSchedule();
} }
/// Do not reschedule, do not stop event loop.
return false; return false;
} }

View File

@ -136,6 +136,8 @@ private:
BackgroundSchedulePool::TaskHolder looping_task; BackgroundSchedulePool::TaskHolder looping_task;
BackgroundSchedulePool::TaskHolder connection_task; BackgroundSchedulePool::TaskHolder connection_task;
uint64_t milliseconds_to_wait;
std::atomic<bool> stream_cancelled{false}; std::atomic<bool> stream_cancelled{false};
size_t read_attempts = 0; size_t read_attempts = 0;
mutable bool drop_table = false; mutable bool drop_table = false;

View File

@ -16,6 +16,7 @@
#include <Storages/StorageReplicatedMergeTree.h> #include <Storages/StorageReplicatedMergeTree.h>
#include <Storages/MergeTree/IMergeTreeDataPart.h> #include <Storages/MergeTree/IMergeTreeDataPart.h>
#include <Storages/MergeTree/MergeList.h> #include <Storages/MergeTree/MergeList.h>
#include <Storages/MergeTree/PinnedPartUUIDs.h>
#include <Storages/MergeTree/PartitionPruner.h> #include <Storages/MergeTree/PartitionPruner.h>
#include <Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h> #include <Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h>
#include <Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h> #include <Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h>
@ -130,6 +131,7 @@ namespace ErrorCodes
extern const int NO_SUCH_DATA_PART; extern const int NO_SUCH_DATA_PART;
extern const int INTERSERVER_SCHEME_DOESNT_MATCH; extern const int INTERSERVER_SCHEME_DOESNT_MATCH;
extern const int DUPLICATE_DATA_PART; extern const int DUPLICATE_DATA_PART;
extern const int BAD_ARGUMENTS;
} }
namespace ActionLocks namespace ActionLocks
@ -282,6 +284,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
, cleanup_thread(*this) , cleanup_thread(*this)
, part_check_thread(*this) , part_check_thread(*this)
, restarting_thread(*this) , restarting_thread(*this)
, part_moves_between_shards_orchestrator(*this)
, allow_renaming(allow_renaming_) , allow_renaming(allow_renaming_)
, replicated_fetches_pool_size(getContext()->getSettingsRef().background_fetches_pool_size) , replicated_fetches_pool_size(getContext()->getSettingsRef().background_fetches_pool_size)
{ {
@ -459,6 +462,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
} }
createNewZooKeeperNodes(); createNewZooKeeperNodes();
syncPinnedPartUUIDs();
} }
@ -580,6 +584,9 @@ void StorageReplicatedMergeTree::createNewZooKeeperNodes()
zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3/shared", String()); zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3/shared", String());
} }
/// Part movement.
zookeeper->createIfNotExists(zookeeper_path + "/part_moves_shard", String());
zookeeper->createIfNotExists(zookeeper_path + "/pinned_part_uuids", getPinnedPartUUIDs()->toString());
/// For ALTER PARTITION with multi-leaders /// For ALTER PARTITION with multi-leaders
zookeeper->createIfNotExists(zookeeper_path + "/alter_partition_version", String()); zookeeper->createIfNotExists(zookeeper_path + "/alter_partition_version", String());
} }
@ -1224,6 +1231,27 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
} }
void StorageReplicatedMergeTree::syncPinnedPartUUIDs()
{
auto zookeeper = getZooKeeper();
Coordination::Stat stat;
String s = zookeeper->get(zookeeper_path + "/pinned_part_uuids", &stat);
std::lock_guard lock(pinned_part_uuids_mutex);
/// Unsure whether or not this can be called concurrently.
if (pinned_part_uuids->stat.version < stat.version)
{
auto new_pinned_part_uuids = std::make_shared<PinnedPartUUIDs>();
new_pinned_part_uuids->fromString(s);
new_pinned_part_uuids->stat = stat;
pinned_part_uuids = new_pinned_part_uuids;
}
}
void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil::ZooKeeperPtr & zookeeper, void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil::ZooKeeperPtr & zookeeper,
const DataPartPtr & part, Coordination::Requests & ops, String part_name, NameSet * absent_replicas_paths) const DataPartPtr & part, Coordination::Requests & ops, String part_name, NameSet * absent_replicas_paths)
{ {
@ -1512,6 +1540,12 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry)
break; break;
case LogEntry::ALTER_METADATA: case LogEntry::ALTER_METADATA:
return executeMetadataAlter(entry); return executeMetadataAlter(entry);
case LogEntry::SYNC_PINNED_PART_UUIDS:
syncPinnedPartUUIDs();
return true;
case LogEntry::CLONE_PART_FROM_SHARD:
executeClonePartFromShard(entry);
return true;
default: default:
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected log entry type: {}", static_cast<int>(entry.type)); throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected log entry type: {}", static_cast<int>(entry.type));
} }
@ -2543,6 +2577,59 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
} }
void StorageReplicatedMergeTree::executeClonePartFromShard(const LogEntry & entry)
{
auto zookeeper = getZooKeeper();
Strings replicas = zookeeper->getChildren(entry.source_shard + "/replicas");
std::shuffle(replicas.begin(), replicas.end(), thread_local_rng);
String replica;
for (const String & candidate : replicas)
{
if (zookeeper->exists(entry.source_shard + "/replicas/" + candidate + "/is_active"))
{
replica = candidate;
break;
}
}
if (replica.empty())
throw Exception(ErrorCodes::NO_REPLICA_HAS_PART, "Not found active replica on shard {} to clone part {}", entry.source_shard, entry.new_part_name);
LOG_INFO(log, "Will clone part from shard " + entry.source_shard + " and replica " + replica);
MutableDataPartPtr part;
{
auto metadata_snapshot = getInMemoryMetadataPtr();
String source_replica_path = entry.source_shard + "/replicas/" + replica;
ReplicatedMergeTreeAddress address(getZooKeeper()->get(source_replica_path + "/host"));
auto timeouts = ConnectionTimeouts::getHTTPTimeouts(getContext());
auto credentials = getContext()->getInterserverCredentials();
String interserver_scheme = getContext()->getInterserverScheme();
auto get_part = [&, address, timeouts, credentials, interserver_scheme]()
{
if (interserver_scheme != address.scheme)
throw Exception("Interserver schemes are different: '" + interserver_scheme
+ "' != '" + address.scheme + "', can't fetch part from " + address.host,
ErrorCodes::LOGICAL_ERROR);
return fetcher.fetchPart(
metadata_snapshot, entry.new_part_name, source_replica_path,
address.host, address.replication_port,
timeouts, credentials->getUser(), credentials->getPassword(), interserver_scheme, true);
};
part = get_part();
// The fetched part is valuable and should not be cleaned like a temp part.
part->is_temp = false;
part->renameTo("detached/" + entry.new_part_name, true);
LOG_INFO(log, "Cloned part {} to detached directory", part->name);
}
}
void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coordination::Stat source_is_lost_stat, zkutil::ZooKeeperPtr & zookeeper) void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coordination::Stat source_is_lost_stat, zkutil::ZooKeeperPtr & zookeeper)
{ {
String source_path = zookeeper_path + "/replicas/" + source_replica; String source_path = zookeeper_path + "/replicas/" + source_replica;
@ -4146,6 +4233,8 @@ void StorageReplicatedMergeTree::startup()
/// between the assignment of queue_task_handle and queueTask that use the queue_task_handle. /// between the assignment of queue_task_handle and queueTask that use the queue_task_handle.
background_executor.start(); background_executor.start();
startBackgroundMovesIfNeeded(); startBackgroundMovesIfNeeded();
part_moves_between_shards_orchestrator.start();
} }
catch (...) catch (...)
{ {
@ -4176,6 +4265,7 @@ void StorageReplicatedMergeTree::shutdown()
restarting_thread.shutdown(); restarting_thread.shutdown();
background_executor.finish(); background_executor.finish();
part_moves_between_shards_orchestrator.shutdown();
{ {
auto lock = queue.lockQueue(); auto lock = queue.lockQueue();
@ -5097,8 +5187,14 @@ bool StorageReplicatedMergeTree::existsNodeCached(const std::string & path) cons
std::optional<EphemeralLockInZooKeeper> std::optional<EphemeralLockInZooKeeper>
StorageReplicatedMergeTree::allocateBlockNumber( StorageReplicatedMergeTree::allocateBlockNumber(
const String & partition_id, const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_block_id_path) const const String & partition_id, const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_block_id_path, const String & zookeeper_path_prefix) const
{ {
String zookeeper_table_path;
if (zookeeper_path_prefix.empty())
zookeeper_table_path = zookeeper_path;
else
zookeeper_table_path = zookeeper_path_prefix;
/// Lets check for duplicates in advance, to avoid superfluous block numbers allocation /// Lets check for duplicates in advance, to avoid superfluous block numbers allocation
Coordination::Requests deduplication_check_ops; Coordination::Requests deduplication_check_ops;
if (!zookeeper_block_id_path.empty()) if (!zookeeper_block_id_path.empty())
@ -5107,7 +5203,7 @@ StorageReplicatedMergeTree::allocateBlockNumber(
deduplication_check_ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_block_id_path, -1)); deduplication_check_ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_block_id_path, -1));
} }
String block_numbers_path = zookeeper_path + "/block_numbers"; String block_numbers_path = zookeeper_table_path + "/block_numbers";
String partition_path = block_numbers_path + "/" + partition_id; String partition_path = block_numbers_path + "/" + partition_id;
if (!existsNodeCached(partition_path)) if (!existsNodeCached(partition_path))
@ -5130,7 +5226,7 @@ StorageReplicatedMergeTree::allocateBlockNumber(
try try
{ {
lock = EphemeralLockInZooKeeper( lock = EphemeralLockInZooKeeper(
partition_path + "/block-", zookeeper_path + "/temp", *zookeeper, &deduplication_check_ops); partition_path + "/block-", zookeeper_table_path + "/temp", *zookeeper, &deduplication_check_ops);
} }
catch (const zkutil::KeeperMultiException & e) catch (const zkutil::KeeperMultiException & e)
{ {
@ -5402,6 +5498,11 @@ void StorageReplicatedMergeTree::getQueue(LogEntriesData & res, String & replica
queue.getEntries(res); queue.getEntries(res);
} }
std::vector<PartMovesBetweenShardsOrchestrator::Entry> StorageReplicatedMergeTree::getPartMovesBetweenShardsEntries()
{
return part_moves_between_shards_orchestrator.getEntries();
}
time_t StorageReplicatedMergeTree::getAbsoluteDelay() const time_t StorageReplicatedMergeTree::getAbsoluteDelay() const
{ {
time_t min_unprocessed_insert_time = 0; time_t min_unprocessed_insert_time = 0;
@ -6613,6 +6714,100 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
cleanLastPartNode(partition_id); cleanLastPartNode(partition_id);
} }
void StorageReplicatedMergeTree::movePartitionToShard(
const ASTPtr & partition, bool move_part, const String & to, ContextPtr /*query_context*/)
{
/// This is a lightweight operation that only optimistically checks if it could succeed and queues tasks.
if (!move_part)
throw Exception("MOVE PARTITION TO SHARD is not supported, use MOVE PART instead", ErrorCodes::NOT_IMPLEMENTED);
if (normalizeZooKeeperPath(zookeeper_path) == normalizeZooKeeperPath(to))
throw Exception("Source and destination are the same", ErrorCodes::BAD_ARGUMENTS);
auto zookeeper = getZooKeeper();
String part_name = partition->as<ASTLiteral &>().value.safeGet<String>();
auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version);
auto part = getPartIfExists(part_info, {MergeTreeDataPartState::Committed});
if (!part)
throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "Part {} not found locally", part_name);
if (part->uuid == UUIDHelpers::Nil)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Part {} does not have an uuid assigned and it can't be moved between shards", part_name);
ReplicatedMergeTreeMergePredicate merge_pred = queue.getMergePredicate(zookeeper);
/// The following block is pretty much copy & paste from StorageReplicatedMergeTree::dropPart to avoid conflicts while this is WIP.
/// Extract it to a common method and re-use it before merging.
{
if (partIsLastQuorumPart(part->info))
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Part {} is last inserted part with quorum in partition. Would not be able to drop", part_name);
}
/// canMergeSinglePart is overlapping with dropPart, let's try to use the same code.
String out_reason;
if (!merge_pred.canMergeSinglePart(part, &out_reason))
throw Exception(ErrorCodes::PART_IS_TEMPORARILY_LOCKED, "Part is busy, reason: " + out_reason);
}
{
/// Optimistic check that for compatible destination table structure.
checkTableStructure(to, getInMemoryMetadataPtr());
}
PinnedPartUUIDs src_pins;
PinnedPartUUIDs dst_pins;
{
String s = zookeeper->get(zookeeper_path + "/pinned_part_uuids", &src_pins.stat);
src_pins.fromString(s);
}
{
String s = zookeeper->get(to + "/pinned_part_uuids", &dst_pins.stat);
dst_pins.fromString(s);
}
if (src_pins.part_uuids.contains(part->uuid) || dst_pins.part_uuids.contains(part->uuid))
throw Exception(ErrorCodes::PART_IS_TEMPORARILY_LOCKED, "Part {} has it's uuid ({}) already pinned.", part_name, toString(part->uuid));
src_pins.part_uuids.insert(part->uuid);
dst_pins.part_uuids.insert(part->uuid);
PartMovesBetweenShardsOrchestrator::Entry part_move_entry;
part_move_entry.create_time = std::time(nullptr);
part_move_entry.update_time = part_move_entry.create_time;
part_move_entry.task_uuid = UUIDHelpers::generateV4();
part_move_entry.part_name = part->name;
part_move_entry.part_uuid = part->uuid;
part_move_entry.to_shard = to;
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCheckRequest(zookeeper_path + "/log", merge_pred.getVersion())); /// Make sure no new events were added to the log.
ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/pinned_part_uuids", src_pins.toString(), src_pins.stat.version));
ops.emplace_back(zkutil::makeSetRequest(to + "/pinned_part_uuids", dst_pins.toString(), dst_pins.stat.version));
ops.emplace_back(zkutil::makeCreateRequest(
part_moves_between_shards_orchestrator.entries_znode_path + "/task-",
part_move_entry.toString(),
zkutil::CreateMode::PersistentSequential));
Coordination::Responses responses;
Coordination::Error rc = zookeeper->tryMulti(ops, responses);
zkutil::KeeperMultiException::check(rc, ops, responses);
String task_znode_path = dynamic_cast<const Coordination::CreateResponse &>(*responses.back()).path_created;
LOG_DEBUG(log, "Created task for part movement between shards at " + task_znode_path);
/// Force refresh local state. This will make the task immediately visible in `system.part_moves_between_shards` table.
part_moves_between_shards_orchestrator.fetchStateFromZK();
// TODO: Add support for `replication_alter_partitions_sync`.
}
void StorageReplicatedMergeTree::getCommitPartOps( void StorageReplicatedMergeTree::getCommitPartOps(
Coordination::Requests & ops, Coordination::Requests & ops,
MutableDataPartPtr & part, MutableDataPartPtr & part,

View File

@ -20,6 +20,7 @@
#include <Storages/MergeTree/DataPartsExchange.h> #include <Storages/MergeTree/DataPartsExchange.h>
#include <Storages/MergeTree/ReplicatedMergeTreeAddress.h> #include <Storages/MergeTree/ReplicatedMergeTreeAddress.h>
#include <Storages/MergeTree/LeaderElection.h> #include <Storages/MergeTree/LeaderElection.h>
#include <Storages/MergeTree/PartMovesBetweenShardsOrchestrator.h>
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <Interpreters/Cluster.h> #include <Interpreters/Cluster.h>
#include <Interpreters/PartLog.h> #include <Interpreters/PartLog.h>
@ -183,6 +184,8 @@ public:
using LogEntriesData = std::vector<ReplicatedMergeTreeLogEntryData>; using LogEntriesData = std::vector<ReplicatedMergeTreeLogEntryData>;
void getQueue(LogEntriesData & res, String & replica_name); void getQueue(LogEntriesData & res, String & replica_name);
std::vector<PartMovesBetweenShardsOrchestrator::Entry> getPartMovesBetweenShardsEntries();
/// Get replica delay relative to current time. /// Get replica delay relative to current time.
time_t getAbsoluteDelay() const; time_t getAbsoluteDelay() const;
@ -252,6 +255,7 @@ private:
friend struct ReplicatedMergeTreeLogEntry; friend struct ReplicatedMergeTreeLogEntry;
friend class ScopedPartitionMergeLock; friend class ScopedPartitionMergeLock;
friend class ReplicatedMergeTreeQueue; friend class ReplicatedMergeTreeQueue;
friend class PartMovesBetweenShardsOrchestrator;
friend class MergeTreeData; friend class MergeTreeData;
using MergeStrategyPicker = ReplicatedMergeTreeMergeStrategyPicker; using MergeStrategyPicker = ReplicatedMergeTreeMergeStrategyPicker;
@ -305,7 +309,6 @@ private:
DataPartsExchange::Fetcher fetcher; DataPartsExchange::Fetcher fetcher;
/// When activated, replica is initialized and startup() method could exit /// When activated, replica is initialized and startup() method could exit
Poco::Event startup_event; Poco::Event startup_event;
@ -350,6 +353,8 @@ private:
/// A thread that processes reconnection to ZooKeeper when the session expires. /// A thread that processes reconnection to ZooKeeper when the session expires.
ReplicatedMergeTreeRestartingThread restarting_thread; ReplicatedMergeTreeRestartingThread restarting_thread;
PartMovesBetweenShardsOrchestrator part_moves_between_shards_orchestrator;
/// True if replica was created for existing table with fixed granularity /// True if replica was created for existing table with fixed granularity
bool other_replicas_fixed_granularity = false; bool other_replicas_fixed_granularity = false;
@ -387,6 +392,10 @@ private:
*/ */
void checkParts(bool skip_sanity_checks); void checkParts(bool skip_sanity_checks);
/// Synchronize the list of part uuids which are currently pinned. These should be sent to root query executor
/// to be used for deduplication.
void syncPinnedPartUUIDs();
/** Check that the part's checksum is the same as the checksum of the same part on some other replica. /** Check that the part's checksum is the same as the checksum of the same part on some other replica.
* If no one has such a part, nothing checks. * If no one has such a part, nothing checks.
* Not very reliable: if two replicas add a part almost at the same time, no checks will occur. * Not very reliable: if two replicas add a part almost at the same time, no checks will occur.
@ -457,6 +466,7 @@ private:
bool executeFetch(LogEntry & entry); bool executeFetch(LogEntry & entry);
bool executeReplaceRange(const LogEntry & entry); bool executeReplaceRange(const LogEntry & entry);
void executeClonePartFromShard(const LogEntry & entry);
/** Updates the queue. /** Updates the queue.
*/ */
@ -585,9 +595,11 @@ private:
bool partIsInsertingWithParallelQuorum(const MergeTreePartInfo & part_info) const; bool partIsInsertingWithParallelQuorum(const MergeTreePartInfo & part_info) const;
/// Creates new block number if block with such block_id does not exist /// Creates new block number if block with such block_id does not exist
/// If zookeeper_path_prefix specified then allocate block number on this path
/// (can be used if we want to allocate blocks on other replicas)
std::optional<EphemeralLockInZooKeeper> allocateBlockNumber( std::optional<EphemeralLockInZooKeeper> allocateBlockNumber(
const String & partition_id, const zkutil::ZooKeeperPtr & zookeeper, const String & partition_id, const zkutil::ZooKeeperPtr & zookeeper,
const String & zookeeper_block_id_path = "") const; const String & zookeeper_block_id_path = "", const String & zookeeper_path_prefix = "") const;
/** Wait until all replicas, including this, execute the specified action from the log. /** Wait until all replicas, including this, execute the specified action from the log.
* If replicas are added at the same time, it can not wait the added replica . * If replicas are added at the same time, it can not wait the added replica .
@ -639,6 +651,7 @@ private:
PartitionCommandsResultInfo attachPartition(const ASTPtr & partition, const StorageMetadataPtr & metadata_snapshot, bool part, ContextPtr query_context) override; PartitionCommandsResultInfo attachPartition(const ASTPtr & partition, const StorageMetadataPtr & metadata_snapshot, bool part, ContextPtr query_context) override;
void replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, ContextPtr query_context) override; void replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, ContextPtr query_context) override;
void movePartitionToTable(const StoragePtr & dest_table, const ASTPtr & partition, ContextPtr query_context) override; void movePartitionToTable(const StoragePtr & dest_table, const ASTPtr & partition, ContextPtr query_context) override;
void movePartitionToShard(const ASTPtr & partition, bool move_part, const String & to, ContextPtr query_context) override;
void fetchPartition( void fetchPartition(
const ASTPtr & partition, const ASTPtr & partition,
const StorageMetadataPtr & metadata_snapshot, const StorageMetadataPtr & metadata_snapshot,

View File

@ -0,0 +1,135 @@
#include <Access/ContextAccess.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeDateTime.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeUUID.h>
#include <DataTypes/DataTypesNumber.h>
#include <Databases/IDatabase.h>
#include <Storages/StorageReplicatedMergeTree.h>
#include <Storages/System/StorageSystemPartMovesBetweenShards.h>
#include <Storages/VirtualColumnUtils.h>
#include <Common/typeid_cast.h>
namespace DB
{
NamesAndTypesList StorageSystemPartMovesBetweenShards::getNamesAndTypes()
{
return {
/// Table properties.
{ "database", std::make_shared<DataTypeString>() },
{ "table", std::make_shared<DataTypeString>() },
/// Constant element properties.
{ "task_name", std::make_shared<DataTypeString>() },
{ "task_uuid", std::make_shared<DataTypeUUID>() },
{ "create_time", std::make_shared<DataTypeDateTime>() },
{ "part_name", std::make_shared<DataTypeString>() },
{ "part_uuid", std::make_shared<DataTypeUUID>() },
{ "to_shard", std::make_shared<DataTypeString>() },
/// Processing status of item.
{ "update_time", std::make_shared<DataTypeDateTime>() },
{ "state", std::make_shared<DataTypeString>() },
{ "num_tries", std::make_shared<DataTypeUInt32>() },
{ "last_exception", std::make_shared<DataTypeString>() },
};
}
void StorageSystemPartMovesBetweenShards::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const
{
const auto access = context->getAccess();
const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES);
std::map<String, std::map<String, StoragePtr>> replicated_tables;
for (const auto & db : DatabaseCatalog::instance().getDatabases())
{
/// Check if database can contain replicated tables
if (!db.second->canContainMergeTreeTables())
continue;
const bool check_access_for_tables = check_access_for_databases && !access->isGranted(AccessType::SHOW_TABLES, db.first);
for (auto iterator = db.second->getTablesIterator(context); iterator->isValid(); iterator->next())
{
const auto & table = iterator->table();
if (!table)
continue;
if (!dynamic_cast<const StorageReplicatedMergeTree *>(table.get()))
continue;
if (check_access_for_tables && !access->isGranted(AccessType::SHOW_TABLES, db.first, iterator->name()))
continue;
replicated_tables[db.first][iterator->name()] = table;
}
}
MutableColumnPtr col_database_mut = ColumnString::create();
MutableColumnPtr col_table_mut = ColumnString::create();
for (auto & db : replicated_tables)
{
for (auto & table : db.second)
{
col_database_mut->insert(db.first);
col_table_mut->insert(table.first);
}
}
ColumnPtr col_database_to_filter = std::move(col_database_mut);
ColumnPtr col_table_to_filter = std::move(col_table_mut);
/// Determine what tables are needed by the conditions in the query.
{
Block filtered_block
{
{ col_database_to_filter, std::make_shared<DataTypeString>(), "database" },
{ col_table_to_filter, std::make_shared<DataTypeString>(), "table" },
};
VirtualColumnUtils::filterBlockWithQuery(query_info.query, filtered_block, context);
if (!filtered_block.rows())
return;
col_database_to_filter = filtered_block.getByName("database").column;
col_table_to_filter = filtered_block.getByName("table").column;
}
for (size_t i = 0, tables_size = col_database_to_filter->size(); i < tables_size; ++i)
{
String database = (*col_database_to_filter)[i].safeGet<const String &>();
String table = (*col_table_to_filter)[i].safeGet<const String &>();
auto moves = dynamic_cast<StorageReplicatedMergeTree &>(*replicated_tables[database][table]).getPartMovesBetweenShardsEntries();
for (auto & entry : moves)
{
size_t col_num = 0;
/// Table properties.
res_columns[col_num++]->insert(database);
res_columns[col_num++]->insert(table);
/// Constant element properties.
res_columns[col_num++]->insert(entry.znode_name);
res_columns[col_num++]->insert(entry.task_uuid);
res_columns[col_num++]->insert(entry.create_time);
res_columns[col_num++]->insert(entry.part_name);
res_columns[col_num++]->insert(entry.part_uuid);
res_columns[col_num++]->insert(entry.to_shard);
/// Processing status of item.
res_columns[col_num++]->insert(entry.update_time);
res_columns[col_num++]->insert(entry.state.toString());
res_columns[col_num++]->insert(0);
res_columns[col_num++]->insert(entry.last_exception_msg);
}
}
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include <ext/shared_ptr_helper.h>
#include <Storages/System/IStorageSystemOneBlock.h>
namespace DB
{
class Context;
class StorageSystemPartMovesBetweenShards final : public ext::shared_ptr_helper<StorageSystemPartMovesBetweenShards>, public IStorageSystemOneBlock<StorageSystemPartMovesBetweenShards>
{
friend struct ext::shared_ptr_helper<StorageSystemPartMovesBetweenShards>;
public:
std::string getName() const override { return "SystemShardMoves"; }
static NamesAndTypesList getNamesAndTypes();
protected:
using IStorageSystemOneBlock::IStorageSystemOneBlock;
void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override;
};
}

View File

@ -25,6 +25,7 @@
#include <Storages/System/StorageSystemMutations.h> #include <Storages/System/StorageSystemMutations.h>
#include <Storages/System/StorageSystemNumbers.h> #include <Storages/System/StorageSystemNumbers.h>
#include <Storages/System/StorageSystemOne.h> #include <Storages/System/StorageSystemOne.h>
#include <Storages/System/StorageSystemPartMovesBetweenShards.h>
#include <Storages/System/StorageSystemParts.h> #include <Storages/System/StorageSystemParts.h>
#include <Storages/System/StorageSystemProjectionParts.h> #include <Storages/System/StorageSystemProjectionParts.h>
#include <Storages/System/StorageSystemPartsColumns.h> #include <Storages/System/StorageSystemPartsColumns.h>
@ -148,6 +149,7 @@ void attachSystemTablesServer(IDatabase & system_database, bool has_zookeeper)
attach<StorageSystemGraphite>(system_database, "graphite_retentions"); attach<StorageSystemGraphite>(system_database, "graphite_retentions");
attach<StorageSystemMacros>(system_database, "macros"); attach<StorageSystemMacros>(system_database, "macros");
attach<StorageSystemReplicatedFetches>(system_database, "replicated_fetches"); attach<StorageSystemReplicatedFetches>(system_database, "replicated_fetches");
attach<StorageSystemPartMovesBetweenShards>(system_database, "part_moves_between_shards");
if (has_zookeeper) if (has_zookeeper)
attach<StorageSystemZooKeeper>(system_database, "zookeeper"); attach<StorageSystemZooKeeper>(system_database, "zookeeper");

View File

@ -92,7 +92,9 @@ SRCS(
MergeTree/MergeType.cpp MergeTree/MergeType.cpp
MergeTree/MergedBlockOutputStream.cpp MergeTree/MergedBlockOutputStream.cpp
MergeTree/MergedColumnOnlyOutputStream.cpp MergeTree/MergedColumnOnlyOutputStream.cpp
MergeTree/PartMovesBetweenShardsOrchestrator.cpp
MergeTree/PartitionPruner.cpp MergeTree/PartitionPruner.cpp
MergeTree/PinnedPartUUIDs.cpp
MergeTree/ReplicatedFetchList.cpp MergeTree/ReplicatedFetchList.cpp
MergeTree/ReplicatedMergeTreeAddress.cpp MergeTree/ReplicatedMergeTreeAddress.cpp
MergeTree/ReplicatedMergeTreeAltersSequence.cpp MergeTree/ReplicatedMergeTreeAltersSequence.cpp
@ -177,6 +179,7 @@ SRCS(
System/StorageSystemMutations.cpp System/StorageSystemMutations.cpp
System/StorageSystemNumbers.cpp System/StorageSystemNumbers.cpp
System/StorageSystemOne.cpp System/StorageSystemOne.cpp
System/StorageSystemPartMovesBetweenShards.cpp
System/StorageSystemParts.cpp System/StorageSystemParts.cpp
System/StorageSystemPartsBase.cpp System/StorageSystemPartsBase.cpp
System/StorageSystemPartsColumns.cpp System/StorageSystemPartsColumns.cpp

View File

@ -185,9 +185,10 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std
'stderr': stderr_file, 'stderr': stderr_file,
} }
# >> append to stdout and stderr, because there are also output of per test database creation # >> append to stderr (but not stdout since it is not used there),
# because there are also output of per test database creation
if not args.database: if not args.database:
pattern = '{test} >> {stdout} 2>> {stderr}' pattern = '{test} > {stdout} 2>> {stderr}'
else: else:
pattern = '{test} > {stdout} 2> {stderr}' pattern = '{test} > {stdout} 2> {stderr}'

View File

@ -0,0 +1,7 @@
<yandex>
<merge_tree>
<assign_part_uuids>1</assign_part_uuids>
<part_moves_between_shards_enable>1</part_moves_between_shards_enable>
<part_moves_between_shards_delay_seconds>3</part_moves_between_shards_delay_seconds>
</merge_tree>
</yandex>

View File

@ -0,0 +1,26 @@
<yandex>
<remote_servers>
<test_cluster>
<shard>
<replica>
<host>s0r0</host>
<port>9000</port>
</replica>
<replica>
<host>s0r1</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>s1r0</host>
<port>9000</port>
</replica>
<replica>
<host>s1r1</host>
<port>9000</port>
</replica>
</shard>
</test_cluster>
</remote_servers>
</yandex>

View File

@ -0,0 +1,159 @@
import random
import time
import pytest
from helpers.client import QueryRuntimeException
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV
cluster = ClickHouseCluster(__file__)
s0r0 = cluster.add_instance(
's0r0',
main_configs=['configs/remote_servers.xml', 'configs/merge_tree.xml'],
with_zookeeper=True)
s0r1 = cluster.add_instance(
's0r1',
main_configs=['configs/remote_servers.xml', 'configs/merge_tree.xml'],
with_zookeeper=True)
s1r0 = cluster.add_instance(
's1r0',
main_configs=['configs/remote_servers.xml', 'configs/merge_tree.xml'],
with_zookeeper=True)
s1r1 = cluster.add_instance(
's1r1',
main_configs=['configs/remote_servers.xml', 'configs/merge_tree.xml'],
with_zookeeper=True)
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def test_move(started_cluster):
for shard_ix, rs in enumerate([[s0r0, s0r1], [s1r0, s1r1]]):
for replica_ix, r in enumerate(rs):
r.query("""
CREATE TABLE test_move(v UInt64)
ENGINE ReplicatedMergeTree('/clickhouse/shard_{}/tables/test_move', '{}')
ORDER BY tuple()
""".format(shard_ix, replica_ix))
s0r0.query("SYSTEM STOP MERGES test_move")
s0r0.query("INSERT INTO test_move VALUES (1)")
s0r0.query("INSERT INTO test_move VALUES (2)")
assert "2" == s0r0.query("SELECT count() FROM test_move").strip()
assert "0" == s1r0.query("SELECT count() FROM test_move").strip()
s0r0.query("ALTER TABLE test_move MOVE PART 'all_0_0_0' TO SHARD '/clickhouse/shard_1/tables/test_move'")
print(s0r0.query("SELECT * FROM system.part_moves_between_shards"))
s0r0.query("SYSTEM START MERGES test_move")
s0r0.query("OPTIMIZE TABLE test_move FINAL")
while True:
time.sleep(3)
print(s0r0.query("SELECT * FROM system.part_moves_between_shards"))
# Eventually.
if "DONE" == s0r0.query("SELECT state FROM system.part_moves_between_shards WHERE table = 'test_move'").strip():
break
for n in [s0r0, s0r1]:
assert "1" == n.query("SELECT count() FROM test_move").strip()
for n in [s1r0, s1r1]:
assert "1" == n.query("SELECT count() FROM test_move").strip()
# Move part back
s1r0.query("ALTER TABLE test_move MOVE PART 'all_0_0_0' TO SHARD '/clickhouse/shard_0/tables/test_move'")
while True:
time.sleep(3)
print(s1r0.query("SELECT * FROM system.part_moves_between_shards"))
# Eventually.
if "DONE" == s1r0.query("SELECT state FROM system.part_moves_between_shards WHERE table = 'test_move'").strip():
break
for n in [s0r0, s0r1]:
assert "2" == n.query("SELECT count() FROM test_move").strip()
for n in [s1r0, s1r1]:
assert "0" == n.query("SELECT count() FROM test_move").strip()
def test_deduplication_while_move(started_cluster):
for shard_ix, rs in enumerate([[s0r0, s0r1], [s1r0, s1r1]]):
for replica_ix, r in enumerate(rs):
r.query("""
CREATE TABLE test_deduplication(v UInt64)
ENGINE ReplicatedMergeTree('/clickhouse/shard_{}/tables/test_deduplication', '{}')
ORDER BY tuple()
""".format(shard_ix, replica_ix))
r.query("""
CREATE TABLE t_d AS test_deduplication
ENGINE Distributed('test_cluster', '', test_deduplication)
""")
s0r0.query("SYSTEM STOP MERGES test_deduplication")
s0r0.query("INSERT INTO test_deduplication VALUES (1)")
s0r0.query("INSERT INTO test_deduplication VALUES (2)")
s0r1.query("SYSTEM SYNC REPLICA test_deduplication", timeout=20)
assert "2" == s0r0.query("SELECT count() FROM test_deduplication").strip()
assert "0" == s1r0.query("SELECT count() FROM test_deduplication").strip()
s0r0.query("ALTER TABLE test_deduplication MOVE PART 'all_0_0_0' TO SHARD '/clickhouse/shard_1/tables/test_deduplication'")
s0r0.query("SYSTEM START MERGES test_deduplication")
expected = """
1
2
"""
# Verify that we get consisntent result at all times while the part is moving from one shard to another.
while "DONE" != s0r0.query("SELECT state FROM system.part_moves_between_shards WHERE table = 'test_deduplication' ORDER BY create_time DESC LIMIT 1").strip():
n = random.choice(list(started_cluster.instances.values()))
assert TSV(n.query("SELECT * FROM t_d ORDER BY v", settings={
"allow_experimental_query_deduplication": 1
})) == TSV(expected)
def test_move_not_permitted(started_cluster):
for ix, n in enumerate([s0r0, s1r0]):
n.query("DROP TABLE IF EXISTS not_permitted")
n.query("""
CREATE TABLE not_permitted(v_{} UInt64)
ENGINE ReplicatedMergeTree('/clickhouse/shard_{}/tables/not_permitted', 'r')
ORDER BY tuple()
""".format(ix, ix))
s0r0.query("INSERT INTO not_permitted VALUES (1)")
with pytest.raises(QueryRuntimeException) as exc:
s0r0.query("ALTER TABLE not_permitted MOVE PART 'all_0_0_0' TO SHARD '/clickhouse/shard_1/tables/not_permitted'")
assert "DB::Exception: Table columns structure in ZooKeeper is different from local table structure." in str(exc.value)
with pytest.raises(QueryRuntimeException) as exc:
s0r0.query("ALTER TABLE not_permitted MOVE PART 'all_0_0_0' TO SHARD '/clickhouse/shard_0/tables/not_permitted'")
assert "DB::Exception: Source and destination are the same" in str(exc.value)

View File

@ -0,0 +1,7 @@
<yandex>
<profiles>
<default>
<experimental_query_deduplication_send_all_part_uuids>1</experimental_query_deduplication_send_all_part_uuids>
</default>
</profiles>
</yandex>

View File

@ -11,15 +11,18 @@ cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance( node1 = cluster.add_instance(
'node1', 'node1',
main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml']) main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'],
user_configs=['configs/profiles.xml'])
node2 = cluster.add_instance( node2 = cluster.add_instance(
'node2', 'node2',
main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml']) main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'],
user_configs=['configs/profiles.xml'])
node3 = cluster.add_instance( node3 = cluster.add_instance(
'node3', 'node3',
main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml']) main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'],
user_configs=['configs/profiles.xml'])
@pytest.fixture(scope="module") @pytest.fixture(scope="module")

View File

@ -25,6 +25,6 @@ DROP DATABASE IF EXISTS test_DatabaseMemory;
CREATE DATABASE test_DatabaseMemory ENGINE = Memory; CREATE DATABASE test_DatabaseMemory ENGINE = Memory;
CREATE TABLE test_DatabaseMemory.A (A UInt8) ENGINE = Null; CREATE TABLE test_DatabaseMemory.A (A UInt8) ENGINE = Null;
SELECT sum(ignore(*, metadata_modification_time, engine_full, create_table_query)) FROM system.tables; SELECT sum(ignore(*, metadata_modification_time, engine_full, create_table_query)) FROM system.tables WHERE database = 'test_DatabaseMemory';
DROP DATABASE test_DatabaseMemory; DROP DATABASE test_DatabaseMemory;

View File

@ -57,7 +57,8 @@ ${CLICKHOUSE_LOCAL} -q "CREATE TABLE sophisticated_default
a UInt8 DEFAULT 3, a UInt8 DEFAULT 3,
b UInt8 ALIAS a + 5, b UInt8 ALIAS a + 5,
c UInt8 c UInt8
) ENGINE = Memory; SELECT count() FROM system.tables WHERE name='sophisticated_default';" ) ENGINE = Memory;
SELECT count() FROM system.tables WHERE name='sophisticated_default' AND database = currentDatabase();"
# Help is not skipped # Help is not skipped
[[ $(${CLICKHOUSE_LOCAL} --help | wc -l) -gt 100 ]] [[ $(${CLICKHOUSE_LOCAL} --help | wc -l) -gt 100 ]]

View File

@ -1,3 +1,5 @@
-- NOTE: database = currentDatabase() is not mandatory
SELECT avg(blockSize()) <= 10 FROM system.tables SETTINGS max_block_size = 10; SELECT avg(blockSize()) <= 10 FROM system.tables SETTINGS max_block_size = 10;
SELECT avg(blockSize()) <= 10 FROM system.tables LIMIT 10 SETTINGS max_block_size = 10; SELECT avg(blockSize()) <= 10 FROM system.tables LIMIT 10 SETTINGS max_block_size = 10;
SELECT (SELECT count() FROM system.tables SETTINGS max_block_size = 10) = (SELECT count() FROM system.tables SETTINGS max_block_size = 9); SELECT (SELECT count() FROM system.tables SETTINGS max_block_size = 10) = (SELECT count() FROM system.tables SETTINGS max_block_size = 9);

View File

@ -1,3 +1,5 @@
-- NOTE: database = currentDatabase() is not mandatory
SELECT count() > 0 FROM (SELECT * FROM system.columns LIMIT 0); SELECT count() > 0 FROM (SELECT * FROM system.columns LIMIT 0);
SELECT count() > 0 FROM (SELECT * FROM system.columns LIMIT 1); SELECT count() > 0 FROM (SELECT * FROM system.columns LIMIT 1);
SELECT count() > 0 FROM (SELECT * FROM system.columns LIMIT 2); SELECT count() > 0 FROM (SELECT * FROM system.columns LIMIT 2);

View File

@ -13,17 +13,15 @@ CREATE TABLE check_system_tables
SETTINGS min_bytes_for_wide_part = 0; SETTINGS min_bytes_for_wide_part = 0;
SELECT name, partition_key, sorting_key, primary_key, sampling_key, storage_policy, total_rows SELECT name, partition_key, sorting_key, primary_key, sampling_key, storage_policy, total_rows
FROM system.tables FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase()
WHERE name = 'check_system_tables'
FORMAT PrettyCompactNoEscapes; FORMAT PrettyCompactNoEscapes;
SELECT name, is_in_partition_key, is_in_sorting_key, is_in_primary_key, is_in_sampling_key SELECT name, is_in_partition_key, is_in_sorting_key, is_in_primary_key, is_in_sampling_key
FROM system.columns FROM system.columns WHERE table = 'check_system_tables' AND database = currentDatabase()
WHERE table = 'check_system_tables'
FORMAT PrettyCompactNoEscapes; FORMAT PrettyCompactNoEscapes;
INSERT INTO check_system_tables VALUES (1, 1, 1); INSERT INTO check_system_tables VALUES (1, 1, 1);
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
DROP TABLE IF EXISTS check_system_tables; DROP TABLE IF EXISTS check_system_tables;
@ -39,13 +37,11 @@ CREATE TABLE check_system_tables
ORDER BY date; ORDER BY date;
SELECT name, partition_key, sorting_key, primary_key, sampling_key SELECT name, partition_key, sorting_key, primary_key, sampling_key
FROM system.tables FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase()
WHERE name = 'check_system_tables'
FORMAT PrettyCompactNoEscapes; FORMAT PrettyCompactNoEscapes;
SELECT name, is_in_partition_key, is_in_sorting_key, is_in_primary_key, is_in_sampling_key SELECT name, is_in_partition_key, is_in_sorting_key, is_in_primary_key, is_in_sampling_key
FROM system.columns FROM system.columns WHERE table = 'check_system_tables' AND database = currentDatabase()
WHERE table = 'check_system_tables'
FORMAT PrettyCompactNoEscapes; FORMAT PrettyCompactNoEscapes;
DROP TABLE IF EXISTS check_system_tables; DROP TABLE IF EXISTS check_system_tables;
@ -59,29 +55,27 @@ CREATE TABLE check_system_tables
) ENGINE = MergeTree(Event, intHash32(UserId), (Counter, Event, intHash32(UserId)), 8192); ) ENGINE = MergeTree(Event, intHash32(UserId), (Counter, Event, intHash32(UserId)), 8192);
SELECT name, partition_key, sorting_key, primary_key, sampling_key SELECT name, partition_key, sorting_key, primary_key, sampling_key
FROM system.tables FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase()
WHERE name = 'check_system_tables'
FORMAT PrettyCompactNoEscapes; FORMAT PrettyCompactNoEscapes;
SELECT name, is_in_partition_key, is_in_sorting_key, is_in_primary_key, is_in_sampling_key SELECT name, is_in_partition_key, is_in_sorting_key, is_in_primary_key, is_in_sampling_key
FROM system.columns FROM system.columns WHERE table = 'check_system_tables' AND database = currentDatabase()
WHERE table = 'check_system_tables'
FORMAT PrettyCompactNoEscapes; FORMAT PrettyCompactNoEscapes;
DROP TABLE IF EXISTS check_system_tables; DROP TABLE IF EXISTS check_system_tables;
SELECT 'Check total_bytes/total_rows for TinyLog'; SELECT 'Check total_bytes/total_rows for TinyLog';
CREATE TABLE check_system_tables (key UInt8) ENGINE = TinyLog(); CREATE TABLE check_system_tables (key UInt8) ENGINE = TinyLog();
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
INSERT INTO check_system_tables VALUES (1); INSERT INTO check_system_tables VALUES (1);
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
DROP TABLE check_system_tables; DROP TABLE check_system_tables;
SELECT 'Check total_bytes/total_rows for Memory'; SELECT 'Check total_bytes/total_rows for Memory';
CREATE TABLE check_system_tables (key UInt16) ENGINE = Memory(); CREATE TABLE check_system_tables (key UInt16) ENGINE = Memory();
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
INSERT INTO check_system_tables VALUES (1); INSERT INTO check_system_tables VALUES (1);
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
DROP TABLE check_system_tables; DROP TABLE check_system_tables;
SELECT 'Check total_bytes/total_rows for Buffer'; SELECT 'Check total_bytes/total_rows for Buffer';
@ -96,33 +90,33 @@ CREATE TABLE check_system_tables (key UInt16) ENGINE = Buffer(
100, 100, /* min_rows /max_rows */ 100, 100, /* min_rows /max_rows */
0, 1e6 /* min_bytes/max_bytes */ 0, 1e6 /* min_bytes/max_bytes */
); );
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
INSERT INTO check_system_tables SELECT * FROM numbers_mt(50); INSERT INTO check_system_tables SELECT * FROM numbers_mt(50);
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
SELECT 'Check lifetime_bytes/lifetime_rows for Buffer'; SELECT 'Check lifetime_bytes/lifetime_rows for Buffer';
SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
OPTIMIZE TABLE check_system_tables; -- flush OPTIMIZE TABLE check_system_tables; -- flush
SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
INSERT INTO check_system_tables SELECT * FROM numbers_mt(50); INSERT INTO check_system_tables SELECT * FROM numbers_mt(50);
SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
OPTIMIZE TABLE check_system_tables; -- flush OPTIMIZE TABLE check_system_tables; -- flush
SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
INSERT INTO check_system_tables SELECT * FROM numbers_mt(101); -- direct block write (due to min_rows exceeded) INSERT INTO check_system_tables SELECT * FROM numbers_mt(101); -- direct block write (due to min_rows exceeded)
SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
DROP TABLE check_system_tables; DROP TABLE check_system_tables;
DROP TABLE check_system_tables_null; DROP TABLE check_system_tables_null;
SELECT 'Check total_bytes/total_rows for Set'; SELECT 'Check total_bytes/total_rows for Set';
CREATE TABLE check_system_tables Engine=Set() AS SELECT * FROM numbers(50); CREATE TABLE check_system_tables Engine=Set() AS SELECT * FROM numbers(50);
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
INSERT INTO check_system_tables SELECT number+50 FROM numbers(50); INSERT INTO check_system_tables SELECT number+50 FROM numbers(50);
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
DROP TABLE check_system_tables; DROP TABLE check_system_tables;
SELECT 'Check total_bytes/total_rows for Join'; SELECT 'Check total_bytes/total_rows for Join';
CREATE TABLE check_system_tables Engine=Join(ANY, LEFT, number) AS SELECT * FROM numbers(50); CREATE TABLE check_system_tables Engine=Join(ANY, LEFT, number) AS SELECT * FROM numbers(50);
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
INSERT INTO check_system_tables SELECT number+50 FROM numbers(50); INSERT INTO check_system_tables SELECT number+50 FROM numbers(50);
SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables'; SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
DROP TABLE check_system_tables; DROP TABLE check_system_tables;

View File

@ -1,3 +1,5 @@
-- NOTE: database = currentDatabase() is not mandatory
DROP TABLE IF EXISTS table1; DROP TABLE IF EXISTS table1;
DROP TABLE IF EXISTS table2; DROP TABLE IF EXISTS table2;

View File

@ -1,5 +1,7 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
set -e set -e
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)

View File

@ -7,12 +7,12 @@ DROP TABLE IF EXISTS alter_compression_codec2;
CREATE TABLE alter_compression_codec1 ( CREATE TABLE alter_compression_codec1 (
somedate Date CODEC(LZ4), somedate Date CODEC(LZ4),
id UInt64 CODEC(NONE) id UInt64 CODEC(NONE)
) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00910/alter_compression_codecs', '1') PARTITION BY somedate ORDER BY id; ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00910/'||currentDatabase()||'alter_compression_codecs', '1') PARTITION BY somedate ORDER BY id;
CREATE TABLE alter_compression_codec2 ( CREATE TABLE alter_compression_codec2 (
somedate Date CODEC(LZ4), somedate Date CODEC(LZ4),
id UInt64 CODEC(NONE) id UInt64 CODEC(NONE)
) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00910/alter_compression_codecs', '2') PARTITION BY somedate ORDER BY id; ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00910/'||currentDatabase()||'alter_compression_codecs', '2') PARTITION BY somedate ORDER BY id;
INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 1); INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 1);
INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 2); INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 2);
@ -25,8 +25,8 @@ ALTER TABLE alter_compression_codec1 ADD COLUMN alter_column String DEFAULT 'def
SYSTEM SYNC REPLICA alter_compression_codec1; SYSTEM SYNC REPLICA alter_compression_codec1;
SYSTEM SYNC REPLICA alter_compression_codec2; SYSTEM SYNC REPLICA alter_compression_codec2;
SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column'; SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column' AND database = currentDatabase();
SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column'; SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column' AND database = currentDatabase();
INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 3, '3'); INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 3, '3');
INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 4, '4'); INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 4, '4');
@ -37,8 +37,8 @@ SELECT * FROM alter_compression_codec1 ORDER BY id;
SELECT * FROM alter_compression_codec2 ORDER BY id; SELECT * FROM alter_compression_codec2 ORDER BY id;
ALTER TABLE alter_compression_codec1 MODIFY COLUMN alter_column CODEC(NONE); ALTER TABLE alter_compression_codec1 MODIFY COLUMN alter_column CODEC(NONE);
SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column'; SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column' AND database = currentDatabase();
SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column'; SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column' AND database = currentDatabase();
INSERT INTO alter_compression_codec2 VALUES('2018-01-01', 5, '5'); INSERT INTO alter_compression_codec2 VALUES('2018-01-01', 5, '5');
INSERT INTO alter_compression_codec2 VALUES('2018-01-01', 6, '6'); INSERT INTO alter_compression_codec2 VALUES('2018-01-01', 6, '6');
@ -50,8 +50,8 @@ SET allow_suspicious_codecs = 1;
ALTER TABLE alter_compression_codec1 MODIFY COLUMN alter_column CODEC(ZSTD, LZ4HC, LZ4, LZ4, NONE); ALTER TABLE alter_compression_codec1 MODIFY COLUMN alter_column CODEC(ZSTD, LZ4HC, LZ4, LZ4, NONE);
SYSTEM SYNC REPLICA alter_compression_codec1; SYSTEM SYNC REPLICA alter_compression_codec1;
SYSTEM SYNC REPLICA alter_compression_codec2; SYSTEM SYNC REPLICA alter_compression_codec2;
SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column'; SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column' AND database = currentDatabase();
SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column'; SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column' AND database = currentDatabase();
INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 7, '7'); INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 7, '7');
INSERT INTO alter_compression_codec2 VALUES('2018-01-01', 8, '8'); INSERT INTO alter_compression_codec2 VALUES('2018-01-01', 8, '8');
@ -62,8 +62,8 @@ SELECT * FROM alter_compression_codec2 ORDER BY id;
ALTER TABLE alter_compression_codec1 MODIFY COLUMN alter_column FixedString(100); ALTER TABLE alter_compression_codec1 MODIFY COLUMN alter_column FixedString(100);
SYSTEM SYNC REPLICA alter_compression_codec2; SYSTEM SYNC REPLICA alter_compression_codec2;
SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column'; SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column' AND database = currentDatabase();
SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column'; SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column' AND database = currentDatabase();
DROP TABLE IF EXISTS alter_compression_codec1; DROP TABLE IF EXISTS alter_compression_codec1;
DROP TABLE IF EXISTS alter_compression_codec2; DROP TABLE IF EXISTS alter_compression_codec2;

View File

@ -8,6 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
set -e set -e
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
for _ in {1..100}; do \ for _ in {1..100}; do \
$CLICKHOUSE_CLIENT -q "SELECT name FROM system.tables UNION ALL SELECT name FROM system.columns format Null"; $CLICKHOUSE_CLIENT -q "SELECT name FROM system.tables UNION ALL SELECT name FROM system.columns format Null";
done done

View File

@ -13,6 +13,7 @@ $CLICKHOUSE_CLIENT -q "CREATE TABLE alter_table (a UInt8, b Int16, c Float32, d
function thread1() function thread1()
{ {
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
while true; do $CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns UNION ALL SELECT name FROM system.columns FORMAT Null"; done while true; do $CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns UNION ALL SELECT name FROM system.columns FORMAT Null"; done
} }

View File

@ -15,6 +15,7 @@ $CLICKHOUSE_CLIENT -q "CREATE TABLE alter_table (a UInt8, b Int16, c Float32, d
function thread1() function thread1()
{ {
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
while true; do $CLICKHOUSE_CLIENT --query "SELECT * FROM system.parts FORMAT Null"; done while true; do $CLICKHOUSE_CLIENT --query "SELECT * FROM system.parts FORMAT Null"; done
} }

View File

@ -16,6 +16,7 @@ $CLICKHOUSE_CLIENT -n -q "
function thread1() function thread1()
{ {
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
while true; do $CLICKHOUSE_CLIENT --query "SELECT * FROM system.parts FORMAT Null"; done while true; do $CLICKHOUSE_CLIENT --query "SELECT * FROM system.parts FORMAT Null"; done
} }

View File

@ -8,6 +8,7 @@ set -e
function thread1() function thread1()
{ {
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
while true; do while true; do
$CLICKHOUSE_CLIENT --query "SELECT * FROM system.parts FORMAT Null"; $CLICKHOUSE_CLIENT --query "SELECT * FROM system.parts FORMAT Null";
done done

View File

@ -28,6 +28,7 @@ function thread2()
function thread3() function thread3()
{ {
while true; do while true; do
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
$CLICKHOUSE_CLIENT --query "SELECT * FROM system.tables" --format Null $CLICKHOUSE_CLIENT --query "SELECT * FROM system.tables" --format Null
done done
} }

View File

@ -16,6 +16,7 @@ $CLICKHOUSE_CLIENT --query "CREATE TABLE b (x UInt8) ENGINE = MergeTree ORDER BY
function thread1() function thread1()
{ {
while true; do while true; do
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
seq 1 100 | awk '{ print "SELECT x FROM a WHERE x IN (SELECT toUInt8(count()) FROM system.tables);" }' | $CLICKHOUSE_CLIENT -n seq 1 100 | awk '{ print "SELECT x FROM a WHERE x IN (SELECT toUInt8(count()) FROM system.tables);" }' | $CLICKHOUSE_CLIENT -n
done done
} }
@ -23,6 +24,7 @@ function thread1()
function thread2() function thread2()
{ {
while true; do while true; do
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
seq 1 100 | awk '{ print "SELECT x FROM b WHERE x IN (SELECT toUInt8(count()) FROM system.tables);" }' | $CLICKHOUSE_CLIENT -n seq 1 100 | awk '{ print "SELECT x FROM b WHERE x IN (SELECT toUInt8(count()) FROM system.tables);" }' | $CLICKHOUSE_CLIENT -n
done done
} }

View File

@ -1,2 +1 @@
default
CREATE QUOTA default KEYED BY user_name FOR INTERVAL 1 hour TRACKING ONLY TO default, readonly CREATE QUOTA default KEYED BY user_name FOR INTERVAL 1 hour TRACKING ONLY TO default, readonly

View File

@ -1,2 +1 @@
SHOW QUOTAS;
SHOW CREATE QUOTA default; SHOW CREATE QUOTA default;

View File

@ -82,14 +82,14 @@ wait
echo "Finishing alters" echo "Finishing alters"
columns1=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_1'" 2> /dev/null) columns1=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_1' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)
columns2=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_2'" 2> /dev/null) columns2=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_2' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)
columns3=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_3'" 2> /dev/null) columns3=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_3' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)
while [ "$columns1" != "$columns2" ] || [ "$columns2" != "$columns3" ]; do while [ "$columns1" != "$columns2" ] || [ "$columns2" != "$columns3" ]; do
columns1=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_1'" 2> /dev/null) columns1=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_1' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)
columns2=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_2'" 2> /dev/null) columns2=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_2' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)
columns3=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_3'" 2> /dev/null) columns3=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_3' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)
sleep 1 sleep 1
done done

View File

@ -10,6 +10,7 @@ url="${url_without_session}session_id=test_01098"
${CLICKHOUSE_CURL} -m 30 -sSk "$url" --data "DROP TEMPORARY TABLE IF EXISTS tmp_table" ${CLICKHOUSE_CURL} -m 30 -sSk "$url" --data "DROP TEMPORARY TABLE IF EXISTS tmp_table"
${CLICKHOUSE_CURL} -m 30 -sSk "$url" --data "CREATE TEMPORARY TABLE tmp_table AS SELECT number AS n FROM numbers(42)" ${CLICKHOUSE_CURL} -m 30 -sSk "$url" --data "CREATE TEMPORARY TABLE tmp_table AS SELECT number AS n FROM numbers(42)"
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
id=$(echo "SELECT uuid FROM system.tables WHERE name='tmp_table' AND is_temporary" | ${CLICKHOUSE_CURL} -m 31 -sSgk "$url" -d @-) id=$(echo "SELECT uuid FROM system.tables WHERE name='tmp_table' AND is_temporary" | ${CLICKHOUSE_CURL} -m 31 -sSgk "$url" -d @-)
internal_table_name="_temporary_and_external_tables.\`_tmp_$id\`" internal_table_name="_temporary_and_external_tables.\`_tmp_$id\`"

View File

@ -4,6 +4,7 @@ CREATE DATABASE test_01109 ENGINE=Atomic;
USE test_01109; USE test_01109;
CREATE TABLE t0 ENGINE=MergeTree() ORDER BY tuple() AS SELECT rowNumberInAllBlocks(), * FROM (SELECT toLowCardinality(arrayJoin(['exchange', 'tables']))); CREATE TABLE t0 ENGINE=MergeTree() ORDER BY tuple() AS SELECT rowNumberInAllBlocks(), * FROM (SELECT toLowCardinality(arrayJoin(['exchange', 'tables'])));
-- NOTE: database = currentDatabase() is not mandatory
CREATE TABLE t1 ENGINE=Log() AS SELECT * FROM system.tables AS t JOIN system.databases AS d ON t.database=d.name; CREATE TABLE t1 ENGINE=Log() AS SELECT * FROM system.tables AS t JOIN system.databases AS d ON t.database=d.name;
CREATE TABLE t2 ENGINE=MergeTree() ORDER BY tuple() AS SELECT rowNumberInAllBlocks() + (SELECT count() FROM t0), * FROM (SELECT arrayJoin(['hello', 'world'])); CREATE TABLE t2 ENGINE=MergeTree() ORDER BY tuple() AS SELECT rowNumberInAllBlocks() + (SELECT count() FROM t0), * FROM (SELECT arrayJoin(['hello', 'world']));

View File

@ -1,7 +1,7 @@
1 1
CREATE TABLE default.table_for_rename_replicated\n(\n `date` Date,\n `key` UInt64,\n `value1` String,\n `value2` String,\n `value3` String\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/01213_alter_rename_column_zookeeper_default/table_for_rename_replicated\', \'1\')\nPARTITION BY date\nORDER BY key\nSETTINGS index_granularity = 8192 CREATE TABLE default.table_for_rename_replicated\n(\n `date` Date,\n `key` UInt64,\n `value1` String,\n `value2` String,\n `value3` String\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/01213_alter_rename_column_zookeeper_long_default/table_for_rename_replicated\', \'1\')\nPARTITION BY date\nORDER BY key\nSETTINGS index_granularity = 8192
renamed_value1 renamed_value1
CREATE TABLE default.table_for_rename_replicated\n(\n `date` Date,\n `key` UInt64,\n `renamed_value1` String,\n `value2` String,\n `value3` String\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/01213_alter_rename_column_zookeeper_default/table_for_rename_replicated\', \'1\')\nPARTITION BY date\nORDER BY key\nSETTINGS index_granularity = 8192 CREATE TABLE default.table_for_rename_replicated\n(\n `date` Date,\n `key` UInt64,\n `renamed_value1` String,\n `value2` String,\n `value3` String\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/01213_alter_rename_column_zookeeper_long_default/table_for_rename_replicated\', \'1\')\nPARTITION BY date\nORDER BY key\nSETTINGS index_granularity = 8192
1 1
date key renamed_value1 value2 value3 date key renamed_value1 value2 value3
2019-10-02 1 1 1 1 2019-10-02 1 1 1 1

View File

@ -32,11 +32,11 @@ $CLICKHOUSE_CLIENT --query "SHOW CREATE TABLE table_for_rename_replicated;"
$CLICKHOUSE_CLIENT --query "ALTER TABLE table_for_rename_replicated RENAME COLUMN value1 to renamed_value1" --replication_alter_partitions_sync=0 $CLICKHOUSE_CLIENT --query "ALTER TABLE table_for_rename_replicated RENAME COLUMN value1 to renamed_value1" --replication_alter_partitions_sync=0
while [[ -z $($CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns WHERE name = 'renamed_value1' and table = 'table_for_rename_replicated'" 2>/dev/null) ]]; do while [[ -z $($CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns WHERE name = 'renamed_value1' and table = 'table_for_rename_replicated' AND database = '$CLICKHOUSE_DATABASE'" 2>/dev/null) ]]; do
sleep 0.5 sleep 0.5
done done
$CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns WHERE name = 'renamed_value1' and table = 'table_for_rename_replicated'" $CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns WHERE name = 'renamed_value1' and table = 'table_for_rename_replicated' AND database = '$CLICKHOUSE_DATABASE'"
# SHOW CREATE TABLE takes query from .sql file on disk. # SHOW CREATE TABLE takes query from .sql file on disk.
# previous select take metadata from memory. So, when previous select says, that return renamed_value1 already exists in table, it's still can have old version on disk. # previous select take metadata from memory. So, when previous select says, that return renamed_value1 already exists in table, it's still can have old version on disk.

View File

@ -6,6 +6,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh # shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh . "$CUR_DIR"/../shell_config.sh
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
${CLICKHOUSE_CLIENT} --query "CREATE TABLE system.columns AS numbers(10);" 2>&1 | grep -F "Code: 57" > /dev/null && echo 'OK' || echo 'FAIL' ${CLICKHOUSE_CLIENT} --query "CREATE TABLE system.columns AS numbers(10);" 2>&1 | grep -F "Code: 57" > /dev/null && echo 'OK' || echo 'FAIL'
${CLICKHOUSE_CLIENT} --query "CREATE TABLE system.columns engine=Memory AS numbers(10);" 2>&1 | grep -F "Code: 62" > /dev/null && echo 'OK' || echo 'FAIL' ${CLICKHOUSE_CLIENT} --query "CREATE TABLE system.columns engine=Memory AS numbers(10);" 2>&1 | grep -F "Code: 62" > /dev/null && echo 'OK' || echo 'FAIL'
${CLICKHOUSE_CLIENT} --query "CREATE TABLE system.columns AS numbers(10) engine=Memory;" 2>&1 | grep -F "Code: 62" > /dev/null && echo 'OK' || echo 'FAIL' ${CLICKHOUSE_CLIENT} --query "CREATE TABLE system.columns AS numbers(10) engine=Memory;" 2>&1 | grep -F "Code: 62" > /dev/null && echo 'OK' || echo 'FAIL'

View File

@ -15,13 +15,13 @@ ALTER TABLE mt_01451 DETACH PART 'all_2_2_0';
SELECT v FROM mt_01451 ORDER BY v; SELECT v FROM mt_01451 ORDER BY v;
SELECT name FROM system.detached_parts WHERE table = 'mt_01451'; SELECT name FROM system.detached_parts WHERE table = 'mt_01451' AND database = currentDatabase();
ALTER TABLE mt_01451 ATTACH PART 'all_2_2_0'; ALTER TABLE mt_01451 ATTACH PART 'all_2_2_0';
SELECT v FROM mt_01451 ORDER BY v; SELECT v FROM mt_01451 ORDER BY v;
SELECT name FROM system.detached_parts WHERE table = 'mt_01451'; SELECT name FROM system.detached_parts WHERE table = 'mt_01451' AND database = currentDatabase();
SELECT '-- drop part --'; SELECT '-- drop part --';
@ -37,6 +37,6 @@ OPTIMIZE TABLE mt_01451 FINAL;
SELECT v FROM mt_01451 ORDER BY v; SELECT v FROM mt_01451 ORDER BY v;
SELECT name FROM system.parts WHERE table = 'mt_01451' AND active; SELECT name FROM system.parts WHERE table = 'mt_01451' AND active AND database = currentDatabase();
DROP TABLE mt_01451; DROP TABLE mt_01451;

View File

@ -3,8 +3,8 @@ SET replication_alter_partitions_sync = 2;
DROP TABLE IF EXISTS replica1; DROP TABLE IF EXISTS replica1;
DROP TABLE IF EXISTS replica2; DROP TABLE IF EXISTS replica2;
CREATE TABLE replica1 (v UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/01451/attach', 'r1') order by tuple() settings max_replicated_merges_in_queue = 0; CREATE TABLE replica1 (v UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/'||currentDatabase()||'test/01451/attach', 'r1') order by tuple() settings max_replicated_merges_in_queue = 0;
CREATE TABLE replica2 (v UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/01451/attach', 'r2') order by tuple() settings max_replicated_merges_in_queue = 0; CREATE TABLE replica2 (v UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/'||currentDatabase()||'test/01451/attach', 'r2') order by tuple() settings max_replicated_merges_in_queue = 0;
INSERT INTO replica1 VALUES (0); INSERT INTO replica1 VALUES (0);
INSERT INTO replica1 VALUES (1); INSERT INTO replica1 VALUES (1);
@ -19,14 +19,14 @@ ALTER TABLE replica2 DETACH PART 'all_1_1_0';
SELECT v FROM replica1 ORDER BY v; SELECT v FROM replica1 ORDER BY v;
SELECT name FROM system.detached_parts WHERE table = 'replica2'; SELECT name FROM system.detached_parts WHERE table = 'replica2' AND database = currentDatabase();
ALTER TABLE replica2 ATTACH PART 'all_1_1_0'; ALTER TABLE replica2 ATTACH PART 'all_1_1_0';
SYSTEM SYNC REPLICA replica1; SYSTEM SYNC REPLICA replica1;
SELECT v FROM replica1 ORDER BY v; SELECT v FROM replica1 ORDER BY v;
SELECT name FROM system.detached_parts WHERE table = 'replica2'; SELECT name FROM system.detached_parts WHERE table = 'replica2' AND database = currentDatabase();
SELECT '-- drop part --'; SELECT '-- drop part --';
@ -43,7 +43,7 @@ OPTIMIZE TABLE replica1 FINAL;
SELECT v FROM replica1 ORDER BY v; SELECT v FROM replica1 ORDER BY v;
SELECT name FROM system.parts WHERE table = 'replica2' AND active; SELECT name FROM system.parts WHERE table = 'replica2' AND active AND database = currentDatabase();
DROP TABLE replica1; DROP TABLE replica1;
DROP TABLE replica2; DROP TABLE replica2;

View File

@ -1,7 +1,8 @@
-- NOTE: database = currentDatabase() is not mandatory
SELECT database FROM system.tables WHERE database LIKE '%' format Null; SELECT database FROM system.tables WHERE database LIKE '%' format Null;
SELECT database AS db FROM system.tables WHERE db LIKE '%' format Null; SELECT database AS db FROM system.tables WHERE db LIKE '%' format Null;
SELECT CAST(database, 'String') AS db FROM system.tables WHERE db LIKE '%' format Null; SELECT CAST(database, 'String') AS db FROM system.tables WHERE db LIKE '%' format Null;
SELECT CAST('a string', 'Nullable(String)') AS str WHERE str LIKE '%' format Null; SELECT CAST('a string', 'Nullable(String)') AS str WHERE str LIKE '%' format Null;
SELECT CAST(database, 'Nullable(String)') AS ndb FROM system.tables WHERE ndb LIKE '%' format Null; SELECT CAST(database, 'Nullable(String)') AS ndb FROM system.tables WHERE ndb LIKE '%' format Null;
SELECT 'all tests passed'; SELECT 'all tests passed';

View File

@ -14,7 +14,7 @@ INSERT INTO optimize_final SELECT toDate('2000-01-01'), number + 5 FROM numbers(
OPTIMIZE TABLE optimize_final FINAL; OPTIMIZE TABLE optimize_final FINAL;
SELECT table, partition, active, level from system.parts where table = 'optimize_final' and active = 1; SELECT table, partition, active, level from system.parts where table = 'optimize_final' and database = currentDatabase() and active = 1;
DROP TABLE optimize_final; DROP TABLE optimize_final;

View File

@ -38,5 +38,5 @@ DROP TABLE IF EXISTS empty;
CREATE TABLE empty (key UInt32, val UInt32, date Datetime) ENGINE=SummingMergeTree(val) PARTITION BY date ORDER BY key; CREATE TABLE empty (key UInt32, val UInt32, date Datetime) ENGINE=SummingMergeTree(val) PARTITION BY date ORDER BY key;
INSERT INTO empty VALUES (1, 1, '2020-01-01'), (1, 1, '2020-01-01'), (1, -2, '2020-01-01'); INSERT INTO empty VALUES (1, 1, '2020-01-01'), (1, 1, '2020-01-01'), (1, -2, '2020-01-01');
SELECT * FROM empty ORDER BY key; SELECT * FROM empty ORDER BY key;
SELECT table, partition, active FROM system.parts where table = 'empty' and active = 1; SELECT table, partition, active FROM system.parts where table = 'empty' and active = 1 and database = currentDatabase();
DROP TABLE empty; DROP TABLE empty;

View File

@ -4,6 +4,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh # shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh . "$CURDIR"/../shell_config.sh
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
verify_sql="SELECT verify_sql="SELECT
(SELECT sumIf(value, metric = 'PartsCommitted'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics) (SELECT sumIf(value, metric = 'PartsCommitted'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics)
= (SELECT sum(active), sum(NOT active) FROM system.parts)" = (SELECT sum(active), sum(NOT active) FROM system.parts)"

View File

@ -7,6 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
set -e set -e
set -o pipefail set -o pipefail
# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
verify_sql="SELECT verify_sql="SELECT
(SELECT sumIf(value, metric = 'PartsInMemory'), sumIf(value, metric = 'PartsCompact'), sumIf(value, metric = 'PartsWide') FROM system.metrics) = (SELECT sumIf(value, metric = 'PartsInMemory'), sumIf(value, metric = 'PartsCompact'), sumIf(value, metric = 'PartsWide') FROM system.metrics) =
(SELECT countIf(part_type == 'InMemory'), countIf(part_type == 'Compact'), countIf(part_type == 'Wide') FROM system.parts)" (SELECT countIf(part_type == 'InMemory'), countIf(part_type == 'Compact'), countIf(part_type == 'Wide') FROM system.parts)"

View File

@ -4,6 +4,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh # shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh . "$CURDIR"/../shell_config.sh
# NOTE: database = $CLICKHOUSE_DATABASE is superfluous
function test_completion_word() function test_completion_word()
{ {
local w=$1 && shift local w=$1 && shift
@ -13,7 +15,7 @@ function test_completion_word()
local compword_end=${w:$((w_len-3))} local compword_end=${w:$((w_len-3))}
# NOTE: here and below you should escape variables of the expect. # NOTE: here and below you should escape variables of the expect.
timeout 22s expect << EOF timeout 60s expect << EOF
log_user 0 log_user 0
set timeout 3 set timeout 3
match_max 100000 match_max 100000

View File

@ -1,11 +1,11 @@
-- TinyTinyLog -- TinyLog
DROP TABLE IF EXISTS nested_01800_tiny_log; DROP TABLE IF EXISTS nested_01800_tiny_log;
CREATE TABLE nested_01800_tiny_log (`column` Nested(name String, names Array(String), types Array(Enum8('PU' = 1, 'US' = 2, 'OTHER' = 3)))) ENGINE = TinyLog; CREATE TABLE nested_01800_tiny_log (`column` Nested(name String, names Array(String), types Array(Enum8('PU' = 1, 'US' = 2, 'OTHER' = 3)))) ENGINE = TinyLog;
INSERT INTO nested_01800_tiny_log VALUES (['Hello', 'World'], [['a'], ['b', 'c']], [['PU', 'US'], ['OTHER']]); INSERT INTO nested_01800_tiny_log VALUES (['Hello', 'World'], [['a'], ['b', 'c']], [['PU', 'US'], ['OTHER']]);
SELECT 10 FROM nested_01800_tiny_log FORMAT Null; SELECT 10 FROM nested_01800_tiny_log FORMAT Null;
DROP TABLE nested_01800_tiny_log; DROP TABLE nested_01800_tiny_log;
-- StripeStripeLog -- StripeLog
DROP TABLE IF EXISTS nested_01800_stripe_log; DROP TABLE IF EXISTS nested_01800_stripe_log;
CREATE TABLE nested_01800_stripe_log (`column` Nested(name String, names Array(String), types Array(Enum8('PU' = 1, 'US' = 2, 'OTHER' = 3)))) ENGINE = StripeLog; CREATE TABLE nested_01800_stripe_log (`column` Nested(name String, names Array(String), types Array(Enum8('PU' = 1, 'US' = 2, 'OTHER' = 3)))) ENGINE = StripeLog;
INSERT INTO nested_01800_stripe_log VALUES (['Hello', 'World'], [['a'], ['b', 'c']], [['PU', 'US'], ['OTHER']]); INSERT INTO nested_01800_stripe_log VALUES (['Hello', 'World'], [['a'], ['b', 'c']], [['PU', 'US'], ['OTHER']]);

View File

@ -28,7 +28,7 @@ SELECT
name, name,
comment comment
FROM system.tables FROM system.tables
WHERE name IN ('t1', 't2', 't3') order by name; WHERE name IN ('t1', 't2', 't3') AND database = currentDatabase() order by name;
SHOW CREATE TABLE t1; SHOW CREATE TABLE t1;

View File

@ -1 +1,2 @@
-- NOTE: database = currentDatabase() is not mandatory
SELECT sum(data_compressed_bytes) > 0, sum(data_uncompressed_bytes) > 0, sum(marks_bytes) > 0 FROM system.columns; SELECT sum(data_compressed_bytes) > 0, sum(data_uncompressed_bytes) > 0, sum(marks_bytes) > 0 FROM system.columns;

View File

@ -94,7 +94,28 @@ tests_with_query_log=( $(
xargs grep --with-filename -e system.query_log -e system.query_thread_log | cut -d: -f1 | sort -u xargs grep --with-filename -e system.query_log -e system.query_thread_log | cut -d: -f1 | sort -u
) ) ) )
for test_case in "${tests_with_query_log[@]}"; do for test_case in "${tests_with_query_log[@]}"; do
grep -qE current_database.*currentDatabase "$test_case" || echo "Queries to system.query_log/system.query_thread_log does not have current_database = currentDatabase() condition in $test_case" grep -qE current_database.*currentDatabase "$test_case" || {
grep -qE 'current_database.*\$CLICKHOUSE_DATABASE' "$test_case"
} || echo "Queries to system.query_log/system.query_thread_log does not have current_database = currentDatabase() condition in $test_case"
done
# Queries to system.tables/system.parts/system.detached_parts/system.parts_columns/system.columns should have database = currentDatabase() condition
# NOTE: it is not that accuate, but at least something.
tests_with_database_column=( $(
find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' |
grep -vP $EXCLUDE_DIRS |
grep -v -x -e $ROOT_PATH/tests/queries/query_test.py |
xargs grep --with-filename -e system.tables -e system.parts -e system.detached_parts -e system.parts_columns -e system.columns | cut -d: -f1 | sort -u
) )
for test_case in "${tests_with_database_column[@]}"; do
grep -qE database.*currentDatabase "$test_case" || {
grep -qE 'database.*\$CLICKHOUSE_DATABASE' "$test_case"
} || {
# explicit database
grep -qE "database[ ]*=[ ]*'" "$test_case"
} || {
echo "Queries to system.tables/system.parts/system.detached_parts/system.parts_columns/system.columns does not have database = currentDatabase()/\$CLICKHOUSE_DATABASE condition in $test_case"
}
done done
# Queries with ReplicatedMergeTree # Queries with ReplicatedMergeTree

View File

@ -8,11 +8,6 @@ source default-config
mkdir -p "${WORKSPACE}/build" mkdir -p "${WORKSPACE}/build"
pushd "${WORKSPACE}/build" pushd "${WORKSPACE}/build"
if [[ "${ENABLE_EMBEDDED_COMPILER}" == 1 ]]; then
[[ "$USE_LLVM_LIBRARIES_FROM_SYSTEM" == 0 ]] && CMAKE_FLAGS="$CMAKE_FLAGS -DUSE_INTERNAL_LLVM_LIBRARY=1"
[[ "$USE_LLVM_LIBRARIES_FROM_SYSTEM" != 0 ]] && CMAKE_FLAGS="$CMAKE_FLAGS -DUSE_INTERNAL_LLVM_LIBRARY=0"
fi
cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DENABLE_EMBEDDED_COMPILER=${ENABLE_EMBEDDED_COMPILER} $CMAKE_FLAGS ../sources cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DENABLE_EMBEDDED_COMPILER=${ENABLE_EMBEDDED_COMPILER} $CMAKE_FLAGS ../sources
[[ "$BUILD_TARGETS" != 'all' ]] && BUILD_TARGETS_STRING="--target $BUILD_TARGETS" [[ "$BUILD_TARGETS" != 'all' ]] && BUILD_TARGETS_STRING="--target $BUILD_TARGETS"

View File

@ -27,7 +27,6 @@ CLANG_SOURCES_BRANCH=trunk # or tags/RELEASE_600/final
GCC_SOURCES_VERSION=latest # or gcc-7.1.0 GCC_SOURCES_VERSION=latest # or gcc-7.1.0
# install-libraries # install-libraries
USE_LLVM_LIBRARIES_FROM_SYSTEM=0 # 0 or 1
ENABLE_EMBEDDED_COMPILER=1 ENABLE_EMBEDDED_COMPILER=1
# build # build

View File

@ -5,7 +5,3 @@ source default-config
./install-os-packages.sh libicu-dev ./install-os-packages.sh libicu-dev
./install-os-packages.sh libreadline-dev ./install-os-packages.sh libreadline-dev
if [[ "$ENABLE_EMBEDDED_COMPILER" == 1 && "$USE_LLVM_LIBRARIES_FROM_SYSTEM" == 1 ]]; then
./install-os-packages.sh llvm-libs-5.0
fi

Some files were not shown because too many files have changed in this diff Show More