Merge branch 'master' of https://github.com/ClickHouse/ClickHouse into pg-ch-replica

This commit is contained in:
kssenii 2021-02-20 18:31:26 +00:00
commit d590c32708
167 changed files with 4936 additions and 1225 deletions

View File

@ -416,7 +416,9 @@ static void sanitizerDeathCallback()
else
log_message = "Terminate called without an active exception";
static const size_t buf_size = 1024;
/// POSIX.1 says that write(2)s of less than PIPE_BUF bytes must be atomic - man 7 pipe
/// And the buffer should not be too small because our exception messages can be large.
static constexpr size_t buf_size = PIPE_BUF;
if (log_message.size() > buf_size - 16)
log_message.resize(buf_size - 16);

View File

@ -107,6 +107,18 @@ function start_server
fi
echo "ClickHouse server pid '$server_pid' started and responded"
echo "
handle all noprint
handle SIGSEGV stop print
handle SIGBUS stop print
handle SIGABRT stop print
continue
thread apply all backtrace
continue
" > script.gdb
gdb -batch -command script.gdb -p "$server_pid" &
}
function clone_root
@ -259,6 +271,7 @@ function run_tests
00929_multi_match_edit_distance
01681_hyperscan_debug_assertion
01176_mysql_client_interactive # requires mysql client
01031_mutations_interpreter_and_context
01053_ssd_dictionary # this test mistakenly requires acces to /var/lib/clickhouse -- can't run this locally, disabled
01083_expressions_in_engine_arguments
@ -326,7 +339,7 @@ function run_tests
# Look at DistributedFilesToInsert, so cannot run in parallel.
01460_DistributedFilesToInsert
01541_max_memory_usage_for_user
01541_max_memory_usage_for_user_long
# Require python libraries like scipy, pandas and numpy
01322_ttest_scipy

View File

@ -60,4 +60,8 @@ fi
# more idiologically correct.
read -ra ADDITIONAL_OPTIONS <<< "${ADDITIONAL_OPTIONS:-}"
if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
ADDITIONAL_OPTIONS+=('--replicated-database')
fi
clickhouse-test --testname --shard --zookeeper --no-stateless --hung-check --print-time "$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt

View File

@ -3,6 +3,9 @@ FROM yandex/clickhouse-test-base
ARG odbc_driver_url="https://github.com/ClickHouse/clickhouse-odbc/releases/download/v1.1.4.20200302/clickhouse-odbc-1.1.4-Linux.tar.gz"
RUN echo "deb [trusted=yes] http://repo.mysql.com/apt/ubuntu/ bionic mysql-5.7" >> /etc/apt/sources.list \
&& apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 8C718D3B5072E1F5
RUN apt-get update -y \
&& env DEBIAN_FRONTEND=noninteractive \
apt-get install --yes --no-install-recommends \
@ -24,7 +27,8 @@ RUN apt-get update -y \
telnet \
tree \
unixodbc \
wget
wget \
mysql-client=5.7*
RUN pip3 install numpy scipy pandas

View File

@ -57,6 +57,10 @@ function run_tests()
ADDITIONAL_OPTIONS+=('4')
fi
if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
ADDITIONAL_OPTIONS+=('--replicated-database')
fi
clickhouse-test --testname --shard --zookeeper --hung-check --print-time \
--test-runs "$NUM_TRIES" \
"$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \

View File

@ -23,12 +23,15 @@ def get_options(i):
if 0 < i:
options += " --order=random"
if i % 2 == 1:
if i % 3 == 1:
options += " --db-engine=Ordinary"
if i % 3 == 2:
options += ''' --db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i)
# If database name is not specified, new database is created for each functional test.
# Run some threads with one database for all tests.
if i % 3 == 1:
if i % 2 == 1:
options += " --database=test_{}".format(i)
if i == 13:

View File

@ -59,6 +59,21 @@ Optional parameters:
- `rabbitmq_max_block_size`
- `rabbitmq_flush_interval_ms`
Also format settings can be added along with rabbitmq-related settings.
Example:
``` sql
CREATE TABLE queue (
key UInt64,
value UInt64,
date DateTime
) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'localhost:5672',
rabbitmq_exchange_name = 'exchange1',
rabbitmq_format = 'JSONEachRow',
rabbitmq_num_consumers = 5,
date_time_input_format = 'best_effort';
```
The RabbitMQ server configuration should be added using the ClickHouse config file.
@ -79,18 +94,6 @@ Additional configuration:
</rabbitmq>
```
Example:
``` sql
CREATE TABLE queue (
key UInt64,
value UInt64
) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'localhost:5672',
rabbitmq_exchange_name = 'exchange1',
rabbitmq_format = 'JSONEachRow',
rabbitmq_num_consumers = 5;
```
## Description {#description}
`SELECT` is not particularly useful for reading messages (except for debugging), because each message can be read only once. It is more practical to create real-time threads using [materialized views](../../../sql-reference/statements/create/view.md). To do this:
@ -114,6 +117,7 @@ Exchange type options:
- `consistent_hash` - Data is evenly distributed between all bound tables (where the exchange name is the same). Note that this exchange type must be enabled with RabbitMQ plugin: `rabbitmq-plugins enable rabbitmq_consistent_hash_exchange`.
Setting `rabbitmq_queue_base` may be used for the following cases:
- to let different tables share queues, so that multiple consumers could be registered for the same queues, which makes a better performance. If using `rabbitmq_num_consumers` and/or `rabbitmq_num_queues` settings, the exact match of queues is achieved in case these parameters are the same.
- to be able to restore reading from certain durable queues when not all messages were successfully consumed. To resume consumption from one specific queue - set its name in `rabbitmq_queue_base` setting and do not specify `rabbitmq_num_consumers` and `rabbitmq_num_queues` (defaults to 1). To resume consumption from all queues, which were declared for a specific table - just specify the same settings: `rabbitmq_queue_base`, `rabbitmq_num_consumers`, `rabbitmq_num_queues`. By default, queue names will be unique to tables.
- to reuse queues as they are declared durable and not auto-deleted. (Can be deleted via any of RabbitMQ CLI tools.)

View File

@ -0,0 +1,13 @@
---
toc_folder_title: External User Authenticators and Directories
toc_priority: 48
toc_title: Introduction
---
# External User Authenticators and Directories {#external-authenticators}
ClickHouse supports authenticating and managing users using external services.
The following external authenticators and directories are supported:
- [LDAP](./ldap.md#external-authenticators-ldap) [Authenticator](./ldap.md#ldap-external-authenticator) and [Directory](./ldap.md#ldap-external-user-directory)

View File

@ -0,0 +1,156 @@
# LDAP {#external-authenticators-ldap}
LDAP server can be used to authenticate ClickHouse users. There are two different approaches for doing this:
- use LDAP as an external authenticator for existing users, which are defined in `users.xml` or in local access control paths
- use LDAP as an external user directory and allow locally undefined users to be authenticated if they exist on the LDAP server
For both of these approaches, an internally named LDAP server must be defined in the ClickHouse config so that other parts of config are able to refer to it.
## LDAP Server Definition {#ldap-server-definition}
To define LDAP server you must add `ldap_servers` section to the `config.xml`. For example,
```xml
<yandex>
<!- ... -->
<ldap_servers>
<my_ldap_server>
<host>localhost</host>
<port>636</port>
<bind_dn>uid={user_name},ou=users,dc=example,dc=com</bind_dn>
<verification_cooldown>300</verification_cooldown>
<enable_tls>yes</enable_tls>
<tls_minimum_protocol_version>tls1.2</tls_minimum_protocol_version>
<tls_require_cert>demand</tls_require_cert>
<tls_cert_file>/path/to/tls_cert_file</tls_cert_file>
<tls_key_file>/path/to/tls_key_file</tls_key_file>
<tls_ca_cert_file>/path/to/tls_ca_cert_file</tls_ca_cert_file>
<tls_ca_cert_dir>/path/to/tls_ca_cert_dir</tls_ca_cert_dir>
<tls_cipher_suite>ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:AES256-GCM-SHA384</tls_cipher_suite>
</my_ldap_server>
</ldap_servers>
</yandex>
```
Note, that you can define multiple LDAP servers inside the `ldap_servers` section using distinct names.
Parameters:
- `host` - LDAP server hostname or IP, this parameter is mandatory and cannot be empty.
- `port` - LDAP server port, default is `636` if `enable_tls` is set to `true`, `389` otherwise.
- `bind_dn` - template used to construct the DN to bind to.
- The resulting DN will be constructed by replacing all `{user_name}` substrings of the
template with the actual user name during each authentication attempt.
- `verification_cooldown` - a period of time, in seconds, after a successful bind attempt,
during which the user will be assumed to be successfully authenticated for all consecutive
requests without contacting the LDAP server.
- Specify `0` (the default) to disable caching and force contacting the LDAP server for each authentication request.
- `enable_tls` - flag to trigger use of secure connection to the LDAP server.
- Specify `no` for plain text `ldap://` protocol (not recommended).
- Specify `yes` for LDAP over SSL/TLS `ldaps://` protocol (recommended, the default).
- Specify `starttls` for legacy StartTLS protocol (plain text `ldap://` protocol, upgraded to TLS).
- `tls_minimum_protocol_version` - the minimum protocol version of SSL/TLS.
- Accepted values are: `ssl2`, `ssl3`, `tls1.0`, `tls1.1`, `tls1.2` (the default).
- `tls_require_cert` - SSL/TLS peer certificate verification behavior.
- Accepted values are: `never`, `allow`, `try`, `demand` (the default).
- `tls_cert_file` - path to certificate file.
- `tls_key_file` - path to certificate key file.
- `tls_ca_cert_file` - path to CA certificate file.
- `tls_ca_cert_dir` - path to the directory containing CA certificates.
- `tls_cipher_suite` - allowed cipher suite (in OpenSSL notation).
## LDAP External Authenticator {#ldap-external-authenticator}
A remote LDAP server can be used as a method for verifying passwords for locally defined users (users defined in `users.xml` or in local access control paths). In order to achieve this, specify previously defined LDAP server name instead of `password` or similar sections in the user definition.
At each login attempt, ClickHouse will try to "bind" to the specified DN defined by the `bind_dn` parameter in the [LDAP server definition](#ldap-server-definition) using the provided credentials, and if successful, the user will be considered authenticated. This is often called a "simple bind" method.
For example,
```xml
<yandex>
<!- ... -->
<users>
<!- ... -->
<my_user>
<!- ... -->
<ldap>
<server>my_ldap_server</server>
</ldap>
</my_user>
</users>
</yandex>
```
Note, that user `my_user` refers to `my_ldap_server`. This LDAP server must be configured in the main `config.xml` file as described previously.
When SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled in ClickHouse, users that are authenticated by LDAP servers can also be created using the [CRATE USER](../../sql-reference/statements/create/user.md#create-user-statement) statement.
```sql
CREATE USER my_user IDENTIFIED WITH ldap_server BY 'my_ldap_server'
```
## LDAP Exernal User Directory {#ldap-external-user-directory}
In addition to the locally defined users, a remote LDAP server can be used as a source of user definitions. In order to achieve this, specify previously defined LDAP server name (see [LDAP Server Definition](#ldap-server-definition)) in the `ldap` section inside the `users_directories` section of the `config.xml` file.
At each login attempt, ClickHouse will try to find the user definition locally and authenticate it as usual, but if the user is not defined, ClickHouse will assume it exists in the external LDAP directory, and will try to "bind" to the specified DN at the LDAP server using the provided credentials. If successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in the `roles` section. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then be assigned to the user if the `role_mapping` section is also configured. All this implies that the SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled and roles are created using the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement.
Example (goes into `config.xml`):
```xml
<yandex>
<!- ... -->
<user_directories>
<!- ... -->
<ldap>
<server>my_ldap_server</server>
<roles>
<my_local_role1 />
<my_local_role2 />
</roles>
<role_mapping>
<base_dn>ou=groups,dc=example,dc=com</base_dn>
<scope>subtree</scope>
<search_filter>(&amp;(objectClass=groupOfNames)(member={bind_dn}))</search_filter>
<attribute>cn</attribute>
<prefix>clickhouse_</prefix>
</role_mapping>
</ldap>
</user_directories>
</yandex>
```
Note that `my_ldap_server` referred in the `ldap` section inside the `user_directories` section must be a previously
defined LDAP server that is configured in the `config.xml` (see [LDAP Server Definition](#ldap-server-definition)).
Parameters:
- `server` - one of LDAP server names defined in the `ldap_servers` config section above.
This parameter is mandatory and cannot be empty.
- `roles` - section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server.
- If no roles are specified here or assigned during role mapping (below), user will not be able
to perform any actions after authentication.
- `role_mapping` - section with LDAP search parameters and mapping rules.
- When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter`
and the name of the logged in user. For each entry found during that search, the value of the specified
attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed,
and the rest of the value becomes the name of a local role defined in ClickHouse,
which is expected to be created beforehand by the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement.
- There can be multiple `role_mapping` sections defined inside the same `ldap` section. All of them will be applied.
- `base_dn` - template used to construct the base DN for the LDAP search.
- The resulting DN will be constructed by replacing all `{user_name}` and `{bind_dn}`
substrings of the template with the actual user name and bind DN during each LDAP search.
- `scope` - scope of the LDAP search.
- Accepted values are: `base`, `one_level`, `children`, `subtree` (the default).
- `search_filter` - template used to construct the search filter for the LDAP search.
- The resulting filter will be constructed by replacing all `{user_name}`, `{bind_dn}`, and `{base_dn}`
substrings of the template with the actual user name, bind DN, and base DN during each LDAP search.
- Note, that the special characters must be escaped properly in XML.
- `attribute` - attribute name whose values will be returned by the LDAP search.
- `prefix` - prefix, that will be expected to be in front of each string in the original
list of strings returned by the LDAP search. Prefix will be removed from the original
strings and resulting strings will be treated as local role names. Empty, by default.

View File

@ -32,6 +32,7 @@ The null hypothesis is that two populations are stochastically equal. Also one-s
**Returned values**
[Tuple](../../../sql-reference/data-types/tuple.md) with two elements:
- calculated U-statistic. [Float64](../../../sql-reference/data-types/float.md).
- calculated p-value. [Float64](../../../sql-reference/data-types/float.md).

View File

@ -24,6 +24,7 @@ The null hypothesis is that means of populations are equal. Normal distribution
**Returned values**
[Tuple](../../../sql-reference/data-types/tuple.md) with two elements:
- calculated t-statistic. [Float64](../../../sql-reference/data-types/float.md).
- calculated p-value. [Float64](../../../sql-reference/data-types/float.md).

View File

@ -24,6 +24,7 @@ The null hypothesis is that means of populations are equal. Normal distribution
**Returned values**
[Tuple](../../../sql-reference/data-types/tuple.md) with two elements:
- calculated t-statistic. [Float64](../../../sql-reference/data-types/float.md).
- calculated p-value. [Float64](../../../sql-reference/data-types/float.md).

View File

@ -600,4 +600,46 @@ Hello, &quot;world&quot;!
&apos;foo&apos;
```
## decodeXMLComponent {#decode-xml-component}
Replaces XML predefined entities with characters. Predefined entities are `&quot;` `&amp;` `&apos;` `&gt;` `&lt;`
This function also replaces numeric character references with Unicode characters. Both decimal (like `&#10003;`) and hexadecimal (`&#x2713;`) forms are supported.
**Syntax**
``` sql
decodeXMLComponent(x)
```
**Parameters**
- `x` — A sequence of characters. [String](../../sql-reference/data-types/string.md).
**Returned value**
- The sequence of characters after replacement.
Type: [String](../../sql-reference/data-types/string.md).
**Example**
Query:
``` sql
SELECT decodeXMLComponent('&apos;foo&apos;');
SELECT decodeXMLComponent('&lt; &#x3A3; &gt;');
```
Result:
``` text
'foo'
< Σ >
```
**See Also**
- [List of XML and HTML character entity references](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references)
[Original article](https://clickhouse.tech/docs/en/query_language/functions/string_functions/) <!--hide-->

View File

@ -12,7 +12,7 @@ Syntax:
``` sql
CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1]
[, name2 [ON CLUSTER cluster_name2] ...]
[IDENTIFIED [WITH {NO_PASSWORD|PLAINTEXT_PASSWORD|SHA256_PASSWORD|SHA256_HASH|DOUBLE_SHA1_PASSWORD|DOUBLE_SHA1_HASH}] BY {'password'|'hash'}]
[IDENTIFIED [WITH {NO_PASSWORD|PLAINTEXT_PASSWORD|SHA256_PASSWORD|SHA256_HASH|DOUBLE_SHA1_PASSWORD|DOUBLE_SHA1_HASH|LDAP_SERVER}] BY {'password'|'hash'}]
[HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE]
[DEFAULT ROLE role [,...]]
[SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...]
@ -30,6 +30,7 @@ There are multiple ways of user identification:
- `IDENTIFIED WITH sha256_hash BY 'hash'`
- `IDENTIFIED WITH double_sha1_password BY 'qwerty'`
- `IDENTIFIED WITH double_sha1_hash BY 'hash'`
- `IDENTIFIED WITH ldap_server BY 'server'`
## User Host {#user-host}

View File

@ -52,6 +52,21 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
- `rabbitmq_max_block_size`
- `rabbitmq_flush_interval_ms`
Настройки форматов данных также могут быть добавлены в списке RabbitMQ настроек.
Example:
``` sql
CREATE TABLE queue (
key UInt64,
value UInt64,
date DateTime
) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'localhost:5672',
rabbitmq_exchange_name = 'exchange1',
rabbitmq_format = 'JSONEachRow',
rabbitmq_num_consumers = 5,
date_time_input_format = 'best_effort';
```
Конфигурация сервера RabbitMQ добавляется с помощью конфигурационного файла ClickHouse.
@ -72,18 +87,6 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
</rabbitmq>
```
Example:
``` sql
CREATE TABLE queue (
key UInt64,
value UInt64
) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'localhost:5672',
rabbitmq_exchange_name = 'exchange1',
rabbitmq_format = 'JSONEachRow',
rabbitmq_num_consumers = 5;
```
## Описание {#description}
Запрос `SELECT` не очень полезен для чтения сообщений (за исключением отладки), поскольку каждое сообщение может быть прочитано только один раз. Практичнее создавать потоки реального времени с помощью [материализованных преставлений](../../../sql-reference/statements/create/view.md). Для этого:
@ -107,6 +110,7 @@ Example:
- `consistent_hash` - данные равномерно распределяются между всеми связанными таблицами, где имя точки обмена совпадает. Обратите внимание, что этот тип обмена должен быть включен с помощью плагина RabbitMQ: `rabbitmq-plugins enable rabbitmq_consistent_hash_exchange`.
Настройка `rabbitmq_queue_base` может быть использована в следующих случаях:
1. чтобы восстановить чтение из ранее созданных очередей, если оно прекратилось по какой-либо причине, но очереди остались непустыми. Для восстановления чтения из одной конкретной очереди, нужно написать ее имя в `rabbitmq_queue_base` настройку и не указывать настройки `rabbitmq_num_consumers` и `rabbitmq_num_queues`. Чтобы восстановить чтение из всех очередей, которые были созданы для конкретной таблицы, необходимо совпадение следующих настроек: `rabbitmq_queue_base`, `rabbitmq_num_consumers`, `rabbitmq_num_queues`. По умолчанию, если настройка `rabbitmq_queue_base` не указана, будут использованы уникальные для каждой таблицы имена очередей.
2. чтобы объявить одни и те же очереди для разных таблиц, что позволяет создавать несколько параллельных подписчиков на каждую из очередей. То есть обеспечивается лучшая производительность. В данном случае, для таких таблиц также необходимо совпадение настроек: `rabbitmq_num_consumers`, `rabbitmq_num_queues`.
3. чтобы повторно использовать созданные c `durable` настройкой очереди, так как они не удаляются автоматически (но могут быть удалены с помощью любого RabbitMQ CLI).

View File

@ -31,6 +31,7 @@ mannWhitneyUTest[(alternative[, continuity_correction])](sample_data, sample_ind
**Возвращаемые значения**
[Кортеж](../../../sql-reference/data-types/tuple.md) с двумя элементами:
- вычисленное значение критерия Манна — Уитни. [Float64](../../../sql-reference/data-types/float.md).
- вычисленное p-значение. [Float64](../../../sql-reference/data-types/float.md).

View File

@ -24,6 +24,7 @@ studentTTest(sample_data, sample_index)
**Возвращаемые значения**
[Кортеж](../../../sql-reference/data-types/tuple.md) с двумя элементами:
- вычисленное значение критерия Стьюдента. [Float64](../../../sql-reference/data-types/float.md).
- вычисленное p-значение. [Float64](../../../sql-reference/data-types/float.md).

View File

@ -24,6 +24,7 @@ welchTTest(sample_data, sample_index)
**Возвращаемые значения**
[Кортеж](../../../sql-reference/data-types/tuple.md) с двумя элементами:
- вычисленное значение критерия Уэлча. [Float64](../../../sql-reference/data-types/float.md).
- вычисленное p-значение. [Float64](../../../sql-reference/data-types/float.md).

View File

@ -597,4 +597,46 @@ Hello, &quot;world&quot;!
&apos;foo&apos;
```
## decodeXMLComponent {#decode-xml-component}
Заменяет символами предопределенные мнемоники XML: `&quot;` `&amp;` `&apos;` `&gt;` `&lt;`
Также эта функция заменяет числовые ссылки соответствующими символами юникод. Поддерживаются десятичная (например, `&#10003;`) и шестнадцатеричная (`&#x2713;`) формы.
**Синтаксис**
``` sql
decodeXMLComponent(x)
```
**Параметры**
- `x` — последовательность символов. [String](../../sql-reference/data-types/string.md).
**Возвращаемое значение**
- Строка с произведенными заменами.
Тип: [String](../../sql-reference/data-types/string.md).
**Пример**
Запрос:
``` sql
SELECT decodeXMLComponent('&apos;foo&apos;');
SELECT decodeXMLComponent('&lt; &#x3A3; &gt;');
```
Результат:
``` text
'foo'
< Σ >
```
**Смотрите также**
- [Мнемоники в HTML](https://ru.wikipedia.org/wiki/%D0%9C%D0%BD%D0%B5%D0%BC%D0%BE%D0%BD%D0%B8%D0%BA%D0%B8_%D0%B2_HTML)
[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/string_functions/) <!--hide-->

View File

@ -160,7 +160,15 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ
}
WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout);
writeStringBinary(columns.toString(), out);
try
{
writeStringBinary(columns.toString(), out);
out.finalize();
}
catch (...)
{
out.finalize();
}
}
catch (...)
{

View File

@ -50,7 +50,15 @@ void IdentifierQuoteHandler::handleRequest(HTTPServerRequest & request, HTTPServ
auto identifier = getIdentifierQuote(hdbc);
WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout);
writeStringBinary(identifier, out);
try
{
writeStringBinary(identifier, out);
out.finalize();
}
catch (...)
{
out.finalize();
}
}
catch (...)
{

View File

@ -187,9 +187,27 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
auto message = getCurrentExceptionMessage(true);
response.setStatusAndReason(
Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); // can't call process_error, because of too soon response sending
writeStringBinary(message, out);
tryLogCurrentException(log);
try
{
writeStringBinary(message, out);
out.finalize();
}
catch (...)
{
tryLogCurrentException(log);
}
tryLogCurrentException(log);
}
try
{
out.finalize();
}
catch (...)
{
tryLogCurrentException(log);
}
}

View File

@ -61,7 +61,15 @@ void SchemaAllowedHandler::handleRequest(HTTPServerRequest & request, HTTPServer
bool result = isSchemaAllowed(hdbc);
WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout);
writeBoolText(result, out);
try
{
writeBoolText(result, out);
out.finalize();
}
catch (...)
{
out.finalize();
}
}
catch (...)
{

View File

@ -103,6 +103,7 @@ namespace CurrentMetrics
extern const Metric Revision;
extern const Metric VersionInteger;
extern const Metric MemoryTracking;
extern const Metric MaxDDLEntryID;
}
@ -1012,7 +1013,8 @@ int Server::main(const std::vector<std::string> & /*args*/)
int pool_size = config().getInt("distributed_ddl.pool_size", 1);
if (pool_size < 1)
throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, *global_context, &config(), "distributed_ddl"));
global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, *global_context, &config(),
"distributed_ddl", "DDLWorker", &CurrentMetrics::MaxDDLEntryID));
}
std::unique_ptr<DNSCacheUpdater> dns_cache_updater;

View File

@ -284,6 +284,10 @@
In bytes. Cache is single for server. Memory is allocated only on demand.
Cache is used when 'use_uncompressed_cache' user setting turned on (off by default).
Uncompressed cache is advantageous only for very short queries and in rare cases.
Note: uncompressed cache can be pointless for lz4, because memory bandwidth
is slower than multi-core decompression on some server configurations.
Enabling it can sometimes paradoxically make queries slower.
-->
<uncompressed_cache_size>8589934592</uncompressed_cache_size>

View File

@ -7,9 +7,6 @@
<!-- Maximum memory usage for processing single query, in bytes. -->
<max_memory_usage>10000000000</max_memory_usage>
<!-- Use cache of uncompressed blocks of data. Meaningful only for processing many of very short queries. -->
<use_uncompressed_cache>0</use_uncompressed_cache>
<!-- How to choose between replicas during distributed query processing.
random - choose random replica from set of replicas with minimum number of errors
nearest_hostname - from set of replicas with minimum number of errors, choose replica

View File

@ -236,6 +236,11 @@ public:
return true;
}
bool isState() const override
{
return nested_func->isState();
}
AggregateFunctionPtr getNestedFunction() const override { return nested_func; }
};

View File

@ -52,7 +52,7 @@ struct MovingSumData : public MovingData<T>
{
static constexpr auto name = "groupArrayMovingSum";
T get(size_t idx, UInt64 window_size) const
T NO_SANITIZE_UNDEFINED get(size_t idx, UInt64 window_size) const
{
if (idx < window_size)
return this->value[idx];
@ -66,7 +66,7 @@ struct MovingAvgData : public MovingData<T>
{
static constexpr auto name = "groupArrayMovingAvg";
T get(size_t idx, UInt64 window_size) const
T NO_SANITIZE_UNDEFINED get(size_t idx, UInt64 window_size) const
{
if (idx < window_size)
return this->value[idx] / window_size;
@ -114,7 +114,7 @@ public:
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeResult>());
}
void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
void NO_SANITIZE_UNDEFINED add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
{
auto value = static_cast<const ColumnSource &>(*columns[0]).getData()[row_num];
this->data(place).add(static_cast<ResultT>(value), arena);

View File

@ -115,7 +115,14 @@ public:
"Values for {} are expected to be Numeric, Float or Decimal, passed type {}",
getName(), value_type->getName()};
result_type = value_type_without_nullable->promoteNumericType();
WhichDataType value_type_to_check(value_type);
/// Do not promote decimal because of implementation issues of this function design
/// If we decide to make this function more efficient we should promote decimal type during summ
if (value_type_to_check.isDecimal())
result_type = value_type_without_nullable;
else
result_type = value_type_without_nullable->promoteNumericType();
}
types.emplace_back(std::make_shared<DataTypeArray>(result_type));

View File

@ -7,6 +7,7 @@
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnCompressed.h>
#include <common/unaligned.h>
#include <common/sort.h>
@ -369,8 +370,12 @@ void ColumnArray::compareColumn(const IColumn & rhs, size_t rhs_row_num,
compare_results, direction, nan_direction_hint);
}
namespace
{
template <bool positive>
struct ColumnArray::Cmp
struct Cmp
{
const ColumnArray & parent;
int nan_direction_hint;
@ -390,6 +395,9 @@ struct ColumnArray::Cmp
}
};
}
void ColumnArray::reserve(size_t n)
{
getOffsets().reserve(n);
@ -912,6 +920,21 @@ void ColumnArray::updatePermutationWithCollation(const Collator & collator, bool
updatePermutationImpl(limit, res, equal_range, Cmp<true>(*this, nan_direction_hint, &collator));
}
ColumnPtr ColumnArray::compress() const
{
ColumnPtr data_compressed = data->compress();
ColumnPtr offsets_compressed = offsets->compress();
size_t byte_size = data_compressed->byteSize() + offsets_compressed->byteSize();
return ColumnCompressed::create(size(), byte_size,
[data_compressed = std::move(data_compressed), offsets_compressed = std::move(offsets_compressed)]
{
return ColumnArray::create(data_compressed->decompress(), offsets_compressed->decompress());
});
}
ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
{
if (replicate_offsets.empty())

View File

@ -123,6 +123,8 @@ public:
void gather(ColumnGathererStream & gatherer_stream) override;
ColumnPtr compress() const override;
void forEachSubcolumn(ColumnCallback callback) override
{
callback(offsets);
@ -183,9 +185,6 @@ private:
template <typename Comparator>
void updatePermutationImpl(size_t limit, Permutation & res, EqualRanges & equal_range, Comparator cmp) const;
template <bool positive>
struct Cmp;
};

View File

@ -0,0 +1,61 @@
#include <Columns/ColumnCompressed.h>
#pragma GCC diagnostic ignored "-Wold-style-cast"
#include <lz4.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_COMPRESS;
extern const int CANNOT_DECOMPRESS;
}
std::shared_ptr<Memory<>> ColumnCompressed::compressBuffer(const void * data, size_t data_size, bool always_compress)
{
size_t max_dest_size = LZ4_COMPRESSBOUND(data_size);
if (max_dest_size > std::numeric_limits<int>::max())
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(data_size));
Memory<> compressed(max_dest_size);
auto compressed_size = LZ4_compress_default(
reinterpret_cast<const char *>(data),
compressed.data(),
data_size,
max_dest_size);
if (compressed_size <= 0)
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column");
/// If compression is inefficient.
if (!always_compress && static_cast<size_t>(compressed_size) * 2 > data_size)
return {};
/// Shrink to fit.
auto shrank = std::make_shared<Memory<>>(compressed_size);
memcpy(shrank->data(), compressed.data(), compressed_size);
return shrank;
}
void ColumnCompressed::decompressBuffer(
const void * compressed_data, void * decompressed_data, size_t compressed_size, size_t decompressed_size)
{
auto processed_size = LZ4_decompress_safe(
reinterpret_cast<const char *>(compressed_data),
reinterpret_cast<char *>(decompressed_data),
compressed_size,
decompressed_size);
if (processed_size <= 0)
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress column");
}
}

View File

@ -0,0 +1,121 @@
#pragma once
#include <optional>
#include <Core/Field.h>
#include <Columns/IColumn.h>
#include <IO/BufferWithOwnMemory.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
/** Wrapper for compressed column data.
* The only supported operations are:
* - decompress (reconstruct the source column)
* - get size in rows or bytes.
*
* It is needed to implement in-memory compression
* - to keep compressed data in Block or pass around.
*
* It's often beneficial to store compressed data in-memory and decompress on the fly
* because it allows to lower memory throughput. More specifically, if:
*
* decompression speed * num CPU cores >= memory read throughput
*
* Also in-memory compression allows to keep more data in RAM.
*/
class ColumnCompressed : public COWHelper<IColumn, ColumnCompressed>
{
public:
using Lazy = std::function<ColumnPtr()>;
ColumnCompressed(size_t rows_, size_t bytes_, Lazy lazy_)
: rows(rows_), bytes(bytes_), lazy(lazy_)
{
}
const char * getFamilyName() const override { return "Compressed"; }
size_t size() const override { return rows; }
size_t byteSize() const override { return bytes; }
size_t allocatedBytes() const override { return bytes; }
ColumnPtr decompress() const override
{
return lazy();
}
/** Wrap uncompressed column without compression.
* Method can be used when compression is not worth doing.
* But returning CompressedColumn is still needed to keep uniform block structure.
*/
static ColumnPtr wrap(ColumnPtr column)
{
return ColumnCompressed::create(
column->size(),
column->allocatedBytes(),
[column = std::move(column)]{ return column; });
}
/// Helper methods for compression.
/// If data is not worth to be compressed and not 'always_compress' - returns nullptr.
/// Note: shared_ptr is to allow to be captured by std::function.
static std::shared_ptr<Memory<>> compressBuffer(const void * data, size_t data_size, bool always_compress);
static void decompressBuffer(
const void * compressed_data, void * decompressed_data, size_t compressed_size, size_t decompressed_size);
/// All other methods throw exception.
TypeIndex getDataType() const override { throwMustBeDecompressed(); }
Field operator[](size_t) const override { throwMustBeDecompressed(); }
void get(size_t, Field &) const override { throwMustBeDecompressed(); }
StringRef getDataAt(size_t) const override { throwMustBeDecompressed(); }
void insert(const Field &) override { throwMustBeDecompressed(); }
void insertRangeFrom(const IColumn &, size_t, size_t) override { throwMustBeDecompressed(); }
void insertData(const char *, size_t) override { throwMustBeDecompressed(); }
void insertDefault() override { throwMustBeDecompressed(); }
void popBack(size_t) override { throwMustBeDecompressed(); }
StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeDecompressed(); }
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeDecompressed(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeDecompressed(); }
void updateWeakHash32(WeakHash32 &) const override { throwMustBeDecompressed(); }
void updateHashFast(SipHash &) const override { throwMustBeDecompressed(); }
ColumnPtr filter(const Filter &, ssize_t) const override { throwMustBeDecompressed(); }
ColumnPtr permute(const Permutation &, size_t) const override { throwMustBeDecompressed(); }
ColumnPtr index(const IColumn &, size_t) const override { throwMustBeDecompressed(); }
int compareAt(size_t, size_t, const IColumn &, int) const override { throwMustBeDecompressed(); }
void compareColumn(const IColumn &, size_t, PaddedPODArray<UInt64> *, PaddedPODArray<Int8> &, int, int) const override
{
throwMustBeDecompressed();
}
void getPermutation(bool, size_t, int, Permutation &) const override { throwMustBeDecompressed(); }
void updatePermutation(bool, size_t, int, Permutation &, EqualRanges &) const override { throwMustBeDecompressed(); }
ColumnPtr replicate(const Offsets &) const override { throwMustBeDecompressed(); }
MutableColumns scatter(ColumnIndex, const Selector &) const override { throwMustBeDecompressed(); }
void gather(ColumnGathererStream &) override { throwMustBeDecompressed(); }
void getExtremes(Field &, Field &) const override { throwMustBeDecompressed(); }
size_t byteSizeAt(size_t) const override { throwMustBeDecompressed(); }
protected:
size_t rows;
size_t bytes;
Lazy lazy;
private:
[[noreturn]] void throwMustBeDecompressed() const
{
throw Exception("ColumnCompressed must be decompressed before use", ErrorCodes::LOGICAL_ERROR);
}
};
}

View File

@ -14,6 +14,7 @@
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnDecimal.h>
#include <Columns/ColumnCompressed.h>
#include <DataStreams/ColumnGathererStream.h>
@ -346,6 +347,30 @@ void ColumnDecimal<T>::gather(ColumnGathererStream & gatherer)
gatherer.gather(*this);
}
template <typename T>
ColumnPtr ColumnDecimal<T>::compress() const
{
size_t source_size = data.size() * sizeof(T);
/// Don't compress small blocks.
if (source_size < 4096) /// A wild guess.
return ColumnCompressed::wrap(this->getPtr());
auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size, false);
if (!compressed)
return ColumnCompressed::wrap(this->getPtr());
return ColumnCompressed::create(data.size(), compressed->size(),
[compressed = std::move(compressed), column_size = data.size(), scale = this->scale]
{
auto res = ColumnDecimal<T>::create(column_size, scale);
ColumnCompressed::decompressBuffer(
compressed->data(), res->getData().data(), compressed->size(), column_size * sizeof(T));
return res;
});
}
template <typename T>
void ColumnDecimal<T>::getExtremes(Field & min, Field & max) const
{

View File

@ -172,6 +172,8 @@ public:
return false;
}
ColumnPtr compress() const override;
void insertValue(const T value) { data.push_back(value); }
Container & getData() { return data; }

View File

@ -1,6 +1,7 @@
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnCompressed.h>
#include <DataStreams/ColumnGathererStream.h>
#include <IO/WriteHelpers.h>
#include <Common/Arena.h>
@ -446,6 +447,34 @@ void ColumnFixedString::getExtremes(Field & min, Field & max) const
get(max_idx, max);
}
ColumnPtr ColumnFixedString::compress() const
{
size_t source_size = chars.size();
/// Don't compress small blocks.
if (source_size < 4096) /// A wild guess.
return ColumnCompressed::wrap(this->getPtr());
auto compressed = ColumnCompressed::compressBuffer(chars.data(), source_size, false);
if (!compressed)
return ColumnCompressed::wrap(this->getPtr());
size_t column_size = size();
return ColumnCompressed::create(column_size, compressed->size(),
[compressed = std::move(compressed), column_size, n = n]
{
size_t chars_size = n * column_size;
auto res = ColumnFixedString::create(n);
res->getChars().resize(chars_size);
ColumnCompressed::decompressBuffer(
compressed->data(), res->getChars().data(), compressed->size(), chars_size);
return res;
});
}
void ColumnFixedString::alignStringLength(ColumnFixedString::Chars & data, size_t n, size_t old_size)
{
size_t length = data.size() - old_size;

View File

@ -156,6 +156,8 @@ public:
void gather(ColumnGathererStream & gatherer_stream) override;
ColumnPtr compress() const override;
void reserve(size_t size) override
{
chars.reserve(n * size);

View File

@ -1,4 +1,5 @@
#include <Columns/ColumnMap.h>
#include <Columns/ColumnCompressed.h>
#include <Columns/IColumnImpl.h>
#include <DataStreams/ColumnGathererStream.h>
#include <IO/WriteBufferFromString.h>
@ -243,4 +244,13 @@ bool ColumnMap::structureEquals(const IColumn & rhs) const
return false;
}
ColumnPtr ColumnMap::compress() const
{
auto compressed = nested->compress();
return ColumnCompressed::create(size(), compressed->byteSize(), [compressed = std::move(compressed)]
{
return ColumnMap::create(compressed->decompress());
});
}
}

View File

@ -91,6 +91,8 @@ public:
const ColumnTuple & getNestedData() const { return assert_cast<const ColumnTuple &>(getNestedColumn().getData()); }
ColumnTuple & getNestedData() { return assert_cast<ColumnTuple &>(getNestedColumn().getData()); }
ColumnPtr compress() const override;
};
}

View File

@ -7,6 +7,7 @@
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnCompressed.h>
#include <DataStreams/ColumnGathererStream.h>
@ -511,6 +512,20 @@ void ColumnNullable::protect()
getNullMapColumn().protect();
}
ColumnPtr ColumnNullable::compress() const
{
ColumnPtr nested_compressed = nested_column->compress();
ColumnPtr null_map_compressed = null_map->compress();
size_t byte_size = nested_column->byteSize() + null_map->byteSize();
return ColumnCompressed::create(size(), byte_size,
[nested_column = std::move(nested_column), null_map = std::move(null_map)]
{
return ColumnNullable::create(nested_column->decompress(), null_map->decompress());
});
}
namespace
{

View File

@ -117,6 +117,8 @@ public:
void gather(ColumnGathererStream & gatherer_stream) override;
ColumnPtr compress() const override;
void forEachSubcolumn(ColumnCallback callback) override
{
callback(nested_column);

View File

@ -2,6 +2,7 @@
#include <Columns/Collator.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnCompressed.h>
#include <DataStreams/ColumnGathererStream.h>
#include <Common/Arena.h>
#include <Common/HashTable/Hash.h>
@ -525,6 +526,47 @@ void ColumnString::getExtremes(Field & min, Field & max) const
}
ColumnPtr ColumnString::compress() const
{
size_t source_chars_size = chars.size();
size_t source_offsets_size = offsets.size() * sizeof(Offset);
/// Don't compress small blocks.
if (source_chars_size < 4096) /// A wild guess.
return ColumnCompressed::wrap(this->getPtr());
auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size, false);
/// Return original column if not compressible.
if (!chars_compressed)
return ColumnCompressed::wrap(this->getPtr());
auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, true);
return ColumnCompressed::create(offsets.size(), chars_compressed->size() + offsets_compressed->size(),
[
chars_compressed = std::move(chars_compressed),
offsets_compressed = std::move(offsets_compressed),
source_chars_size,
source_offsets_elements = offsets.size()
]
{
auto res = ColumnString::create();
res->getChars().resize(source_chars_size);
res->getOffsets().resize(source_offsets_elements);
ColumnCompressed::decompressBuffer(
chars_compressed->data(), res->getChars().data(), chars_compressed->size(), source_chars_size);
ColumnCompressed::decompressBuffer(
offsets_compressed->data(), res->getOffsets().data(), offsets_compressed->size(), source_offsets_elements * sizeof(Offset));
return res;
});
}
int ColumnString::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int, const Collator & collator) const
{
const ColumnString & rhs = assert_cast<const ColumnString &>(rhs_);

View File

@ -261,6 +261,8 @@ public:
void gather(ColumnGathererStream & gatherer_stream) override;
ColumnPtr compress() const override;
void reserve(size_t n) override;
void getExtremes(Field & min, Field & max) const override;

View File

@ -1,6 +1,7 @@
#include <Columns/ColumnTuple.h>
#include <Columns/IColumnImpl.h>
#include <Columns/ColumnCompressed.h>
#include <Core/Field.h>
#include <DataStreams/ColumnGathererStream.h>
#include <IO/Operators.h>
@ -486,7 +487,7 @@ bool ColumnTuple::structureEquals(const IColumn & rhs) const
bool ColumnTuple::isCollationSupported() const
{
for (const auto& column : columns)
for (const auto & column : columns)
{
if (column->isCollationSupported())
return true;
@ -495,4 +496,25 @@ bool ColumnTuple::isCollationSupported() const
}
ColumnPtr ColumnTuple::compress() const
{
size_t byte_size = 0;
Columns compressed;
compressed.reserve(columns.size());
for (const auto & column : columns)
{
auto compressed_column = column->compress();
byte_size += compressed_column->byteSize();
compressed.emplace_back(std::move(compressed_column));
}
return ColumnCompressed::create(size(), byte_size,
[compressed = std::move(compressed)]() mutable
{
for (auto & column : compressed)
column = column->decompress();
return ColumnTuple::create(compressed);
});
}
}

View File

@ -89,6 +89,7 @@ public:
void forEachSubcolumn(ColumnCallback callback) override;
bool structureEquals(const IColumn & rhs) const override;
bool isCollationSupported() const override;
ColumnPtr compress() const override;
size_t tupleSize() const { return columns.size(); }

View File

@ -28,13 +28,18 @@ namespace ErrorCodes
extern const int ILLEGAL_COLUMN;
}
/** Stores another column with unique values
* and also an index that allows to find position by value.
*
* This column is not used on it's own but only as implementation detail of ColumnLowCardinality.
*/
template <typename ColumnType>
class ColumnUnique final : public COWHelper<IColumnUnique, ColumnUnique<ColumnType>>
{
friend class COWHelper<IColumnUnique, ColumnUnique<ColumnType>>;
private:
explicit ColumnUnique(MutableColumnPtr && holder, bool is_nullable);
ColumnUnique(MutableColumnPtr && holder, bool is_nullable);
explicit ColumnUnique(const IDataType & type);
ColumnUnique(const ColumnUnique & other);

View File

@ -2,6 +2,7 @@
#include <pdqsort.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnCompressed.h>
#include <DataStreams/ColumnGathererStream.h>
#include <IO/WriteHelpers.h>
#include <Common/Arena.h>
@ -520,6 +521,33 @@ void ColumnVector<T>::getExtremes(Field & min, Field & max) const
max = NearestFieldType<T>(cur_max);
}
#pragma GCC diagnostic ignored "-Wold-style-cast"
template <typename T>
ColumnPtr ColumnVector<T>::compress() const
{
size_t source_size = data.size() * sizeof(T);
/// Don't compress small blocks.
if (source_size < 4096) /// A wild guess.
return ColumnCompressed::wrap(this->getPtr());
auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size, false);
if (!compressed)
return ColumnCompressed::wrap(this->getPtr());
return ColumnCompressed::create(data.size(), compressed->size(),
[compressed = std::move(compressed), column_size = data.size()]
{
auto res = ColumnVector<T>::create(column_size);
ColumnCompressed::decompressBuffer(
compressed->data(), res->getData().data(), compressed->size(), column_size * sizeof(T));
return res;
});
}
/// Explicit template instantiations - to avoid code bloat in headers.
template class ColumnVector<UInt8>;
template class ColumnVector<UInt16>;

View File

@ -298,6 +298,8 @@ public:
return typeid(rhs) == typeid(ColumnVector<T>);
}
ColumnPtr compress() const override;
/// Replace elements that match the filter with zeroes. If inverted replaces not matched elements.
void applyZeroMap(const IColumn::Filter & filt, bool inverted = false);

View File

@ -357,6 +357,21 @@ public:
throw Exception("Method structureEquals is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
/// Compress column in memory to some representation that allows to decompress it back.
/// Return itself if compression is not applicable for this column type.
virtual Ptr compress() const
{
/// No compression by default.
return getPtr();
}
/// If it's CompressedColumn, decompress it and return.
/// Otherwise return itself.
virtual Ptr decompress() const
{
return getPtr();
}
static MutablePtr mutate(Ptr ptr)
{
@ -460,7 +475,7 @@ using Columns = std::vector<ColumnPtr>;
using MutableColumns = std::vector<MutableColumnPtr>;
using ColumnRawPtrs = std::vector<const IColumn *>;
//using MutableColumnRawPtrs = std::vector<IColumn *>;
template <typename ... Args>
struct IsMutableColumns;

View File

@ -316,8 +316,8 @@ template <typename IndexType, typename ColumnType>
class ReverseIndex
{
public:
explicit ReverseIndex(UInt64 num_prefix_rows_to_skip_, UInt64 base_index_)
: num_prefix_rows_to_skip(num_prefix_rows_to_skip_), base_index(base_index_), saved_hash_ptr(nullptr) {}
ReverseIndex(UInt64 num_prefix_rows_to_skip_, UInt64 base_index_)
: num_prefix_rows_to_skip(num_prefix_rows_to_skip_), base_index(base_index_), saved_hash_ptr(nullptr) {}
void setColumn(ColumnType * column_);
@ -329,14 +329,16 @@ public:
/// Returns the found data's index in the dictionary. If index is not built, builds it.
UInt64 getInsertionPoint(StringRef data)
{
if (!index) buildIndex();
if (!index)
buildIndex();
return getIndexImpl(data);
}
/// Returns the found data's index in the dictionary if the #index is built, otherwise, returns a std::nullopt.
std::optional<UInt64> getIndex(StringRef data) const
{
if (!index) return {};
if (!index)
return {};
return getIndexImpl(data);
}

View File

@ -7,18 +7,21 @@ ADDINCL(
contrib/libs/icu/common
contrib/libs/icu/i18n
contrib/libs/pdqsort
contrib/libs/lz4
)
PEERDIR(
clickhouse/src/Common
contrib/libs/icu
contrib/libs/pdqsort
contrib/libs/lz4
)
SRCS(
Collator.cpp
ColumnAggregateFunction.cpp
ColumnArray.cpp
ColumnCompressed.cpp
ColumnConst.cpp
ColumnDecimal.cpp
ColumnFixedString.cpp

View File

@ -6,12 +6,14 @@ ADDINCL(
contrib/libs/icu/common
contrib/libs/icu/i18n
contrib/libs/pdqsort
contrib/libs/lz4
)
PEERDIR(
clickhouse/src/Common
contrib/libs/icu
contrib/libs/pdqsort
contrib/libs/lz4
)
SRCS(

View File

@ -537,6 +537,7 @@
M(568, RAFT_ERROR) \
M(569, MULTIPLE_COLUMNS_SERIALIZED_TO_SAME_PROTOBUF_FIELD) \
M(570, DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD) \
M(571, DATABASE_REPLICATION_FAILED) \
\
M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \

View File

@ -59,7 +59,7 @@ static void addRootPath(String & path, const String & root_path)
throw Exception("Path cannot be empty", Error::ZBADARGUMENTS);
if (path[0] != '/')
throw Exception("Path must begin with /", Error::ZBADARGUMENTS);
throw Exception("Path must begin with /, got " + path, Error::ZBADARGUMENTS);
if (root_path.empty())
return;

View File

@ -194,6 +194,7 @@ public:
void removeChildren(const std::string & path);
using WaitCondition = std::function<bool()>;
/// Wait for the node to disappear or return immediately if it doesn't exist.
/// If condition is specified, it is used to return early (when condition returns false)
/// The function returns true if waited and false if waiting was interrupted by condition.
@ -314,8 +315,15 @@ public:
return std::make_shared<EphemeralNodeHolder>(path, zookeeper, false, false, "");
}
void setAlreadyRemoved()
{
need_remove = false;
}
~EphemeralNodeHolder()
{
if (!need_remove)
return;
try
{
zookeeper.tryRemove(path);
@ -331,6 +339,7 @@ private:
std::string path;
ZooKeeper & zookeeper;
CurrentMetrics::Increment metric_increment{CurrentMetrics::EphemeralNode};
bool need_remove = true;
};
using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr;

View File

@ -62,10 +62,10 @@ ColumnDefinition::ColumnDefinition()
ColumnDefinition::ColumnDefinition(
String schema_, String table_, String org_table_, String name_, String org_name_, uint16_t character_set_, uint32_t column_length_,
ColumnType column_type_, uint16_t flags_, uint8_t decimals_)
ColumnType column_type_, uint16_t flags_, uint8_t decimals_, bool with_defaults_)
: schema(std::move(schema_)), table(std::move(table_)), org_table(std::move(org_table_)), name(std::move(name_)),
org_name(std::move(org_name_)), character_set(character_set_), column_length(column_length_), column_type(column_type_),
flags(flags_), decimals(decimals_)
flags(flags_), decimals(decimals_), is_comm_field_list_response(with_defaults_)
{
}
@ -77,8 +77,15 @@ ColumnDefinition::ColumnDefinition(
size_t ColumnDefinition::getPayloadSize() const
{
return 12 + getLengthEncodedStringSize("def") + getLengthEncodedStringSize(schema) + getLengthEncodedStringSize(table) + getLengthEncodedStringSize(org_table) + \
getLengthEncodedStringSize(name) + getLengthEncodedStringSize(org_name) + getLengthEncodedNumberSize(next_length);
return 12 +
getLengthEncodedStringSize("def") +
getLengthEncodedStringSize(schema) +
getLengthEncodedStringSize(table) +
getLengthEncodedStringSize(org_table) +
getLengthEncodedStringSize(name) +
getLengthEncodedStringSize(org_name) +
getLengthEncodedNumberSize(next_length) +
is_comm_field_list_response;
}
void ColumnDefinition::readPayloadImpl(ReadBuffer & payload)
@ -115,6 +122,13 @@ void ColumnDefinition::writePayloadImpl(WriteBuffer & buffer) const
buffer.write(reinterpret_cast<const char *>(&flags), 2);
buffer.write(reinterpret_cast<const char *>(&decimals), 1);
writeChar(0x0, 2, buffer);
if (is_comm_field_list_response)
{
/// We should write length encoded int with string size
/// followed by string with some "default values" (possibly it's column defaults).
/// But we just send NULL for simplicity.
writeChar(0xfb, buffer);
}
}
ColumnDefinition getColumnDefinition(const String & column_name, const TypeIndex type_index)

View File

@ -101,6 +101,9 @@ public:
ColumnType column_type;
uint16_t flags;
uint8_t decimals = 0x00;
/// https://dev.mysql.com/doc/internals/en/com-query-response.html#column-definition
/// There are extra fields in the packet for column defaults
bool is_comm_field_list_response = false;
protected:
size_t getPayloadSize() const override;
@ -114,7 +117,7 @@ public:
ColumnDefinition(
String schema_, String table_, String org_table_, String name_, String org_name_, uint16_t character_set_, uint32_t column_length_,
ColumnType column_type_, uint16_t flags_, uint8_t decimals_);
ColumnType column_type_, uint16_t flags_, uint8_t decimals_, bool with_defaults_ = false);
/// Should be used when column metadata (original name, table, original table, database) is unknown.
ColumnDefinition(

View File

@ -69,7 +69,7 @@ class IColumn;
M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \
M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \
M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \
M(Bool, use_uncompressed_cache, true, "Whether to use the cache of uncompressed blocks.", 0) \
M(Bool, use_uncompressed_cache, false, "Whether to use the cache of uncompressed blocks.", 0) \
M(Bool, replace_running_query, false, "Whether the running request should be canceled with the same id as the new one.", 0) \
M(UInt64, background_buffer_flush_schedule_pool_size, 16, "Number of threads performing background flush for tables with Buffer engine. Only has meaning at server startup.", 0) \
M(UInt64, background_pool_size, 16, "Number of threads performing background work for tables (for example, merging in merge tree). Only has meaning at server startup.", 0) \
@ -421,6 +421,9 @@ class IColumn;
M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \
M(Bool, allow_experimental_query_deduplication, false, "Allow sending parts' UUIDs for a query in order to deduplicate data parts if any", 0) \
M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \
M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \
M(Bool, database_replicated_ddl_output, true, "Return table with query execution status as a result of DDL query", 0) \
\
/** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \
\

View File

@ -146,9 +146,13 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const
events[0].data.fd = events[1].data.fd = events[2].data.fd = -1;
/// Wait for epoll_fd will not block if it was polled externally.
int num_events = epoll_wait(epoll_fd, events, 3, 0);
if (num_events == -1)
throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET);
int num_events = 0;
while (num_events <= 0)
{
num_events = epoll_wait(epoll_fd, events, 3, -1);
if (num_events == -1 && errno != EINTR)
throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET);
}
bool is_socket_ready = false;
bool is_pipe_alarmed = false;

View File

@ -1,7 +1,9 @@
#pragma once
#include <DataTypes/IDataType.h>
#include <Columns/IColumnUnique.h>
namespace DB
{

View File

@ -4,13 +4,14 @@
#include <Poco/Path.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadBufferFromFile.h>
#include <Parsers/formatAST.h>
#include <Common/renameat2.h>
#include <Storages/StorageMaterializedView.h>
#include <Interpreters/Context.h>
#include <Interpreters/ExternalDictionariesLoader.h>
#include <filesystem>
#include <Interpreters/DDLTask.h>
namespace DB
{
@ -34,7 +35,6 @@ public:
UUID uuid() const override { return table()->getStorageID().uuid; }
};
DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger_name, const Context & context_)
: DatabaseOrdinary(name_, std::move(metadata_path_), "store/", logger_name, context_)
, path_to_table_symlinks(global_context.getPath() + "data/" + escapeForFileName(name_) + "/")
@ -106,7 +106,7 @@ StoragePtr DatabaseAtomic::detachTable(const String & name)
return table;
}
void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool no_delay)
void DatabaseAtomic::dropTable(const Context & context, const String & table_name, bool no_delay)
{
String table_metadata_path = getObjectMetadataPath(table_name);
String table_metadata_path_drop;
@ -115,6 +115,16 @@ void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool
std::unique_lock lock(mutex);
table = getTableUnlocked(table_name, lock);
table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID());
auto txn = context.getZooKeeperMetadataTransaction();
if (txn && !context.isInternalSubquery())
txn->commit(); /// Commit point (a sort of) for Replicated database
/// NOTE: replica will be lost if server crashes before the following rename
/// We apply changes in ZooKeeper before applying changes in local metadata file
/// to reduce probability of failures between these operations
/// (it's more likely to lost connection, than to fail before applying local changes).
/// TODO better detection and recovery
Poco::File(table_metadata_path).renameTo(table_metadata_path_drop); /// Mark table as dropped
DatabaseWithDictionaries::detachTableUnlocked(table_name, lock); /// Should never throw
table_name_to_path.erase(table_name);
@ -124,7 +134,7 @@ void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool
/// Remove the inner table (if any) to avoid deadlock
/// (due to attempt to execute DROP from the worker thread)
if (auto * mv = dynamic_cast<StorageMaterializedView *>(table.get()))
mv->dropInnerTable(no_delay);
mv->dropInnerTable(no_delay, context);
/// Notify DatabaseCatalog that table was dropped. It will remove table data in background.
/// Cleanup is performed outside of database to allow easily DROP DATABASE without waiting for cleanup to complete.
DatabaseCatalog::instance().enqueueDroppedTableCleanup(table->getStorageID(), table, table_metadata_path_drop, no_delay);
@ -144,6 +154,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n
if (exchange && dictionary)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot exchange dictionaries");
if (exchange && !supportsRenameat2())
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "RENAME EXCHANGE is not supported");
auto & other_db = dynamic_cast<DatabaseAtomic &>(to_database);
bool inside_database = this == &other_db;
@ -232,6 +244,13 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n
}
/// Table renaming actually begins here
auto txn = context.getZooKeeperMetadataTransaction();
if (txn && !context.isInternalSubquery())
txn->commit(); /// Commit point (a sort of) for Replicated database
/// NOTE: replica will be lost if server crashes before the following rename
/// TODO better detection and recovery
if (exchange)
renameExchange(old_metadata_path, new_metadata_path);
else
@ -267,7 +286,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n
}
void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
const String & table_metadata_tmp_path, const String & table_metadata_path)
const String & table_metadata_tmp_path, const String & table_metadata_path,
const Context & query_context)
{
DetachedTables not_in_use;
auto table_data_path = getTableDataPath(query);
@ -284,6 +304,14 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora
/// We will get en exception if some table with the same UUID exists (even if it's detached table or table from another database)
DatabaseCatalog::instance().addUUIDMapping(query.uuid);
locked_uuid = true;
auto txn = query_context.getZooKeeperMetadataTransaction();
if (txn && !query_context.isInternalSubquery())
txn->commit(); /// Commit point (a sort of) for Replicated database
/// NOTE: replica will be lost if server crashes before the following renameNoReplace(...)
/// TODO better detection and recovery
/// It throws if `table_metadata_path` already exists (it's possible if table was detached)
renameNoReplace(table_metadata_tmp_path, table_metadata_path); /// Commit point (a sort of)
attachTableUnlocked(query.table, table, lock); /// Should never throw
@ -300,7 +328,8 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora
tryCreateSymlink(query.table, table_data_path);
}
void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path)
void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path,
const String & /*statement*/, const Context & query_context)
{
bool check_file_exists = true;
SCOPE_EXIT({ std::error_code code; if (check_file_exists) std::filesystem::remove(table_metadata_tmp_path, code); });
@ -311,6 +340,13 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String &
if (table_id.uuid != actual_table_id.uuid)
throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER);
auto txn = query_context.getZooKeeperMetadataTransaction();
if (txn && !query_context.isInternalSubquery())
txn->commit(); /// Commit point (a sort of) for Replicated database
/// NOTE: replica will be lost if server crashes before the following rename
/// TODO better detection and recovery
check_file_exists = renameExchangeIfSupported(table_metadata_tmp_path, table_metadata_path);
if (!check_file_exists)
std::filesystem::rename(table_metadata_tmp_path, table_metadata_path);
@ -329,6 +365,12 @@ void DatabaseAtomic::assertDetachedTableNotInUse(const UUID & uuid)
", because it was detached but still used by some query. Retry later.", ErrorCodes::TABLE_ALREADY_EXISTS);
}
void DatabaseAtomic::setDetachedTableNotInUseForce(const UUID & uuid)
{
std::unique_lock lock{mutex};
detached_tables.erase(uuid);
}
DatabaseAtomic::DetachedTables DatabaseAtomic::cleanupDetachedTables()
{
DetachedTables not_in_use;

View File

@ -58,11 +58,12 @@ public:
void tryRemoveSymlink(const String & table_name);
void waitDetachedTableNotInUse(const UUID & uuid) override;
void setDetachedTableNotInUseForce(const UUID & uuid);
private:
void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path) override;
protected:
void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) override;
void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
const String & table_metadata_tmp_path, const String & table_metadata_path) override;
const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) override;
void assertDetachedTableNotInUse(const UUID & uuid);
typedef std::unordered_map<UUID, StoragePtr> DetachedTables;

View File

@ -1,6 +1,7 @@
#include <Databases/DatabaseFactory.h>
#include <Databases/DatabaseAtomic.h>
#include <Databases/DatabaseReplicated.h>
#include <Databases/DatabaseDictionary.h>
#include <Databases/DatabaseLazy.h>
#include <Databases/DatabaseMemory.h>
@ -13,6 +14,7 @@
#include <Poco/File.h>
#include <Poco/Path.h>
#include <Interpreters/Context.h>
#include <Common/Macros.h>
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
@ -98,14 +100,16 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String
const String & engine_name = engine_define->engine->name;
const UUID & uuid = create.uuid;
if (engine_name != "MySQL" && engine_name != "MaterializeMySQL"
&& engine_name != "PostgreSQL" && engine_name != "PostgreSQLReplica"
&& engine_name != "Lazy" && engine_define->engine->arguments)
bool engine_may_have_arguments = engine_name == "MySQL" || engine_name == "MaterializeMySQL" || engine_name == "Lazy" ||
engine_name == "Replicated" || engine_name == "PostgreSQL" || engine_name == "PostgreSQLReplica";
if (engine_define->engine->arguments && !engine_may_have_arguments)
throw Exception("Database engine " + engine_name + " cannot have arguments", ErrorCodes::BAD_ARGUMENTS);
if (engine_define->engine->parameters || engine_define->partition_by || engine_define->primary_key ||
engine_define->order_by || engine_define->sample_by ||
(!endsWith(engine_name, "MySQL") && (engine_name != "PostgreSQLReplica") && engine_define->settings))
bool has_unexpected_element = engine_define->engine->parameters || engine_define->partition_by ||
engine_define->primary_key || engine_define->order_by ||
engine_define->sample_by;
bool may_have_settings = endsWith(engine_name, "MySQL") || engine_name == "Replicated" || engine_name == "PostgreSQLReplica";
if (has_unexpected_element || (!may_have_settings && engine_define->settings))
throw Exception("Database engine " + engine_name + " cannot have parameters, primary_key, order_by, sample_by, settings",
ErrorCodes::UNKNOWN_ELEMENT_IN_AST);
@ -189,6 +193,32 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String
return std::make_shared<DatabaseLazy>(database_name, metadata_path, cache_expiration_time_seconds, context);
}
else if (engine_name == "Replicated")
{
const ASTFunction * engine = engine_define->engine;
if (!engine->arguments || engine->arguments->children.size() != 3)
throw Exception("Replicated database requires 3 arguments: zookeeper path, shard name and replica name", ErrorCodes::BAD_ARGUMENTS);
const auto & arguments = engine->arguments->children;
String zookeeper_path = safeGetLiteralValue<String>(arguments[0], "Replicated");
String shard_name = safeGetLiteralValue<String>(arguments[1], "Replicated");
String replica_name = safeGetLiteralValue<String>(arguments[2], "Replicated");
zookeeper_path = context.getMacros()->expand(zookeeper_path);
shard_name = context.getMacros()->expand(shard_name);
replica_name = context.getMacros()->expand(replica_name);
DatabaseReplicatedSettings database_replicated_settings{};
if (engine_define->settings)
database_replicated_settings.loadFromQuery(*engine_define);
return std::make_shared<DatabaseReplicated>(database_name, metadata_path, uuid,
zookeeper_path, shard_name, replica_name,
std::move(database_replicated_settings), context);
}
#if USE_LIBPQXX
else if (engine_name == "PostgreSQL")

View File

@ -129,6 +129,60 @@ String getObjectDefinitionFromCreateQuery(const ASTPtr & query)
return statement_buf.str();
}
void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemoryMetadata & metadata)
{
auto & ast_create_query = query->as<ASTCreateQuery &>();
bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns;
if (ast_create_query.as_table_function && !has_structure)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function"
" and doesn't have structure in metadata", backQuote(ast_create_query.table));
assert(has_structure);
ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns);
ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices);
ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints);
ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns);
ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices);
ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints);
if (metadata.select.select_query)
{
query->replace(ast_create_query.select, metadata.select.select_query);
}
/// MaterializedView is one type of CREATE query without storage.
if (ast_create_query.storage)
{
ASTStorage & storage_ast = *ast_create_query.storage;
bool is_extended_storage_def
= storage_ast.partition_by || storage_ast.primary_key || storage_ast.order_by || storage_ast.sample_by || storage_ast.settings;
if (is_extended_storage_def)
{
if (metadata.sorting_key.definition_ast)
storage_ast.set(storage_ast.order_by, metadata.sorting_key.definition_ast);
if (metadata.primary_key.definition_ast)
storage_ast.set(storage_ast.primary_key, metadata.primary_key.definition_ast);
if (metadata.sampling_key.definition_ast)
storage_ast.set(storage_ast.sample_by, metadata.sampling_key.definition_ast);
if (metadata.table_ttl.definition_ast)
storage_ast.set(storage_ast.ttl_table, metadata.table_ttl.definition_ast);
else if (storage_ast.ttl_table != nullptr) /// TTL was removed
storage_ast.ttl_table = nullptr;
if (metadata.settings_changes)
storage_ast.set(storage_ast.settings, metadata.settings_changes);
}
}
}
DatabaseOnDisk::DatabaseOnDisk(
const String & name,
const String & metadata_path_,
@ -214,7 +268,7 @@ void DatabaseOnDisk::createTable(
out.close();
}
commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path);
commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path, context);
removeDetachedPermanentlyFlag(table_name, table_metadata_path);
}
@ -238,7 +292,8 @@ void DatabaseOnDisk::removeDetachedPermanentlyFlag(const String & table_name, co
}
void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
const String & table_metadata_tmp_path, const String & table_metadata_path)
const String & table_metadata_tmp_path, const String & table_metadata_path,
const Context & /*query_context*/)
{
try
{
@ -256,7 +311,7 @@ void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const Stora
}
}
void DatabaseOnDisk::detachTablePermanently(const String & table_name)
void DatabaseOnDisk::detachTablePermanently(const Context &, const String & table_name)
{
auto table = detachTable(table_name);
@ -352,6 +407,8 @@ void DatabaseOnDisk::renameTable(
from_ordinary_to_atomic = true;
else if (typeid_cast<DatabaseAtomic *>(this) && typeid_cast<DatabaseOrdinary *>(&to_database))
from_atomic_to_ordinary = true;
else if (dynamic_cast<DatabaseAtomic *>(this) && typeid_cast<DatabaseOrdinary *>(&to_database) && getEngineName() == "Replicated")
from_atomic_to_ordinary = true;
else
throw Exception("Moving tables between databases of different engines is not supported", ErrorCodes::NOT_IMPLEMENTED);
}
@ -363,6 +420,7 @@ void DatabaseOnDisk::renameTable(
/// DatabaseLazy::detachTable may return nullptr even if table exists, so we need tryGetTable for this case.
StoragePtr table = tryGetTable(table_name, global_context);
detachTable(table_name);
UUID prev_uuid = UUIDHelpers::Nil;
try
{
table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout);
@ -375,7 +433,7 @@ void DatabaseOnDisk::renameTable(
if (from_ordinary_to_atomic)
create.uuid = UUIDHelpers::generateV4();
if (from_atomic_to_ordinary)
create.uuid = UUIDHelpers::Nil;
std::swap(create.uuid, prev_uuid);
if (auto * target_db = dynamic_cast<DatabaseOnDisk *>(&to_database))
target_db->checkMetadataFilenameAvailability(to_table_name);
@ -400,12 +458,16 @@ void DatabaseOnDisk::renameTable(
Poco::File(table_metadata_path).remove();
/// Special case: usually no actions with symlinks are required when detaching/attaching table,
/// but not when moving from Atomic database to Ordinary
if (from_atomic_to_ordinary && table->storesDataOnDisk())
if (from_atomic_to_ordinary)
{
auto & atomic_db = assert_cast<DatabaseAtomic &>(*this);
atomic_db.tryRemoveSymlink(table_name);
auto & atomic_db = dynamic_cast<DatabaseAtomic &>(*this);
/// Special case: usually no actions with symlinks are required when detaching/attaching table,
/// but not when moving from Atomic database to Ordinary
if (table->storesDataOnDisk())
atomic_db.tryRemoveSymlink(table_name);
/// Forget about UUID, now it's possible to reuse it for new table
DatabaseCatalog::instance().removeUUIDMappingFinally(prev_uuid);
atomic_db.setDetachedTableNotInUseForce(prev_uuid);
}
}

View File

@ -25,6 +25,8 @@ std::pair<String, StoragePtr> createTableFromAST(
*/
String getObjectDefinitionFromCreateQuery(const ASTPtr & query);
void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemoryMetadata & metadata);
/* Class to provide basic operations with tables when metadata is stored on disk in .sql files.
*/
@ -39,7 +41,7 @@ public:
const StoragePtr & table,
const ASTPtr & query) override;
void detachTablePermanently(const String & table_name) override;
void detachTablePermanently(const Context & context, const String & table_name) override;
void dropTable(
const Context & context,
@ -90,7 +92,7 @@ protected:
ASTPtr getCreateQueryFromMetadata(const String & metadata_path, bool throw_on_error) const;
virtual void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
const String & table_metadata_tmp_path, const String & table_metadata_path);
const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context);
const String metadata_path;
const String data_path;

View File

@ -33,11 +33,6 @@ static constexpr size_t PRINT_MESSAGE_EACH_N_OBJECTS = 256;
static constexpr size_t PRINT_MESSAGE_EACH_N_SECONDS = 5;
static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768;
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
namespace
{
void tryAttachTable(
@ -272,55 +267,7 @@ void DatabaseOrdinary::alterTable(const Context & context, const StorageID & tab
0,
context.getSettingsRef().max_parser_depth);
auto & ast_create_query = ast->as<ASTCreateQuery &>();
bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns;
if (ast_create_query.as_table_function && !has_structure)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function"
" and doesn't have structure in metadata", backQuote(table_name));
assert(has_structure);
ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns);
ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices);
ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints);
ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns);
ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices);
ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints);
if (metadata.select.select_query)
{
ast->replace(ast_create_query.select, metadata.select.select_query);
}
/// MaterializedView is one type of CREATE query without storage.
if (ast_create_query.storage)
{
ASTStorage & storage_ast = *ast_create_query.storage;
bool is_extended_storage_def
= storage_ast.partition_by || storage_ast.primary_key || storage_ast.order_by || storage_ast.sample_by || storage_ast.settings;
if (is_extended_storage_def)
{
if (metadata.sorting_key.definition_ast)
storage_ast.set(storage_ast.order_by, metadata.sorting_key.definition_ast);
if (metadata.primary_key.definition_ast)
storage_ast.set(storage_ast.primary_key, metadata.primary_key.definition_ast);
if (metadata.sampling_key.definition_ast)
storage_ast.set(storage_ast.sample_by, metadata.sampling_key.definition_ast);
if (metadata.table_ttl.definition_ast)
storage_ast.set(storage_ast.ttl_table, metadata.table_ttl.definition_ast);
else if (storage_ast.ttl_table != nullptr) /// TTL was removed
storage_ast.ttl_table = nullptr;
if (metadata.settings_changes)
storage_ast.set(storage_ast.settings, metadata.settings_changes);
}
}
applyMetadataChangesToCreateQuery(ast, metadata);
statement = getObjectDefinitionFromCreateQuery(ast);
{
@ -332,10 +279,10 @@ void DatabaseOrdinary::alterTable(const Context & context, const StorageID & tab
out.close();
}
commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path);
commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, context);
}
void DatabaseOrdinary::commitAlterTable(const StorageID &, const String & table_metadata_tmp_path, const String & table_metadata_path)
void DatabaseOrdinary::commitAlterTable(const StorageID &, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & /*statement*/, const Context & /*query_context*/)
{
try
{

View File

@ -30,7 +30,7 @@ public:
const StorageInMemoryMetadata & metadata) override;
protected:
virtual void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path);
virtual void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context);
void startupTables(ThreadPool & thread_pool);
};

View File

@ -0,0 +1,719 @@
#include <DataTypes/DataTypeString.h>
#include <Databases/DatabaseReplicated.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context.h>
#include <Interpreters/executeQuery.h>
#include <Parsers/queryToString.h>
#include <Common/Exception.h>
#include <Common/Stopwatch.h>
#include <Common/ZooKeeper/KeeperException.h>
#include <Common/ZooKeeper/Types.h>
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Databases/DatabaseReplicatedWorker.h>
#include <Interpreters/DDLTask.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Interpreters/Cluster.h>
#include <common/getFQDNOrHostName.h>
#include <Parsers/ASTAlterQuery.h>
#include <Parsers/ParserCreateQuery.h>
#include <Parsers/parseQuery.h>
#include <Interpreters/InterpreterCreateQuery.h>
#include <Parsers/formatAST.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NO_ZOOKEEPER;
extern const int LOGICAL_ERROR;
extern const int BAD_ARGUMENTS;
extern const int REPLICA_IS_ALREADY_EXIST;
extern const int DATABASE_REPLICATION_FAILED;
extern const int UNKNOWN_DATABASE;
extern const int UNKNOWN_TABLE;
extern const int NOT_IMPLEMENTED;
extern const int INCORRECT_QUERY;
extern const int ALL_CONNECTION_TRIES_FAILED;
}
static constexpr const char * DROPPED_MARK = "DROPPED";
static constexpr const char * BROKEN_TABLES_SUFFIX = "_broken_tables";
zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const
{
return global_context.getZooKeeper();
}
static inline String getHostID(const Context & global_context, const UUID & db_uuid)
{
return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()) + ':' + toString(db_uuid);
}
DatabaseReplicated::~DatabaseReplicated() = default;
DatabaseReplicated::DatabaseReplicated(
const String & name_,
const String & metadata_path_,
UUID uuid,
const String & zookeeper_path_,
const String & shard_name_,
const String & replica_name_,
DatabaseReplicatedSettings db_settings_,
const Context & context_)
: DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseReplicated (" + name_ + ")", context_)
, zookeeper_path(zookeeper_path_)
, shard_name(shard_name_)
, replica_name(replica_name_)
, db_settings(std::move(db_settings_))
{
if (zookeeper_path.empty() || shard_name.empty() || replica_name.empty())
throw Exception("ZooKeeper path, shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS);
if (shard_name.find('/') != std::string::npos || replica_name.find('/') != std::string::npos)
throw Exception("Shard and replica names should not contain '/'", ErrorCodes::BAD_ARGUMENTS);
if (shard_name.find('|') != std::string::npos || replica_name.find('|') != std::string::npos)
throw Exception("Shard and replica names should not contain '|'", ErrorCodes::BAD_ARGUMENTS);
if (zookeeper_path.back() == '/')
zookeeper_path.resize(zookeeper_path.size() - 1);
/// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it.
if (zookeeper_path.front() != '/')
zookeeper_path = "/" + zookeeper_path;
}
String DatabaseReplicated::getFullReplicaName() const
{
return shard_name + '|' + replica_name;
}
std::pair<String, String> DatabaseReplicated::parseFullReplicaName(const String & name)
{
String shard;
String replica;
auto pos = name.find('|');
if (pos == std::string::npos || name.find('|', pos + 1) != std::string::npos)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect replica identifier: {}", name);
shard = name.substr(0, pos);
replica = name.substr(pos + 1);
return {shard, replica};
}
ClusterPtr DatabaseReplicated::getCluster() const
{
/// TODO Maintain up-to-date Cluster and allow to use it in Distributed tables
Strings hosts;
Strings host_ids;
auto zookeeper = global_context.getZooKeeper();
constexpr int max_retries = 10;
int iteration = 0;
bool success = false;
while (++iteration <= max_retries)
{
host_ids.resize(0);
Coordination::Stat stat;
hosts = zookeeper->getChildren(zookeeper_path + "/replicas", &stat);
if (hosts.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "No hosts found");
Int32 cver = stat.cversion;
std::sort(hosts.begin(), hosts.end());
std::vector<zkutil::ZooKeeper::FutureGet> futures;
futures.reserve(hosts.size());
host_ids.reserve(hosts.size());
for (const auto & host : hosts)
futures.emplace_back(zookeeper->asyncTryGet(zookeeper_path + "/replicas/" + host));
success = true;
for (auto & future : futures)
{
auto res = future.get();
if (res.error != Coordination::Error::ZOK)
success = false;
host_ids.emplace_back(res.data);
}
zookeeper->get(zookeeper_path + "/replicas", &stat);
if (success && cver == stat.version)
break;
}
if (!success)
throw Exception(ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get consistent cluster snapshot,"
"because replicas are created or removed concurrently");
assert(!hosts.empty());
assert(hosts.size() == host_ids.size());
String current_shard = parseFullReplicaName(hosts.front()).first;
std::vector<Strings> shards;
shards.emplace_back();
for (size_t i = 0; i < hosts.size(); ++i)
{
const auto & id = host_ids[i];
if (id == DROPPED_MARK)
continue;
auto [shard, replica] = parseFullReplicaName(hosts[i]);
auto pos = id.find(':');
String host = id.substr(0, pos);
if (shard != current_shard)
{
current_shard = shard;
if (!shards.back().empty())
shards.emplace_back();
}
shards.back().emplace_back(unescapeForFileName(host));
}
/// TODO make it configurable
String username = "default";
String password;
return std::make_shared<Cluster>(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false);
}
void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach)
{
try
{
if (!global_context.hasZooKeeper())
{
throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER);
}
auto current_zookeeper = global_context.getZooKeeper();
if (!current_zookeeper->exists(zookeeper_path))
{
/// Create new database, multiple nodes can execute it concurrently
createDatabaseNodesInZooKeeper(current_zookeeper);
}
replica_path = zookeeper_path + "/replicas/" + getFullReplicaName();
String replica_host_id;
if (current_zookeeper->tryGet(replica_path, replica_host_id))
{
String host_id = getHostID(global_context, db_uuid);
if (replica_host_id != host_id)
throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST,
"Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'",
replica_name, shard_name, zookeeper_path, replica_host_id, host_id);
}
else
{
/// Throws if replica with the same name already exists
createReplicaNodesInZooKeeper(current_zookeeper);
}
is_readonly = false;
}
catch (...)
{
if (!force_attach)
throw;
/// It's server startup, ignore error.
/// Worker thread will try to setup ZooKeeper connection
tryLogCurrentException(log);
}
}
bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper)
{
current_zookeeper->createAncestors(zookeeper_path);
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter", "", zkutil::CreateMode::Persistent));
/// We create and remove counter/cnt- node to increment sequential number of counter/ node and make log entry numbers start from 1.
/// New replicas are created with log pointer equal to 0 and log pointer is a number of the last executed entry.
/// It means that we cannot have log entry with number 0.
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter/cnt-", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/counter/cnt-", -1));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/max_log_ptr", "1", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/logs_to_keep", "1000", zkutil::CreateMode::Persistent));
Coordination::Responses responses;
auto res = current_zookeeper->tryMulti(ops, responses);
if (res == Coordination::Error::ZOK)
return true; /// Created new database (it's the first replica)
if (res == Coordination::Error::ZNODEEXISTS)
return false; /// Database exists, we will add new replica
/// Other codes are unexpected, will throw
zkutil::KeeperMultiException::check(res, ops, responses);
assert(false);
__builtin_unreachable();
}
void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper)
{
/// Write host name to replica_path, it will protect from multiple replicas with the same name
auto host_id = getHostID(global_context, db_uuid);
/// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info).
DDLLogEntry entry{};
String query_path_prefix = zookeeper_path + "/log/query-";
String counter_prefix = zookeeper_path + "/counter/cnt-";
String counter_path = current_zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential);
String query_path = query_path_prefix + counter_path.substr(counter_prefix.size());
Coordination::Requests ops;
ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", "0", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1));
current_zookeeper->multi(ops);
}
void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach)
{
tryConnectToZooKeeperAndInitDatabase(force_attach);
DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach);
ddl_worker = std::make_unique<DatabaseReplicatedDDLWorker>(this, global_context);
ddl_worker->startup();
}
BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, const Context & query_context)
{
if (is_readonly)
throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper");
if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY)
throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database.");
/// Replicas will set correct name of current database in query context (database name can be different on replicas)
if (auto * ddl_query = query->as<ASTQueryWithTableAndOutput>())
ddl_query->database.clear();
if (const auto * query_alter = query->as<ASTAlterQuery>())
{
for (const auto & command : query_alter->command_list->children)
{
if (!isSupportedAlterType(command->as<ASTAlterCommand&>().type))
throw Exception("Unsupported type of ALTER query", ErrorCodes::NOT_IMPLEMENTED);
}
}
LOG_DEBUG(log, "Proposing query: {}", queryToString(query));
/// TODO maybe write current settings to log entry?
DDLLogEntry entry;
entry.query = queryToString(query);
entry.initiator = ddl_worker->getCommonHostID();
String node_path = ddl_worker->tryEnqueueAndExecuteEntry(entry, query_context);
BlockIO io;
if (query_context.getSettingsRef().distributed_ddl_task_timeout == 0)
return io;
Strings hosts_to_wait = getZooKeeper()->getChildren(zookeeper_path + "/replicas");
auto stream = std::make_shared<DDLQueryStatusInputStream>(node_path, entry, query_context, hosts_to_wait);
if (query_context.getSettingsRef().database_replicated_ddl_output)
io.in = std::move(stream);
return io;
}
static UUID getTableUUIDIfReplicated(const String & metadata, const Context & context)
{
bool looks_like_replicated = metadata.find("ReplicatedMergeTree") != std::string::npos;
if (!looks_like_replicated)
return UUIDHelpers::Nil;
ParserCreateQuery parser;
auto size = context.getSettingsRef().max_query_size;
auto depth = context.getSettingsRef().max_parser_depth;
ASTPtr query = parseQuery(parser, metadata, size, depth);
const ASTCreateQuery & create = query->as<const ASTCreateQuery &>();
if (!create.storage || !create.storage->engine)
return UUIDHelpers::Nil;
if (!startsWith(create.storage->engine->name, "Replicated") || !endsWith(create.storage->engine->name, "MergeTree"))
return UUIDHelpers::Nil;
assert(create.uuid != UUIDHelpers::Nil);
return create.uuid;
}
void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr)
{
/// Let's compare local (possibly outdated) metadata with (most actual) metadata stored in ZooKeeper
/// and try to update the set of local tables.
/// We could drop all local tables and create the new ones just like it's new replica.
/// But it will cause all ReplicatedMergeTree tables to fetch all data parts again and data in other tables will be lost.
bool new_replica = our_log_ptr == 0;
if (new_replica)
LOG_INFO(log, "Will create new replica from log pointer {}", max_log_ptr);
else
LOG_WARNING(log, "Will recover replica with staled log pointer {} from log pointer {}", our_log_ptr, max_log_ptr);
if (new_replica && !empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "It's new replica, but database is not empty");
auto table_name_to_metadata = tryGetConsistentMetadataSnapshot(current_zookeeper, max_log_ptr);
/// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's the same table.
/// Metadata can be different, it's handled on table replication level.
/// We need to handle renamed tables only.
/// TODO maybe we should also update MergeTree SETTINGS if required?
std::unordered_map<UUID, String> zk_replicated_id_to_name;
for (const auto & zk_table : table_name_to_metadata)
{
UUID zk_replicated_id = getTableUUIDIfReplicated(zk_table.second, global_context);
if (zk_replicated_id != UUIDHelpers::Nil)
zk_replicated_id_to_name.emplace(zk_replicated_id, zk_table.first);
}
/// We will drop or move tables which exist only in local metadata
Strings tables_to_detach;
std::vector<std::pair<String, String>> replicated_tables_to_rename;
size_t total_tables = 0;
std::vector<UUID> replicated_ids;
for (auto existing_tables_it = getTablesIterator(global_context, {}); existing_tables_it->isValid(); existing_tables_it->next(), ++total_tables)
{
String name = existing_tables_it->name();
UUID local_replicated_id = UUIDHelpers::Nil;
if (existing_tables_it->table()->supportsReplication())
{
/// Check if replicated tables have the same UUID
local_replicated_id = existing_tables_it->table()->getStorageID().uuid;
auto it = zk_replicated_id_to_name.find(local_replicated_id);
if (it != zk_replicated_id_to_name.end())
{
if (name != it->second)
{
/// Need just update table name
replicated_tables_to_rename.emplace_back(name, it->second);
}
continue;
}
}
auto in_zk = table_name_to_metadata.find(name);
if (in_zk == table_name_to_metadata.end() || in_zk->second != readMetadataFile(name))
{
/// Local table does not exits in ZooKeeper or has different metadata
tables_to_detach.emplace_back(std::move(name));
}
}
String db_name = getDatabaseName();
String to_db_name = getDatabaseName() + BROKEN_TABLES_SUFFIX;
if (total_tables * db_settings.max_broken_tables_ratio < tables_to_detach.size())
throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Too many tables to recreate: {} of {}", tables_to_detach.size(), total_tables);
else if (!tables_to_detach.empty())
{
LOG_WARNING(log, "Will recreate {} broken tables to recover replica", tables_to_detach.size());
/// It's too dangerous to automatically drop tables, so we will move them to special database.
/// We use Ordinary engine for destination database, because it's the only way to discard table UUID
/// and make possible creation of new table with the same UUID.
String query = fmt::format("CREATE DATABASE IF NOT EXISTS {} ENGINE=Ordinary", backQuoteIfNeed(to_db_name));
Context query_context = global_context;
executeQuery(query, query_context, true);
}
size_t dropped_dicts = 0;
size_t moved_tables = 0;
std::vector<UUID> dropped_tables;
for (const auto & table_name : tables_to_detach)
{
DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, table_name);
if (getDatabaseName() != db_name)
throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed, will retry");
auto table = tryGetTable(table_name, global_context);
if (isDictionaryExist(table_name))
{
/// We can safely drop any dictionaries because they do not store data
LOG_DEBUG(log, "Will DROP DICTIONARY {}", backQuoteIfNeed(table_name));
DatabaseAtomic::removeDictionary(global_context, table_name);
++dropped_dicts;
}
else if (!table->storesDataOnDisk())
{
LOG_DEBUG(log, "Will DROP TABLE {}, because it does not store data on disk and can be safely dropped", backQuoteIfNeed(table_name));
dropped_tables.push_back(tryGetTableUUID(table_name));
table->shutdown();
DatabaseAtomic::dropTable(global_context, table_name, true);
}
else
{
/// Table probably stores some data. Let's move it to another database.
String to_name = fmt::format("{}_{}_{}", table_name, max_log_ptr, thread_local_rng() % 1000);
LOG_DEBUG(log, "Will RENAME TABLE {} TO {}.{}", backQuoteIfNeed(table_name), backQuoteIfNeed(to_db_name), backQuoteIfNeed(to_name));
assert(db_name < to_db_name);
DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(to_db_name, to_name);
auto to_db_ptr = DatabaseCatalog::instance().getDatabase(to_db_name);
DatabaseAtomic::renameTable(global_context, table_name, *to_db_ptr, to_name, false, false);
++moved_tables;
}
}
if (!tables_to_detach.empty())
LOG_WARNING(log, "Cleaned {} outdated objects: dropped {} dictionaries and {} tables, moved {} tables",
tables_to_detach.size(), dropped_dicts, dropped_tables.size(), moved_tables);
/// Now database is cleared from outdated tables, let's rename ReplicatedMergeTree tables to actual names
for (const auto & old_to_new : replicated_tables_to_rename)
{
const String & from = old_to_new.first;
const String & to = old_to_new.second;
LOG_DEBUG(log, "Will RENAME TABLE {} TO {}", backQuoteIfNeed(from), backQuoteIfNeed(to));
/// TODO Maybe we should do it in two steps: rename all tables to temporary names and then rename them to actual names?
DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::min(from, to));
DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::max(from, to));
DatabaseAtomic::renameTable(global_context, from, *this, to, false, false);
}
for (const auto & id : dropped_tables)
DatabaseCatalog::instance().waitTableFinallyDropped(id);
for (const auto & name_and_meta : table_name_to_metadata)
{
if (isTableExist(name_and_meta.first, global_context))
{
assert(name_and_meta.second == readMetadataFile(name_and_meta.first));
continue;
}
auto query_ast = parseQueryFromMetadataInZooKeeper(name_and_meta.first, name_and_meta.second);
Context query_context = global_context;
query_context.makeQueryContext();
query_context.getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
query_context.setCurrentDatabase(database_name);
query_context.setCurrentQueryId(""); // generate random query_id
LOG_INFO(log, "Executing {}", serializeAST(*query_ast));
InterpreterCreateQuery(query_ast, query_context).execute();
}
current_zookeeper->set(replica_path + "/log_ptr", toString(max_log_ptr));
}
std::map<String, String> DatabaseReplicated::tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr)
{
std::map<String, String> table_name_to_metadata;
constexpr int max_retries = 10;
int iteration = 0;
while (++iteration <= max_retries)
{
table_name_to_metadata.clear();
LOG_DEBUG(log, "Trying to get consistent metadata snapshot for log pointer {}", max_log_ptr);
Strings table_names = zookeeper->getChildren(zookeeper_path + "/metadata");
std::vector<zkutil::ZooKeeper::FutureGet> futures;
futures.reserve(table_names.size());
for (const auto & table : table_names)
futures.emplace_back(zookeeper->asyncTryGet(zookeeper_path + "/metadata/" + table));
for (size_t i = 0; i < table_names.size(); ++i)
{
auto res = futures[i].get();
if (res.error != Coordination::Error::ZOK)
break;
table_name_to_metadata.emplace(unescapeForFileName(table_names[i]), res.data);
}
UInt32 new_max_log_ptr = parse<UInt32>(zookeeper->get(zookeeper_path + "/max_log_ptr"));
if (new_max_log_ptr == max_log_ptr && table_names.size() == table_name_to_metadata.size())
break;
if (max_log_ptr < new_max_log_ptr)
{
LOG_DEBUG(log, "Log pointer moved from {} to {}, will retry", max_log_ptr, new_max_log_ptr);
max_log_ptr = new_max_log_ptr;
}
else
{
assert(max_log_ptr == new_max_log_ptr);
assert(table_names.size() != table_name_to_metadata.size());
LOG_DEBUG(log, "Cannot get metadata of some tables due to ZooKeeper error, will retry");
}
}
if (max_retries < iteration)
throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Cannot get consistent metadata snapshot");
LOG_DEBUG(log, "Got consistent metadata snapshot for log pointer {}", max_log_ptr);
return table_name_to_metadata;
}
ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query)
{
ParserCreateQuery parser;
String description = "in ZooKeeper " + zookeeper_path + "/metadata/" + node_name;
auto ast = parseQuery(parser, query, description, 0, global_context.getSettingsRef().max_parser_depth);
auto & create = ast->as<ASTCreateQuery &>();
if (create.uuid == UUIDHelpers::Nil || create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER || ! create.database.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Got unexpected query from {}: {}", node_name, query);
create.database = getDatabaseName();
create.table = unescapeForFileName(node_name);
create.attach = false;
return ast;
}
void DatabaseReplicated::drop(const Context & context_)
{
auto current_zookeeper = getZooKeeper();
current_zookeeper->set(replica_path, DROPPED_MARK);
DatabaseAtomic::drop(context_);
current_zookeeper->tryRemoveRecursive(replica_path);
/// TODO it may leave garbage in ZooKeeper if the last node lost connection here
if (current_zookeeper->tryRemove(zookeeper_path + "/replicas") == Coordination::Error::ZOK)
{
/// It was the last replica, remove all metadata
current_zookeeper->tryRemoveRecursive(zookeeper_path);
}
}
void DatabaseReplicated::stopReplication()
{
if (ddl_worker)
ddl_worker->shutdown();
}
void DatabaseReplicated::shutdown()
{
stopReplication();
ddl_worker = nullptr;
DatabaseAtomic::shutdown();
}
void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay)
{
auto txn = context.getZooKeeperMetadataTransaction();
assert(!ddl_worker->isCurrentlyActive() || txn);
if (txn && txn->isInitialQuery())
{
String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name);
txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
}
DatabaseAtomic::dropTable(context, table_name, no_delay);
}
void DatabaseReplicated::renameTable(const Context & context, const String & table_name, IDatabase & to_database,
const String & to_table_name, bool exchange, bool dictionary)
{
auto txn = context.getZooKeeperMetadataTransaction();
assert(txn);
if (txn->isInitialQuery())
{
if (this != &to_database)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine");
if (table_name == to_table_name)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot rename table to itself");
if (!isTableExist(table_name, context))
throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_name);
if (exchange && !to_database.isTableExist(to_table_name, context))
throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name);
String statement = readMetadataFile(table_name);
String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name);
String metadata_zk_path_to = zookeeper_path + "/metadata/" + escapeForFileName(to_table_name);
txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
if (exchange)
{
String statement_to = readMetadataFile(to_table_name);
txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path_to, -1));
txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent));
}
txn->addOp(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent));
}
DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange, dictionary);
}
void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
const String & table_metadata_tmp_path, const String & table_metadata_path,
const Context & query_context)
{
auto txn = query_context.getZooKeeperMetadataTransaction();
assert(!ddl_worker->isCurrentlyActive() || txn);
if (txn && txn->isInitialQuery())
{
String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(query.table);
String statement = getObjectDefinitionFromCreateQuery(query.clone());
/// zk::multi(...) will throw if `metadata_zk_path` exists
txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent));
}
DatabaseAtomic::commitCreateTable(query, table, table_metadata_tmp_path, table_metadata_path, query_context);
}
void DatabaseReplicated::commitAlterTable(const StorageID & table_id,
const String & table_metadata_tmp_path, const String & table_metadata_path,
const String & statement, const Context & query_context)
{
auto txn = query_context.getZooKeeperMetadataTransaction();
if (txn && txn->isInitialQuery())
{
String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name);
txn->addOp(zkutil::makeSetRequest(metadata_zk_path, statement, -1));
}
DatabaseAtomic::commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, query_context);
}
void DatabaseReplicated::createDictionary(const Context & context,
const String & dictionary_name,
const ASTPtr & query)
{
auto txn = context.getZooKeeperMetadataTransaction();
assert(!ddl_worker->isCurrentlyActive() || txn);
if (txn && txn->isInitialQuery())
{
String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name);
String statement = getObjectDefinitionFromCreateQuery(query->clone());
txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent));
}
DatabaseAtomic::createDictionary(context, dictionary_name, query);
}
void DatabaseReplicated::removeDictionary(const Context & context, const String & dictionary_name)
{
auto txn = context.getZooKeeperMetadataTransaction();
assert(!ddl_worker->isCurrentlyActive() || txn);
if (txn && txn->isInitialQuery())
{
String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name);
txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
}
DatabaseAtomic::removeDictionary(context, dictionary_name);
}
void DatabaseReplicated::detachTablePermanently(const Context & context, const String & table_name)
{
auto txn = context.getZooKeeperMetadataTransaction();
assert(!ddl_worker->isCurrentlyActive() || txn);
if (txn && txn->isInitialQuery())
{
String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name);
txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
}
DatabaseAtomic::detachTablePermanently(context, table_name);
}
String DatabaseReplicated::readMetadataFile(const String & table_name) const
{
String statement;
ReadBufferFromFile in(getObjectMetadataPath(table_name), 4096);
readStringUntilEOF(statement, in);
return statement;
}
}

View File

@ -0,0 +1,91 @@
#pragma once
#include <Databases/DatabaseAtomic.h>
#include <Databases/DatabaseReplicatedSettings.h>
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Core/BackgroundSchedulePool.h>
#include <DataStreams/BlockIO.h>
#include <DataStreams/OneBlockInputStream.h>
#include <Interpreters/Context.h>
namespace DB
{
class DatabaseReplicatedDDLWorker;
using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
class Cluster;
using ClusterPtr = std::shared_ptr<Cluster>;
class DatabaseReplicated : public DatabaseAtomic
{
public:
DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid,
const String & zookeeper_path_, const String & shard_name_, const String & replica_name_,
DatabaseReplicatedSettings db_settings_,
const Context & context);
~DatabaseReplicated() override;
String getEngineName() const override { return "Replicated"; }
/// If current query is initial, then the following methods add metadata updating ZooKeeper operations to current ZooKeeperMetadataTransaction.
void dropTable(const Context &, const String & table_name, bool no_delay) override;
void renameTable(const Context & context, const String & table_name, IDatabase & to_database,
const String & to_table_name, bool exchange, bool dictionary) override;
void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
const String & table_metadata_tmp_path, const String & table_metadata_path,
const Context & query_context) override;
void commitAlterTable(const StorageID & table_id,
const String & table_metadata_tmp_path, const String & table_metadata_path,
const String & statement, const Context & query_context) override;
void createDictionary(const Context & context,
const String & dictionary_name,
const ASTPtr & query) override;
void removeDictionary(const Context & context, const String & dictionary_name) override;
void detachTablePermanently(const Context & context, const String & table_name) override;
/// Try to execute DLL query on current host as initial query. If query is succeed,
/// then it will be executed on all replicas.
BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, const Context & query_context);
void stopReplication();
String getFullReplicaName() const;
static std::pair<String, String> parseFullReplicaName(const String & name);
/// Returns cluster consisting of database replicas
ClusterPtr getCluster() const;
void drop(const Context & /*context*/) override;
void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override;
void shutdown() override;
friend struct DatabaseReplicatedTask;
friend class DatabaseReplicatedDDLWorker;
private:
void tryConnectToZooKeeperAndInitDatabase(bool force_attach);
bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper);
void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper);
void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr);
std::map<String, String> tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr);
ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query);
String readMetadataFile(const String & table_name) const;
String zookeeper_path;
String shard_name;
String replica_name;
String replica_path;
DatabaseReplicatedSettings db_settings;
zkutil::ZooKeeperPtr getZooKeeper() const;
std::atomic_bool is_readonly = true;
std::unique_ptr<DatabaseReplicatedDDLWorker> ddl_worker;
};
}

View File

@ -0,0 +1,23 @@
#include <Databases/DatabaseReplicatedSettings.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTCreateQuery.h>
namespace DB
{
IMPLEMENT_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS)
void DatabaseReplicatedSettings::loadFromQuery(ASTStorage & storage_def)
{
if (storage_def.settings)
{
applyChanges(storage_def.settings->changes);
return;
}
auto settings_ast = std::make_shared<ASTSetQuery>();
settings_ast->is_standalone = false;
storage_def.set(storage_def.settings, settings_ast);
}
}

View File

@ -0,0 +1,26 @@
#pragma once
#include <Core/Defines.h>
#include <Core/BaseSettings.h>
namespace DB
{
class ASTStorage;
#define LIST_OF_DATABASE_REPLICATED_SETTINGS(M) \
M(Float, max_broken_tables_ratio, 0.5, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \
M(UInt64, max_replication_lag_to_enqueue, 10, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \
M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \
DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS)
/** Settings for the MaterializeMySQL database engine.
* Could be loaded from a CREATE DATABASE query (SETTINGS clause).
*/
struct DatabaseReplicatedSettings : public BaseSettings<DatabaseReplicatedSettingsTraits>
{
void loadFromQuery(ASTStorage & storage_def);
};
}

View File

@ -0,0 +1,260 @@
#include <Databases/DatabaseReplicatedWorker.h>
#include <Databases/DatabaseReplicated.h>
#include <Interpreters/DDLTask.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int DATABASE_REPLICATION_FAILED;
extern const int NOT_A_LEADER;
extern const int UNFINISHED;
}
DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_)
: DDLWorker(/* pool_size */ 1, db->zookeeper_path + "/log", context_, nullptr, {}, fmt::format("DDLWorker({})", db->getDatabaseName()))
, database(db)
{
/// Pool size must be 1 to avoid reordering of log entries.
/// TODO Make a dependency graph of DDL queries. It will allow to execute independent entries in parallel.
/// We also need similar graph to load tables on server startup in order of topsort.
}
void DatabaseReplicatedDDLWorker::initializeMainThread()
{
while (!stop_flag)
{
try
{
auto zookeeper = getAndSetZooKeeper();
if (database->is_readonly)
database->tryConnectToZooKeeperAndInitDatabase(false);
initializeReplication();
initialized = true;
return;
}
catch (...)
{
tryLogCurrentException(log, fmt::format("Error on initialization of {}", database->getDatabaseName()));
sleepForSeconds(5);
}
}
}
void DatabaseReplicatedDDLWorker::shutdown()
{
DDLWorker::shutdown();
wait_current_task_change.notify_all();
}
void DatabaseReplicatedDDLWorker::initializeReplication()
{
/// Check if we need to recover replica.
/// Invariant: replica is lost if it's log_ptr value is less then max_log_ptr - logs_to_keep.
String log_ptr_str = current_zookeeper->get(database->replica_path + "/log_ptr");
UInt32 our_log_ptr = parse<UInt32>(log_ptr_str);
UInt32 max_log_ptr = parse<UInt32>(current_zookeeper->get(database->zookeeper_path + "/max_log_ptr"));
logs_to_keep = parse<UInt32>(current_zookeeper->get(database->zookeeper_path + "/logs_to_keep"));
if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr)
database->recoverLostReplica(current_zookeeper, our_log_ptr, max_log_ptr);
else
last_skipped_entry_name.emplace(log_ptr_str);
}
String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry)
{
auto zookeeper = getAndSetZooKeeper();
const String query_path_prefix = queue_dir + "/query-";
/// We cannot create sequential node and it's ephemeral child in a single transaction, so allocate sequential number another way
String counter_prefix = database->zookeeper_path + "/counter/cnt-";
String counter_path = zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential);
String node_path = query_path_prefix + counter_path.substr(counter_prefix.size());
Coordination::Requests ops;
/// Query is not committed yet, but we have to write it into log to avoid reordering
ops.emplace_back(zkutil::makeCreateRequest(node_path, entry.toString(), zkutil::CreateMode::Persistent));
/// '/try' will be replaced with '/committed' or will be removed due to expired session or other error
ops.emplace_back(zkutil::makeCreateRequest(node_path + "/try", database->getFullReplicaName(), zkutil::CreateMode::Ephemeral));
/// We don't need it anymore
ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1));
/// Create status dirs
ops.emplace_back(zkutil::makeCreateRequest(node_path + "/active", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(node_path + "/finished", "", zkutil::CreateMode::Persistent));
zookeeper->multi(ops);
return node_path;
}
String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context)
{
/// NOTE Possibly it would be better to execute initial query on the most up-to-date node,
/// but it requires more complex logic around /try node.
auto zookeeper = getAndSetZooKeeper();
UInt32 our_log_ptr = parse<UInt32>(zookeeper->get(database->replica_path + "/log_ptr"));
UInt32 max_log_ptr = parse<UInt32>(zookeeper->get(database->zookeeper_path + "/max_log_ptr"));
assert(our_log_ptr <= max_log_ptr);
if (database->db_settings.max_replication_lag_to_enqueue < max_log_ptr - our_log_ptr)
throw Exception(ErrorCodes::NOT_A_LEADER, "Cannot enqueue query on this replica, "
"because it has replication lag of {} queries. Try other replica.", max_log_ptr - our_log_ptr);
String entry_path = enqueueQuery(entry);
auto try_node = zkutil::EphemeralNodeHolder::existing(entry_path + "/try", *zookeeper);
String entry_name = entry_path.substr(entry_path.rfind('/') + 1);
auto task = std::make_unique<DatabaseReplicatedTask>(entry_name, entry_path, database);
task->entry = entry;
task->parseQueryFromEntry(context);
assert(!task->entry.query.empty());
assert(!zookeeper->exists(task->getFinishedNodePath()));
task->is_initial_query = true;
LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name);
UInt64 timeout = query_context.getSettingsRef().database_replicated_initial_query_timeout_sec;
{
std::unique_lock lock{mutex};
bool processed = wait_current_task_change.wait_for(lock, std::chrono::seconds(timeout), [&]()
{
assert(zookeeper->expired() || current_task <= entry_name);
return zookeeper->expired() || current_task == entry_name || stop_flag;
});
if (!processed)
throw Exception(ErrorCodes::UNFINISHED, "Timeout: Cannot enqueue query on this replica,"
"most likely because replica is busy with previous queue entries");
}
if (zookeeper->expired() || stop_flag)
throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "ZooKeeper session expired or replication stopped, try again");
processTask(*task, zookeeper);
if (!task->was_executed)
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} was executed, but was not committed: code {}: {}",
task->execution_status.code, task->execution_status.message);
}
try_node->setAlreadyRemoved();
return entry_path;
}
DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper)
{
{
std::lock_guard lock{mutex};
if (current_task < entry_name)
{
current_task = entry_name;
wait_current_task_change.notify_all();
}
}
UInt32 our_log_ptr = parse<UInt32>(current_zookeeper->get(database->replica_path + "/log_ptr"));
UInt32 entry_num = DatabaseReplicatedTask::getLogEntryNumber(entry_name);
if (entry_num <= our_log_ptr)
{
out_reason = fmt::format("Task {} already executed according to log pointer {}", entry_name, our_log_ptr);
return {};
}
String entry_path = queue_dir + "/" + entry_name;
auto task = std::make_unique<DatabaseReplicatedTask>(entry_name, entry_path, database);
String initiator_name;
zkutil::EventPtr wait_committed_or_failed = std::make_shared<Poco::Event>();
String try_node_path = entry_path + "/try";
if (zookeeper->tryGet(try_node_path, initiator_name, nullptr, wait_committed_or_failed))
{
task->is_initial_query = initiator_name == task->host_id_str;
/// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication.
LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path);
constexpr size_t wait_time_ms = 1000;
size_t max_iterations = database->db_settings.wait_entry_commited_timeout_sec;
size_t iteration = 0;
while (!wait_committed_or_failed->tryWait(wait_time_ms))
{
if (stop_flag)
{
/// We cannot return task to process and we cannot return nullptr too,
/// because nullptr means "task should not be executed".
/// We can only exit by exception.
throw Exception(ErrorCodes::UNFINISHED, "Replication was stopped");
}
if (max_iterations <= ++iteration)
{
/// What can we do if initiator hangs for some reason? Seems like we can remove /try node.
/// Initiator will fail to commit ZooKeeperMetadataTransaction (including ops for replicated table) if /try does not exist.
/// But it's questionable.
/// We use tryRemove(...) because multiple hosts (including initiator) may try to do it concurrently.
auto code = zookeeper->tryRemove(try_node_path);
if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNONODE)
throw Coordination::Exception(code, try_node_path);
if (!zookeeper->exists(entry_path + "/committed"))
{
out_reason = fmt::format("Entry {} was forcefully cancelled due to timeout", entry_name);
return {};
}
}
}
}
if (!zookeeper->exists(entry_path + "/committed"))
{
out_reason = fmt::format("Entry {} hasn't been committed", entry_name);
return {};
}
if (task->is_initial_query)
{
assert(!zookeeper->exists(entry_path + "/try"));
assert(zookeeper->exists(entry_path + "/committed") == (zookeeper->get(task->getFinishedNodePath()) == ExecutionStatus(0).serializeText()));
out_reason = fmt::format("Entry {} has been executed as initial query", entry_name);
return {};
}
String node_data;
if (!zookeeper->tryGet(entry_path, node_data))
{
LOG_ERROR(log, "Cannot get log entry {}", entry_path);
throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable");
}
task->entry.parse(node_data);
if (task->entry.query.empty())
{
out_reason = fmt::format("Entry {} is a dummy task", entry_name);
return {};
}
task->parseQueryFromEntry(context);
if (zookeeper->exists(task->getFinishedNodePath()))
{
out_reason = fmt::format("Task {} has been already processed", entry_name);
return {};
}
return task;
}
bool DatabaseReplicatedDDLWorker::canRemoveQueueEntry(const String & entry_name, const Coordination::Stat &)
{
UInt32 entry_number = DDLTaskBase::getLogEntryNumber(entry_name);
UInt32 max_log_ptr = parse<UInt32>(getAndSetZooKeeper()->get(database->zookeeper_path + "/max_log_ptr"));
return entry_number + logs_to_keep < max_log_ptr;
}
}

View File

@ -0,0 +1,46 @@
#pragma once
#include <Interpreters/DDLWorker.h>
namespace DB
{
class DatabaseReplicated;
/// It's similar to DDLWorker, but has the following differences:
/// 1. DDL queue in ZooKeeper is not shared between multiple clusters and databases,
/// each DatabaseReplicated has its own queue in ZooKeeper and DatabaseReplicatedDDLWorker object.
/// 2. Shards and replicas are identified by shard_name and replica_name arguments of database engine,
/// not by address:port pairs. Cluster (of multiple database replicas) is identified by its zookeeper_path.
/// 3. After creation of an entry in DDL queue initiator tries to execute the entry locally
/// and other hosts wait for query to finish on initiator host.
/// If query succeed on initiator, then all hosts must execute it, so they will retry until query succeed.
/// We assume that cluster is homogeneous, so if replicas are in consistent state and query succeed on one host,
/// then all hosts can execute it (maybe after several retries).
/// 4. Each database replica stores its log pointer in ZooKeeper. Cleanup thread removes old entry
/// if its number < max_log_ptr - logs_to_keep.
class DatabaseReplicatedDDLWorker : public DDLWorker
{
public:
DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_);
String enqueueQuery(DDLLogEntry & entry) override;
String tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context);
void shutdown() override;
private:
void initializeMainThread() override;
void initializeReplication();
DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override;
bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat) override;
DatabaseReplicated * const database;
mutable std::mutex mutex;
std::condition_variable wait_current_task_change;
String current_task;
UInt32 logs_to_keep = std::numeric_limits<UInt32>::max();
};
}

View File

@ -4,6 +4,7 @@
#include <Interpreters/ExternalDictionariesLoader.h>
#include <Interpreters/ExternalLoaderTempConfigRepository.h>
#include <Interpreters/ExternalLoaderDatabaseConfigRepository.h>
#include <Interpreters/DDLTask.h>
#include <Dictionaries/getDictionaryConfigurationFromAST.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Parsers/ASTCreateQuery.h>
@ -193,6 +194,10 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S
detachDictionary(dictionary_name);
});
auto txn = context.getZooKeeperMetadataTransaction();
if (txn && !context.isInternalSubquery())
txn->commit(); /// Commit point (a sort of) for Replicated database
/// If it was ATTACH query and file with dictionary metadata already exist
/// (so, ATTACH is done after DETACH), then rename atomically replaces old file with new one.
Poco::File(dictionary_metadata_tmp_path).renameTo(dictionary_metadata_path);
@ -205,7 +210,7 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S
succeeded = true;
}
void DatabaseWithDictionaries::removeDictionary(const Context &, const String & dictionary_name)
void DatabaseWithDictionaries::removeDictionary(const Context & context, const String & dictionary_name)
{
DictionaryAttachInfo attach_info;
detachDictionaryImpl(dictionary_name, attach_info);
@ -213,6 +218,11 @@ void DatabaseWithDictionaries::removeDictionary(const Context &, const String &
try
{
String dictionary_metadata_path = getObjectMetadataPath(dictionary_name);
auto txn = context.getZooKeeperMetadataTransaction();
if (txn && !context.isInternalSubquery())
txn->commit(); /// Commit point (a sort of) for Replicated database
Poco::File(dictionary_metadata_path).remove();
CurrentStatusInfo::unset(CurrentStatusInfo::DictionaryStatus,
StorageID(attach_info.create_query).getInternalDictionaryName());

View File

@ -249,7 +249,7 @@ public:
/// Forget about the table without deleting it's data, but rename metadata file to prevent reloading it
/// with next restart. The database may not support this method.
virtual void detachTablePermanently(const String & /*name*/)
virtual void detachTablePermanently(const Context & /*context*/, const String & /*name*/)
{
throw Exception("There is no DETACH TABLE PERMANENTLY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED);
}

View File

@ -395,7 +395,7 @@ void DatabaseConnectionMySQL::loadStoredObjects(Context &, bool, bool /*force_at
}
}
void DatabaseConnectionMySQL::detachTablePermanently(const String & table_name)
void DatabaseConnectionMySQL::detachTablePermanently(const Context &, const String & table_name)
{
std::lock_guard<std::mutex> lock{mutex};
@ -429,9 +429,9 @@ void DatabaseConnectionMySQL::detachTablePermanently(const String & table_name)
table_iter->second.second->is_dropped = true;
}
void DatabaseConnectionMySQL::dropTable(const Context &, const String & table_name, bool /*no_delay*/)
void DatabaseConnectionMySQL::dropTable(const Context & context, const String & table_name, bool /*no_delay*/)
{
detachTablePermanently(table_name);
detachTablePermanently(context, table_name);
}
DatabaseConnectionMySQL::~DatabaseConnectionMySQL()

View File

@ -72,9 +72,9 @@ public:
StoragePtr detachTable(const String & table_name) override;
void detachTablePermanently(const String & table_name) override;
void detachTablePermanently(const Context & context, const String & table_name) override;
void dropTable(const Context &, const String & table_name, bool no_delay) override;
void dropTable(const Context & context, const String & table_name, bool no_delay) override;
void attachTable(const String & table_name, const StoragePtr & storage, const String & relative_table_path) override;

View File

@ -16,6 +16,9 @@ SRCS(
DatabaseMemory.cpp
DatabaseOnDisk.cpp
DatabaseOrdinary.cpp
DatabaseReplicated.cpp
DatabaseReplicatedSettings.cpp
DatabaseReplicatedWorker.cpp
DatabaseWithDictionaries.cpp
DatabasesCommon.cpp
MySQL/ConnectionMySQLSettings.cpp

View File

@ -271,9 +271,13 @@ static T inline packFixedShuffle(
size_t idx,
const uint8_t * __restrict masks)
{
__m128i res{};
assert(num_srcs > 0);
for (size_t i = 0; i < num_srcs; ++i)
__m128i res = _mm_shuffle_epi8(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(srcs[0] + elem_sizes[0] * idx)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(masks)));
for (size_t i = 1; i < num_srcs; ++i)
{
res = _mm_xor_si128(res,
_mm_shuffle_epi8(

View File

@ -558,7 +558,7 @@ void NO_INLINE Aggregator::executeImplBatch(
/// Generic case.
PODArray<AggregateDataPtr> places(rows);
std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[rows]);
/// For all rows.
for (size_t i = 0; i < rows; ++i)
@ -589,9 +589,9 @@ void NO_INLINE Aggregator::executeImplBatch(
for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
{
if (inst->offsets)
inst->batch_that->addBatchArray(rows, places.data(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool);
inst->batch_that->addBatchArray(rows, places.get(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool);
else
inst->batch_that->addBatch(rows, places.data(), inst->state_offset, inst->batch_arguments, aggregates_pool);
inst->batch_that->addBatch(rows, places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool);
}
}

View File

@ -342,6 +342,7 @@ struct ContextShared
ReplicatedFetchList replicated_fetch_list;
ConfigurationPtr users_config; /// Config with the users, profiles and quotas sections.
InterserverIOHandler interserver_io_handler; /// Handler for interserver communication.
mutable std::optional<BackgroundSchedulePool> buffer_flush_schedule_pool; /// A thread pool that can do background flush for Buffer tables.
mutable std::optional<BackgroundSchedulePool> schedule_pool; /// A thread pool that can run different jobs in background (used in replicated tables)
mutable std::optional<BackgroundSchedulePool> distributed_schedule_pool; /// A thread pool that can run different jobs in background (used for distributed sends)
@ -1552,6 +1553,7 @@ void Context::setDDLWorker(std::unique_ptr<DDLWorker> ddl_worker)
auto lock = getLock();
if (shared->ddl_worker)
throw Exception("DDL background thread has already been initialized", ErrorCodes::LOGICAL_ERROR);
ddl_worker->startup();
shared->ddl_worker = std::move(ddl_worker);
}
@ -2551,6 +2553,19 @@ StorageID Context::resolveStorageIDImpl(StorageID storage_id, StorageNamespace w
return StorageID::createEmpty();
}
void Context::initZooKeeperMetadataTransaction(ZooKeeperMetadataTransactionPtr txn, [[maybe_unused]] bool attach_existing)
{
assert(!metadata_transaction);
assert(attach_existing || query_context == this);
metadata_transaction = std::move(txn);
}
ZooKeeperMetadataTransactionPtr Context::getZooKeeperMetadataTransaction() const
{
assert(!metadata_transaction || hasQueryContext());
return metadata_transaction;
}
PartUUIDsPtr Context::getPartUUIDs()
{
auto lock = getLock();

View File

@ -117,6 +117,8 @@ using VolumePtr = std::shared_ptr<IVolume>;
struct NamedSession;
struct BackgroundTaskSchedulingSettings;
class ZooKeeperMetadataTransaction;
using ZooKeeperMetadataTransactionPtr = std::shared_ptr<ZooKeeperMetadataTransaction>;
#if USE_EMBEDDED_COMPILER
class CompiledExpressionCache;
@ -279,6 +281,12 @@ private:
/// to be customized in HTTP and TCP servers by overloading the customizeContext(DB::Context&)
/// methods.
ZooKeeperMetadataTransactionPtr metadata_transaction; /// Distributed DDL context. I'm not sure if it's a suitable place for this,
/// but it's the easiest way to pass this through the whole stack from executeQuery(...)
/// to DatabaseOnDisk::commitCreateTable(...) or IStorage::alter(...) without changing
/// thousands of signatures.
/// And I hope it will be replaced with more common Transaction sometime.
/// Use copy constructor or createGlobal() instead
Context();
@ -534,6 +542,7 @@ public:
const Context & getQueryContext() const;
Context & getQueryContext();
bool hasQueryContext() const { return query_context != nullptr; }
bool isInternalSubquery() const { return hasQueryContext() && query_context != this; }
const Context & getSessionContext() const;
Context & getSessionContext();
@ -737,6 +746,11 @@ public:
IHostContextPtr & getHostContext();
const IHostContextPtr & getHostContext() const;
/// Initialize context of distributed DDL query with Replicated database.
void initZooKeeperMetadataTransaction(ZooKeeperMetadataTransactionPtr txn, bool attach_existing = false);
/// Returns context of current distributed DDL query or nullptr.
ZooKeeperMetadataTransactionPtr getZooKeeperMetadataTransaction() const;
struct MySQLWireContext
{
uint8_t sequence_id = 0;

View File

@ -0,0 +1,344 @@
#include <Interpreters/DDLTask.h>
#include <Common/DNSResolver.h>
#include <Common/isLocalAddress.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <IO/Operators.h>
#include <IO/ReadBufferFromString.h>
#include <Poco/Net/NetException.h>
#include <common/logger_useful.h>
#include <Parsers/ParserQuery.h>
#include <Parsers/parseQuery.h>
#include <Parsers/ASTQueryWithOnCluster.h>
#include <Parsers/ASTQueryWithTableAndOutput.h>
#include <Databases/DatabaseReplicated.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_FORMAT_VERSION;
extern const int UNKNOWN_TYPE_OF_QUERY;
extern const int INCONSISTENT_CLUSTER_DEFINITION;
}
HostID HostID::fromString(const String & host_port_str)
{
HostID res;
std::tie(res.host_name, res.port) = Cluster::Address::fromString(host_port_str);
return res;
}
bool HostID::isLocalAddress(UInt16 clickhouse_port) const
{
try
{
return DB::isLocalAddress(DNSResolver::instance().resolveAddress(host_name, port), clickhouse_port);
}
catch (const Poco::Net::NetException &)
{
/// Avoid "Host not found" exceptions
return false;
}
}
String DDLLogEntry::toString() const
{
WriteBufferFromOwnString wb;
Strings host_id_strings(hosts.size());
std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString);
auto version = CURRENT_VERSION;
wb << "version: " << version << "\n";
wb << "query: " << escape << query << "\n";
wb << "hosts: " << host_id_strings << "\n";
wb << "initiator: " << initiator << "\n";
return wb.str();
}
void DDLLogEntry::parse(const String & data)
{
ReadBufferFromString rb(data);
int version;
rb >> "version: " >> version >> "\n";
if (version != CURRENT_VERSION)
throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown DDLLogEntry format version: {}", version);
Strings host_id_strings;
rb >> "query: " >> escape >> query >> "\n";
rb >> "hosts: " >> host_id_strings >> "\n";
if (!rb.eof())
rb >> "initiator: " >> initiator >> "\n";
else
initiator.clear();
assertEOF(rb);
hosts.resize(host_id_strings.size());
std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString);
}
void DDLTaskBase::parseQueryFromEntry(const Context & context)
{
const char * begin = entry.query.data();
const char * end = begin + entry.query.size();
ParserQuery parser_query(end);
String description;
query = parseQuery(parser_query, begin, end, description, 0, context.getSettingsRef().max_parser_depth);
}
std::unique_ptr<Context> DDLTaskBase::makeQueryContext(Context & from_context, const ZooKeeperPtr & /*zookeeper*/)
{
auto query_context = std::make_unique<Context>(from_context);
query_context->makeQueryContext();
query_context->setCurrentQueryId(""); // generate random query_id
query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
return query_context;
}
bool DDLTask::findCurrentHostID(const Context & global_context, Poco::Logger * log)
{
bool host_in_hostlist = false;
for (const HostID & host : entry.hosts)
{
auto maybe_secure_port = global_context.getTCPPortSecure();
/// The port is considered local if it matches TCP or TCP secure port that the server is listening.
bool is_local_port = (maybe_secure_port && host.isLocalAddress(*maybe_secure_port))
|| host.isLocalAddress(global_context.getTCPPort());
if (!is_local_port)
continue;
if (host_in_hostlist)
{
/// This check could be slow a little bit
LOG_WARNING(log, "There are two the same ClickHouse instances in task {}: {} and {}. Will use the first one only.",
entry_name, host_id.readableString(), host.readableString());
}
else
{
host_in_hostlist = true;
host_id = host;
host_id_str = host.toString();
}
}
return host_in_hostlist;
}
void DDLTask::setClusterInfo(const Context & context, Poco::Logger * log)
{
auto * query_on_cluster = dynamic_cast<ASTQueryWithOnCluster *>(query.get());
if (!query_on_cluster)
throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY);
cluster_name = query_on_cluster->cluster;
cluster = context.tryGetCluster(cluster_name);
if (!cluster)
throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION,
"DDL task {} contains current host {} in cluster {}, but there are no such cluster here.",
entry_name, host_id.readableString(), cluster_name);
/// Try to find host from task host list in cluster
/// At the first, try find exact match (host name and ports should be literally equal)
/// If the attempt fails, try find it resolving host name of each instance
if (!tryFindHostInCluster())
{
LOG_WARNING(log, "Not found the exact match of host {} from task {} in cluster {} definition. Will try to find it using host name resolving.",
host_id.readableString(), entry_name, cluster_name);
if (!tryFindHostInClusterViaResolving(context))
throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, "Not found host {} in definition of cluster {}",
host_id.readableString(), cluster_name);
LOG_INFO(log, "Resolved host {} from task {} as host {} in definition of cluster {}",
host_id.readableString(), entry_name, address_in_cluster.readableString(), cluster_name);
}
query = query_on_cluster->getRewrittenASTWithoutOnCluster(address_in_cluster.default_database);
query_on_cluster = nullptr;
}
bool DDLTask::tryFindHostInCluster()
{
const auto & shards = cluster->getShardsAddresses();
bool found_exact_match = false;
String default_database;
for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num)
{
for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num)
{
const Cluster::Address & address = shards[shard_num][replica_num];
if (address.host_name == host_id.host_name && address.port == host_id.port)
{
if (found_exact_match)
{
if (default_database == address.default_database)
{
throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION,
"There are two exactly the same ClickHouse instances {} in cluster {}",
address.readableString(), cluster_name);
}
else
{
/* Circular replication is used.
* It is when every physical node contains
* replicas of different shards of the same table.
* To distinguish one replica from another on the same node,
* every shard is placed into separate database.
* */
is_circular_replicated = true;
auto * query_with_table = dynamic_cast<ASTQueryWithTableAndOutput *>(query.get());
if (!query_with_table || query_with_table->database.empty())
{
throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION,
"For a distributed DDL on circular replicated cluster its table name must be qualified by database name.");
}
if (default_database == query_with_table->database)
return true;
}
}
found_exact_match = true;
host_shard_num = shard_num;
host_replica_num = replica_num;
address_in_cluster = address;
default_database = address.default_database;
}
}
}
return found_exact_match;
}
bool DDLTask::tryFindHostInClusterViaResolving(const Context & context)
{
const auto & shards = cluster->getShardsAddresses();
bool found_via_resolving = false;
for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num)
{
for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num)
{
const Cluster::Address & address = shards[shard_num][replica_num];
if (auto resolved = address.getResolvedAddress();
resolved && (isLocalAddress(*resolved, context.getTCPPort())
|| (context.getTCPPortSecure() && isLocalAddress(*resolved, *context.getTCPPortSecure()))))
{
if (found_via_resolving)
{
throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION,
"There are two the same ClickHouse instances in cluster {} : {} and {}",
cluster_name, address_in_cluster.readableString(), address.readableString());
}
else
{
found_via_resolving = true;
host_shard_num = shard_num;
host_replica_num = replica_num;
address_in_cluster = address;
}
}
}
}
return found_via_resolving;
}
String DDLTask::getShardID() const
{
/// Generate unique name for shard node, it will be used to execute the query by only single host
/// Shard node name has format 'replica_name1,replica_name2,...,replica_nameN'
/// Where replica_name is 'replica_config_host_name:replica_port'
auto shard_addresses = cluster->getShardsAddresses().at(host_shard_num);
Strings replica_names;
for (const Cluster::Address & address : shard_addresses)
replica_names.emplace_back(address.readableString());
std::sort(replica_names.begin(), replica_names.end());
String res;
for (auto it = replica_names.begin(); it != replica_names.end(); ++it)
res += *it + (std::next(it) != replica_names.end() ? "," : "");
return res;
}
DatabaseReplicatedTask::DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_)
: DDLTaskBase(name, path)
, database(database_)
{
host_id_str = database->getFullReplicaName();
}
String DatabaseReplicatedTask::getShardID() const
{
return database->shard_name;
}
std::unique_ptr<Context> DatabaseReplicatedTask::makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper)
{
auto query_context = DDLTaskBase::makeQueryContext(from_context, zookeeper);
query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
query_context->setCurrentDatabase(database->getDatabaseName());
auto txn = std::make_shared<ZooKeeperMetadataTransaction>(zookeeper, database->zookeeper_path, is_initial_query);
query_context->initZooKeeperMetadataTransaction(txn);
if (is_initial_query)
{
txn->addOp(zkutil::makeRemoveRequest(entry_path + "/try", -1));
txn->addOp(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent));
txn->addOp(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1));
}
txn->addOp(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1));
for (auto & op : ops)
txn->addOp(std::move(op));
ops.clear();
return query_context;
}
String DDLTaskBase::getLogEntryName(UInt32 log_entry_number)
{
constexpr size_t seq_node_digits = 10;
String number = toString(log_entry_number);
String name = "query-" + String(seq_node_digits - number.size(), '0') + number;
return name;
}
UInt32 DDLTaskBase::getLogEntryNumber(const String & log_entry_name)
{
constexpr const char * name = "query-";
assert(startsWith(log_entry_name, name));
return parse<UInt32>(log_entry_name.substr(strlen(name)));
}
void ZooKeeperMetadataTransaction::commit()
{
assert(state == CREATED);
state = FAILED;
current_zookeeper->multi(ops);
state = COMMITTED;
}
}

195
src/Interpreters/DDLTask.h Normal file
View File

@ -0,0 +1,195 @@
#pragma once
#include <Core/Types.h>
#include <Interpreters/Cluster.h>
#include <Common/ZooKeeper/Types.h>
namespace Poco
{
class Logger;
}
namespace zkutil
{
class ZooKeeper;
}
namespace DB
{
class ASTQueryWithOnCluster;
using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
class DatabaseReplicated;
class ZooKeeperMetadataTransaction;
using ZooKeeperMetadataTransactionPtr = std::shared_ptr<ZooKeeperMetadataTransaction>;
struct HostID
{
String host_name;
UInt16 port;
HostID() = default;
explicit HostID(const Cluster::Address & address)
: host_name(address.host_name), port(address.port) {}
static HostID fromString(const String & host_port_str);
String toString() const
{
return Cluster::Address::toString(host_name, port);
}
String readableString() const
{
return host_name + ":" + DB::toString(port);
}
bool isLocalAddress(UInt16 clickhouse_port) const;
static String applyToString(const HostID & host_id)
{
return host_id.toString();
}
};
struct DDLLogEntry
{
String query;
std::vector<HostID> hosts;
String initiator; // optional
static constexpr int CURRENT_VERSION = 1;
String toString() const;
void parse(const String & data);
};
struct DDLTaskBase
{
const String entry_name;
const String entry_path;
DDLLogEntry entry;
String host_id_str;
ASTPtr query;
bool is_initial_query = false;
bool is_circular_replicated = false;
bool execute_on_leader = false;
Coordination::Requests ops;
ExecutionStatus execution_status;
bool was_executed = false;
std::atomic_bool completely_processed = false;
DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {}
DDLTaskBase(const DDLTaskBase &) = delete;
virtual ~DDLTaskBase() = default;
void parseQueryFromEntry(const Context & context);
virtual String getShardID() const = 0;
virtual std::unique_ptr<Context> makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper);
inline String getActiveNodePath() const { return entry_path + "/active/" + host_id_str; }
inline String getFinishedNodePath() const { return entry_path + "/finished/" + host_id_str; }
inline String getShardNodePath() const { return entry_path + "/shards/" + getShardID(); }
static String getLogEntryName(UInt32 log_entry_number);
static UInt32 getLogEntryNumber(const String & log_entry_name);
};
struct DDLTask : public DDLTaskBase
{
DDLTask(const String & name, const String & path) : DDLTaskBase(name, path) {}
bool findCurrentHostID(const Context & global_context, Poco::Logger * log);
void setClusterInfo(const Context & context, Poco::Logger * log);
String getShardID() const override;
private:
bool tryFindHostInCluster();
bool tryFindHostInClusterViaResolving(const Context & context);
HostID host_id;
String cluster_name;
ClusterPtr cluster;
Cluster::Address address_in_cluster;
size_t host_shard_num;
size_t host_replica_num;
};
struct DatabaseReplicatedTask : public DDLTaskBase
{
DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_);
String getShardID() const override;
std::unique_ptr<Context> makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper) override;
DatabaseReplicated * database;
};
/// The main purpose of ZooKeeperMetadataTransaction is to execute all zookeeper operation related to query
/// in a single transaction when we performed all required checks and ready to "commit" changes.
/// For example, create ALTER_METADATA entry in ReplicatedMergeTree log,
/// create path/to/entry/finished/host_id node in distributed DDL queue to mark query as executed and
/// update metadata in path/to/replicated_database/metadata/table_name
/// It's used for DatabaseReplicated.
/// TODO we can also use it for ordinary ON CLUSTER queries
class ZooKeeperMetadataTransaction
{
enum State
{
CREATED,
COMMITTED,
FAILED
};
State state = CREATED;
ZooKeeperPtr current_zookeeper;
String zookeeper_path;
bool is_initial_query;
Coordination::Requests ops;
public:
ZooKeeperMetadataTransaction(const ZooKeeperPtr & current_zookeeper_, const String & zookeeper_path_, bool is_initial_query_)
: current_zookeeper(current_zookeeper_)
, zookeeper_path(zookeeper_path_)
, is_initial_query(is_initial_query_)
{
}
bool isInitialQuery() const { return is_initial_query; }
bool isExecuted() const { return state != CREATED; }
String getDatabaseZooKeeperPath() const { return zookeeper_path; }
void addOp(Coordination::RequestPtr && op)
{
assert(!isExecuted());
ops.emplace_back(op);
}
void moveOpsTo(Coordination::Requests & other_ops)
{
assert(!isExecuted());
std::move(ops.begin(), ops.end(), std::back_inserter(other_ops));
ops.clear();
state = COMMITTED;
}
void commit();
~ZooKeeperMetadataTransaction() { assert(isExecuted() || std::uncaught_exception()); }
};
}

File diff suppressed because it is too large Load Diff

View File

@ -1,15 +1,11 @@
#pragma once
#include <DataStreams/BlockIO.h>
#include <Interpreters/Cluster.h>
#include <Interpreters/Context.h>
#include <Storages/IStorage_fwd.h>
#include <Poco/Net/NetException.h>
#include <Common/CurrentThread.h>
#include <Common/DNSResolver.h>
#include <Common/ThreadPool.h>
#include <Common/isLocalAddress.h>
#include <common/logger_useful.h>
#include <Storages/IStorage_fwd.h>
#include <Parsers/IAST_fwd.h>
#include <Interpreters/Context.h>
#include <atomic>
#include <chrono>
@ -19,90 +15,39 @@
namespace zkutil
{
class ZooKeeper;
class ZooKeeper;
}
namespace Poco
{
class Logger;
namespace Util { class AbstractConfiguration; }
}
namespace Coordination
{
struct Stat;
}
namespace DB
{
class Context;
class ASTAlterQuery;
struct DDLLogEntry;
struct DDLTaskBase;
using DDLTaskPtr = std::unique_ptr<DDLTaskBase>;
using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
class AccessRightsElements;
struct HostID
{
String host_name;
UInt16 port;
HostID() = default;
explicit HostID(const Cluster::Address & address) : host_name(address.host_name), port(address.port) { }
static HostID fromString(const String & host_port_str)
{
HostID res;
std::tie(res.host_name, res.port) = Cluster::Address::fromString(host_port_str);
return res;
}
String toString() const { return Cluster::Address::toString(host_name, port); }
String readableString() const { return host_name + ":" + DB::toString(port); }
bool isLocalAddress(UInt16 clickhouse_port) const
{
try
{
return DB::isLocalAddress(DNSResolver::instance().resolveAddress(host_name, port), clickhouse_port);
}
catch (const Poco::Net::NetException &)
{
/// Avoid "Host not found" exceptions
return false;
}
}
static String applyToString(const HostID & host_id) { return host_id.toString(); }
};
struct DDLLogEntry
{
String query;
std::vector<HostID> hosts;
String initiator; // optional
static constexpr int CURRENT_VERSION = 1;
public:
String toString();
void parse(const String & data);
};
struct DDLTask;
using DDLTaskPtr = std::unique_ptr<DDLTask>;
/// Pushes distributed DDL query to the queue
BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context);
BlockIO executeDDLQueryOnCluster(
const ASTPtr & query_ptr,
const Context & context,
const AccessRightsElements & query_requires_access,
bool query_requires_grant_option = false);
BlockIO executeDDLQueryOnCluster(
const ASTPtr & query_ptr,
const Context & context,
AccessRightsElements && query_requires_access,
bool query_requires_grant_option = false);
class DDLWorker
{
public:
DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix);
~DDLWorker();
DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix,
const String & logger_name = "DDLWorker", const CurrentMetrics::Metric * max_entry_metric_ = nullptr);
virtual ~DDLWorker();
/// Pushes query into DDL queue, returns path to created node
String enqueueQuery(DDLLogEntry & entry);
virtual String enqueueQuery(DDLLogEntry & entry);
/// Host ID (name:port) for logging purposes
/// Note that in each task hosts are identified individually by name:port from initiator server cluster config
@ -111,30 +56,32 @@ public:
return host_fqdn_id;
}
private:
using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
void startup();
virtual void shutdown();
bool isCurrentlyActive() const { return initialized && !stop_flag; }
protected:
/// Returns cached ZooKeeper session (possibly expired).
ZooKeeperPtr tryGetZooKeeper() const;
/// If necessary, creates a new session and caches it.
ZooKeeperPtr getAndSetZooKeeper();
/// ZooKeeper recover loop (while not stopped).
void recoverZooKeeper();
void checkCurrentTasks();
/// Iterates through queue tasks in ZooKeeper, runs execution of new tasks
void scheduleTasks();
void saveTask(const String & entry_name);
DDLTaskBase & saveTask(DDLTaskPtr && task);
/// Reads entry and check that the host belongs to host list of the task
/// Returns non-empty DDLTaskPtr if entry parsed and the check is passed
DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper);
virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper);
void updateMaxDDLEntryID(const DDLTask & task);
void enqueueTask(DDLTaskPtr task);
void processTask(DDLTask & task);
void processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper);
void updateMaxDDLEntryID(const String & entry_name);
/// Check that query should be executed on leader replica only
static bool taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, StoragePtr storage);
static bool taskShouldBeExecutedOnLeader(const ASTPtr & ast_ddl, StoragePtr storage);
/// Executes query only on leader replica in case of replicated table.
/// Queries like TRUNCATE/ALTER .../OPTIMIZE have to be executed only on one node of shard.
@ -142,27 +89,27 @@ private:
/// query via RemoteBlockOutputStream to leader, so to avoid such "2-phase" query execution we
/// execute query directly on leader.
bool tryExecuteQueryOnLeaderReplica(
DDLTask & task,
DDLTaskBase & task,
StoragePtr storage,
const String & rewritten_query,
const String & node_path,
const ZooKeeperPtr & zookeeper);
void parseQueryAndResolveHost(DDLTask & task);
bool tryExecuteQuery(const String & query, const DDLTask & task, ExecutionStatus & status);
bool tryExecuteQuery(const String & query, DDLTaskBase & task, const ZooKeeperPtr & zookeeper);
/// Checks and cleanups queue's nodes
void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper);
virtual bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat);
/// Init task node
static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper);
void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper);
virtual void initializeMainThread();
void runMainThread();
void runCleanupThread();
private:
protected:
Context context;
Poco::Logger * log;
@ -174,10 +121,12 @@ private:
ZooKeeperPtr current_zookeeper;
/// Save state of executed task to avoid duplicate execution on ZK error
std::vector<std::string> last_tasks;
std::optional<String> last_skipped_entry_name;
std::list<DDLTaskPtr> current_tasks;
std::shared_ptr<Poco::Event> queue_updated_event = std::make_shared<Poco::Event>();
std::shared_ptr<Poco::Event> cleanup_event = std::make_shared<Poco::Event>();
std::atomic<bool> initialized = false;
std::atomic<bool> stop_flag = false;
ThreadFromGlobalPool main_thread;
@ -195,9 +144,7 @@ private:
size_t max_tasks_in_queue = 1000;
std::atomic<UInt64> max_id = 0;
friend class DDLQueryStatusInputStream;
friend struct DDLTask;
const CurrentMetrics::Metric * max_entry_metric;
};

View File

@ -609,7 +609,7 @@ DatabaseCatalog::updateDependency(const StorageID & old_from, const StorageID &
view_dependencies[{new_from.getDatabaseName(), new_from.getTableName()}].insert(new_where);
}
std::unique_ptr<DDLGuard> DatabaseCatalog::getDDLGuard(const String & database, const String & table)
DDLGuardPtr DatabaseCatalog::getDDLGuard(const String & database, const String & table)
{
std::unique_lock lock(ddl_guards_mutex);
auto db_guard_iter = ddl_guards.try_emplace(database).first;
@ -956,36 +956,38 @@ DDLGuard::DDLGuard(Map & map_, std::shared_mutex & db_mutex_, std::unique_lock<s
++it->second.counter;
guards_lock.unlock();
table_lock = std::unique_lock(*it->second.mutex);
bool is_database = elem.empty();
if (!is_database)
is_database_guard = elem.empty();
if (!is_database_guard)
{
bool locked_database_for_read = db_mutex.try_lock_shared();
if (!locked_database_for_read)
{
removeTableLock();
releaseTableLock();
throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database {} is currently dropped or renamed", database_name);
}
}
}
void DDLGuard::removeTableLock()
void DDLGuard::releaseTableLock() noexcept
{
if (table_lock_removed)
return;
table_lock_removed = true;
guards_lock.lock();
--it->second.counter;
if (!it->second.counter)
{
table_lock.unlock();
UInt32 counter = --it->second.counter;
table_lock.unlock();
if (counter == 0)
map.erase(it);
}
guards_lock.unlock();
}
DDLGuard::~DDLGuard()
{
bool is_database = it->first.empty();
if (!is_database)
if (!is_database_guard)
db_mutex.unlock_shared();
removeTableLock();
releaseTableLock();
}
}

View File

@ -54,16 +54,21 @@ public:
DDLGuard(Map & map_, std::shared_mutex & db_mutex_, std::unique_lock<std::mutex> guards_lock_, const String & elem, const String & database_name);
~DDLGuard();
/// Unlocks table name, keeps holding read lock for database name
void releaseTableLock() noexcept;
private:
Map & map;
std::shared_mutex & db_mutex;
Map::iterator it;
std::unique_lock<std::mutex> guards_lock;
std::unique_lock<std::mutex> table_lock;
void removeTableLock();
bool table_lock_removed = false;
bool is_database_guard = false;
};
using DDLGuardPtr = std::unique_ptr<DDLGuard>;
/// Creates temporary table in `_temporary_and_external_tables` with randomly generated unique StorageID.
/// Such table can be accessed from everywhere by its ID.
@ -117,7 +122,7 @@ public:
void loadDatabases();
/// Get an object that protects the table from concurrently executing multiple DDL operations.
std::unique_ptr<DDLGuard> getDDLGuard(const String & database, const String & table);
DDLGuardPtr getDDLGuard(const String & database, const String & table);
/// Get an object that protects the database from concurrent DDL queries all tables in the database
std::unique_lock<std::shared_mutex> getExclusiveDDLGuardForDatabase(const String & database);

View File

@ -1,5 +1,5 @@
#include <Interpreters/InterpreterAlterQuery.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Interpreters/MutationsInterpreter.h>
#include <Interpreters/AddDefaultDatabaseVisitor.h>
#include <Interpreters/Context.h>
@ -16,6 +16,9 @@
#include <Common/typeid_cast.h>
#include <boost/range/algorithm_ext/push_back.hpp>
#include <algorithm>
#include <Databases/IDatabase.h>
#include <Databases/DatabaseReplicated.h>
#include <Databases/DatabaseFactory.h>
namespace DB
@ -25,6 +28,7 @@ namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int INCORRECT_QUERY;
extern const int NOT_IMPLEMENTED;
}
@ -38,11 +42,21 @@ BlockIO InterpreterAlterQuery::execute()
BlockIO res;
const auto & alter = query_ptr->as<ASTAlterQuery &>();
if (!alter.cluster.empty())
return executeDDLQueryOnCluster(query_ptr, context, getRequiredAccess());
context.checkAccess(getRequiredAccess());
auto table_id = context.resolveStorageID(alter, Context::ResolveOrdinary);
DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name);
if (typeid_cast<DatabaseReplicated *>(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY)
{
auto guard = DatabaseCatalog::instance().getDDLGuard(table_id.database_name, table_id.table_name);
guard->releaseTableLock();
return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query_ptr, context);
}
StoragePtr table = DatabaseCatalog::instance().getTable(table_id, context);
auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout);
auto metadata_snapshot = table->getInMemoryMetadataPtr();
@ -80,6 +94,14 @@ BlockIO InterpreterAlterQuery::execute()
throw Exception("Wrong parameter type in ALTER query", ErrorCodes::LOGICAL_ERROR);
}
if (typeid_cast<DatabaseReplicated *>(database.get()))
{
int command_types_count = !mutation_commands.empty() + !partition_commands.empty() + !live_view_commands.empty() + !alter_commands.empty();
if (1 < command_types_count)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "For Replicated databases it's not allowed "
"to execute ALTERs of different types in single query");
}
if (!mutation_commands.empty())
{
MutationsInterpreter(table, metadata_snapshot, mutation_commands, context, false).validate();

View File

@ -30,7 +30,8 @@
#include <Storages/StorageInMemoryMetadata.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Interpreters/Cluster.h>
#include <Interpreters/ExpressionAnalyzer.h>
#include <Interpreters/InterpreterCreateQuery.h>
#include <Interpreters/InterpreterSelectWithUnionQuery.h>
@ -46,6 +47,7 @@
#include <DataTypes/DataTypeNullable.h>
#include <Databases/DatabaseFactory.h>
#include <Databases/DatabaseReplicated.h>
#include <Databases/IDatabase.h>
#include <Databases/DatabaseOnDisk.h>
@ -79,6 +81,7 @@ namespace ErrorCodes
extern const int ILLEGAL_SYNTAX_FOR_DATA_TYPE;
extern const int ILLEGAL_COLUMN;
extern const int LOGICAL_ERROR;
extern const int UNKNOWN_DATABASE;
extern const int PATH_ACCESS_DENIED;
extern const int NOT_IMPLEMENTED;
extern const int UNKNOWN_TABLE;
@ -146,7 +149,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
throw Exception(ErrorCodes::UNKNOWN_DATABASE_ENGINE, "Unknown database engine: {}", serializeAST(*create.storage));
}
if (create.storage->engine->name == "Atomic")
if (create.storage->engine->name == "Atomic" || create.storage->engine->name == "Replicated")
{
if (create.attach && create.uuid == UUIDHelpers::Nil)
throw Exception(ErrorCodes::INCORRECT_QUERY, "UUID must be specified for ATTACH. "
@ -205,6 +208,12 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
"Enable allow_experimental_database_materialize_mysql to use it.", ErrorCodes::UNKNOWN_DATABASE_ENGINE);
}
if (create.storage->engine->name == "Replicated" && !context.getSettingsRef().allow_experimental_database_replicated && !internal)
{
throw Exception("Replicated is an experimental database engine. "
"Enable allow_experimental_database_replicated to use it.", ErrorCodes::UNKNOWN_DATABASE_ENGINE);
}
DatabasePtr database = DatabaseFactory::get(create, metadata_path / "", context);
if (create.uuid != UUIDHelpers::Nil)
@ -556,6 +565,11 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::setProperties(AS
validateTableStructure(create, properties);
/// Set the table engine if it was not specified explicitly.
setEngine(create);
assert(as_database_saved.empty() && as_table_saved.empty());
std::swap(create.as_database, as_database_saved);
std::swap(create.as_table, as_table_saved);
return properties;
}
@ -702,6 +716,12 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data
const auto * kind = create.is_dictionary ? "Dictionary" : "Table";
const auto * kind_upper = create.is_dictionary ? "DICTIONARY" : "TABLE";
if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY && !internal)
{
if (create.uuid == UUIDHelpers::Nil)
throw Exception("Table UUID is not specified in DDL log", ErrorCodes::LOGICAL_ERROR);
}
bool from_path = create.attach_from_path.has_value();
if (database->getUUID() != UUIDHelpers::Nil)
@ -776,11 +796,11 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create)
ErrorCodes::BAD_DATABASE_FOR_TEMPORARY_TABLE);
String current_database = context.getCurrentDatabase();
auto database_name = create.database.empty() ? current_database : create.database;
// If this is a stub ATTACH query, read the query definition from the database
if (create.attach && !create.storage && !create.columns_list)
{
auto database_name = create.database.empty() ? current_database : create.database;
auto database = DatabaseCatalog::instance().getDatabase(database_name);
bool if_not_exists = create.if_not_exists;
@ -800,19 +820,30 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create)
if (create.attach_from_path)
{
fs::path data_path = fs::path(*create.attach_from_path).lexically_normal();
fs::path user_files = fs::path(context.getUserFilesPath()).lexically_normal();
if (data_path.is_relative())
data_path = (user_files / data_path).lexically_normal();
if (!startsWith(data_path, user_files))
throw Exception(ErrorCodes::PATH_ACCESS_DENIED,
"Data directory {} must be inside {} to attach it", String(data_path), String(user_files));
fs::path root_path = fs::path(context.getPath()).lexically_normal();
/// Data path must be relative to root_path
create.attach_from_path = fs::relative(data_path, root_path) / "";
if (context.getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY)
{
fs::path data_path = fs::path(*create.attach_from_path).lexically_normal();
if (data_path.is_relative())
data_path = (user_files / data_path).lexically_normal();
if (!startsWith(data_path, user_files))
throw Exception(ErrorCodes::PATH_ACCESS_DENIED,
"Data directory {} must be inside {} to attach it", String(data_path), String(user_files));
/// Data path must be relative to root_path
create.attach_from_path = fs::relative(data_path, root_path) / "";
}
else
{
fs::path data_path = (root_path / *create.attach_from_path).lexically_normal();
if (!startsWith(data_path, user_files))
throw Exception(ErrorCodes::PATH_ACCESS_DENIED,
"Data directory {} must be inside {} to attach it", String(data_path), String(user_files));
}
}
else if (create.attach && !create.attach_short_syntax)
else if (create.attach && !create.attach_short_syntax && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY)
{
auto * log = &Poco::Logger::get("InterpreterCreateQuery");
LOG_WARNING(log, "ATTACH TABLE query with full table definition is not recommended: "
@ -836,11 +867,29 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create)
/// Set and retrieve list of columns, indices and constraints. Set table engine if needed. Rewrite query in canonical way.
TableProperties properties = setProperties(create);
DatabasePtr database;
bool need_add_to_database = !create.temporary;
if (need_add_to_database)
database = DatabaseCatalog::instance().getDatabase(database_name);
if (need_add_to_database && database->getEngineName() == "Replicated")
{
auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table);
database = DatabaseCatalog::instance().getDatabase(create.database);
if (typeid_cast<DatabaseReplicated *>(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY)
{
assertOrSetUUID(create, database);
guard->releaseTableLock();
return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query_ptr, context);
}
}
if (create.replace_table)
return doCreateOrReplaceTable(create, properties);
/// Actually creates table
bool created = doCreateTable(create, properties);
if (!created) /// Table already exists
return {};
@ -880,7 +929,8 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
drop_ast->table = create.table;
drop_ast->no_ddl_lock = true;
InterpreterDropQuery interpreter(drop_ast, context);
Context drop_context = context;
InterpreterDropQuery interpreter(drop_ast, drop_context);
interpreter.execute();
}
else
@ -1037,6 +1087,14 @@ BlockIO InterpreterCreateQuery::createDictionary(ASTCreateQuery & create)
auto guard = DatabaseCatalog::instance().getDDLGuard(database_name, dictionary_name);
DatabasePtr database = DatabaseCatalog::instance().getDatabase(database_name);
if (typeid_cast<DatabaseReplicated *>(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY)
{
if (!create.attach)
assertOrSetUUID(create, database);
guard->releaseTableLock();
return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query_ptr, context);
}
if (database->isDictionaryExist(dictionary_name))
{
/// TODO Check structure of dictionary
@ -1189,15 +1247,14 @@ AccessRightsElements InterpreterCreateQuery::getRequiredAccess() const
return required_access;
}
void InterpreterCreateQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, const Context &) const
void InterpreterCreateQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, const Context &) const
{
const auto & create = ast->as<const ASTCreateQuery &>();
elem.query_kind = "Create";
if (!create.as_table.empty())
if (!as_table_saved.empty())
{
String database = backQuoteIfNeed(create.as_database.empty() ? context.getCurrentDatabase() : create.as_database);
String database = backQuoteIfNeed(as_database_saved.empty() ? context.getCurrentDatabase() : as_database_saved);
elem.query_databases.insert(database);
elem.query_tables.insert(database + "." + backQuoteIfNeed(create.as_table));
elem.query_tables.insert(database + "." + backQuoteIfNeed(as_table_saved));
}
}

View File

@ -95,5 +95,8 @@ private:
/// Is this an internal query - not from the user.
bool internal = false;
bool force_attach = false;
mutable String as_database_saved;
mutable String as_table_saved;
};
}

View File

@ -2,7 +2,7 @@
#include <Parsers/ASTCreateQuotaQuery.h>
#include <Parsers/ASTRolesOrUsersSet.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Access/AccessControlManager.h>
#include <Access/AccessFlags.h>
#include <ext/range.h>

View File

@ -1,7 +1,7 @@
#include <Interpreters/InterpreterCreateRoleQuery.h>
#include <Parsers/ASTCreateRoleQuery.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Access/AccessControlManager.h>
#include <Access/Role.h>

View File

@ -4,7 +4,7 @@
#include <Parsers/ASTRolesOrUsersSet.h>
#include <Parsers/formatAST.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Access/AccessControlManager.h>
#include <Access/AccessFlags.h>
#include <boost/range/algorithm/sort.hpp>

View File

@ -2,7 +2,7 @@
#include <Parsers/ASTCreateSettingsProfileQuery.h>
#include <Parsers/ASTRolesOrUsersSet.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Access/AccessControlManager.h>
#include <Access/SettingsProfile.h>
#include <Access/AccessFlags.h>

View File

@ -1,7 +1,7 @@
#include <Interpreters/InterpreterCreateUserQuery.h>
#include <Interpreters/Context.h>
#include <Interpreters/InterpreterSetRoleQuery.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Parsers/ASTCreateUserQuery.h>
#include <Parsers/ASTUserNameWithHost.h>
#include <Parsers/ASTRolesOrUsersSet.h>

View File

@ -2,7 +2,7 @@
#include <Parsers/ASTDropAccessEntityQuery.h>
#include <Parsers/ASTRowPolicyName.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Access/AccessControlManager.h>
#include <Access/AccessFlags.h>
#include <Access/User.h>

View File

@ -2,7 +2,7 @@
#include <Databases/IDatabase.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Interpreters/InterpreterDropQuery.h>
#include <Interpreters/ExternalDictionariesLoader.h>
#include <Interpreters/QueryLog.h>
@ -12,6 +12,7 @@
#include <Common/escapeForFileName.h>
#include <Common/quoteString.h>
#include <Common/typeid_cast.h>
#include <Databases/DatabaseReplicated.h>
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
@ -35,6 +36,7 @@ namespace ErrorCodes
extern const int UNKNOWN_TABLE;
extern const int UNKNOWN_DICTIONARY;
extern const int NOT_IMPLEMENTED;
extern const int INCORRECT_QUERY;
}
@ -121,32 +123,55 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat
if (database && table)
{
if (query_ptr->as<ASTDropQuery &>().is_view && !table->isView())
if (query.as<ASTDropQuery &>().is_view && !table->isView())
throw Exception("Table " + table_id.getNameForLogs() + " is not a View", ErrorCodes::LOGICAL_ERROR);
/// Now get UUID, so we can wait for table data to be finally dropped
table_id.uuid = database->tryGetTableUUID(table_id.table_name);
/// Prevents recursive drop from drop database query. The original query must specify a table.
bool is_drop_or_detach_database = query_ptr->as<ASTDropQuery>()->table.empty();
bool is_replicated_ddl_query = typeid_cast<DatabaseReplicated *>(database.get()) &&
context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY &&
!is_drop_or_detach_database;
if (is_replicated_ddl_query)
{
if (query.kind == ASTDropQuery::Kind::Detach && !query.permanently)
throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. "
"Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA");
if (query.kind == ASTDropQuery::Kind::Detach)
context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id);
else if (query.kind == ASTDropQuery::Kind::Truncate)
context.checkAccess(AccessType::TRUNCATE, table_id);
else if (query.kind == ASTDropQuery::Kind::Drop)
context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id);
ddl_guard->releaseTableLock();
table.reset();
return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query.clone(), context);
}
if (query.kind == ASTDropQuery::Kind::Detach)
{
context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id);
table->checkTableCanBeDetached();
table->shutdown();
TableExclusiveLockHolder table_lock;
if (database->getUUID() == UUIDHelpers::Nil)
table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout);
if (query.permanently)
{
/// Drop table from memory, don't touch data, metadata file renamed and will be skipped during server restart
database->detachTablePermanently(table_id.table_name);
database->detachTablePermanently(context, table_id.table_name);
}
else
{
/// Drop table from memory, don't touch data and metadata
database->detachTable(table_id.table_name);
}
}
else if (query.kind == ASTDropQuery::Kind::Truncate)
{
@ -202,6 +227,21 @@ BlockIO InterpreterDropQuery::executeToDictionary(
DatabasePtr database = tryGetDatabase(database_name, if_exists);
bool is_drop_or_detach_database = query_ptr->as<ASTDropQuery>()->table.empty();
bool is_replicated_ddl_query = typeid_cast<DatabaseReplicated *>(database.get()) &&
context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY &&
!is_drop_or_detach_database;
if (is_replicated_ddl_query)
{
if (kind == ASTDropQuery::Kind::Detach)
throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH DICTIONARY is not allowed for Replicated databases.");
context.checkAccess(AccessType::DROP_DICTIONARY, database_name, dictionary_name);
ddl_guard->releaseTableLock();
return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query_ptr, context);
}
if (!database || !database->isDictionaryExist(dictionary_name))
{
if (!if_exists)
@ -315,6 +355,8 @@ BlockIO InterpreterDropQuery::executeToDatabaseImpl(const ASTDropQuery & query,
if (database->getEngineName() == "MaterializeMySQL")
stopDatabaseSynchronization(database);
#endif
if (auto * replicated = typeid_cast<DatabaseReplicated *>(database.get()))
replicated->stopReplication();
if (database->shouldBeEmptyOnDetach())
{

View File

@ -2,7 +2,7 @@
#include <Parsers/ASTGrantQuery.h>
#include <Parsers/ASTRolesOrUsersSet.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Access/AccessControlManager.h>
#include <Access/ContextAccess.h>
#include <Access/RolesOrUsersSet.h>

View File

@ -2,7 +2,7 @@
#include <Parsers/ASTKillQueryQuery.h>
#include <Parsers/queryToString.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
#include <Interpreters/executeDDLQueryOnCluster.h>
#include <Interpreters/ProcessList.h>
#include <Interpreters/executeQuery.h>
#include <Interpreters/CancellationCode.h>

Some files were not shown because too many files have changed in this diff Show More