Merge branch 'master' into use-iobject-storage-for-table-engines-1

This commit is contained in:
Kseniia Sumarokova 2024-03-12 15:18:19 +01:00 committed by GitHub
commit f7ac982a6b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
163 changed files with 434 additions and 9232 deletions

View File

@ -61,8 +61,8 @@ if (ENABLE_CHECK_HEAVY_BUILDS)
# set CPU time limit to 1000 seconds
set (RLIMIT_CPU 1000)
# -fsanitize=memory is too heavy
if (SANITIZE STREQUAL "memory")
# -fsanitize=memory and address are too heavy
if (SANITIZE)
set (RLIMIT_DATA 10000000000) # 10G
endif()

View File

@ -28,7 +28,6 @@ lrwxrwxrwx 1 root root 10 clickhouse-benchmark -> clickhouse
lrwxrwxrwx 1 root root 10 clickhouse-clang -> clickhouse
lrwxrwxrwx 1 root root 10 clickhouse-client -> clickhouse
lrwxrwxrwx 1 root root 10 clickhouse-compressor -> clickhouse
lrwxrwxrwx 1 root root 10 clickhouse-copier -> clickhouse
lrwxrwxrwx 1 root root 10 clickhouse-extract-from-config -> clickhouse
lrwxrwxrwx 1 root root 10 clickhouse-format -> clickhouse
lrwxrwxrwx 1 root root 10 clickhouse-lld -> clickhouse

View File

@ -33,6 +33,9 @@ ENV TSAN_OPTIONS='halt_on_error=1 abort_on_error=1 history_size=7 memory_limit_m
ENV UBSAN_OPTIONS='print_stacktrace=1'
ENV MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1'
# for external_symbolizer_path
RUN ln -s /usr/bin/llvm-symbolizer-${LLVM_VERSION} /usr/bin/llvm-symbolizer
RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8
ENV LC_ALL en_US.UTF-8

View File

@ -14,7 +14,6 @@ RUN apt-get update \
libclang-${LLVM_VERSION}-dev \
libclang-rt-${LLVM_VERSION}-dev \
lld-${LLVM_VERSION} \
llvm-${LLVM_VERSION} \
llvm-${LLVM_VERSION}-dev \
lsof \
ninja-build \
@ -37,8 +36,6 @@ RUN pip3 install numpy==1.26.3 scipy==1.12.0 pandas==1.5.3 Jinja2==3.1.3
# This symlink is required by gcc to find the lld linker
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld
# for external_symbolizer_path
RUN ln -s /usr/bin/llvm-symbolizer-${LLVM_VERSION} /usr/bin/llvm-symbolizer
# FIXME: workaround for "The imported target "merge-fdata" references the file" error
# https://salsa.debian.org/pkg-llvm-team/llvm-toolchain/-/commit/992e52c0b156a5ba9c6a8a54f8c4857ddd3d371d
RUN sed -i '/_IMPORT_CHECK_FILES_FOR_\(mlir-\|llvm-bolt\|merge-fdata\|MLIR\)/ {s|^|#|}' /usr/lib/llvm-${LLVM_VERSION}/lib/cmake/llvm/LLVMExports-*.cmake

View File

@ -343,7 +343,7 @@ quit
# which is confusing.
task_exit_code=$fuzzer_exit_code
echo "failure" > status.txt
echo "Achtung!" > description.txt
echo "Let op!" > description.txt
echo "Fuzzer went wrong with error code: ($fuzzer_exit_code). Its process died somehow when the server stayed alive. The server log probably won't tell you much so try to find information in other files." >>description.txt
{ rg -ao "Found error:.*" fuzzer.log || rg -ao "Exception:.*" fuzzer.log; } | tail -1 >>description.txt
fi

View File

@ -26,6 +26,8 @@ RUN apt-get update \
&& export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \
&& echo "deb https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \
/etc/apt/sources.list \
&& apt-get update \
&& apt-get install --yes --no-install-recommends --verbose-versions llvm-${LLVM_VERSION} \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*

View File

@ -170,7 +170,7 @@ RESTORE TABLE test.table PARTITIONS '2', '3'
### Backups as tar archives
Backups can also be stored as tar archives. The functionality is the same as for zip, except that a password is not supported.
Backups can also be stored as tar archives. The functionality is the same as for zip, except that a password is not supported.
Write a backup as a tar:
```
@ -444,10 +444,6 @@ Often data that is ingested into ClickHouse is delivered through some sort of pe
Some local filesystems provide snapshot functionality (for example, [ZFS](https://en.wikipedia.org/wiki/ZFS)), but they might not be the best choice for serving live queries. A possible solution is to create additional replicas with this kind of filesystem and exclude them from the [Distributed](../engines/table-engines/special/distributed.md) tables that are used for `SELECT` queries. Snapshots on such replicas will be out of reach of any queries that modify data. As a bonus, these replicas might have special hardware configurations with more disks attached per server, which would be cost-effective.
### clickhouse-copier {#clickhouse-copier}
[clickhouse-copier](../operations/utilities/clickhouse-copier.md) is a versatile tool that was initially created to re-shard petabyte-sized tables. It can also be used for backup and restore purposes because it reliably copies data between ClickHouse tables and clusters.
For smaller volumes of data, a simple `INSERT INTO ... SELECT ...` to remote tables might work as well.
### Manipulations with Parts {#manipulations-with-parts}

View File

@ -513,10 +513,6 @@ Part was moved to another disk and should be deleted in own destructor.
Not active data part with identity refcounter, it is deleting right now by a cleaner.
### PartsInMemory
In-memory parts.
### PartsOutdated
Not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes.

View File

@ -1,187 +0,0 @@
---
slug: /en/operations/utilities/clickhouse-copier
sidebar_position: 59
sidebar_label: clickhouse-copier
---
# clickhouse-copier
Copies data from the tables in one cluster to tables in another (or the same) cluster.
:::note
To get a consistent copy, the data in the source tables and partitions should not change during the entire process.
:::
You can run multiple `clickhouse-copier` instances on different servers to perform the same job. ClickHouse Keeper, or ZooKeeper, is used for syncing the processes.
After starting, `clickhouse-copier`:
- Connects to ClickHouse Keeper and receives:
- Copying jobs.
- The state of the copying jobs.
- It performs the jobs.
Each running process chooses the “closest” shard of the source cluster and copies the data into the destination cluster, resharding the data if necessary.
`clickhouse-copier` tracks the changes in ClickHouse Keeper and applies them on the fly.
To reduce network traffic, we recommend running `clickhouse-copier` on the same server where the source data is located.
## Running Clickhouse-copier {#running-clickhouse-copier}
The utility should be run manually:
``` bash
$ clickhouse-copier --daemon --config keeper.xml --task-path /task/path --base-dir /path/to/dir
```
Parameters:
- `daemon` — Starts `clickhouse-copier` in daemon mode.
- `config` — The path to the `keeper.xml` file with the parameters for the connection to ClickHouse Keeper.
- `task-path` — The path to the ClickHouse Keeper node. This node is used for syncing `clickhouse-copier` processes and storing tasks. Tasks are stored in `$task-path/description`.
- `task-file` — Optional path to file with task configuration for initial upload to ClickHouse Keeper.
- `task-upload-force` — Force upload `task-file` even if node already exists. Default is false.
- `base-dir` — The path to logs and auxiliary files. When it starts, `clickhouse-copier` creates `clickhouse-copier_YYYYMMHHSS_<PID>` subdirectories in `$base-dir`. If this parameter is omitted, the directories are created in the directory where `clickhouse-copier` was launched.
## Format of keeper.xml {#format-of-zookeeper-xml}
``` xml
<clickhouse>
<logger>
<level>trace</level>
<size>100M</size>
<count>3</count>
</logger>
<zookeeper>
<node index="1">
<host>127.0.0.1</host>
<port>2181</port>
</node>
</zookeeper>
</clickhouse>
```
## Configuration of Copying Tasks {#configuration-of-copying-tasks}
``` xml
<clickhouse>
<!-- Configuration of clusters as in an ordinary server config -->
<remote_servers>
<source_cluster>
<!--
source cluster & destination clusters accept exactly the same
parameters as parameters for the usual Distributed table
see https://clickhouse.com/docs/en/engines/table-engines/special/distributed/
-->
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>127.0.0.1</host>
<port>9000</port>
<!--
<user>default</user>
<password>default</password>
<secure>1</secure>
-->
</replica>
</shard>
...
</source_cluster>
<destination_cluster>
...
</destination_cluster>
</remote_servers>
<!-- How many simultaneously active workers are possible. If you run more workers superfluous workers will sleep. -->
<max_workers>2</max_workers>
<!-- Setting used to fetch (pull) data from source cluster tables -->
<settings_pull>
<readonly>1</readonly>
</settings_pull>
<!-- Setting used to insert (push) data to destination cluster tables -->
<settings_push>
<readonly>0</readonly>
</settings_push>
<!-- Common setting for fetch (pull) and insert (push) operations. Also, copier process context uses it.
They are overlaid by <settings_pull/> and <settings_push/> respectively. -->
<settings>
<connect_timeout>3</connect_timeout>
<!-- Sync insert is set forcibly, leave it here just in case. -->
<distributed_foreground_insert>1</distributed_foreground_insert>
</settings>
<!-- Copying tasks description.
You could specify several table task in the same task description (in the same ZooKeeper node), they will be performed
sequentially.
-->
<tables>
<!-- A table task, copies one table. -->
<table_hits>
<!-- Source cluster name (from <remote_servers/> section) and tables in it that should be copied -->
<cluster_pull>source_cluster</cluster_pull>
<database_pull>test</database_pull>
<table_pull>hits</table_pull>
<!-- Destination cluster name and tables in which the data should be inserted -->
<cluster_push>destination_cluster</cluster_push>
<database_push>test</database_push>
<table_push>hits2</table_push>
<!-- Engine of destination tables.
If destination tables have not be created, workers create them using columns definition from source tables and engine
definition from here.
NOTE: If the first worker starts insert data and detects that destination partition is not empty then the partition will
be dropped and refilled, take it into account if you already have some data in destination tables. You could directly
specify partitions that should be copied in <enabled_partitions/>, they should be in quoted format like partition column of
system.parts table.
-->
<engine>
ENGINE=ReplicatedMergeTree('/clickhouse/tables/{cluster}/{shard}/hits2', '{replica}')
PARTITION BY toMonday(date)
ORDER BY (CounterID, EventDate)
</engine>
<!-- Sharding key used to insert data to destination cluster -->
<sharding_key>jumpConsistentHash(intHash64(UserID), 2)</sharding_key>
<!-- Optional expression that filter data while pull them from source servers -->
<where_condition>CounterID != 0</where_condition>
<!-- This section specifies partitions that should be copied, other partition will be ignored.
Partition names should have the same format as
partition column of system.parts table (i.e. a quoted text).
Since partition key of source and destination cluster could be different,
these partition names specify destination partitions.
NOTE: In spite of this section is optional (if it is not specified, all partitions will be copied),
it is strictly recommended to specify them explicitly.
If you already have some ready partitions on destination cluster they
will be removed at the start of the copying since they will be interpeted
as unfinished data from the previous copying!!!
-->
<enabled_partitions>
<partition>'2018-02-26'</partition>
<partition>'2018-03-05'</partition>
...
</enabled_partitions>
</table_hits>
<!-- Next table to copy. It is not copied until previous table is copying. -->
<table_visits>
...
</table_visits>
...
</tables>
</clickhouse>
```
`clickhouse-copier` tracks the changes in `/task/path/description` and applies them on the fly. For instance, if you change the value of `max_workers`, the number of processes running tasks will also change.

View File

@ -2,13 +2,11 @@
slug: /en/operations/utilities/
sidebar_position: 56
sidebar_label: List of tools and utilities
pagination_next: 'en/operations/utilities/clickhouse-copier'
---
# List of tools and utilities
- [clickhouse-local](../../operations/utilities/clickhouse-local.md) — Allows running SQL queries on data without starting the ClickHouse server, similar to how `awk` does this.
- [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster.
- [clickhouse-benchmark](../../operations/utilities/clickhouse-benchmark.md) — Loads server with the custom queries and settings.
- [clickhouse-format](../../operations/utilities/clickhouse-format.md) — Enables formatting input queries.
- [ClickHouse obfuscator](../../operations/utilities/clickhouse-obfuscator.md) — Obfuscates data.

View File

@ -335,7 +335,7 @@ The `ALTER` query lets you create and delete separate elements (columns) in nest
There is no support for deleting columns in the primary key or the sampling key (columns that are used in the `ENGINE` expression). Changing the type for columns that are included in the primary key is only possible if this change does not cause the data to be modified (for example, you are allowed to add values to an Enum or to change a type from `DateTime` to `UInt32`).
If the `ALTER` query is not sufficient to make the table changes you need, you can create a new table, copy the data to it using the [INSERT SELECT](/docs/en/sql-reference/statements/insert-into.md/#inserting-the-results-of-select) query, then switch the tables using the [RENAME](/docs/en/sql-reference/statements/rename.md/#rename-table) query and delete the old table. You can use the [clickhouse-copier](/docs/en/operations/utilities/clickhouse-copier.md) as an alternative to the `INSERT SELECT` query.
If the `ALTER` query is not sufficient to make the table changes you need, you can create a new table, copy the data to it using the [INSERT SELECT](/docs/en/sql-reference/statements/insert-into.md/#inserting-the-results-of-select) query, then switch the tables using the [RENAME](/docs/en/sql-reference/statements/rename.md/#rename-table) query and delete the old table.
The `ALTER` query blocks all reads and writes for the table. In other words, if a long `SELECT` is running at the time of the `ALTER` query, the `ALTER` query will wait for it to complete. At the same time, all new queries to the same table will wait while this `ALTER` is running.

View File

@ -585,10 +585,6 @@ ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand());
INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1;
```
:::danger Внимание!
Этот подход не годится для сегментирования больших таблиц. Есть инструмент [clickhouse-copier](../operations/utilities/clickhouse-copier.md), специально предназначенный для перераспределения любых больших таблиц.
:::
Как и следовало ожидать, вычислительно сложные запросы работают втрое быстрее, если они выполняются на трёх серверах, а не на одном.
В данном случае мы использовали кластер из трёх сегментов с одной репликой для каждого.

View File

@ -24,10 +24,6 @@ sidebar_label: "Резервное копирование данных"
Некоторые локальные файловые системы позволяют делать снимки (например, [ZFS](https://en.wikipedia.org/wiki/ZFS)), но они могут быть не лучшим выбором для обслуживания живых запросов. Возможным решением является создание дополнительных реплик с такой файловой системой и исключение их из [Distributed](../engines/table-engines/special/distributed.md) таблиц, используемых для запросов `SELECT`. Снимки на таких репликах будут недоступны для запросов, изменяющих данные. В качестве бонуса, эти реплики могут иметь особые конфигурации оборудования с большим количеством дисков, подключенных к серверу, что будет экономически эффективным.
## clickhouse-copier {#clickhouse-copier}
[clickhouse-copier](utilities/clickhouse-copier.md) — это универсальный инструмент, который изначально был создан для перешардирования таблиц с петабайтами данных. Его также можно использовать для резервного копирования и восстановления, поскольку он надёжно копирует данные между таблицами и кластерами ClickHouse.
Для небольших объёмов данных можно применять `INSERT INTO ... SELECT ...` в удалённые таблицы.
## Манипуляции с партициями {#manipuliatsii-s-partitsiiami}

View File

@ -1,183 +0,0 @@
---
slug: /ru/operations/utilities/clickhouse-copier
sidebar_position: 59
sidebar_label: clickhouse-copier
---
# clickhouse-copier {#clickhouse-copier}
Копирует данные из таблиц одного кластера в таблицы другого (или этого же) кластера.
Можно запустить несколько `clickhouse-copier` для разных серверах для выполнения одного и того же задания. Для синхронизации между процессами используется ZooKeeper.
После запуска, `clickhouse-copier`:
- Соединяется с ZooKeeper и получает:
- Задания на копирование.
- Состояние заданий на копирование.
- Выполняет задания.
Каждый запущенный процесс выбирает "ближайший" шард исходного кластера и копирует данные в кластер назначения, при необходимости перешардируя их.
`clickhouse-copier` отслеживает изменения в ZooKeeper и применяет их «на лету».
Для снижения сетевого трафика рекомендуем запускать `clickhouse-copier` на том же сервере, где находятся исходные данные.
## Запуск Clickhouse-copier {#zapusk-clickhouse-copier}
Утилиту следует запускать вручную следующим образом:
``` bash
$ clickhouse-copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir
```
Параметры запуска:
- `daemon` - запускает `clickhouse-copier` в режиме демона.
- `config` - путь к файлу `zookeeper.xml` с параметрами соединения с ZooKeeper.
- `task-path` - путь к ноде ZooKeeper. Нода используется для синхронизации между процессами `clickhouse-copier` и для хранения заданий. Задания хранятся в `$task-path/description`.
- `task-file` - необязательный путь к файлу с описанием конфигурация заданий для загрузки в ZooKeeper.
- `task-upload-force` - Загрузить `task-file` в ZooKeeper даже если уже было загружено.
- `base-dir` - путь к логам и вспомогательным файлам. При запуске `clickhouse-copier` создает в `$base-dir` подкаталоги `clickhouse-copier_YYYYMMHHSS_<PID>`. Если параметр не указан, то каталоги будут создаваться в каталоге, где `clickhouse-copier` был запущен.
## Формат Zookeeper.xml {#format-zookeeper-xml}
``` xml
<clickhouse>
<logger>
<level>trace</level>
<size>100M</size>
<count>3</count>
</logger>
<zookeeper>
<node index="1">
<host>127.0.0.1</host>
<port>2181</port>
</node>
</zookeeper>
</clickhouse>
```
## Конфигурация заданий на копирование {#konfiguratsiia-zadanii-na-kopirovanie}
``` xml
<clickhouse>
<!-- Configuration of clusters as in an ordinary server config -->
<remote_servers>
<source_cluster>
<!--
source cluster & destination clusters accept exactly the same
parameters as parameters for the usual Distributed table
see https://clickhouse.com/docs/ru/engines/table-engines/special/distributed/
-->
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>127.0.0.1</host>
<port>9000</port>
<!--
<user>default</user>
<password>default</password>
<secure>1</secure>
-->
</replica>
</shard>
...
</source_cluster>
<destination_cluster>
...
</destination_cluster>
</remote_servers>
<!-- How many simultaneously active workers are possible. If you run more workers superfluous workers will sleep. -->
<max_workers>2</max_workers>
<!-- Setting used to fetch (pull) data from source cluster tables -->
<settings_pull>
<readonly>1</readonly>
</settings_pull>
<!-- Setting used to insert (push) data to destination cluster tables -->
<settings_push>
<readonly>0</readonly>
</settings_push>
<!-- Common setting for fetch (pull) and insert (push) operations. Also, copier process context uses it.
They are overlaid by <settings_pull/> and <settings_push/> respectively. -->
<settings>
<connect_timeout>3</connect_timeout>
<!-- Sync insert is set forcibly, leave it here just in case. -->
<distributed_foreground_insert>1</distributed_foreground_insert>
</settings>
<!-- Copying tasks description.
You could specify several table task in the same task description (in the same ZooKeeper node), they will be performed
sequentially.
-->
<tables>
<!-- A table task, copies one table. -->
<table_hits>
<!-- Source cluster name (from <remote_servers/> section) and tables in it that should be copied -->
<cluster_pull>source_cluster</cluster_pull>
<database_pull>test</database_pull>
<table_pull>hits</table_pull>
<!-- Destination cluster name and tables in which the data should be inserted -->
<cluster_push>destination_cluster</cluster_push>
<database_push>test</database_push>
<table_push>hits2</table_push>
<!-- Engine of destination tables.
If destination tables have not be created, workers create them using columns definition from source tables and engine
definition from here.
NOTE: If the first worker starts insert data and detects that destination partition is not empty then the partition will
be dropped and refilled, take it into account if you already have some data in destination tables. You could directly
specify partitions that should be copied in <enabled_partitions/>, they should be in quoted format like partition column of
system.parts table.
-->
<engine>
ENGINE=ReplicatedMergeTree('/clickhouse/tables/{cluster}/{shard}/hits2', '{replica}')
PARTITION BY toMonday(date)
ORDER BY (CounterID, EventDate)
</engine>
<!-- Sharding key used to insert data to destination cluster -->
<sharding_key>jumpConsistentHash(intHash64(UserID), 2)</sharding_key>
<!-- Optional expression that filter data while pull them from source servers -->
<where_condition>CounterID != 0</where_condition>
<!-- This section specifies partitions that should be copied, other partition will be ignored.
Partition names should have the same format as
partition column of system.parts table (i.e. a quoted text).
Since partition key of source and destination cluster could be different,
these partition names specify destination partitions.
NOTE: In spite of this section is optional (if it is not specified, all partitions will be copied),
it is strictly recommended to specify them explicitly.
If you already have some ready partitions on destination cluster they
will be removed at the start of the copying since they will be interpeted
as unfinished data from the previous copying!!!
-->
<enabled_partitions>
<partition>'2018-02-26'</partition>
<partition>'2018-03-05'</partition>
...
</enabled_partitions>
</table_hits>
<!-- Next table to copy. It is not copied until previous table is copying. -->
<table_visits>
...
</table_visits>
...
</tables>
</clickhouse>
```
`clickhouse-copier` отслеживает изменения `/task/path/description` и применяет их «на лету». Если вы поменяете, например, значение `max_workers`, то количество процессов, выполняющих задания, также изменится.

View File

@ -7,7 +7,6 @@ sidebar_position: 56
# Утилиты ClickHouse {#utility-clickhouse}
- [clickhouse-local](clickhouse-local.md) - позволяет выполнять SQL-запросы над данными без остановки сервера ClickHouse, подобно утилите `awk`.
- [clickhouse-copier](clickhouse-copier.md) - копирует (и перешардирует) данные с одного кластера на другой.
- [clickhouse-benchmark](../../operations/utilities/clickhouse-benchmark.md) — устанавливает соединение с сервером ClickHouse и запускает циклическое выполнение указанных запросов.
- [clickhouse-format](../../operations/utilities/clickhouse-format.md) — позволяет форматировать входящие запросы.
- [ClickHouse obfuscator](../../operations/utilities/clickhouse-obfuscator.md) — обфусцирует данные.

View File

@ -94,7 +94,7 @@ RENAME COLUMN [IF EXISTS] name to new_name
Переименовывает столбец `name` в `new_name`. Если указано выражение `IF EXISTS`, то запрос не будет возвращать ошибку при условии, что столбец `name` не существует. Поскольку переименование не затрагивает физические данные колонки, запрос выполняется практически мгновенно.
**ЗАМЕЧЕНИЕ**: Столбцы, являющиеся частью основного ключа или ключа сортировки (заданные с помощью `ORDER BY` или `PRIMARY KEY`), не могут быть переименованы. Попытка переименовать эти слобцы приведет к `SQL Error [524]`.
**ЗАМЕЧЕНИЕ**: Столбцы, являющиеся частью основного ключа или ключа сортировки (заданные с помощью `ORDER BY` или `PRIMARY KEY`), не могут быть переименованы. Попытка переименовать эти слобцы приведет к `SQL Error [524]`.
Пример:
@ -254,7 +254,7 @@ SELECT groupArray(x), groupArray(s) FROM tmp;
Отсутствует возможность удалять столбцы, входящие в первичный ключ или ключ для сэмплирования (в общем, входящие в выражение `ENGINE`). Изменение типа у столбцов, входящих в первичный ключ возможно только в том случае, если это изменение не приводит к изменению данных (например, разрешено добавление значения в Enum или изменение типа с `DateTime` на `UInt32`).
Если возможностей запроса `ALTER` не хватает для нужного изменения таблицы, вы можете создать новую таблицу, скопировать туда данные с помощью запроса [INSERT SELECT](../insert-into.md#inserting-the-results-of-select), затем поменять таблицы местами с помощью запроса [RENAME](../rename.md#rename-table), и удалить старую таблицу. В качестве альтернативы для запроса `INSERT SELECT`, можно использовать инструмент [clickhouse-copier](../../../sql-reference/statements/alter/index.md).
Если возможностей запроса `ALTER` не хватает для нужного изменения таблицы, вы можете создать новую таблицу, скопировать туда данные с помощью запроса [INSERT SELECT](../insert-into.md#inserting-the-results-of-select), затем поменять таблицы местами с помощью запроса [RENAME](../rename.md#rename-table), и удалить старую таблицу.
Запрос `ALTER` блокирует все чтения и записи для таблицы. То есть если на момент запроса `ALTER` выполнялся долгий `SELECT`, то запрос `ALTER` сначала дождётся его выполнения. И в это время все новые запросы к той же таблице будут ждать, пока завершится этот `ALTER`.

View File

@ -582,8 +582,6 @@ ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand());
INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1;
```
!!! warning "注意:"
这种方法不适合大型表的分片。 有一个单独的工具 [clickhouse-copier](../operations/utilities/clickhouse-copier.md) 这可以重新分片任意大表。
正如您所期望的那样如果计算量大的查询使用3台服务器而不是一个则运行速度快N倍。

View File

@ -24,12 +24,6 @@ sidebar_label: "\u6570\u636E\u5907\u4EFD"
某些本地文件系统提供快照功能(例如, [ZFS](https://en.wikipedia.org/wiki/ZFS)),但它们可能不是提供实时查询的最佳选择。 一个可能的解决方案是使用这种文件系统创建额外的副本,并将它们与用于`SELECT` 查询的 [分布式](../engines/table-engines/special/distributed.md) 表分离。 任何修改数据的查询都无法访问此类副本上的快照。 作为回报,这些副本可能具有特殊的硬件配置,每个服务器附加更多的磁盘,这将是经济高效的。
## clickhouse-copier {#clickhouse-copier}
[clickhouse-copier](utilities/clickhouse-copier.md) 是一个多功能工具最初创建它是为了用于重新切分pb大小的表。 因为它能够在ClickHouse表和集群之间可靠地复制数据所以它也可用于备份和还原数据。
对于较小的数据量,一个简单的 `INSERT INTO ... SELECT ...` 到远程表也可以工作。
## part操作 {#manipulations-with-parts}
ClickHouse允许使用 `ALTER TABLE ... FREEZE PARTITION ...` 查询以创建表分区的本地副本。 这是利用硬链接(hardlink)到 `/var/lib/clickhouse/shadow/` 文件夹中实现的,所以它通常不会因为旧数据而占用额外的磁盘空间。 创建的文件副本不由ClickHouse服务器处理所以你可以把它们留在那里你将有一个简单的备份不需要任何额外的外部系统但它仍然容易出现硬件问题。 出于这个原因,最好将它们远程复制到另一个位置,然后删除本地副本。 分布式文件系统和对象存储仍然是一个不错的选择,但是具有足够大容量的正常附加文件服务器也可以工作(在这种情况下,传输将通过网络文件系统或者也许是 [rsync](https://en.wikipedia.org/wiki/Rsync) 来进行).

View File

@ -1,172 +0,0 @@
---
slug: /zh/operations/utilities/clickhouse-copier
---
# clickhouse-copier {#clickhouse-copier}
将数据从一个群集中的表复制到另一个(或相同)群集中的表。
您可以运行多个 `clickhouse-copier` 不同服务器上的实例执行相同的作业。 ZooKeeper用于同步进程。
开始后, `clickhouse-copier`:
- 连接到ZooKeeper并且接收:
- 复制作业。
- 复制作业的状态。
- 它执行的工作。
每个正在运行的进程都会选择源集群的“最接近”分片,然后将数据复制到目标集群,并在必要时重新分片数据。
`clickhouse-copier` 跟踪ZooKeeper中的更改并实时应用它们。
为了减少网络流量,我们建议运行 `clickhouse-copier` 在源数据所在的同一服务器上。
## 运行Clickhouse-copier {#running-clickhouse-copier}
该实用程序应手动运行:
``` bash
clickhouse-copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir
```
参数:
- `daemon` — 在守护进程模式下启动`clickhouse-copier`。
- `config``zookeeper.xml`文件的路径其中包含用于连接ZooKeeper的参数。
- `task-path` — ZooKeeper节点的路径。 该节点用于同步`clickhouse-copier`进程和存储任务。 任务存储在`$task-path/description`中。
- `task-file` — 可选的非必须参数, 指定一个包含任务配置的参数文件, 用于初始上传到ZooKeeper。
- `task-upload-force` — 即使节点已经存在,也强制上载`task-file`。
- `base-dir` — 日志和辅助文件的路径。 启动时,`clickhouse-copier`在`$base-dir`中创建`clickhouse-copier_YYYYMMHHSS_<PID>`子目录。 如果省略此参数,则会在启动`clickhouse-copier`的目录中创建目录。
## Zookeeper.xml格式 {#format-of-zookeeper-xml}
``` xml
<clickhouse>
<logger>
<level>trace</level>
<size>100M</size>
<count>3</count>
</logger>
<zookeeper>
<node index="1">
<host>127.0.0.1</host>
<port>2181</port>
</node>
</zookeeper>
</clickhouse>
```
## 复制任务的配置 {#configuration-of-copying-tasks}
``` xml
<clickhouse>
<!-- Configuration of clusters as in an ordinary server config -->
<remote_servers>
<source_cluster>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>127.0.0.1</host>
<port>9000</port>
</replica>
</shard>
...
</source_cluster>
<destination_cluster>
...
</destination_cluster>
</remote_servers>
<!-- How many simultaneously active workers are possible. If you run more workers superfluous workers will sleep. -->
<max_workers>2</max_workers>
<!-- Setting used to fetch (pull) data from source cluster tables -->
<settings_pull>
<readonly>1</readonly>
</settings_pull>
<!-- Setting used to insert (push) data to destination cluster tables -->
<settings_push>
<readonly>0</readonly>
</settings_push>
<!-- Common setting for fetch (pull) and insert (push) operations. Also, copier process context uses it.
They are overlaid by <settings_pull/> and <settings_push/> respectively. -->
<settings>
<connect_timeout>3</connect_timeout>
<!-- Sync insert is set forcibly, leave it here just in case. -->
<distributed_foreground_insert>1</distributed_foreground_insert>
</settings>
<!-- Copying tasks description.
You could specify several table task in the same task description (in the same ZooKeeper node), they will be performed
sequentially.
-->
<tables>
<!-- A table task, copies one table. -->
<table_hits>
<!-- Source cluster name (from <remote_servers/> section) and tables in it that should be copied -->
<cluster_pull>source_cluster</cluster_pull>
<database_pull>test</database_pull>
<table_pull>hits</table_pull>
<!-- Destination cluster name and tables in which the data should be inserted -->
<cluster_push>destination_cluster</cluster_push>
<database_push>test</database_push>
<table_push>hits2</table_push>
<!-- Engine of destination tables.
If destination tables have not be created, workers create them using columns definition from source tables and engine
definition from here.
NOTE: If the first worker starts insert data and detects that destination partition is not empty then the partition will
be dropped and refilled, take it into account if you already have some data in destination tables. You could directly
specify partitions that should be copied in <enabled_partitions/>, they should be in quoted format like partition column of
system.parts table.
-->
<engine>
ENGINE=ReplicatedMergeTree('/clickhouse/tables/{cluster}/{shard}/hits2', '{replica}')
PARTITION BY toMonday(date)
ORDER BY (CounterID, EventDate)
</engine>
<!-- Sharding key used to insert data to destination cluster -->
<sharding_key>jumpConsistentHash(intHash64(UserID), 2)</sharding_key>
<!-- Optional expression that filter data while pull them from source servers -->
<where_condition>CounterID != 0</where_condition>
<!-- This section specifies partitions that should be copied, other partition will be ignored.
Partition names should have the same format as
partition column of system.parts table (i.e. a quoted text).
Since partition key of source and destination cluster could be different,
these partition names specify destination partitions.
NOTE: In spite of this section is optional (if it is not specified, all partitions will be copied),
it is strictly recommended to specify them explicitly.
If you already have some ready partitions on destination cluster they
will be removed at the start of the copying since they will be interpeted
as unfinished data from the previous copying!!!
-->
<enabled_partitions>
<partition>'2018-02-26'</partition>
<partition>'2018-03-05'</partition>
...
</enabled_partitions>
</table_hits>
<!-- Next table to copy. It is not copied until previous table is copying. -->
<table_visits>
...
</table_visits>
...
</tables>
</clickhouse>
```
`clickhouse-copier` 跟踪更改 `/task/path/description` 并在飞行中应用它们。 例如,如果你改变的值 `max_workers`,运行任务的进程数也会发生变化。

View File

@ -4,5 +4,4 @@ slug: /zh/operations/utilities/
# 实用工具 {#clickhouse-utility}
- [本地查询](clickhouse-local.md) — 在不停止ClickHouse服务的情况下对数据执行查询操作(类似于 `awk` 命令)。
- [跨集群复制](clickhouse-copier.md) — 在不同集群间复制数据。
- [性能测试](clickhouse-benchmark.md) — 连接到Clickhouse服务器执行性能测试。

View File

@ -150,7 +150,7 @@ ALTER TABLE visits MODIFY COLUMN browser Array(String)
不支持对primary key或者sampling key中的列`ENGINE` 表达式中用到的列进行删除操作。改变包含在primary key中的列的类型时如果操作不会导致数据的变化例如往Enum中添加一个值或者将`DateTime` 类型改成 `UInt32`),那么这种操作是可行的。
如果 `ALTER` 操作不足以完成你想要的表变动操作,你可以创建一张新的表,通过 [INSERT SELECT](../../sql-reference/statements/insert-into.md#inserting-the-results-of-select)将数据拷贝进去,然后通过 [RENAME](../../sql-reference/statements/misc.md#misc_operations-rename)将新的表改成和原有表一样的名称,并删除原有的表。你可以使用 [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) 代替 `INSERT SELECT`
如果 `ALTER` 操作不足以完成你想要的表变动操作,你可以创建一张新的表,通过 [INSERT SELECT](../../sql-reference/statements/insert-into.md#inserting-the-results-of-select)将数据拷贝进去,然后通过 [RENAME](../../sql-reference/statements/misc.md#misc_operations-rename)将新的表改成和原有表一样的名称,并删除原有的表。
`ALTER` 操作会阻塞对表的所有读写操作。换句话说,当一个大的 `SELECT` 语句和 `ALTER`同时执行时,`ALTER`会等待,直到 `SELECT` 执行结束。与此同时,当 `ALTER` 运行时,新的 sql 语句将会等待。

View File

@ -50,8 +50,6 @@ contents:
dst: /etc/init.d/clickhouse-server
- src: clickhouse-server.service
dst: /lib/systemd/system/clickhouse-server.service
- src: root/usr/bin/clickhouse-copier
dst: /usr/bin/clickhouse-copier
- src: root/usr/bin/clickhouse-server
dst: /usr/bin/clickhouse-server
# clickhouse-keeper part

View File

@ -122,7 +122,6 @@ add_subdirectory (local)
add_subdirectory (benchmark)
add_subdirectory (extract-from-config)
add_subdirectory (compressor)
add_subdirectory (copier)
add_subdirectory (format)
add_subdirectory (obfuscator)
add_subdirectory (install)
@ -200,7 +199,6 @@ clickhouse_program_install(clickhouse-server server)
clickhouse_program_install(clickhouse-client client chc)
clickhouse_program_install(clickhouse-local local chl ch)
clickhouse_program_install(clickhouse-benchmark benchmark)
clickhouse_program_install(clickhouse-copier copier)
clickhouse_program_install(clickhouse-extract-from-config extract-from-config)
clickhouse_program_install(clickhouse-compressor compressor)
clickhouse_program_install(clickhouse-format format)

View File

@ -1,15 +0,0 @@
#pragma once
#include <base/types.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <utility>
namespace DB
{
using ConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfiguration>;
using DatabaseAndTableName = std::pair<String, String>;
using ListOfDatabasesAndTableNames = std::vector<DatabaseAndTableName>;
}

View File

@ -1,28 +0,0 @@
set(CLICKHOUSE_COPIER_SOURCES
"${CMAKE_CURRENT_SOURCE_DIR}/ClusterCopierApp.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/ClusterCopier.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/Internals.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/ShardPartition.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/ShardPartitionPiece.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/StatusAccumulator.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/TaskCluster.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/TaskShard.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/TaskTable.cpp")
set (CLICKHOUSE_COPIER_LINK
PRIVATE
clickhouse_common_zookeeper
clickhouse_common_config
clickhouse_parsers
clickhouse_functions
clickhouse_table_functions
clickhouse_aggregate_functions
string_utils
PUBLIC
daemon
)
set(CLICKHOUSE_COPIER_INCLUDE SYSTEM PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
clickhouse_program_add(copier)

File diff suppressed because it is too large Load Diff

View File

@ -1,240 +0,0 @@
#pragma once
#include "Aliases.h"
#include "Internals.h"
#include "TaskCluster.h"
#include "TaskShard.h"
#include "TaskTable.h"
#include "ShardPartition.h"
#include "ShardPartitionPiece.h"
#include "ZooKeeperStaff.h"
namespace DB
{
class ClusterCopier : WithMutableContext
{
public:
ClusterCopier(const String & task_path_,
const String & host_id_,
const String & proxy_database_name_,
ContextMutablePtr context_,
LoggerRawPtr log_)
: WithMutableContext(context_),
task_zookeeper_path(task_path_),
host_id(host_id_),
working_database_name(proxy_database_name_),
log(log_) {}
void init();
template <typename T>
decltype(auto) retry(T && func, UInt64 max_tries = 100);
void discoverShardPartitions(const ConnectionTimeouts & timeouts, const TaskShardPtr & task_shard);
/// Compute set of partitions, assume set of partitions aren't changed during the processing
void discoverTablePartitions(const ConnectionTimeouts & timeouts, TaskTable & task_table, UInt64 num_threads = 0);
void uploadTaskDescription(const std::string & task_path, const std::string & task_file, bool force);
void reloadTaskDescription();
void updateConfigIfNeeded();
void process(const ConnectionTimeouts & timeouts);
/// Disables DROP PARTITION commands that used to clear data after errors
void setSafeMode(bool is_safe_mode_ = true)
{
is_safe_mode = is_safe_mode_;
}
void setCopyFaultProbability(double copy_fault_probability_)
{
copy_fault_probability = copy_fault_probability_;
}
void setMoveFaultProbability(double move_fault_probability_)
{
move_fault_probability = move_fault_probability_;
}
void setExperimentalUseSampleOffset(bool value)
{
experimental_use_sample_offset = value;
}
void setMaxTableTries(UInt64 tries)
{
max_table_tries = tries;
}
void setMaxShardPartitionTries(UInt64 tries)
{
max_shard_partition_tries = tries;
}
void setMaxShardPartitionPieceTriesForAlter(UInt64 tries)
{
max_shard_partition_piece_tries_for_alter = tries;
}
void setRetryDelayMs(std::chrono::milliseconds ms)
{
retry_delay_ms = ms;
}
protected:
String getWorkersPath() const
{
return task_cluster->task_zookeeper_path + "/task_active_workers";
}
String getWorkersPathVersion() const
{
return getWorkersPath() + "_version";
}
String getCurrentWorkerNodePath() const
{
return getWorkersPath() + "/" + host_id;
}
zkutil::EphemeralNodeHolder::Ptr createTaskWorkerNodeAndWaitIfNeed(
const zkutil::ZooKeeperPtr & zookeeper,
const String & description,
bool unprioritized);
/*
* Checks that partition piece or some other entity is clean.
* The only requirement is that you have to pass is_dirty_flag_path and is_dirty_cleaned_path to the function.
* And is_dirty_flag_path is a parent of is_dirty_cleaned_path.
* */
static bool checkPartitionPieceIsClean(
const zkutil::ZooKeeperPtr & zookeeper,
const CleanStateClock & clean_state_clock,
const String & task_status_path);
bool checkAllPiecesInPartitionAreDone(const TaskTable & task_table, const String & partition_name, const TasksShard & shards_with_partition);
/** Checks that the whole partition of a table was copied. We should do it carefully due to dirty lock.
* State of some task could change during the processing.
* We have to ensure that all shards have the finished state and there is no dirty flag.
* Moreover, we have to check status twice and check zxid, because state can change during the checking.
*/
/* The same as function above
* Assume that we don't know on which shards do we have partition certain piece.
* We'll check them all (I mean shards that contain the whole partition)
* And shards that don't have certain piece MUST mark that piece is_done true.
* */
bool checkPartitionPieceIsDone(const TaskTable & task_table, const String & partition_name,
size_t piece_number, const TasksShard & shards_with_partition);
/*Alter successful insertion to helping tables it will move all pieces to destination table*/
TaskStatus tryMoveAllPiecesToDestinationTable(const TaskTable & task_table, const String & partition_name);
/// Removes MATERIALIZED and ALIAS columns from create table query
static ASTPtr removeAliasMaterializedAndTTLColumnsFromCreateQuery(const ASTPtr & query_ast, bool allow_to_copy_alias_and_materialized_columns);
bool tryDropPartitionPiece(ShardPartition & task_partition, size_t current_piece_number,
const zkutil::ZooKeeperPtr & zookeeper, const CleanStateClock & clean_state_clock);
bool tryProcessTable(const ConnectionTimeouts & timeouts, TaskTable & task_table);
TaskStatus tryCreateDestinationTable(const ConnectionTimeouts & timeouts, TaskTable & task_table);
/// Job for copying partition from particular shard.
TaskStatus tryProcessPartitionTask(const ConnectionTimeouts & timeouts,
ShardPartition & task_partition,
bool is_unprioritized_task);
TaskStatus iterateThroughAllPiecesInPartition(const ConnectionTimeouts & timeouts,
ShardPartition & task_partition,
bool is_unprioritized_task);
TaskStatus processPartitionPieceTaskImpl(const ConnectionTimeouts & timeouts,
ShardPartition & task_partition,
size_t current_piece_number,
bool is_unprioritized_task);
void dropAndCreateLocalTable(const ASTPtr & create_ast);
void dropLocalTableIfExists(const DatabaseAndTableName & table_name) const;
void dropHelpingTables(const TaskTable & task_table);
void dropHelpingTablesByPieceNumber(const TaskTable & task_table, size_t current_piece_number);
/// Is used for usage less disk space.
/// After all pieces were successfully moved to original destination
/// table we can get rid of partition pieces (partitions in helping tables).
void dropParticularPartitionPieceFromAllHelpingTables(const TaskTable & task_table, const String & partition_name);
String getRemoteCreateTable(const DatabaseAndTableName & table, Connection & connection, const Settings & settings);
ASTPtr getCreateTableForPullShard(const ConnectionTimeouts & timeouts, TaskShard & task_shard);
/// If it is implicitly asked to create split Distributed table for certain piece on current shard, we will do it.
void createShardInternalTables(const ConnectionTimeouts & timeouts, TaskShard & task_shard, bool create_split = true);
std::set<String> getShardPartitions(const ConnectionTimeouts & timeouts, TaskShard & task_shard);
bool checkShardHasPartition(const ConnectionTimeouts & timeouts, TaskShard & task_shard, const String & partition_quoted_name);
bool checkPresentPartitionPiecesOnCurrentShard(const ConnectionTimeouts & timeouts,
TaskShard & task_shard, const String & partition_quoted_name, size_t current_piece_number);
/*
* This class is used in executeQueryOnCluster function
* You can execute query on each shard (no sense it is executed on each replica of a shard or not)
* or you can execute query on each replica on each shard.
* First mode is useful for INSERTS queries.
* */
enum ClusterExecutionMode
{
ON_EACH_SHARD,
ON_EACH_NODE
};
/** Executes simple query (without output streams, for example DDL queries) on each shard of the cluster
* Returns number of shards for which at least one replica executed query successfully
*/
UInt64 executeQueryOnCluster(
const ClusterPtr & cluster,
const String & query,
const Settings & current_settings,
ClusterExecutionMode execution_mode = ClusterExecutionMode::ON_EACH_SHARD) const;
private:
String task_zookeeper_path;
String task_description_path;
String host_id;
String working_database_name;
/// Auto update config stuff
UInt64 task_description_current_version = 1;
std::atomic<UInt64> task_description_version{1};
Coordination::WatchCallback task_description_watch_callback;
/// ZooKeeper session used to set the callback
zkutil::ZooKeeperPtr task_description_watch_zookeeper;
ConfigurationPtr task_cluster_initial_config;
ConfigurationPtr task_cluster_current_config;
std::unique_ptr<TaskCluster> task_cluster;
bool is_safe_mode = false;
double copy_fault_probability = 0.0;
double move_fault_probability = 0.0;
bool experimental_use_sample_offset{false};
LoggerRawPtr log;
UInt64 max_table_tries = 3;
UInt64 max_shard_partition_tries = 3;
UInt64 max_shard_partition_piece_tries_for_alter = 10;
std::chrono::milliseconds retry_delay_ms{1000};
};
}

View File

@ -1,252 +0,0 @@
#include "ClusterCopierApp.h"
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Common/StatusFile.h>
#include <Common/TerminalSize.h>
#include <Databases/registerDatabases.h>
#include <IO/ConnectionTimeouts.h>
#include <Interpreters/registerInterpreters.h>
#include <Formats/registerFormats.h>
#include <Common/scope_guard_safe.h>
#include <unistd.h>
#include <filesystem>
namespace fs = std::filesystem;
namespace DB
{
/// ClusterCopierApp
void ClusterCopierApp::initialize(Poco::Util::Application & self)
{
is_help = config().has("help");
if (is_help)
return;
config_xml_path = config().getString("config-file");
task_path = config().getString("task-path");
log_level = config().getString("log-level", "info");
is_safe_mode = config().has("safe-mode");
is_status_mode = config().has("status");
if (config().has("copy-fault-probability"))
copy_fault_probability = std::max(std::min(config().getDouble("copy-fault-probability"), 1.0), 0.0);
if (config().has("move-fault-probability"))
move_fault_probability = std::max(std::min(config().getDouble("move-fault-probability"), 1.0), 0.0);
base_dir = (config().has("base-dir")) ? config().getString("base-dir") : fs::current_path().string();
max_table_tries = std::max<size_t>(config().getUInt("max-table-tries", 3), 1);
max_shard_partition_tries = std::max<size_t>(config().getUInt("max-shard-partition-tries", 3), 1);
max_shard_partition_piece_tries_for_alter = std::max<size_t>(config().getUInt("max-shard-partition-piece-tries-for-alter", 10), 1);
retry_delay_ms = std::chrono::milliseconds(std::max<size_t>(config().getUInt("retry-delay-ms", 1000), 100));
if (config().has("experimental-use-sample-offset"))
experimental_use_sample_offset = config().getBool("experimental-use-sample-offset");
// process_id is '<hostname>#<start_timestamp>_<pid>'
time_t timestamp = Poco::Timestamp().epochTime();
auto curr_pid = Poco::Process::id();
process_id = std::to_string(DateLUT::serverTimezoneInstance().toNumYYYYMMDDhhmmss(timestamp)) + "_" + std::to_string(curr_pid);
host_id = escapeForFileName(getFQDNOrHostName()) + '#' + process_id;
process_path = fs::weakly_canonical(fs::path(base_dir) / ("clickhouse-copier_" + process_id));
fs::create_directories(process_path);
/// Override variables for BaseDaemon
if (config().has("log-level"))
config().setString("logger.level", config().getString("log-level"));
if (config().has("base-dir") || !config().has("logger.log"))
config().setString("logger.log", fs::path(process_path) / "log.log");
if (config().has("base-dir") || !config().has("logger.errorlog"))
config().setString("logger.errorlog", fs::path(process_path) / "log.err.log");
Base::initialize(self);
}
void ClusterCopierApp::handleHelp(const std::string &, const std::string &)
{
uint16_t terminal_width = 0;
if (isatty(STDIN_FILENO))
terminal_width = getTerminalWidth();
Poco::Util::HelpFormatter help_formatter(options());
if (terminal_width)
help_formatter.setWidth(terminal_width);
help_formatter.setCommand(commandName());
help_formatter.setHeader("Copies tables from one cluster to another");
help_formatter.setUsage("--config-file <config-file> --task-path <task-path>");
help_formatter.format(std::cerr);
help_formatter.setFooter("See also: https://clickhouse.com/docs/en/operations/utilities/clickhouse-copier/");
stopOptionsProcessing();
}
void ClusterCopierApp::defineOptions(Poco::Util::OptionSet & options)
{
Base::defineOptions(options);
options.addOption(Poco::Util::Option("task-path", "", "path to task in ZooKeeper")
.argument("task-path").binding("task-path"));
options.addOption(Poco::Util::Option("task-file", "", "path to task file for uploading in ZooKeeper to task-path")
.argument("task-file").binding("task-file"));
options.addOption(Poco::Util::Option("task-upload-force", "", "Force upload task-file even node already exists. Default is false.")
.argument("task-upload-force").binding("task-upload-force"));
options.addOption(Poco::Util::Option("safe-mode", "", "disables ALTER DROP PARTITION in case of errors")
.binding("safe-mode"));
options.addOption(Poco::Util::Option("copy-fault-probability", "", "the copying fails with specified probability (used to test partition state recovering)")
.argument("copy-fault-probability").binding("copy-fault-probability"));
options.addOption(Poco::Util::Option("move-fault-probability", "", "the moving fails with specified probability (used to test partition state recovering)")
.argument("move-fault-probability").binding("move-fault-probability"));
options.addOption(Poco::Util::Option("log-level", "", "sets log level")
.argument("log-level").binding("log-level"));
options.addOption(Poco::Util::Option("base-dir", "", "base directory for copiers, consecutive copier launches will populate /base-dir/launch_id/* directories")
.argument("base-dir").binding("base-dir"));
options.addOption(Poco::Util::Option("experimental-use-sample-offset", "", "Use SAMPLE OFFSET query instead of cityHash64(PRIMARY KEY) % n == k")
.argument("experimental-use-sample-offset").binding("experimental-use-sample-offset"));
options.addOption(Poco::Util::Option("status", "", "Get for status for current execution").binding("status"));
options.addOption(Poco::Util::Option("max-table-tries", "", "Number of tries for the copy table task")
.argument("max-table-tries").binding("max-table-tries"));
options.addOption(Poco::Util::Option("max-shard-partition-tries", "", "Number of tries for the copy one partition task")
.argument("max-shard-partition-tries").binding("max-shard-partition-tries"));
options.addOption(Poco::Util::Option("max-shard-partition-piece-tries-for-alter", "", "Number of tries for final ALTER ATTACH to destination table")
.argument("max-shard-partition-piece-tries-for-alter").binding("max-shard-partition-piece-tries-for-alter"));
options.addOption(Poco::Util::Option("retry-delay-ms", "", "Delay between task retries")
.argument("retry-delay-ms").binding("retry-delay-ms"));
using Me = std::decay_t<decltype(*this)>;
options.addOption(Poco::Util::Option("help", "", "produce this help message").binding("help")
.callback(Poco::Util::OptionCallback<Me>(this, &Me::handleHelp)));
}
void ClusterCopierApp::mainImpl()
{
/// Status command
{
if (is_status_mode)
{
SharedContextHolder shared_context = Context::createShared();
auto context = Context::createGlobal(shared_context.get());
context->makeGlobalContext();
SCOPE_EXIT_SAFE(context->shutdown());
auto zookeeper = context->getZooKeeper();
auto status_json = zookeeper->get(task_path + "/status");
LOG_INFO(&logger(), "{}", status_json);
std::cout << status_json << std::endl;
context->resetZooKeeper();
return;
}
}
StatusFile status_file(process_path + "/status", StatusFile::write_full_info);
ThreadStatus thread_status;
auto * log = &logger();
LOG_INFO(log, "Starting clickhouse-copier (id {}, host_id {}, path {}, revision {})", process_id, host_id, process_path, ClickHouseRevision::getVersionRevision());
SharedContextHolder shared_context = Context::createShared();
auto context = Context::createGlobal(shared_context.get());
context->makeGlobalContext();
SCOPE_EXIT_SAFE(context->shutdown());
context->setConfig(loaded_config.configuration);
context->setApplicationType(Context::ApplicationType::LOCAL);
context->setPath(process_path + "/");
registerInterpreters();
registerFunctions();
registerAggregateFunctions();
registerTableFunctions();
registerDatabases();
registerStorages();
registerDictionaries();
registerDisks(/* global_skip_access_check= */ true);
registerFormats();
static const std::string default_database = "_local";
DatabaseCatalog::instance().attachDatabase(default_database, std::make_shared<DatabaseMemory>(default_database, context));
context->setCurrentDatabase(default_database);
/// Disable queries logging, since:
/// - There are bits that is not allowed for global context, like adding factories info (for the query_log)
/// - And anyway it is useless for copier.
context->setSetting("log_queries", false);
auto local_context = Context::createCopy(context);
/// Initialize query scope just in case.
CurrentThread::QueryScope query_scope(local_context);
auto copier = std::make_unique<ClusterCopier>(
task_path, host_id, default_database, local_context, log);
copier->setSafeMode(is_safe_mode);
copier->setCopyFaultProbability(copy_fault_probability);
copier->setMoveFaultProbability(move_fault_probability);
copier->setMaxTableTries(max_table_tries);
copier->setMaxShardPartitionTries(max_shard_partition_tries);
copier->setMaxShardPartitionPieceTriesForAlter(max_shard_partition_piece_tries_for_alter);
copier->setRetryDelayMs(retry_delay_ms);
copier->setExperimentalUseSampleOffset(experimental_use_sample_offset);
auto task_file = config().getString("task-file", "");
if (!task_file.empty())
copier->uploadTaskDescription(task_path, task_file, config().getBool("task-upload-force", false));
zkutil::validateZooKeeperConfig(config());
copier->init();
copier->process(ConnectionTimeouts::getTCPTimeoutsWithoutFailover(context->getSettingsRef()));
/// Reset ZooKeeper before removing ClusterCopier.
/// Otherwise zookeeper watch can call callback which use already removed ClusterCopier object.
context->resetZooKeeper();
}
int ClusterCopierApp::main(const std::vector<std::string> &)
{
if (is_help)
return 0;
try
{
mainImpl();
}
catch (...)
{
tryLogCurrentException(&Poco::Logger::root(), __PRETTY_FUNCTION__);
auto code = getCurrentExceptionCode();
return (code) ? code : -1;
}
return 0;
}
}
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wmissing-declarations"
int mainEntryClickHouseClusterCopier(int argc, char ** argv)
{
try
{
DB::ClusterCopierApp app;
return app.run(argc, argv);
}
catch (...)
{
std::cerr << DB::getCurrentExceptionMessage(true) << "\n";
auto code = DB::getCurrentExceptionCode();
return (code) ? code : -1;
}
}

View File

@ -1,99 +0,0 @@
#pragma once
#include <Poco/Util/ServerApplication.h>
#include <Daemon/BaseDaemon.h>
#include "ClusterCopier.h"
/* clickhouse cluster copier util
* Copies tables data from one cluster to new tables of other (possibly the same) cluster in distributed fault-tolerant manner.
*
* See overview in the docs: docs/en/utils/clickhouse-copier.md
*
* Implementation details:
*
* cluster-copier workers pull each partition of each shard of the source cluster and push it to the destination cluster through
* Distributed table (to perform data resharding). So, worker job is a partition of a source shard.
* A job has three states: Active, Finished and Abandoned. Abandoned means that worker died and did not finish the job.
*
* If an error occurred during the copying (a worker failed or a worker did not finish the INSERT), then the whole partition (on
* all destination servers) should be dropped and refilled. So, copying entity is a partition of all destination shards.
* If a failure is detected a special /is_dirty node is created in ZooKeeper signalling that other workers copying the same partition
* should stop, after a refilling procedure should start.
*
* ZooKeeper task node has the following structure:
* /task/path_root - path passed in --task-path parameter
* /description - contains user-defined XML config of the task
* /task_active_workers - contains ephemeral nodes of all currently active workers, used to implement max_workers limitation
* /server_fqdn#PID_timestamp - cluster-copier worker ID
* ...
* /tables - directory with table tasks
* /cluster.db.table1 - directory of table_hits task
* /partition1 - directory for partition1
* /shards - directory for source cluster shards
* /1 - worker job for the first shard of partition1 of table test.hits
* Contains info about current status (Active or Finished) and worker ID.
* /2
* ...
* /partition_active_workers
* /1 - for each job in /shards a corresponding ephemeral node created in /partition_active_workers
* It is used to detect Abandoned jobs (if there is Active node in /shards and there is no node in
* /partition_active_workers).
* Also, it is used to track active workers in the partition (when we need to refill the partition we do
* not DROP PARTITION while there are active workers)
* /2
* ...
* /is_dirty - the node is set if some worker detected that an error occurred (the INSERT is failed or an Abandoned node is
* detected). If the node appeared workers in this partition should stop and start cleaning and refilling
* partition procedure.
* During this procedure a single 'cleaner' worker is selected. The worker waits for stopping all partition
* workers, removes /shards node, executes DROP PARTITION on each destination node and removes /is_dirty node.
* /cleaner- An ephemeral node used to select 'cleaner' worker. Contains ID of the worker.
* /cluster.db.table2
* ...
*/
namespace DB
{
class ClusterCopierApp : public BaseDaemon
{
public:
void initialize(Poco::Util::Application & self) override;
void handleHelp(const std::string &, const std::string &);
void defineOptions(Poco::Util::OptionSet & options) override;
int main(const std::vector<std::string> &) override;
private:
using Base = BaseDaemon;
void mainImpl();
std::string config_xml_path;
std::string task_path;
std::string log_level = "info";
bool is_safe_mode = false;
bool is_status_mode = false;
double copy_fault_probability = 0.0;
double move_fault_probability = 0.0;
bool is_help = false;
UInt64 max_table_tries = 3;
UInt64 max_shard_partition_tries = 3;
UInt64 max_shard_partition_piece_tries_for_alter = 10;
std::chrono::milliseconds retry_delay_ms{1000};
bool experimental_use_sample_offset{false};
std::string base_dir;
std::string process_path;
std::string process_id;
std::string host_id;
};
}

View File

@ -1,22 +0,0 @@
#pragma once
#include <base/types.h>
#include <map>
namespace DB
{
/// Contains info about all shards that contain a partition
struct ClusterPartition
{
double elapsed_time_seconds = 0;
UInt64 bytes_copied = 0;
UInt64 rows_copied = 0;
UInt64 blocks_copied = 0;
UInt64 total_tries = 0;
};
using ClusterPartitions = std::map<String, ClusterPartition, std::greater<>>;
}

View File

@ -1,280 +0,0 @@
#include "Internals.h"
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Processors/Executors/PullingPipelineExecutor.h>
#include <Processors/Transforms/SquashingChunksTransform.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/extractKeyExpressionList.h>
#include <QueryPipeline/QueryPipelineBuilder.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
using ConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfiguration>;
ConfigurationPtr getConfigurationFromXMLString(const std::string & xml_data)
{
std::stringstream ss(xml_data); // STYLE_CHECK_ALLOW_STD_STRING_STREAM
Poco::XML::InputSource input_source{ss};
return {new Poco::Util::XMLConfiguration{&input_source}};
}
String getQuotedTable(const String & database, const String & table)
{
if (database.empty())
return backQuoteIfNeed(table);
return backQuoteIfNeed(database) + "." + backQuoteIfNeed(table);
}
String getQuotedTable(const DatabaseAndTableName & db_and_table)
{
return getQuotedTable(db_and_table.first, db_and_table.second);
}
// Creates AST representing 'ENGINE = Distributed(cluster, db, table, [sharding_key])
std::shared_ptr<ASTStorage> createASTStorageDistributed(
const String & cluster_name, const String & database, const String & table,
const ASTPtr & sharding_key_ast)
{
auto args = std::make_shared<ASTExpressionList>();
args->children.emplace_back(std::make_shared<ASTLiteral>(cluster_name));
args->children.emplace_back(std::make_shared<ASTIdentifier>(database));
args->children.emplace_back(std::make_shared<ASTIdentifier>(table));
if (sharding_key_ast)
args->children.emplace_back(sharding_key_ast);
auto engine = std::make_shared<ASTFunction>();
engine->name = "Distributed";
engine->arguments = args;
auto storage = std::make_shared<ASTStorage>();
storage->set(storage->engine, engine);
return storage;
}
Block getBlockWithAllStreamData(QueryPipelineBuilder builder)
{
builder.addTransform(std::make_shared<SquashingChunksTransform>(
builder.getHeader(),
std::numeric_limits<size_t>::max(),
std::numeric_limits<size_t>::max()));
auto cur_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder));
Block block;
PullingPipelineExecutor executor(cur_pipeline);
executor.pull(block);
return block;
}
bool isExtendedDefinitionStorage(const ASTPtr & storage_ast)
{
const auto & storage = storage_ast->as<ASTStorage &>();
return storage.partition_by || storage.order_by || storage.sample_by;
}
ASTPtr extractPartitionKey(const ASTPtr & storage_ast)
{
String storage_str = queryToString(storage_ast);
const auto & storage = storage_ast->as<ASTStorage &>();
const auto & engine = storage.engine->as<ASTFunction &>();
if (!endsWith(engine.name, "MergeTree"))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported engine was specified in {}, only *MergeTree engines are supported", storage_str);
}
if (isExtendedDefinitionStorage(storage_ast))
{
if (storage.partition_by)
return storage.partition_by->clone();
static const char * all = "all";
return std::make_shared<ASTLiteral>(Field(all, strlen(all)));
}
else
{
bool is_replicated = startsWith(engine.name, "Replicated");
size_t min_args = is_replicated ? 3 : 1;
if (!engine.arguments)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected arguments in {}", storage_str);
ASTPtr arguments_ast = engine.arguments->clone();
ASTs & arguments = arguments_ast->children;
if (arguments.size() < min_args)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected at least {} arguments in {}", min_args, storage_str);
ASTPtr & month_arg = is_replicated ? arguments[2] : arguments[1];
return makeASTFunction("toYYYYMM", month_arg->clone());
}
}
ASTPtr extractPrimaryKey(const ASTPtr & storage_ast)
{
String storage_str = queryToString(storage_ast);
const auto & storage = storage_ast->as<ASTStorage &>();
const auto & engine = storage.engine->as<ASTFunction &>();
if (!endsWith(engine.name, "MergeTree"))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported engine was specified in {}, only *MergeTree engines are supported", storage_str);
}
if (!isExtendedDefinitionStorage(storage_ast))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Is not extended deginition storage {} Will be fixed later.", storage_str);
}
if (storage.primary_key)
return storage.primary_key->clone();
return nullptr;
}
ASTPtr extractOrderBy(const ASTPtr & storage_ast)
{
String storage_str = queryToString(storage_ast);
const auto & storage = storage_ast->as<ASTStorage &>();
const auto & engine = storage.engine->as<ASTFunction &>();
if (!endsWith(engine.name, "MergeTree"))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported engine was specified in {}, only *MergeTree engines are supported", storage_str);
}
if (!isExtendedDefinitionStorage(storage_ast))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Is not extended deginition storage {} Will be fixed later.", storage_str);
}
if (storage.order_by)
return storage.order_by->clone();
throw Exception(ErrorCodes::BAD_ARGUMENTS, "ORDER BY cannot be empty");
}
/// Wraps only identifiers with backticks.
std::string wrapIdentifiersWithBackticks(const ASTPtr & root)
{
if (auto identifier = std::dynamic_pointer_cast<ASTIdentifier>(root))
return backQuote(identifier->name());
if (auto function = std::dynamic_pointer_cast<ASTFunction>(root))
return function->name + '(' + wrapIdentifiersWithBackticks(function->arguments) + ')';
if (auto expression_list = std::dynamic_pointer_cast<ASTExpressionList>(root))
{
Names function_arguments(expression_list->children.size());
for (size_t i = 0; i < expression_list->children.size(); ++i)
function_arguments[i] = wrapIdentifiersWithBackticks(expression_list->children[0]);
return boost::algorithm::join(function_arguments, ", ");
}
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key could be represented only as columns or functions from columns.");
}
Names extractPrimaryKeyColumnNames(const ASTPtr & storage_ast)
{
const auto sorting_key_ast = extractOrderBy(storage_ast);
const auto primary_key_ast = extractPrimaryKey(storage_ast);
const auto sorting_key_expr_list = extractKeyExpressionList(sorting_key_ast);
const auto primary_key_expr_list = primary_key_ast
? extractKeyExpressionList(primary_key_ast) : sorting_key_expr_list->clone();
/// Maybe we have to handle VersionedCollapsing engine separately. But in our case in looks pointless.
size_t primary_key_size = primary_key_expr_list->children.size();
size_t sorting_key_size = sorting_key_expr_list->children.size();
if (primary_key_size > sorting_key_size)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key must be a prefix of the sorting key, but its length: "
"{} is greater than the sorting key length: {}",
primary_key_size, sorting_key_size);
Names primary_key_columns;
NameSet primary_key_columns_set;
for (size_t i = 0; i < sorting_key_size; ++i)
{
/// Column name could be represented as a f_1(f_2(...f_n(column_name))).
/// Each f_i could take one or more parameters.
/// We will wrap identifiers with backticks to allow non-standard identifier names.
String sorting_key_column = sorting_key_expr_list->children[i]->getColumnName();
if (i < primary_key_size)
{
String pk_column = primary_key_expr_list->children[i]->getColumnName();
if (pk_column != sorting_key_column)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Primary key must be a prefix of the sorting key, "
"but the column in the position {} is {}, not {}", i, sorting_key_column, pk_column);
if (!primary_key_columns_set.emplace(pk_column).second)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key contains duplicate columns");
primary_key_columns.push_back(wrapIdentifiersWithBackticks(primary_key_expr_list->children[i]));
}
}
return primary_key_columns;
}
bool isReplicatedTableEngine(const ASTPtr & storage_ast)
{
const auto & storage = storage_ast->as<ASTStorage &>();
const auto & engine = storage.engine->as<ASTFunction &>();
if (!endsWith(engine.name, "MergeTree"))
{
String storage_str = queryToString(storage_ast);
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported engine was specified in {}, only *MergeTree engines are supported", storage_str);
}
return startsWith(engine.name, "Replicated");
}
ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random)
{
ShardPriority res;
if (replicas.empty())
return res;
res.is_remote = 1;
for (const auto & replica : replicas)
{
if (isLocalAddress(DNSResolver::instance().resolveHostAllInOriginOrder(replica.host_name).front()))
{
res.is_remote = 0;
break;
}
}
res.hostname_difference = std::numeric_limits<size_t>::max();
for (const auto & replica : replicas)
{
size_t difference = getHostNamePrefixDistance(local_hostname, replica.host_name);
res.hostname_difference = std::min(difference, res.hostname_difference);
}
res.random = random;
return res;
}
}

View File

@ -1,198 +0,0 @@
#pragma once
#include <chrono>
#include <optional>
#include <Poco/Util/XMLConfiguration.h>
#include <Poco/Logger.h>
#include <Poco/ConsoleChannel.h>
#include <Poco/FormattingChannel.h>
#include <Poco/PatternFormatter.h>
#include <Poco/UUIDGenerator.h>
#include <Poco/Process.h>
#include <Poco/FileChannel.h>
#include <Poco/SplitterChannel.h>
#include <Poco/Util/HelpFormatter.h>
#include <boost/algorithm/string.hpp>
#include <Common/logger_useful.h>
#include <Common/ThreadPool.h>
#include <Common/Exception.h>
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Common/ZooKeeper/KeeperException.h>
#include <base/getFQDNOrHostName.h>
#include <Common/isLocalAddress.h>
#include <Common/typeid_cast.h>
#include <Common/ClickHouseRevision.h>
#include <Common/formatReadable.h>
#include <Common/DNSResolver.h>
#include <Common/CurrentThread.h>
#include <Common/getNumberOfPhysicalCPUCores.h>
#include <Common/ThreadStatus.h>
#include <Client/Connection.h>
#include <Interpreters/Context.h>
#include <Interpreters/Cluster.h>
#include <Interpreters/InterpreterFactory.h>
#include <Interpreters/InterpreterExistsQuery.h>
#include <Interpreters/InterpreterShowCreateQuery.h>
#include <Interpreters/InterpreterDropQuery.h>
#include <Interpreters/InterpreterCreateQuery.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/NestedUtils.h>
#include <Parsers/ParserCreateQuery.h>
#include <Parsers/parseQuery.h>
#include <Parsers/ParserQuery.h>
#include <Parsers/ASTCreateQuery.h>
#include <Parsers/queryToString.h>
#include <Parsers/ASTDropQuery.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTExpressionList.h>
#include <Formats/FormatSettings.h>
#include <QueryPipeline/RemoteQueryExecutor.h>
#include <IO/ConnectionTimeouts.h>
#include <IO/Operators.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadBufferFromFile.h>
#include <Functions/registerFunctions.h>
#include <TableFunctions/registerTableFunctions.h>
#include <AggregateFunctions/registerAggregateFunctions.h>
#include <Storages/registerStorages.h>
#include <Storages/StorageDistributed.h>
#include <Dictionaries/registerDictionaries.h>
#include <Disks/registerDisks.h>
#include <Databases/DatabaseMemory.h>
#include "Aliases.h"
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
ConfigurationPtr getConfigurationFromXMLString(const std::string & xml_data);
String getQuotedTable(const String & database, const String & table);
String getQuotedTable(const DatabaseAndTableName & db_and_table);
enum class TaskState
{
Started = 0,
Finished,
Unknown
};
/// Used to mark status of shard partition tasks
struct TaskStateWithOwner
{
TaskStateWithOwner() = default;
TaskStateWithOwner(TaskState state_, const String & owner_) : state(state_), owner(owner_) {}
TaskState state{TaskState::Unknown};
String owner;
static String getData(TaskState state, const String &owner)
{
return TaskStateWithOwner(state, owner).toString();
}
String toString() const
{
WriteBufferFromOwnString wb;
wb << static_cast<UInt32>(state) << "\n" << escape << owner;
return wb.str();
}
static TaskStateWithOwner fromString(const String & data)
{
ReadBufferFromString rb(data);
TaskStateWithOwner res;
UInt32 state;
rb >> state >> "\n" >> escape >> res.owner;
if (state >= static_cast<int>(TaskState::Unknown))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown state {}", data);
res.state = static_cast<TaskState>(state);
return res;
}
};
struct ShardPriority
{
UInt8 is_remote = 1;
size_t hostname_difference = 0;
UInt8 random = 0;
static bool greaterPriority(const ShardPriority & current, const ShardPriority & other)
{
return std::forward_as_tuple(current.is_remote, current.hostname_difference, current.random)
< std::forward_as_tuple(other.is_remote, other.hostname_difference, other.random);
}
};
/// Execution status of a task.
/// Is used for: partition copying task status, partition piece copying task status, partition moving task status.
enum class TaskStatus
{
Active,
Finished,
Error,
};
struct MultiTransactionInfo
{
int32_t code;
Coordination::Requests requests;
Coordination::Responses responses;
};
// Creates AST representing 'ENGINE = Distributed(cluster, db, table, [sharding_key])
std::shared_ptr<ASTStorage> createASTStorageDistributed(
const String & cluster_name, const String & database, const String & table,
const ASTPtr & sharding_key_ast = nullptr);
Block getBlockWithAllStreamData(QueryPipelineBuilder builder);
bool isExtendedDefinitionStorage(const ASTPtr & storage_ast);
ASTPtr extractPartitionKey(const ASTPtr & storage_ast);
/*
* Choosing a Primary Key that Differs from the Sorting Key
* It is possible to specify a primary key (an expression with values that are written in the index file for each mark)
* that is different from the sorting key (an expression for sorting the rows in data parts).
* In this case the primary key expression tuple must be a prefix of the sorting key expression tuple.
* This feature is helpful when using the SummingMergeTree and AggregatingMergeTree table engines.
* In a common case when using these engines, the table has two types of columns: dimensions and measures.
* Typical queries aggregate values of measure columns with arbitrary GROUP BY and filtering by dimensions.
* Because SummingMergeTree and AggregatingMergeTree aggregate rows with the same value of the sorting key,
* it is natural to add all dimensions to it. As a result, the key expression consists of a long list of columns
* and this list must be frequently updated with newly added dimensions.
* In this case it makes sense to leave only a few columns in the primary key that will provide efficient
* range scans and add the remaining dimension columns to the sorting key tuple.
* ALTER of the sorting key is a lightweight operation because when a new column is simultaneously added t
* o the table and to the sorting key, existing data parts don't need to be changed.
* Since the old sorting key is a prefix of the new sorting key and there is no data in the newly added column,
* the data is sorted by both the old and new sorting keys at the moment of table modification.
*
* */
ASTPtr extractPrimaryKey(const ASTPtr & storage_ast);
ASTPtr extractOrderBy(const ASTPtr & storage_ast);
Names extractPrimaryKeyColumnNames(const ASTPtr & storage_ast);
bool isReplicatedTableEngine(const ASTPtr & storage_ast);
ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random);
}

View File

@ -1,70 +0,0 @@
#include "ShardPartition.h"
#include "TaskShard.h"
#include "TaskTable.h"
namespace DB
{
ShardPartition::ShardPartition(TaskShard & parent, String name_quoted_, size_t number_of_splits)
: task_shard(parent)
, name(std::move(name_quoted_))
{
pieces.reserve(number_of_splits);
}
String ShardPartition::getPartitionCleanStartPath() const
{
return getPartitionPath() + "/clean_start";
}
String ShardPartition::getPartitionPieceCleanStartPath(size_t current_piece_number) const
{
assert(current_piece_number < task_shard.task_table.number_of_splits);
return getPartitionPiecePath(current_piece_number) + "/clean_start";
}
String ShardPartition::getPartitionPath() const
{
return task_shard.task_table.getPartitionPath(name);
}
String ShardPartition::getPartitionPiecePath(size_t current_piece_number) const
{
assert(current_piece_number < task_shard.task_table.number_of_splits);
return task_shard.task_table.getPartitionPiecePath(name, current_piece_number);
}
String ShardPartition::getShardStatusPath() const
{
// schema: /<root...>/tables/<table>/<partition>/shards/<shard>
// e.g. /root/table_test.hits/201701/shards/1
return getPartitionShardsPath() + "/" + toString(task_shard.numberInCluster());
}
String ShardPartition::getPartitionShardsPath() const
{
return getPartitionPath() + "/shards";
}
String ShardPartition::getPartitionActiveWorkersPath() const
{
return getPartitionPath() + "/partition_active_workers";
}
String ShardPartition::getActiveWorkerPath() const
{
return getPartitionActiveWorkersPath() + "/" + toString(task_shard.numberInCluster());
}
String ShardPartition::getCommonPartitionIsDirtyPath() const
{
return getPartitionPath() + "/is_dirty";
}
String ShardPartition::getCommonPartitionIsCleanedPath() const
{
return getCommonPartitionIsDirtyPath() + "/cleaned";
}
}

View File

@ -1,54 +0,0 @@
#pragma once
#include "ShardPartitionPiece.h"
#include <base/types.h>
#include <map>
namespace DB
{
struct TaskShard;
/// Just destination partition of a shard
/// I don't know what this comment means.
/// In short, when we discovered what shards contain currently processing partition,
/// This class describes a partition (name) that is stored on the shard (parent).
struct ShardPartition
{
ShardPartition(TaskShard &parent, String name_quoted_, size_t number_of_splits = 10);
String getPartitionPath() const;
String getPartitionPiecePath(size_t current_piece_number) const;
String getPartitionCleanStartPath() const;
String getPartitionPieceCleanStartPath(size_t current_piece_number) const;
String getCommonPartitionIsDirtyPath() const;
String getCommonPartitionIsCleanedPath() const;
String getPartitionActiveWorkersPath() const;
String getActiveWorkerPath() const;
String getPartitionShardsPath() const;
String getShardStatusPath() const;
/// What partition pieces are present in current shard.
/// FYI: Piece is a part of partition which has modulo equals to concrete constant (less than number_of_splits obliously)
/// For example SELECT ... from ... WHERE partition=current_partition AND cityHash64(*) == const;
/// Absent pieces have field is_absent_piece equals to true.
PartitionPieces pieces;
TaskShard & task_shard;
String name;
};
using TasksPartition = std::map<String, ShardPartition, std::greater<>>;
}

View File

@ -1,64 +0,0 @@
#include "ShardPartitionPiece.h"
#include "ShardPartition.h"
#include "TaskShard.h"
#include <IO/WriteHelpers.h>
namespace DB
{
ShardPartitionPiece::ShardPartitionPiece(ShardPartition & parent, size_t current_piece_number_, bool is_present_piece_)
: is_absent_piece(!is_present_piece_)
, current_piece_number(current_piece_number_)
, shard_partition(parent)
{
}
String ShardPartitionPiece::getPartitionPiecePath() const
{
return shard_partition.getPartitionPath() + "/piece_" + toString(current_piece_number);
}
String ShardPartitionPiece::getPartitionPieceCleanStartPath() const
{
return getPartitionPiecePath() + "/clean_start";
}
String ShardPartitionPiece::getPartitionPieceIsDirtyPath() const
{
return getPartitionPiecePath() + "/is_dirty";
}
String ShardPartitionPiece::getPartitionPieceIsCleanedPath() const
{
return getPartitionPieceIsDirtyPath() + "/cleaned";
}
String ShardPartitionPiece::getPartitionPieceActiveWorkersPath() const
{
return getPartitionPiecePath() + "/partition_piece_active_workers";
}
String ShardPartitionPiece::getActiveWorkerPath() const
{
return getPartitionPieceActiveWorkersPath() + "/" + toString(shard_partition.task_shard.numberInCluster());
}
/// On what shards do we have current partition.
String ShardPartitionPiece::getPartitionPieceShardsPath() const
{
return getPartitionPiecePath() + "/shards";
}
String ShardPartitionPiece::getShardStatusPath() const
{
return getPartitionPieceShardsPath() + "/" + toString(shard_partition.task_shard.numberInCluster());
}
String ShardPartitionPiece::getPartitionPieceCleanerPath() const
{
return getPartitionPieceIsDirtyPath() + "/cleaner";
}
}

View File

@ -1,43 +0,0 @@
#pragma once
#include <base/types.h>
#include <vector>
namespace DB
{
struct ShardPartition;
struct ShardPartitionPiece
{
ShardPartitionPiece(ShardPartition & parent, size_t current_piece_number_, bool is_present_piece_);
String getPartitionPiecePath() const;
String getPartitionPieceCleanStartPath() const;
String getPartitionPieceIsDirtyPath() const;
String getPartitionPieceIsCleanedPath() const;
String getPartitionPieceActiveWorkersPath() const;
String getActiveWorkerPath() const ;
/// On what shards do we have current partition.
String getPartitionPieceShardsPath() const;
String getShardStatusPath() const;
String getPartitionPieceCleanerPath() const;
bool is_absent_piece;
const size_t current_piece_number;
ShardPartition & shard_partition;
};
using PartitionPieces = std::vector<ShardPartitionPiece>;
}

View File

@ -1,48 +0,0 @@
#include "StatusAccumulator.h"
#include <Poco/JSON/Parser.h>
#include <Poco/JSON/JSON.h>
#include <Poco/JSON/Object.h>
#include <Poco/JSON/Stringifier.h>
#include <iostream>
namespace DB
{
StatusAccumulator::MapPtr StatusAccumulator::fromJSON(String state_json)
{
Poco::JSON::Parser parser;
auto state = parser.parse(state_json).extract<Poco::JSON::Object::Ptr>();
MapPtr result_ptr = std::make_shared<Map>();
for (const auto & table_name : state->getNames())
{
auto table_status_json = state->getValue<String>(table_name);
auto table_status = parser.parse(table_status_json).extract<Poco::JSON::Object::Ptr>();
/// Map entry will be created if it is absent
auto & map_table_status = (*result_ptr)[table_name];
map_table_status.all_partitions_count += table_status->getValue<size_t>("all_partitions_count");
map_table_status.processed_partitions_count += table_status->getValue<size_t>("processed_partitions_count");
}
return result_ptr;
}
String StatusAccumulator::serializeToJSON(MapPtr statuses)
{
Poco::JSON::Object result_json;
for (const auto & [table_name, table_status] : *statuses)
{
Poco::JSON::Object status_json;
status_json.set("all_partitions_count", table_status.all_partitions_count);
status_json.set("processed_partitions_count", table_status.processed_partitions_count);
result_json.set(table_name, status_json);
}
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
oss.exceptions(std::ios::failbit);
Poco::JSON::Stringifier::stringify(result_json, oss);
auto result = oss.str();
return result;
}
}

View File

@ -1,27 +0,0 @@
#pragma once
#include <base/types.h>
#include <memory>
#include <unordered_map>
namespace DB
{
class StatusAccumulator
{
public:
struct TableStatus
{
size_t all_partitions_count;
size_t processed_partitions_count;
};
using Map = std::unordered_map<String, TableStatus>;
using MapPtr = std::shared_ptr<Map>;
static MapPtr fromJSON(String state_json);
static String serializeToJSON(MapPtr statuses);
};
}

View File

@ -1,74 +0,0 @@
#include "TaskCluster.h"
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
TaskCluster::TaskCluster(const String & task_zookeeper_path_, const String & default_local_database_)
: task_zookeeper_path(task_zookeeper_path_)
, default_local_database(default_local_database_)
{}
void DB::TaskCluster::loadTasks(const Poco::Util::AbstractConfiguration & config, const String & base_key)
{
String prefix = base_key.empty() ? "" : base_key + ".";
clusters_prefix = prefix + "remote_servers";
if (!config.has(clusters_prefix))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "You should specify list of clusters in {}", clusters_prefix);
Poco::Util::AbstractConfiguration::Keys tables_keys;
config.keys(prefix + "tables", tables_keys);
for (const auto & table_key : tables_keys)
{
table_tasks.emplace_back(*this, config, prefix + "tables", table_key);
}
}
void DB::TaskCluster::reloadSettings(const Poco::Util::AbstractConfiguration & config, const String & base_key)
{
String prefix = base_key.empty() ? "" : base_key + ".";
max_workers = config.getUInt64(prefix + "max_workers");
settings_common = Settings();
if (config.has(prefix + "settings"))
settings_common.loadSettingsFromConfig(prefix + "settings", config);
settings_common.prefer_localhost_replica = false;
settings_pull = settings_common;
if (config.has(prefix + "settings_pull"))
settings_pull.loadSettingsFromConfig(prefix + "settings_pull", config);
settings_push = settings_common;
if (config.has(prefix + "settings_push"))
settings_push.loadSettingsFromConfig(prefix + "settings_push", config);
auto set_default_value = [] (auto && setting, auto && default_value)
{
setting = setting.changed ? setting.value : default_value;
};
/// Override important settings
settings_pull.readonly = 1;
settings_pull.prefer_localhost_replica = false;
settings_push.distributed_foreground_insert = true;
settings_push.prefer_localhost_replica = false;
set_default_value(settings_pull.load_balancing, LoadBalancing::NEAREST_HOSTNAME);
set_default_value(settings_pull.max_threads, 1);
set_default_value(settings_pull.max_block_size, 8192UL);
set_default_value(settings_pull.preferred_block_size_bytes, 0);
set_default_value(settings_push.distributed_background_insert_timeout, 0);
set_default_value(settings_push.alter_sync, 2);
}
}

View File

@ -1,51 +0,0 @@
#pragma once
#include "TaskTable.h"
#include <Core/Settings.h>
#include <base/types.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <pcg_random.hpp>
namespace DB
{
struct TaskCluster
{
TaskCluster(const String & task_zookeeper_path_, const String & default_local_database_);
void loadTasks(const Poco::Util::AbstractConfiguration & config, const String & base_key = "");
/// Set (or update) settings and max_workers param
void reloadSettings(const Poco::Util::AbstractConfiguration & config, const String & base_key = "");
/// Base node for all tasks. Its structure:
/// workers/ - directory with active workers (amount of them is less or equal max_workers)
/// description - node with task configuration
/// table_table1/ - directories with per-partition copying status
String task_zookeeper_path;
/// Database used to create temporary Distributed tables
String default_local_database;
/// Limits number of simultaneous workers
UInt64 max_workers = 0;
/// Base settings for pull and push
Settings settings_common;
/// Settings used to fetch data
Settings settings_pull;
/// Settings used to insert data
Settings settings_push;
String clusters_prefix;
/// Subtasks
TasksTable table_tasks;
pcg64 random_engine;
};
}

View File

@ -1,37 +0,0 @@
#include "TaskShard.h"
#include "TaskTable.h"
namespace DB
{
TaskShard::TaskShard(TaskTable & parent, const Cluster::ShardInfo & info_)
: task_table(parent)
, info(info_)
{
list_of_split_tables_on_shard.assign(task_table.number_of_splits, DatabaseAndTableName());
}
UInt32 TaskShard::numberInCluster() const
{
return info.shard_num;
}
UInt32 TaskShard::indexInCluster() const
{
return info.shard_num - 1;
}
String DB::TaskShard::getDescription() const
{
return fmt::format("N{} (having a replica {}, pull table {} of cluster {}",
numberInCluster(), getHostNameExample(), getQuotedTable(task_table.table_pull), task_table.cluster_pull_name);
}
String DB::TaskShard::getHostNameExample() const
{
const auto & replicas = task_table.cluster_pull->getShardsAddresses().at(indexInCluster());
return replicas.at(0).readableString();
}
}

View File

@ -1,56 +0,0 @@
#pragma once
#include "Aliases.h"
#include "Internals.h"
#include "ClusterPartition.h"
#include "ShardPartition.h"
namespace DB
{
struct TaskTable;
struct TaskShard
{
TaskShard(TaskTable & parent, const Cluster::ShardInfo & info_);
TaskTable & task_table;
Cluster::ShardInfo info;
UInt32 numberInCluster() const;
UInt32 indexInCluster() const;
String getDescription() const;
String getHostNameExample() const;
/// Used to sort clusters by their proximity
ShardPriority priority;
/// Column with unique destination partitions (computed from engine_push_partition_key expr.) in the shard
ColumnWithTypeAndName partition_key_column;
/// There is a task for each destination partition
TasksPartition partition_tasks;
/// Which partitions have been checked for existence
/// If some partition from this lists is exists, it is in partition_tasks
std::set<String> checked_partitions;
/// Last CREATE TABLE query of the table of the shard
ASTPtr current_pull_table_create_query;
ASTPtr current_push_table_create_query;
/// Internal distributed tables
DatabaseAndTableName table_read_shard;
DatabaseAndTableName main_table_split_shard;
ListOfDatabasesAndTableNames list_of_split_tables_on_shard;
};
using TaskShardPtr = std::shared_ptr<TaskShard>;
using TasksShard = std::vector<TaskShardPtr>;
}

View File

@ -1,222 +0,0 @@
#include "TaskTable.h"
#include "ClusterPartition.h"
#include "TaskCluster.h"
#include <Parsers/ASTFunction.h>
#include <Common/escapeForFileName.h>
#include <boost/algorithm/string/join.hpp>
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_ELEMENT_IN_CONFIG;
extern const int LOGICAL_ERROR;
}
TaskTable::TaskTable(TaskCluster & parent, const Poco::Util::AbstractConfiguration & config,
const String & prefix_, const String & table_key)
: task_cluster(parent)
{
String table_prefix = prefix_ + "." + table_key + ".";
name_in_config = table_key;
number_of_splits = config.getUInt64(table_prefix + "number_of_splits", 3);
allow_to_copy_alias_and_materialized_columns = config.getBool(table_prefix + "allow_to_copy_alias_and_materialized_columns", false);
allow_to_drop_target_partitions = config.getBool(table_prefix + "allow_to_drop_target_partitions", false);
cluster_pull_name = config.getString(table_prefix + "cluster_pull");
cluster_push_name = config.getString(table_prefix + "cluster_push");
table_pull.first = config.getString(table_prefix + "database_pull");
table_pull.second = config.getString(table_prefix + "table_pull");
table_push.first = config.getString(table_prefix + "database_push");
table_push.second = config.getString(table_prefix + "table_push");
/// Used as node name in ZooKeeper
table_id = escapeForFileName(cluster_push_name)
+ "." + escapeForFileName(table_push.first)
+ "." + escapeForFileName(table_push.second);
engine_push_str = config.getString(table_prefix + "engine", "rand()");
{
ParserStorage parser_storage{ParserStorage::TABLE_ENGINE};
engine_push_ast = parseQuery(parser_storage, engine_push_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
engine_push_partition_key_ast = extractPartitionKey(engine_push_ast);
primary_key_comma_separated = boost::algorithm::join(extractPrimaryKeyColumnNames(engine_push_ast), ", ");
is_replicated_table = isReplicatedTableEngine(engine_push_ast);
}
sharding_key_str = config.getString(table_prefix + "sharding_key");
auxiliary_engine_split_asts.reserve(number_of_splits);
{
ParserExpressionWithOptionalAlias parser_expression(false);
sharding_key_ast = parseQuery(parser_expression, sharding_key_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
main_engine_split_ast = createASTStorageDistributed(cluster_push_name, table_push.first, table_push.second,
sharding_key_ast);
for (const auto piece_number : collections::range(0, number_of_splits))
{
auxiliary_engine_split_asts.emplace_back
(
createASTStorageDistributed(cluster_push_name, table_push.first,
table_push.second + "_piece_" + toString(piece_number), sharding_key_ast)
);
}
}
where_condition_str = config.getString(table_prefix + "where_condition", "");
if (!where_condition_str.empty())
{
ParserExpressionWithOptionalAlias parser_expression(false);
where_condition_ast = parseQuery(parser_expression, where_condition_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
// Will use canonical expression form
where_condition_str = queryToString(where_condition_ast);
}
String enabled_partitions_prefix = table_prefix + "enabled_partitions";
has_enabled_partitions = config.has(enabled_partitions_prefix);
if (has_enabled_partitions)
{
Strings keys;
config.keys(enabled_partitions_prefix, keys);
if (keys.empty())
{
/// Parse list of partition from space-separated string
String partitions_str = config.getString(table_prefix + "enabled_partitions");
boost::trim_if(partitions_str, isWhitespaceASCII);
boost::split(enabled_partitions, partitions_str, isWhitespaceASCII, boost::token_compress_on);
}
else
{
/// Parse sequence of <partition>...</partition>
for (const String &key : keys)
{
if (!startsWith(key, "partition"))
throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "Unknown key {} in {}", key, enabled_partitions_prefix);
enabled_partitions.emplace_back(config.getString(enabled_partitions_prefix + "." + key));
}
}
std::copy(enabled_partitions.begin(), enabled_partitions.end(), std::inserter(enabled_partitions_set, enabled_partitions_set.begin()));
}
}
String TaskTable::getPartitionPath(const String & partition_name) const
{
return task_cluster.task_zookeeper_path // root
+ "/tables/" + table_id // tables/dst_cluster.merge.hits
+ "/" + escapeForFileName(partition_name); // 201701
}
String TaskTable::getPartitionAttachIsActivePath(const String & partition_name) const
{
return getPartitionPath(partition_name) + "/attach_active";
}
String TaskTable::getPartitionAttachIsDonePath(const String & partition_name) const
{
return getPartitionPath(partition_name) + "/attach_is_done";
}
String TaskTable::getPartitionPiecePath(const String & partition_name, size_t piece_number) const
{
assert(piece_number < number_of_splits);
return getPartitionPath(partition_name) + "/piece_" + toString(piece_number); // 1...number_of_splits
}
String TaskTable::getCertainPartitionIsDirtyPath(const String &partition_name) const
{
return getPartitionPath(partition_name) + "/is_dirty";
}
String TaskTable::getCertainPartitionPieceIsDirtyPath(const String & partition_name, const size_t piece_number) const
{
return getPartitionPiecePath(partition_name, piece_number) + "/is_dirty";
}
String TaskTable::getCertainPartitionIsCleanedPath(const String & partition_name) const
{
return getCertainPartitionIsDirtyPath(partition_name) + "/cleaned";
}
String TaskTable::getCertainPartitionPieceIsCleanedPath(const String & partition_name, const size_t piece_number) const
{
return getCertainPartitionPieceIsDirtyPath(partition_name, piece_number) + "/cleaned";
}
String TaskTable::getCertainPartitionTaskStatusPath(const String & partition_name) const
{
return getPartitionPath(partition_name) + "/shards";
}
String TaskTable::getCertainPartitionPieceTaskStatusPath(const String & partition_name, const size_t piece_number) const
{
return getPartitionPiecePath(partition_name, piece_number) + "/shards";
}
bool TaskTable::isReplicatedTable() const
{
return is_replicated_table;
}
String TaskTable::getStatusAllPartitionCount() const
{
return task_cluster.task_zookeeper_path + "/status/all_partitions_count";
}
String TaskTable::getStatusProcessedPartitionsCount() const
{
return task_cluster.task_zookeeper_path + "/status/processed_partitions_count";
}
ASTPtr TaskTable::rewriteReplicatedCreateQueryToPlain() const
{
ASTPtr prev_engine_push_ast = engine_push_ast->clone();
auto & new_storage_ast = prev_engine_push_ast->as<ASTStorage &>();
auto & new_engine_ast = new_storage_ast.engine->as<ASTFunction &>();
/// Remove "Replicated" from name
new_engine_ast.name = new_engine_ast.name.substr(10);
if (new_engine_ast.arguments)
{
auto & replicated_table_arguments = new_engine_ast.arguments->children;
/// In some cases of Atomic database engine usage ReplicatedMergeTree tables
/// could be created without arguments.
if (!replicated_table_arguments.empty())
{
/// Delete first two arguments of Replicated...MergeTree() table.
replicated_table_arguments.erase(replicated_table_arguments.begin());
replicated_table_arguments.erase(replicated_table_arguments.begin());
}
}
return new_storage_ast.clone();
}
ClusterPartition & TaskTable::getClusterPartition(const String & partition_name)
{
auto it = cluster_partitions.find(partition_name);
if (it == cluster_partitions.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "There are no cluster partition {} in {}", partition_name, table_id);
return it->second;
}
}

View File

@ -1,173 +0,0 @@
#pragma once
#include "Aliases.h"
#include "TaskShard.h"
namespace DB
{
struct ClusterPartition;
struct TaskCluster;
struct TaskTable
{
TaskTable(TaskCluster & parent, const Poco::Util::AbstractConfiguration & config, const String & prefix, const String & table_key);
TaskCluster & task_cluster;
/// These functions used in checkPartitionIsDone() or checkPartitionPieceIsDone()
/// They are implemented here not to call task_table.tasks_shard[partition_name].second.pieces[current_piece_number] etc.
String getPartitionPath(const String & partition_name) const;
String getPartitionAttachIsActivePath(const String & partition_name) const;
String getPartitionAttachIsDonePath(const String & partition_name) const;
String getPartitionPiecePath(const String & partition_name, size_t piece_number) const;
String getCertainPartitionIsDirtyPath(const String & partition_name) const;
String getCertainPartitionPieceIsDirtyPath(const String & partition_name, size_t piece_number) const;
String getCertainPartitionIsCleanedPath(const String & partition_name) const;
String getCertainPartitionPieceIsCleanedPath(const String & partition_name, size_t piece_number) const;
String getCertainPartitionTaskStatusPath(const String & partition_name) const;
String getCertainPartitionPieceTaskStatusPath(const String & partition_name, size_t piece_number) const;
bool isReplicatedTable() const;
/// These nodes are used for check-status option
String getStatusAllPartitionCount() const;
String getStatusProcessedPartitionsCount() const;
/// Partitions will be split into number-of-splits pieces.
/// Each piece will be copied independently. (10 by default)
size_t number_of_splits;
bool allow_to_copy_alias_and_materialized_columns{false};
bool allow_to_drop_target_partitions{false};
String name_in_config;
/// Used as task ID
String table_id;
/// Column names in primary key
String primary_key_comma_separated;
/// Source cluster and table
String cluster_pull_name;
DatabaseAndTableName table_pull;
/// Destination cluster and table
String cluster_push_name;
DatabaseAndTableName table_push;
/// Storage of destination table
/// (tables that are stored on each shard of target cluster)
String engine_push_str;
ASTPtr engine_push_ast;
ASTPtr engine_push_partition_key_ast;
/// First argument of Replicated...MergeTree()
String engine_push_zk_path;
bool is_replicated_table;
ASTPtr rewriteReplicatedCreateQueryToPlain() const;
/*
* A Distributed table definition used to split data
* Distributed table will be created on each shard of default
* cluster to perform data copying and resharding
* */
String sharding_key_str;
ASTPtr sharding_key_ast;
ASTPtr main_engine_split_ast;
/*
* To copy partition piece form one cluster to another we have to use Distributed table.
* In case of usage separate table (engine_push) for each partition piece,
* we have to use many Distributed tables.
* */
ASTs auxiliary_engine_split_asts;
/// Additional WHERE expression to filter input data
String where_condition_str;
ASTPtr where_condition_ast;
/// Resolved clusters
ClusterPtr cluster_pull;
ClusterPtr cluster_push;
/// Filter partitions that should be copied
bool has_enabled_partitions = false;
Strings enabled_partitions;
NameSet enabled_partitions_set;
/**
* Prioritized list of shards
* all_shards contains information about all shards in the table.
* So we have to check whether particular shard have current partition or not while processing.
*/
TasksShard all_shards;
TasksShard local_shards;
/// All partitions of the current table.
ClusterPartitions cluster_partitions;
NameSet finished_cluster_partitions;
/// Partition names to process in user-specified order
Strings ordered_partition_names;
ClusterPartition & getClusterPartition(const String & partition_name);
Stopwatch watch;
UInt64 bytes_copied = 0;
UInt64 rows_copied = 0;
template <typename RandomEngine>
void initShards(RandomEngine &&random_engine);
};
using TasksTable = std::list<TaskTable>;
template<typename RandomEngine>
inline void TaskTable::initShards(RandomEngine && random_engine)
{
const String & fqdn_name = getFQDNOrHostName();
std::uniform_int_distribution<uint8_t> get_urand(0, std::numeric_limits<UInt8>::max());
// Compute the priority
for (const auto & shard_info : cluster_pull->getShardsInfo())
{
TaskShardPtr task_shard = std::make_shared<TaskShard>(*this, shard_info);
const auto & replicas = cluster_pull->getShardsAddresses().at(task_shard->indexInCluster());
task_shard->priority = getReplicasPriority(replicas, fqdn_name, get_urand(random_engine));
all_shards.emplace_back(task_shard);
}
// Sort by priority
std::sort(all_shards.begin(), all_shards.end(),
[](const TaskShardPtr & lhs, const TaskShardPtr & rhs)
{
return ShardPriority::greaterPriority(lhs->priority, rhs->priority);
});
// Cut local shards
auto it_first_remote = std::lower_bound(all_shards.begin(), all_shards.end(), 1,
[](const TaskShardPtr & lhs, UInt8 is_remote)
{
return lhs->priority.is_remote < is_remote;
});
local_shards.assign(all_shards.begin(), it_first_remote);
}
}

View File

@ -1,221 +0,0 @@
#pragma once
/** Allows to compare two incremental counters of type UInt32 in presence of possible overflow.
* We assume that we compare values that are not too far away.
* For example, when we increment 0xFFFFFFFF, we get 0. So, 0xFFFFFFFF is less than 0.
*/
class WrappingUInt32
{
public:
UInt32 value;
explicit WrappingUInt32(UInt32 _value)
: value(_value)
{}
bool operator<(const WrappingUInt32 & other) const
{
return value != other.value && *this <= other;
}
bool operator<=(const WrappingUInt32 & other) const
{
const UInt32 HALF = static_cast<UInt32>(1) << 31;
return (value <= other.value && other.value - value < HALF)
|| (value > other.value && value - other.value > HALF);
}
bool operator==(const WrappingUInt32 & other) const
{
return value == other.value;
}
};
/** Conforming Zxid definition.
* cf. https://github.com/apache/zookeeper/blob/631d1b284f0edb1c4f6b0fb221bf2428aec71aaa/zookeeper-docs/src/main/resources/markdown/zookeeperInternals.md#guarantees-properties-and-definitions
*
* But it is better to read this: https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html
*
* Actually here is the definition of Zxid.
* Every change to the ZooKeeper state receives a stamp in the form of a zxid (ZooKeeper Transaction Id).
* This exposes the total ordering of all changes to ZooKeeper. Each change will have a unique zxid
* and if zxid1 is smaller than zxid2 then zxid1 happened before zxid2.
*/
class Zxid
{
public:
WrappingUInt32 epoch;
WrappingUInt32 counter;
explicit Zxid(UInt64 _zxid)
: epoch(static_cast<UInt32>(_zxid >> 32))
, counter(static_cast<UInt32>(_zxid))
{}
bool operator<=(const Zxid & other) const
{
return (epoch < other.epoch)
|| (epoch == other.epoch && counter <= other.counter);
}
bool operator==(const Zxid & other) const
{
return epoch == other.epoch && counter == other.counter;
}
};
/* When multiple ClusterCopiers discover that the target partition is not empty,
* they will attempt to clean up this partition before proceeding to copying.
*
* Instead of purging is_dirty, the history of cleaning work is preserved and partition hygiene is established
* based on a happens-before relation between the events.
* This relation is encoded by LogicalClock based on the mzxid of the is_dirty ZNode and is_dirty/cleaned.
* The fact of the partition hygiene is encoded by CleanStateClock.
*
* For you to know what mzxid means:
*
* ZooKeeper Stat Structure:
* The Stat structure for each znode in ZooKeeper is made up of the following fields:
*
* -- czxid
* The zxid of the change that caused this znode to be created.
*
* -- mzxid
* The zxid of the change that last modified this znode.
*
* -- ctime
* The time in milliseconds from epoch when this znode was created.
*
* -- mtime
* The time in milliseconds from epoch when this znode was last modified.
*
* -- version
* The number of changes to the data of this znode.
*
* -- cversion
* The number of changes to the children of this znode.
*
* -- aversion
* The number of changes to the ACL of this znode.
*
* -- ephemeralOwner
* The session id of the owner of this znode if the znode is an ephemeral node.
* If it is not an ephemeral node, it will be zero.
*
* -- dataLength
* The length of the data field of this znode.
*
* -- numChildren
* The number of children of this znode.
* */
class LogicalClock
{
public:
std::optional<Zxid> zxid;
LogicalClock() = default;
explicit LogicalClock(UInt64 _zxid)
: zxid(_zxid)
{}
bool hasHappened() const
{
return bool(zxid);
}
/// happens-before relation with a reasonable time bound
bool happensBefore(const LogicalClock & other) const
{
return !zxid
|| (other.zxid && *zxid <= *other.zxid);
}
bool operator<=(const LogicalClock & other) const
{
return happensBefore(other);
}
/// strict equality check
bool operator==(const LogicalClock & other) const
{
return zxid == other.zxid;
}
};
class CleanStateClock
{
public:
LogicalClock discovery_zxid;
std::optional<UInt32> discovery_version;
LogicalClock clean_state_zxid;
std::optional<UInt32> clean_state_version;
std::shared_ptr<std::atomic_bool> stale;
bool is_clean() const
{
return !is_stale()
&& (!discovery_zxid.hasHappened() || (clean_state_zxid.hasHappened() && discovery_zxid <= clean_state_zxid));
}
bool is_stale() const
{
return stale->load();
}
CleanStateClock(
const zkutil::ZooKeeperPtr & zookeeper,
const String & discovery_path,
const String & clean_state_path)
: stale(std::make_shared<std::atomic_bool>(false))
{
Coordination::Stat stat{};
String _some_data;
auto watch_callback =
[my_stale = stale] (const Coordination::WatchResponse & rsp)
{
auto logger = getLogger("ClusterCopier");
if (rsp.error == Coordination::Error::ZOK)
{
switch (rsp.type) /// NOLINT(bugprone-switch-missing-default-case)
{
case Coordination::CREATED:
LOG_DEBUG(logger, "CleanStateClock change: CREATED, at {}", rsp.path);
my_stale->store(true);
break;
case Coordination::CHANGED:
LOG_DEBUG(logger, "CleanStateClock change: CHANGED, at {}", rsp.path);
my_stale->store(true);
}
}
};
if (zookeeper->tryGetWatch(discovery_path, _some_data, &stat, watch_callback))
{
discovery_zxid = LogicalClock(stat.mzxid);
discovery_version = stat.version;
}
if (zookeeper->tryGetWatch(clean_state_path, _some_data, &stat, watch_callback))
{
clean_state_zxid = LogicalClock(stat.mzxid);
clean_state_version = stat.version;
}
}
bool operator==(const CleanStateClock & other) const
{
return !is_stale()
&& !other.is_stale()
&& discovery_zxid == other.discovery_zxid
&& discovery_version == other.discovery_version
&& clean_state_zxid == other.clean_state_zxid
&& clean_state_version == other.clean_state_version;
}
bool operator!=(const CleanStateClock & other) const
{
return !(*this == other);
}
};

View File

@ -1 +0,0 @@
int mainEntryClickHouseClusterCopier(int argc, char ** argv);

View File

@ -94,7 +94,7 @@
<http_port>8123</http_port>
<!-- Port for interaction by native protocol with:
- clickhouse-client and other native ClickHouse tools (clickhouse-benchmark, clickhouse-copier);
- clickhouse-client and other native ClickHouse tools (clickhouse-benchmark);
- clickhouse-server with other clickhouse-servers for distributed query processing;
- ClickHouse drivers and applications supporting native protocol
(this protocol is also informally called as "the TCP protocol");

View File

@ -56,7 +56,7 @@ logger:
http_port: 8123
# Port for interaction by native protocol with:
# - clickhouse-client and other native ClickHouse tools (clickhouse-benchmark, clickhouse-copier);
# - clickhouse-client and other native ClickHouse tools (clickhouse-benchmark);
# - clickhouse-server with other clickhouse-servers for distributed query processing;
# - ClickHouse drivers and applications supporting native protocol
# (this protocol is also informally called as "the TCP protocol");

View File

@ -94,7 +94,7 @@
<http_port>8123</http_port>
<!-- Port for interaction by native protocol with:
- clickhouse-client and other native ClickHouse tools (clickhouse-benchmark, clickhouse-copier);
- clickhouse-client and other native ClickHouse tools (clickhouse-benchmark);
- clickhouse-server with other clickhouse-servers for distributed query processing;
- ClickHouse drivers and applications supporting native protocol
(this protocol is also informally called as "the TCP protocol");

View File

@ -433,7 +433,6 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
"clickhouse-client",
"clickhouse-local",
"clickhouse-benchmark",
"clickhouse-copier",
"clickhouse-obfuscator",
"clickhouse-git-import",
"clickhouse-compressor",

View File

@ -31,7 +31,6 @@ int mainEntryClickHouseBenchmark(int argc, char ** argv);
int mainEntryClickHouseExtractFromConfig(int argc, char ** argv);
int mainEntryClickHouseCompressor(int argc, char ** argv);
int mainEntryClickHouseFormat(int argc, char ** argv);
int mainEntryClickHouseClusterCopier(int argc, char ** argv);
int mainEntryClickHouseObfuscator(int argc, char ** argv);
int mainEntryClickHouseGitImport(int argc, char ** argv);
int mainEntryClickHouseStaticFilesDiskUploader(int argc, char ** argv);
@ -80,7 +79,6 @@ std::pair<std::string_view, MainFunc> clickhouse_applications[] =
{"extract-from-config", mainEntryClickHouseExtractFromConfig},
{"compressor", mainEntryClickHouseCompressor},
{"format", mainEntryClickHouseFormat},
{"copier", mainEntryClickHouseClusterCopier},
{"obfuscator", mainEntryClickHouseObfuscator},
{"git-import", mainEntryClickHouseGitImport},
{"static-files-disk-uploader", mainEntryClickHouseStaticFilesDiskUploader},

View File

@ -135,7 +135,7 @@
<http_port>8123</http_port>
<!-- Port for interaction by native protocol with:
- clickhouse-client and other native ClickHouse tools (clickhouse-benchmark, clickhouse-copier);
- clickhouse-client and other native ClickHouse tools (clickhouse-benchmark);
- clickhouse-server with other clickhouse-servers for distributed query processing;
- ClickHouse drivers and applications supporting native protocol
(this protocol is also informally called as "the TCP protocol");

View File

@ -55,7 +55,7 @@ logger:
http_port: 8123
# Port for interaction by native protocol with:
# - clickhouse-client and other native ClickHouse tools (clickhouse-benchmark, clickhouse-copier);
# - clickhouse-client and other native ClickHouse tools (clickhouse-benchmark);
# - clickhouse-server with other clickhouse-servers for distributed query processing;
# - ClickHouse drivers and applications supporting native protocol
# (this protocol is also informally called as "the TCP protocol");

View File

@ -21,7 +21,6 @@ namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ALL_CONNECTION_TRIES_FAILED;
extern const int BAD_ARGUMENTS;
}
@ -192,20 +191,11 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
max_entries = nested_pools.size();
}
else if (pool_mode == PoolMode::GET_ONE)
{
max_entries = 1;
}
else if (pool_mode == PoolMode::GET_MANY)
{
if (settings.max_parallel_replicas == 0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The value of the setting max_parallel_replicas must be greater than 0");
max_entries = settings.max_parallel_replicas;
}
else
{
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknown pool allocation mode");
}
if (!priority_func)
priority_func = makeGetPriorityFunc(settings);

View File

@ -19,7 +19,6 @@ namespace ErrorCodes
extern const int ALL_CONNECTION_TRIES_FAILED;
extern const int ALL_REPLICAS_ARE_STALE;
extern const int LOGICAL_ERROR;
extern const int BAD_ARGUMENTS;
}
HedgedConnectionsFactory::HedgedConnectionsFactory(
@ -83,10 +82,7 @@ std::vector<Connection *> HedgedConnectionsFactory::getManyConnections(PoolMode
}
case PoolMode::GET_MANY:
{
if (max_parallel_replicas == 0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The value of the setting max_parallel_replicas must be greater than 0");
max_entries = std::min(max_parallel_replicas, shuffled_pools.size());
max_entries = max_parallel_replicas;
break;
}
}

View File

@ -158,7 +158,7 @@ private:
/// checking the number of requested replicas that are still in process).
size_t requested_connections_count = 0;
const size_t max_parallel_replicas = 1;
const size_t max_parallel_replicas = 0;
const bool skip_unavailable_shards = 0;
};

View File

@ -20,12 +20,12 @@ namespace DB
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_COLUMN;
extern const int DUPLICATE_COLUMN;
extern const int NUMBER_OF_DIMENSIONS_MISMATCHED;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int EXPERIMENTAL_FEATURE_ERROR;
}
namespace
@ -247,7 +247,7 @@ void ColumnObject::Subcolumn::checkTypes() const
prefix_types.push_back(current_type);
auto prefix_common_type = getLeastSupertype(prefix_types);
if (!prefix_common_type->equals(*current_type))
throw Exception(ErrorCodes::LOGICAL_ERROR,
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR,
"Data type {} of column at position {} cannot represent all columns from i-th prefix",
current_type->getName(), i);
}
@ -635,7 +635,7 @@ void ColumnObject::checkConsistency() const
{
if (num_rows != leaf->data.size())
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "Sizes of subcolumns are inconsistent in ColumnObject."
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Sizes of subcolumns are inconsistent in ColumnObject."
" Subcolumn '{}' has {} rows, but expected size is {}",
leaf->path.getPath(), leaf->data.size(), num_rows);
}
@ -919,7 +919,7 @@ void ColumnObject::addSubcolumn(const PathInData & key, size_t new_size)
void ColumnObject::addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size)
{
if (!key.hasNested())
throw Exception(ErrorCodes::LOGICAL_ERROR,
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR,
"Cannot add Nested subcolumn, because path doesn't contain Nested");
bool inserted = false;

View File

@ -226,7 +226,6 @@
M(PartsDeleteOnDestroy, "Part was moved to another disk and should be deleted in own destructor.") \
M(PartsWide, "Wide parts.") \
M(PartsCompact, "Compact parts.") \
M(PartsInMemory, "In-memory parts.") \
M(MMappedFiles, "Total number of mmapped files.") \
M(MMappedFileBytes, "Sum size of mmapped file regions.") \
M(AsynchronousReadWait, "Number of threads waiting for asynchronous read.") \

View File

@ -598,6 +598,7 @@
M(714, UNEXPECTED_CLUSTER) \
M(715, CANNOT_DETECT_FORMAT) \
M(716, CANNOT_FORGET_PARTITION) \
M(717, EXPERIMENTAL_FEATURE_ERROR) \
\
M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \

View File

@ -221,7 +221,7 @@ private:
{
std::lock_guard lock(mutex);
auto it = cells.find(key);
if (it != cells.end() && !it->second.expired)
if (it != cells.end())
{
if (!it->second.expired)
{

View File

@ -45,6 +45,33 @@ struct MyWeight
size_t operator()(const int & x) const { return static_cast<size_t>(x); }
};
TEST(LRUResourceCache, remove2)
{
using MyCache = DB::LRUResourceCache<int, int, MyWeight>;
auto mcache = MyCache(10, 10);
for (int i = 1; i < 5; ++i)
{
auto load_int = [&] { return std::make_shared<int>(i); };
mcache.getOrSet(i, load_int);
}
auto n = mcache.size();
ASSERT_EQ(n, 4);
auto w = mcache.weight();
ASSERT_EQ(w, 10);
auto holder4 = mcache.get(4);
ASSERT_TRUE(holder4 != nullptr);
mcache.tryRemove(4);
auto holder_reget_4 = mcache.get(4);
ASSERT_TRUE(holder_reget_4 == nullptr);
mcache.getOrSet(4, [&]() { return std::make_shared<int>(4); });
holder4.reset();
auto holder1 = mcache.getOrSet(1, [&]() { return std::make_shared<int>(1); });
ASSERT_TRUE(holder1 != nullptr);
auto holder7 = mcache.getOrSet(7, [&] { return std::make_shared<int>(7); });
ASSERT_TRUE(holder7 != nullptr);
}
TEST(LRUResourceCache, evictOnWweight)
{
using MyCache = DB::LRUResourceCache<int, int, MyWeight>;

View File

@ -497,7 +497,7 @@ public:
switch (which)
{
case Types::Null: return false;
case Types::Null: return get<Null>() < rhs.get<Null>();
case Types::Bool: [[fallthrough]];
case Types::UInt64: return get<UInt64>() < rhs.get<UInt64>();
case Types::UInt128: return get<UInt128>() < rhs.get<UInt128>();
@ -541,7 +541,7 @@ public:
switch (which)
{
case Types::Null: return true;
case Types::Null: return get<Null>() <= rhs.get<Null>();
case Types::Bool: [[fallthrough]];
case Types::UInt64: return get<UInt64>() <= rhs.get<UInt64>();
case Types::UInt128: return get<UInt128>() <= rhs.get<UInt128>();
@ -590,7 +590,7 @@ public:
switch (which)
{
case Types::Null: return true;
case Types::Null: return get<Null>() == rhs.get<Null>();
case Types::Bool: [[fallthrough]];
case Types::UInt64: return get<UInt64>() == rhs.get<UInt64>();
case Types::Int64: return get<Int64>() == rhs.get<Int64>();

View File

@ -23,9 +23,9 @@ struct Null
{
enum class Value
{
Null,
PositiveInfinity,
NegativeInfinity,
NegativeInfinity = -1,
Null = 0,
PositiveInfinity = 1,
};
Value value{Value::Null};
@ -34,15 +34,12 @@ struct Null
bool isPositiveInfinity() const { return value == Value::PositiveInfinity; }
bool isNegativeInfinity() const { return value == Value::NegativeInfinity; }
bool operator==(const Null & other) const
auto operator<=>(const Null & other) const
{
return value == other.value;
return static_cast<int>(value) <=> static_cast<int>(other.value);
}
bool operator!=(const Null & other) const
{
return !(*this == other);
}
bool operator==(const Null &) const = default;
};
using UInt128 = ::UInt128;

View File

@ -28,9 +28,9 @@ namespace DB
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int LOGICAL_ERROR;
extern const int INCOMPATIBLE_COLUMNS;
extern const int NOT_IMPLEMENTED;
extern const int EXPERIMENTAL_FEATURE_ERROR;
}
size_t getNumberOfDimensions(const IDataType & type)
@ -92,7 +92,7 @@ ColumnPtr createArrayOfColumn(ColumnPtr column, size_t num_dimensions)
Array createEmptyArrayField(size_t num_dimensions)
{
if (num_dimensions == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create array field with 0 dimensions");
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Cannot create array field with 0 dimensions");
Array array;
Array * current_array = &array;
@ -231,7 +231,7 @@ static std::pair<ColumnPtr, DataTypePtr> recursivlyConvertDynamicColumnToTuple(
};
}
throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type->getName());
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Type {} unexpectedly has dynamic columns", type->getName());
}
void convertDynamicColumnsToTuples(Block & block, const StorageSnapshotPtr & storage_snapshot)
@ -247,7 +247,7 @@ void convertDynamicColumnsToTuples(Block & block, const StorageSnapshotPtr & sto
GetColumnsOptions options(GetColumnsOptions::AllPhysical);
auto storage_column = storage_snapshot->tryGetColumn(options, column.name);
if (!storage_column)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", column.name);
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Column '{}' not found in storage", column.name);
auto storage_column_concrete = storage_snapshot->getColumn(options.withExtendedObjects(), column.name);
@ -315,7 +315,7 @@ static DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool che
{
const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get());
if (!type_tuple)
throw Exception(ErrorCodes::LOGICAL_ERROR,
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR,
"Least common type for object can be deduced only from tuples, but {} given", type->getName());
auto [tuple_paths, tuple_types] = flattenTuple(type);
@ -427,7 +427,7 @@ static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl(
if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type_in_storage.get()))
return getLeastCommonTypeForTuple(*type_tuple, concrete_types, check_ambiguos_paths);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type_in_storage->getName());
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Type {} unexpectedly has dynamic columns", type_in_storage->getName());
}
DataTypePtr getLeastCommonTypeForDynamicColumns(
@ -481,7 +481,7 @@ DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage
return recreateTupleWithElements(*type_tuple, new_elements);
}
throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type_in_storage->getName());
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Type {} unexpectedly has dynamic columns", type_in_storage->getName());
}
bool hasDynamicSubcolumns(const ColumnsDescription & columns)
@ -613,7 +613,7 @@ DataTypePtr reduceNumberOfDimensions(DataTypePtr type, size_t dimensions_to_redu
{
const auto * type_array = typeid_cast<const DataTypeArray *>(type.get());
if (!type_array)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce");
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Not enough dimensions to reduce");
type = type_array->getNestedType();
}
@ -627,7 +627,7 @@ ColumnPtr reduceNumberOfDimensions(ColumnPtr column, size_t dimensions_to_reduce
{
const auto * column_array = typeid_cast<const ColumnArray *>(column.get());
if (!column_array)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce");
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Not enough dimensions to reduce");
column = column_array->getDataPtr();
}
@ -653,7 +653,7 @@ ColumnWithTypeAndDimensions createTypeFromNode(const Node & node)
auto collect_tuple_elemets = [](const auto & children)
{
if (children.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create type from empty Tuple or Nested node");
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Cannot create type from empty Tuple or Nested node");
std::vector<std::tuple<String, ColumnWithTypeAndDimensions>> tuple_elements;
tuple_elements.reserve(children.size());
@ -705,6 +705,7 @@ ColumnWithTypeAndDimensions createTypeFromNode(const Node & node)
size_t num_elements = tuple_columns.size();
Columns tuple_elements_columns(num_elements);
DataTypes tuple_elements_types(num_elements);
size_t last_offset = assert_cast<const ColumnArray::ColumnOffsets &>(*offsets_columns.back()).getData().back();
/// Reduce extra array dimensions to get columns and types of Nested elements.
for (size_t i = 0; i < num_elements; ++i)
@ -712,6 +713,14 @@ ColumnWithTypeAndDimensions createTypeFromNode(const Node & node)
assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions);
tuple_elements_columns[i] = reduceNumberOfDimensions(tuple_columns[i].column, tuple_columns[i].array_dimensions);
tuple_elements_types[i] = reduceNumberOfDimensions(tuple_columns[i].type, tuple_columns[i].array_dimensions);
if (tuple_elements_columns[i]->size() != last_offset)
throw Exception(
ErrorCodes::EXPERIMENTAL_FEATURE_ERROR,
"Cannot create a type for subcolumn {} in Object data type: offsets_column has data inconsistent with nested_column. "
"Data size: {}, last offset: {}",
node.path.getPath(),
tuple_elements_columns[i]->size(),
last_offset);
}
auto result_column = ColumnArray::create(ColumnTuple::create(tuple_elements_columns), offsets_columns.back());
@ -720,6 +729,16 @@ ColumnWithTypeAndDimensions createTypeFromNode(const Node & node)
/// Recreate result Array type and Array column.
for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it)
{
last_offset = assert_cast<const ColumnArray::ColumnOffsets &>((**it)).getData().back();
if (result_column->size() != last_offset)
throw Exception(
ErrorCodes::EXPERIMENTAL_FEATURE_ERROR,
"Cannot create a type for subcolumn {} in Object data type: offsets_column has data inconsistent with nested_column. "
"Data size: {}, last offset: {}",
node.path.getPath(),
result_column->size(),
last_offset);
result_column = ColumnArray::create(result_column, *it);
result_type = std::make_shared<DataTypeArray>(result_type);
}
@ -822,7 +841,7 @@ std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
assert(paths.size() == tuple_columns.size());
if (paths.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot unflatten empty Tuple");
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Cannot unflatten empty Tuple");
/// We add all paths to the subcolumn tree and then create a type from it.
/// The tree stores column, type and number of array dimensions
@ -841,7 +860,7 @@ std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
tree.add(paths[i], [&](Node::Kind kind, bool exists) -> std::shared_ptr<Node>
{
if (pos >= num_parts)
throw Exception(ErrorCodes::LOGICAL_ERROR,
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR,
"Not enough name parts for path {}. Expected at least {}, got {}",
paths[i].getPath(), pos + 1, num_parts);

View File

@ -29,7 +29,7 @@ namespace ErrorCodes
extern const int INCORRECT_DATA;
extern const int CANNOT_READ_ALL_DATA;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int LOGICAL_ERROR;
extern const int EXPERIMENTAL_FEATURE_ERROR;
}
template <typename Parser>
@ -177,7 +177,7 @@ void SerializationObject<Parser>::serializeBinaryBulkStatePrefix(
auto * stream = settings.getter(settings.path);
if (!stream)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing stream for kind of binary serialization");
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR, "Missing stream for kind of binary serialization");
auto [tuple_column, tuple_type] = unflattenObjectToTuple(column_object);
@ -288,7 +288,7 @@ void SerializationObject<Parser>::serializeBinaryBulkWithMultipleStreams(
if (!state_object->nested_type->equals(*tuple_type))
{
throw Exception(ErrorCodes::LOGICAL_ERROR,
throw Exception(ErrorCodes::EXPERIMENTAL_FEATURE_ERROR,
"Types of internal column of Object mismatched. Expected: {}, Got: {}",
state_object->nested_type->getName(), tuple_type->getName());
}

View File

@ -99,7 +99,6 @@ struct ReadSettings
bool enable_filesystem_cache = true;
bool read_from_filesystem_cache_if_exists_otherwise_bypass_cache = false;
bool enable_filesystem_cache_log = false;
bool force_read_through_cache_merges = false;
size_t filesystem_cache_segments_batch_size = 20;
bool use_page_cache_for_disks_without_file_cache = false;

View File

@ -275,13 +275,7 @@ private:
if (only_replace_current_database_function)
return;
for (ASTRenameQuery::Element & elem : node.elements)
{
if (!elem.from.database)
elem.from.database = std::make_shared<ASTIdentifier>(database_name);
if (!elem.to.database)
elem.to.database = std::make_shared<ASTIdentifier>(database_name);
}
node.setDatabaseIfNotExists(database_name);
}
void visitDDL(ASTAlterQuery & node, ASTPtr &) const

View File

@ -1612,7 +1612,6 @@ BlockIO InterpreterCreateQuery::doCreateOrReplaceTable(ASTCreateQuery & create,
executeTrivialBlockIO(fill_io, getContext());
/// Replace target table with created one
auto ast_rename = std::make_shared<ASTRenameQuery>();
ASTRenameQuery::Element elem
{
ASTRenameQuery::Table
@ -1627,7 +1626,7 @@ BlockIO InterpreterCreateQuery::doCreateOrReplaceTable(ASTCreateQuery & create,
}
};
ast_rename->elements.push_back(std::move(elem));
auto ast_rename = std::make_shared<ASTRenameQuery>(ASTRenameQuery::Elements{std::move(elem)});
ast_rename->dictionary = create.is_dictionary;
if (create.create_or_replace)
{

View File

@ -47,12 +47,12 @@ BlockIO InterpreterRenameQuery::execute()
*/
RenameDescriptions descriptions;
descriptions.reserve(rename.elements.size());
descriptions.reserve(rename.getElements().size());
/// Don't allow to drop tables (that we are renaming); don't allow to create tables in places where tables will be renamed.
TableGuards table_guards;
for (const auto & elem : rename.elements)
for (const auto & elem : rename.getElements())
{
descriptions.emplace_back(elem, current_database);
const auto & description = descriptions.back();
@ -186,7 +186,7 @@ AccessRightsElements InterpreterRenameQuery::getRequiredAccess(InterpreterRename
{
AccessRightsElements required_access;
const auto & rename = query_ptr->as<const ASTRenameQuery &>();
for (const auto & elem : rename.elements)
for (const auto & elem : rename.getElements())
{
if (type == RenameType::RenameTable)
{
@ -214,7 +214,7 @@ AccessRightsElements InterpreterRenameQuery::getRequiredAccess(InterpreterRename
void InterpreterRenameQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, ContextPtr) const
{
const auto & rename = ast->as<const ASTRenameQuery &>();
for (const auto & element : rename.elements)
for (const auto & element : rename.getElements())
{
{
String database = backQuoteIfNeed(!element.from.database ? getContext()->getCurrentDatabase() : element.from.getDatabase());

View File

@ -947,7 +947,7 @@ bool InterpreterSelectQuery::adjustParallelReplicasAfterAnalysis()
if (number_of_replicas_to_use <= 1)
{
context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
context->setSetting("max_parallel_replicas", UInt64{1});
context->setSetting("max_parallel_replicas", UInt64{0});
LOG_DEBUG(log, "Disabling parallel replicas because there aren't enough rows to read");
return true;
}

View File

@ -579,7 +579,7 @@ ASTs InterpreterRenameImpl::getRewrittenQueries(
const InterpreterRenameImpl::TQuery & rename_query, ContextPtr context, const String & mapped_to_database, const String & mysql_database)
{
ASTRenameQuery::Elements elements;
for (const auto & rename_element : rename_query.elements)
for (const auto & rename_element : rename_query.getElements())
{
const auto & to_database = resolveDatabase(rename_element.to.getDatabase(), mysql_database, mapped_to_database, context);
const auto & from_database = resolveDatabase(rename_element.from.getDatabase(), mysql_database, mapped_to_database, context);
@ -600,8 +600,7 @@ ASTs InterpreterRenameImpl::getRewrittenQueries(
if (elements.empty())
return ASTs{};
auto rewritten_query = std::make_shared<ASTRenameQuery>();
rewritten_query->elements = elements;
auto rewritten_query = std::make_shared<ASTRenameQuery>(std::move(elements));
return ASTs{rewritten_query};
}
@ -616,7 +615,8 @@ ASTs InterpreterAlterImpl::getRewrittenQueries(
return {};
auto rewritten_alter_query = std::make_shared<ASTAlterQuery>();
auto rewritten_rename_query = std::make_shared<ASTRenameQuery>();
ASTRenameQuery::Elements rename_elements;
rewritten_alter_query->setDatabase(mapped_to_database);
rewritten_alter_query->setTable(alter_query.table);
rewritten_alter_query->alter_object = ASTAlterQuery::AlterObjectType::TABLE;
@ -749,13 +749,13 @@ ASTs InterpreterAlterImpl::getRewrittenQueries(
/// For ALTER TABLE table_name RENAME TO new_table_name_1, RENAME TO new_table_name_2;
/// We just need to generate RENAME TABLE table_name TO new_table_name_2;
if (rewritten_rename_query->elements.empty())
rewritten_rename_query->elements.push_back(ASTRenameQuery::Element());
if (rename_elements.empty())
rename_elements.push_back(ASTRenameQuery::Element());
rewritten_rename_query->elements.back().from.database = std::make_shared<ASTIdentifier>(mapped_to_database);
rewritten_rename_query->elements.back().from.table = std::make_shared<ASTIdentifier>(alter_query.table);
rewritten_rename_query->elements.back().to.database = std::make_shared<ASTIdentifier>(mapped_to_database);
rewritten_rename_query->elements.back().to.table = std::make_shared<ASTIdentifier>(alter_command->new_table_name);
rename_elements.back().from.database = std::make_shared<ASTIdentifier>(mapped_to_database);
rename_elements.back().from.table = std::make_shared<ASTIdentifier>(alter_query.table);
rename_elements.back().to.database = std::make_shared<ASTIdentifier>(mapped_to_database);
rename_elements.back().to.table = std::make_shared<ASTIdentifier>(alter_command->new_table_name);
}
}
@ -765,8 +765,11 @@ ASTs InterpreterAlterImpl::getRewrittenQueries(
if (!rewritten_alter_query->command_list->children.empty())
rewritten_queries.push_back(rewritten_alter_query);
if (!rewritten_rename_query->elements.empty())
if (!rename_elements.empty())
{
auto rewritten_rename_query = std::make_shared<ASTRenameQuery>(std::move(rename_elements));
rewritten_queries.push_back(rewritten_rename_query);
}
return rewritten_queries;
}

View File

@ -563,7 +563,6 @@ void SystemLog<LogElement>::prepareTable()
{table_id.database_name, table_id.table_name + "_" + toString(suffix)}, getContext()))
++suffix;
auto rename = std::make_shared<ASTRenameQuery>();
ASTRenameQuery::Element elem
{
ASTRenameQuery::Table
@ -586,7 +585,7 @@ void SystemLog<LogElement>::prepareTable()
old_create_query,
create_query);
rename->elements.emplace_back(std::move(elem));
auto rename = std::make_shared<ASTRenameQuery>(ASTRenameQuery::Elements{std::move(elem)});
ActionLock merges_lock;
if (DatabaseCatalog::instance().getDatabase(table_id.database_name)->getUUID() == UUIDHelpers::Nil)

View File

@ -45,7 +45,6 @@ public:
};
using Elements = std::vector<Element>;
Elements elements;
bool exchange{false}; /// For EXCHANGE TABLES
bool database{false}; /// For RENAME DATABASE
@ -54,12 +53,48 @@ public:
/// Special flag for CREATE OR REPLACE. Do not throw if the second table does not exist.
bool rename_if_cannot_exchange{false};
explicit ASTRenameQuery(Elements elements_ = {})
: elements(std::move(elements_))
{
for (const auto & elem : elements)
{
if (elem.from.database)
children.push_back(elem.from.database);
if (elem.from.table)
children.push_back(elem.from.table);
if (elem.to.database)
children.push_back(elem.to.database);
if (elem.to.table)
children.push_back(elem.to.table);
}
}
void setDatabaseIfNotExists(const String & database_name)
{
for (auto & elem : elements)
{
if (!elem.from.database)
{
elem.from.database = std::make_shared<ASTIdentifier>(database_name);
children.push_back(elem.from.database);
}
if (!elem.to.database)
{
elem.to.database = std::make_shared<ASTIdentifier>(database_name);
children.push_back(elem.to.database);
}
}
}
const Elements & getElements() const { return elements; }
/** Get the text that identifies this element. */
String getID(char) const override { return "Rename"; }
ASTPtr clone() const override
{
auto res = std::make_shared<ASTRenameQuery>(*this);
res->cloneChildren();
cloneOutputOptions(*res);
return res;
}
@ -145,6 +180,8 @@ protected:
formatOnCluster(settings);
}
Elements elements;
};
}

View File

@ -44,15 +44,14 @@ bool ParserRenameQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
if (!ASTQueryWithOnCluster::parse(pos, cluster_str, expected))
return false;
}
ASTRenameQuery::Elements rename_elements;
rename_elements.emplace_back();
rename_elements.back().if_exists = if_exists;
rename_elements.back().from.database = from_db;
rename_elements.back().to.database = to_db;
auto query = std::make_shared<ASTRenameQuery>();
auto query = std::make_shared<ASTRenameQuery>(std::move(rename_elements));
query->database = true;
query->elements.emplace({});
query->elements.front().if_exists = if_exists;
query->elements.front().from.database = from_db;
query->elements.front().to.database = to_db;
query->children.push_back(std::move(from_db));
query->children.push_back(std::move(to_db));
query->cluster = cluster_str;
node = query;
return true;
@ -75,9 +74,8 @@ bool ParserRenameQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
const auto ignore_delim = [&] { return exchange ? s_and.ignore(pos) : s_to.ignore(pos); };
auto query = std::make_shared<ASTRenameQuery>();
ASTRenameQuery::Elements & elements = query->elements;
ASTRenameQuery::Elements elements;
while (true)
{
@ -93,15 +91,6 @@ bool ParserRenameQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
|| !ignore_delim()
|| !parseDatabaseAndTableAsAST(pos, expected, ref.to.database, ref.to.table))
return false;
if (ref.from.database)
query->children.push_back(ref.from.database);
if (ref.from.table)
query->children.push_back(ref.from.table);
if (ref.to.database)
query->children.push_back(ref.to.database);
if (ref.to.table)
query->children.push_back(ref.to.table);
}
String cluster_str;
@ -111,6 +100,7 @@ bool ParserRenameQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
return false;
}
auto query = std::make_shared<ASTRenameQuery>(std::move(elements));
query->cluster = cluster_str;
query->exchange = exchange;
query->dictionary = dictionary;

View File

@ -9,6 +9,7 @@
#include <Parsers/ParserAlterQuery.h>
#include <Parsers/ParserCreateQuery.h>
#include <Parsers/ParserOptimizeQuery.h>
#include <Parsers/ParserRenameQuery.h>
#include <Parsers/ParserQueryWithOutput.h>
#include <Parsers/ParserAttachAccessEntity.h>
#include <Parsers/formatAST.h>
@ -62,10 +63,29 @@ TEST_P(ParserTest, parseQuery)
if (std::string("CREATE USER or ALTER USER query") != parser->getName()
&& std::string("ATTACH access entity query") != parser->getName())
{
WriteBufferFromOwnString buf;
formatAST(*ast->clone(), buf, false, false);
String formatted_ast = buf.str();
EXPECT_EQ(expected_ast, formatted_ast);
ASTPtr ast_clone = ast->clone();
{
WriteBufferFromOwnString buf;
formatAST(*ast_clone, buf, false, false);
String formatted_ast = buf.str();
EXPECT_EQ(expected_ast, formatted_ast);
}
ASTPtr ast_clone2 = ast_clone->clone();
/// Break `ast_clone2`, it should not affect `ast_clone` if `clone()` implemented properly
for (auto & child : ast_clone2->children)
{
if (auto * identifier = dynamic_cast<ASTIdentifier *>(child.get()))
identifier->setShortName("new_name");
}
{
WriteBufferFromOwnString buf;
formatAST(*ast_clone, buf, false, false);
String formatted_ast = buf.str();
EXPECT_EQ(expected_ast, formatted_ast);
}
}
else
{
@ -299,6 +319,16 @@ INSTANTIATE_TEST_SUITE_P(ParserAttachUserQuery, ParserTest,
}
})));
INSTANTIATE_TEST_SUITE_P(ParserRenameQuery, ParserTest,
::testing::Combine(
::testing::Values(std::make_shared<ParserRenameQuery>()),
::testing::ValuesIn(std::initializer_list<ParserTestCase>{
{
"RENAME TABLE eligible_test TO eligible_test2",
"RENAME TABLE eligible_test TO eligible_test2"
}
})));
INSTANTIATE_TEST_SUITE_P(ParserKQLQuery, ParserKQLTest,
::testing::Combine(
::testing::Values(std::make_shared<ParserKQLQuery>()),

View File

@ -295,7 +295,7 @@ bool applyTrivialCountIfPossible(
/// The query could use trivial count if it didn't use parallel replicas, so let's disable it
query_context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
query_context->setSetting("max_parallel_replicas", UInt64{1});
query_context->setSetting("max_parallel_replicas", UInt64{0});
LOG_TRACE(getLogger("Planner"), "Disabling parallel replicas to be able to use a trivial count optimization");
}
@ -756,7 +756,7 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres
{
planner_context->getMutableQueryContext()->setSetting(
"allow_experimental_parallel_reading_from_replicas", Field(0));
planner_context->getMutableQueryContext()->setSetting("max_parallel_replicas", UInt64{1});
planner_context->getMutableQueryContext()->setSetting("max_parallel_replicas", UInt64{0});
LOG_DEBUG(getLogger("Planner"), "Disabling parallel replicas because there aren't enough rows to read");
}
else if (number_of_replicas_to_use < settings.max_parallel_replicas)

View File

@ -3,6 +3,7 @@
#include <Processors/Port.h>
#include <Processors/IProcessor.h>
#include <Common/SharedMutex.h>
#include <Common/AllocatorWithMemoryTracking.h>
#include <mutex>
#include <queue>
#include <stack>
@ -117,7 +118,11 @@ public:
}
};
using Queue = std::queue<Node *>;
/// This queue can grow a lot and lead to OOM. That is why we use non-default
/// allocator for container which throws exceptions in operator new
using DequeWithMemoryTracker = std::deque<ExecutingGraph::Node *, AllocatorWithMemoryTracking<ExecutingGraph::Node *>>;
using Queue = std::queue<ExecutingGraph::Node *, DequeWithMemoryTracker>;
using NodePtr = std::unique_ptr<Node>;
using Nodes = std::vector<NodePtr>;
Nodes nodes;

View File

@ -47,7 +47,10 @@ class ExecutorTasks
public:
using Stack = std::stack<UInt64>;
using Queue = std::queue<ExecutingGraph::Node *>;
/// This queue can grow a lot and lead to OOM. That is why we use non-default
/// allocator for container which throws exceptions in operator new
using DequeWithMemoryTracker = std::deque<ExecutingGraph::Node *, AllocatorWithMemoryTracking<ExecutingGraph::Node *>>;
using Queue = std::queue<ExecutingGraph::Node *, DequeWithMemoryTracker>;
void finish();
bool isFinished() const { return finished; }

View File

@ -5,7 +5,9 @@
#include <Common/EventCounter.h>
#include <Common/ThreadPool_fwd.h>
#include <Common/ConcurrencyControl.h>
#include <Common/AllocatorWithMemoryTracking.h>
#include <deque>
#include <queue>
#include <mutex>
#include <memory>
@ -90,7 +92,10 @@ private:
ReadProgressCallbackPtr read_progress_callback;
using Queue = std::queue<ExecutingGraph::Node *>;
/// This queue can grow a lot and lead to OOM. That is why we use non-default
/// allocator for container which throws exceptions in operator new
using DequeWithMemoryTracker = std::deque<ExecutingGraph::Node *, AllocatorWithMemoryTracking<ExecutingGraph::Node *>>;
using Queue = std::queue<ExecutingGraph::Node *, DequeWithMemoryTracker>;
void initializeExecution(size_t num_threads, bool concurrency_control); /// Initialize executor contexts and task_queue.
void finalizeExecution(); /// Check all processors are finished.

View File

@ -101,9 +101,9 @@ bool isSafePrimaryKey(const KeyDescription & primary_key)
int compareValues(const Values & lhs, const Values & rhs)
{
chassert(lhs.size() == rhs.size());
size_t size = std::min(lhs.size(), rhs.size());
for (size_t i = 0; i < lhs.size(); ++i)
for (size_t i = 0; i < size; ++i)
{
if (applyVisitor(FieldVisitorAccurateLess(), lhs[i], rhs[i]))
return -1;
@ -124,8 +124,9 @@ public:
Values getValue(size_t part_idx, size_t mark) const
{
const auto & index = parts[part_idx].data_part->getIndex();
Values values(index.size());
for (size_t i = 0; i < values.size(); ++i)
size_t size = index.size();
Values values(size);
for (size_t i = 0; i < size; ++i)
{
index[i]->get(mark, values[i]);
if (values[i].isNull())

View File

@ -1,3 +0,0 @@
if(ENABLE_EXAMPLES)
add_subdirectory(examples)
endif()

View File

@ -1,7 +1,6 @@
#pragma once
#include <Storages/MergeTree/IMergeTreeDataPart.h>
#include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
namespace DB
@ -10,7 +9,7 @@ namespace DB
/* Allow to compute more accurate progress statistics */
class ColumnSizeEstimator
{
using ColumnToSize = MergeTreeDataPartInMemory::ColumnToSize;
using ColumnToSize = std::map<String, UInt64>;
ColumnToSize map;
public:

View File

@ -10,7 +10,6 @@
#include <IO/S3Common.h>
#include <Server/HTTP/HTMLForm.h>
#include <Server/HTTP/HTTPServerResponse.h>
#include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
#include <Storages/MergeTree/MergedBlockOutputStream.h>
#include <Storages/MergeTree/ReplicatedFetchList.h>
#include <Storages/StorageReplicatedMergeTree.h>
@ -44,10 +43,8 @@ namespace ErrorCodes
extern const int CANNOT_WRITE_TO_OSTREAM;
extern const int CHECKSUM_DOESNT_MATCH;
extern const int INSECURE_PATH;
extern const int CORRUPTED_DATA;
extern const int LOGICAL_ERROR;
extern const int S3_ERROR;
extern const int INCORRECT_PART_TYPE;
extern const int ZERO_COPY_REPLICATION_ERROR;
}
@ -191,8 +188,6 @@ void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, Write
}
if (data_settings->allow_remote_fs_zero_copy_replication &&
/// In memory data part does not have metadata yet.
!isInMemoryPart(part) &&
client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_ZERO_COPY)
{
auto disk_type = part->getDataPartStorage().getDiskType();
@ -205,11 +200,7 @@ void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, Write
}
}
if (isInMemoryPart(part))
sendPartFromMemory(part, out, send_projections);
else
sendPartFromDisk(part, out, client_protocol_version, false, send_projections);
sendPartFromDisk(part, out, client_protocol_version, false, send_projections);
data.addLastSentPart(part->info);
}
catch (const NetException &)
@ -231,36 +222,6 @@ void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, Write
}
}
void Service::sendPartFromMemory(
const MergeTreeData::DataPartPtr & part, WriteBuffer & out, bool send_projections)
{
auto metadata_snapshot = data.getInMemoryMetadataPtr();
if (send_projections)
{
for (const auto & [name, projection] : part->getProjectionParts())
{
auto projection_sample_block = metadata_snapshot->projections.get(name).sample_block;
auto part_in_memory = asInMemoryPart(projection);
if (!part_in_memory)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Projection {} of part {} is not stored in memory", name, part->name);
writeStringBinary(name, out);
projection->checksums.write(out);
NativeWriter block_out(out, 0, projection_sample_block);
block_out.write(part_in_memory->block);
}
}
auto part_in_memory = asInMemoryPart(part);
if (!part_in_memory)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} is not stored in memory", part->name);
NativeWriter block_out(out, 0, metadata_snapshot->getSampleBlock());
part->checksums.write(out);
block_out.write(part_in_memory->block);
data.getSendsThrottler()->add(part_in_memory->block.bytes());
}
MergeTreeData::DataPart::Checksums Service::sendPartFromDisk(
const MergeTreeData::DataPartPtr & part,
@ -642,8 +603,6 @@ std::pair<MergeTreeData::MutableDataPartPtr, scope_guard> Fetcher::fetchSelected
remote_fs_metadata, fmt::join(capability, ", "));
if (server_protocol_version < REPLICATION_PROTOCOL_VERSION_WITH_PARTS_ZERO_COPY)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Got 'remote_fs_metadata' cookie with old protocol version {}", server_protocol_version);
if (part_type == PartType::InMemory)
throw Exception(ErrorCodes::INCORRECT_PART_TYPE, "Got 'remote_fs_metadata' cookie for in-memory part");
try
{
@ -702,7 +661,7 @@ std::pair<MergeTreeData::MutableDataPartPtr, scope_guard> Fetcher::fetchSelected
}
auto storage_id = data.getStorageID();
String new_part_path = part_type == PartType::InMemory ? "memory" : fs::path(data.getFullPathOnDisk(disk)) / part_name / "";
String new_part_path = fs::path(data.getFullPathOnDisk(disk)) / part_name / "";
auto entry = data.getContext()->getReplicatedFetchList().insert(
storage_id.getDatabaseName(), storage_id.getTableName(),
part_info.partition_id, part_name, new_part_path,
@ -710,22 +669,6 @@ std::pair<MergeTreeData::MutableDataPartPtr, scope_guard> Fetcher::fetchSelected
in->setNextCallback(ReplicatedFetchReadCallback(*entry));
if (part_type == PartType::InMemory)
{
auto volume = std::make_shared<SingleDiskVolume>("volume_" + part_name, disk, 0);
auto data_part_storage = std::make_shared<DataPartStorageOnDiskFull>(
volume,
data.getRelativeDataPath(),
part_name);
return std::make_pair(downloadPartToMemory(
data_part_storage, part_name,
MergeTreePartInfo::fromPartName(part_name, data.format_version),
part_uuid, metadata_snapshot, context, *in,
projections, false, throttler), std::move(temporary_directory_lock));
}
auto output_buffer_getter = [](IDataPartStorage & part_storage, const String & file_name, size_t file_size)
{
return part_storage.writeFile(file_name, std::min<UInt64>(file_size, DBMS_DEFAULT_BUFFER_SIZE), {});
@ -737,65 +680,6 @@ std::pair<MergeTreeData::MutableDataPartPtr, scope_guard> Fetcher::fetchSelected
projections, throttler, sync),std::move(temporary_directory_lock));
}
MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
MutableDataPartStoragePtr data_part_storage,
const String & part_name,
const MergeTreePartInfo & part_info,
const UUID & part_uuid,
const StorageMetadataPtr & metadata_snapshot,
ContextPtr context,
ReadWriteBufferFromHTTP & in,
size_t projections,
bool is_projection,
ThrottlerPtr throttler)
{
auto new_data_part = std::make_shared<MergeTreeDataPartInMemory>(data, part_name, part_info, data_part_storage);
for (size_t i = 0; i < projections; ++i)
{
String projection_name;
readStringBinary(projection_name, in);
MergeTreePartInfo new_part_info("all", 0, 0, 0);
auto projection_part_storage = data_part_storage->getProjection(projection_name + ".proj");
auto new_projection_part = downloadPartToMemory(
projection_part_storage, projection_name,
new_part_info, part_uuid, metadata_snapshot,
context, in, 0, true, throttler);
new_data_part->addProjectionPart(projection_name, std::move(new_projection_part));
}
MergeTreeData::DataPart::Checksums checksums;
if (!checksums.read(in))
throw Exception(ErrorCodes::CORRUPTED_DATA, "Cannot deserialize checksums");
NativeReader block_in(in, 0);
auto block = block_in.read();
throttler->add(block.bytes());
new_data_part->setColumns(block.getNamesAndTypesList(), {}, metadata_snapshot->getMetadataVersion());
if (!is_projection)
{
new_data_part->version.setCreationTID(Tx::PrehistoricTID, nullptr);
new_data_part->uuid = part_uuid;
new_data_part->is_temp = true;
new_data_part->minmax_idx->update(block, data.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
new_data_part->partition.create(metadata_snapshot, block, 0, context);
}
MergedBlockOutputStream part_out(
new_data_part, metadata_snapshot, block.getNamesAndTypesList(), {}, {},
CompressionCodecFactory::instance().get("NONE", {}), NO_TRANSACTION_PTR);
part_out.write(block);
part_out.finalizePart(new_data_part, false);
new_data_part->checksums.checkEqual(checksums, /* have_uncompressed = */ true);
return new_data_part;
}
void Fetcher::downloadBaseOrProjectionPartToDisk(
const String & replica_path,

View File

@ -40,10 +40,6 @@ public:
private:
MergeTreeData::DataPartPtr findPart(const String & name);
void sendPartFromMemory(
const MergeTreeData::DataPartPtr & part,
WriteBuffer & out,
bool send_projections);
MergeTreeData::DataPart::Checksums sendPartFromDisk(
const MergeTreeData::DataPartPtr & part,
@ -113,18 +109,6 @@ private:
ThrottlerPtr throttler,
bool sync);
MergeTreeData::MutableDataPartPtr downloadPartToMemory(
MutableDataPartStoragePtr data_part_storage,
const String & part_name,
const MergeTreePartInfo & part_info,
const UUID & part_uuid,
const StorageMetadataPtr & metadata_snapshot,
ContextPtr context,
ReadWriteBufferFromHTTP & in,
size_t projections,
bool is_projection,
ThrottlerPtr throttler);
MergeTreeData::MutableDataPartPtr downloadPartToDiskRemoteMeta(
const String & part_name,
const String & replica_path,

View File

@ -51,7 +51,6 @@ namespace CurrentMetrics
extern const Metric PartsWide;
extern const Metric PartsCompact;
extern const Metric PartsInMemory;
}
namespace DB
@ -278,9 +277,6 @@ static void incrementTypeMetric(MergeTreeDataPartType type)
case MergeTreeDataPartType::Compact:
CurrentMetrics::add(CurrentMetrics::PartsCompact);
return;
case MergeTreeDataPartType::InMemory:
CurrentMetrics::add(CurrentMetrics::PartsInMemory);
return;
case MergeTreeDataPartType::Unknown:
return;
}
@ -296,9 +292,6 @@ static void decrementTypeMetric(MergeTreeDataPartType type)
case MergeTreeDataPartType::Compact:
CurrentMetrics::sub(CurrentMetrics::PartsCompact);
return;
case MergeTreeDataPartType::InMemory:
CurrentMetrics::sub(CurrentMetrics::PartsInMemory);
return;
case MergeTreeDataPartType::Unknown:
return;
}
@ -844,6 +837,27 @@ void IMergeTreeDataPart::loadIndex() const
for (size_t j = 0; j < key_size; ++j)
key_serializations[j]->deserializeBinary(*loaded_index[j], *index_file, {});
/// Cut useless suffix columns, if necessary.
Float64 ratio_to_drop_suffix_columns = storage.getSettings()->primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns;
if (key_size > 1 && ratio_to_drop_suffix_columns > 0 && ratio_to_drop_suffix_columns < 1)
{
chassert(marks_count > 0);
for (size_t j = 0; j < key_size - 1; ++j)
{
size_t num_changes = 0;
for (size_t i = 1; i < marks_count; ++i)
if (0 != loaded_index[j]->compareAt(i, i - 1, *loaded_index[j], 0))
++num_changes;
if (static_cast<Float64>(num_changes) / marks_count >= ratio_to_drop_suffix_columns)
{
key_size = j + 1;
loaded_index.resize(key_size);
break;
}
}
}
for (size_t i = 0; i < key_size; ++i)
{
loaded_index[i]->shrinkToFit();
@ -2207,11 +2221,6 @@ bool isWidePart(const MergeTreeDataPartPtr & data_part)
return (data_part && data_part->getType() == MergeTreeDataPartType::Wide);
}
bool isInMemoryPart(const MergeTreeDataPartPtr & data_part)
{
return (data_part && data_part->getType() == MergeTreeDataPartType::InMemory);
}
std::optional<std::string> getIndexExtensionFromFilesystem(const IDataPartStorage & data_part_storage)
{
if (data_part_storage.exists())

View File

@ -710,7 +710,6 @@ using MergeTreeMutableDataPartPtr = std::shared_ptr<IMergeTreeDataPart>;
bool isCompactPart(const MergeTreeDataPartPtr & data_part);
bool isWidePart(const MergeTreeDataPartPtr & data_part);
bool isInMemoryPart(const MergeTreeDataPartPtr & data_part);
inline String getIndexExtension(bool is_compressed_primary_key) { return is_compressed_primary_key ? ".cidx" : ".idx"; }
std::optional<String> getIndexExtensionFromFilesystem(const IDataPartStorage & data_part_storage);

View File

@ -34,8 +34,6 @@ public:
virtual bool isWidePart() const = 0;
virtual bool isInMemoryPart() const = 0;
virtual bool isProjectionPart() const = 0;
virtual DataPartStoragePtr getDataPartStorage() const = 0;

View File

@ -22,8 +22,6 @@ public:
bool isWidePart() const override { return DB::isWidePart(data_part); }
bool isInMemoryPart() const override { return DB::isInMemoryPart(data_part); }
bool isProjectionPart() const override { return data_part->isProjectionPart(); }
DataPartStoragePtr getDataPartStorage() const override { return data_part->getDataPartStoragePtr(); }

View File

@ -309,7 +309,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
ctx->rows_sources_uncompressed_write_buf = ctx->tmp_disk->createRawStream();
ctx->rows_sources_write_buf = std::make_unique<CompressedWriteBuffer>(*ctx->rows_sources_uncompressed_write_buf);
MergeTreeDataPartInMemory::ColumnToSize local_merged_column_to_size;
std::map<String, UInt64> local_merged_column_to_size;
for (const MergeTreeData::DataPartPtr & part : global_ctx->future_part->parts)
part->accumulateColumnSizes(local_merged_column_to_size);

View File

@ -72,7 +72,6 @@
#include <Storages/MergeTree/DataPartStorageOnDiskFull.h>
#include <Storages/MergeTree/MergeTreeDataPartBuilder.h>
#include <Storages/MergeTree/MergeTreeDataPartCompact.h>
#include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
#include <Storages/Statistics/Estimator.h>
#include <Storages/MergeTree/MergeTreeSelectProcessor.h>
#include <Storages/MergeTree/checkDataPart.h>
@ -1707,8 +1706,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional<std::un
{
/// Skip temporary directories, file 'format_version.txt' and directory 'detached'.
if (startsWith(it->name(), "tmp") || it->name() == MergeTreeData::FORMAT_VERSION_FILE_NAME
|| it->name() == MergeTreeData::DETACHED_DIR_NAME
|| startsWith(it->name(), MergeTreeWriteAheadLog::WAL_FILE_NAME))
|| it->name() == MergeTreeData::DETACHED_DIR_NAME)
continue;
if (auto part_info = MergeTreePartInfo::tryParsePartName(it->name(), format_version))
@ -2261,7 +2259,6 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
bool reached_removal_time = part_remove_time <= time_now && time_now - part_remove_time >= getSettings()->old_parts_lifetime.totalSeconds();
if ((reached_removal_time && !has_skipped_mutation_parent(part))
|| force
|| isInMemoryPart(part) /// Remove in-memory parts immediately to not store excessive data in RAM
|| (part->version.creation_csn == Tx::RolledBackCSN && getSettings()->remove_rolled_back_parts_immediately))
{
part->removal_state.store(DataPartRemovalState::REMOVED, std::memory_order_relaxed);
@ -5225,14 +5222,14 @@ Pipe MergeTreeData::alterPartition(
case PartitionCommand::FREEZE_PARTITION:
{
auto lock = lockForShare(query_context->getCurrentQueryId(), query_context->getSettingsRef().lock_acquire_timeout);
current_command_results = freezePartition(command.partition, metadata_snapshot, command.with_name, query_context, lock);
current_command_results = freezePartition(command.partition, command.with_name, query_context, lock);
}
break;
case PartitionCommand::FREEZE_ALL_PARTITIONS:
{
auto lock = lockForShare(query_context->getCurrentQueryId(), query_context->getSettingsRef().lock_acquire_timeout);
current_command_results = freezeAll(command.with_name, metadata_snapshot, query_context, lock);
current_command_results = freezeAll(command.with_name, query_context, lock);
}
break;
@ -7103,27 +7100,6 @@ std::pair<MergeTreeData::MutableDataPartPtr, scope_guard> MergeTreeData::cloneAn
scope_guard src_flushed_tmp_dir_lock;
MergeTreeData::MutableDataPartPtr src_flushed_tmp_part;
/// If source part is in memory, flush it to disk and clone it already in on-disk format
/// Protect tmp dir from removing by cleanup thread with src_flushed_tmp_dir_lock
/// Construct src_flushed_tmp_part in order to delete part with its directory at destructor
if (auto src_part_in_memory = asInMemoryPart(src_part))
{
auto flushed_part_path = *src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix);
auto tmp_src_part_file_name = fs::path(tmp_dst_part_name).filename();
src_flushed_tmp_dir_lock = src_part->storage.getTemporaryPartDirectoryHolder(tmp_src_part_file_name);
auto flushed_part_storage = src_part_in_memory->flushToDisk(flushed_part_path, metadata_snapshot);
src_flushed_tmp_part = MergeTreeDataPartBuilder(*this, src_part->name, flushed_part_storage)
.withPartInfo(src_part->info)
.withPartFormatFromDisk()
.build();
src_flushed_tmp_part->is_temp = true;
src_part_storage = flushed_part_storage;
}
String with_copy;
if (params.copy_instead_of_hardlink)
with_copy = " (copying data)";
@ -7305,26 +7281,23 @@ MergeTreeData::MatcherFn MergeTreeData::getPartitionMatcher(const ASTPtr & parti
PartitionCommandsResultInfo MergeTreeData::freezePartition(
const ASTPtr & partition_ast,
const StorageMetadataPtr & metadata_snapshot,
const String & with_name,
ContextPtr local_context,
TableLockHolder &)
{
return freezePartitionsByMatcher(getPartitionMatcher(partition_ast, local_context), metadata_snapshot, with_name, local_context);
return freezePartitionsByMatcher(getPartitionMatcher(partition_ast, local_context), with_name, local_context);
}
PartitionCommandsResultInfo MergeTreeData::freezeAll(
const String & with_name,
const StorageMetadataPtr & metadata_snapshot,
ContextPtr local_context,
TableLockHolder &)
{
return freezePartitionsByMatcher([] (const String &) { return true; }, metadata_snapshot, with_name, local_context);
return freezePartitionsByMatcher([] (const String &) { return true; }, with_name, local_context);
}
PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher(
MatcherFn matcher,
const StorageMetadataPtr & metadata_snapshot,
const String & with_name,
ContextPtr local_context)
{
@ -7376,22 +7349,6 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher(
scope_guard src_flushed_tmp_dir_lock;
MergeTreeData::MutableDataPartPtr src_flushed_tmp_part;
if (auto part_in_memory = asInMemoryPart(part))
{
auto flushed_part_path = *part_in_memory->getRelativePathForPrefix("tmp_freeze");
src_flushed_tmp_dir_lock = part->storage.getTemporaryPartDirectoryHolder("tmp_freeze" + part->name);
auto flushed_part_storage = part_in_memory->flushToDisk(flushed_part_path, metadata_snapshot);
src_flushed_tmp_part = MergeTreeDataPartBuilder(*this, part->name, flushed_part_storage)
.withPartInfo(part->info)
.withPartFormatFromDisk()
.build();
src_flushed_tmp_part->is_temp = true;
data_part_storage = flushed_part_storage;
}
auto callback = [this, &part, &backup_part_path](const DiskPtr & disk)
{
// Store metadata for replicated table.

View File

@ -23,9 +23,7 @@
#include <Storages/MergeTree/MergeList.h>
#include <Storages/MergeTree/IMergeTreeDataPart.h>
#include <Storages/MergeTree/MergeTreeDataPartBuilder.h>
#include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
#include <Storages/MergeTree/MergeTreePartsMover.h>
#include <Storages/MergeTree/MergeTreeWriteAheadLog.h>
#include <Storages/MergeTree/PinnedPartUUIDs.h>
#include <Storages/MergeTree/ZeroCopyLock.h>
#include <Storages/MergeTree/TemporaryParts.h>
@ -752,7 +750,6 @@ public:
*/
PartitionCommandsResultInfo freezePartition(
const ASTPtr & partition,
const StorageMetadataPtr & metadata_snapshot,
const String & with_name,
ContextPtr context,
TableLockHolder & table_lock_holder);
@ -760,7 +757,6 @@ public:
/// Freezes all parts.
PartitionCommandsResultInfo freezeAll(
const String & with_name,
const StorageMetadataPtr & metadata_snapshot,
ContextPtr context,
TableLockHolder & table_lock_holder);
@ -1307,7 +1303,7 @@ protected:
bool isPrimaryOrMinMaxKeyColumnPossiblyWrappedInFunctions(const ASTPtr & node, const StorageMetadataPtr & metadata_snapshot) const;
/// Common part for |freezePartition()| and |freezeAll()|.
PartitionCommandsResultInfo freezePartitionsByMatcher(MatcherFn matcher, const StorageMetadataPtr & metadata_snapshot, const String & with_name, ContextPtr context);
PartitionCommandsResultInfo freezePartitionsByMatcher(MatcherFn matcher, const String & with_name, ContextPtr context);
PartitionCommandsResultInfo unfreezePartitionsByMatcher(MatcherFn matcher, const String & backup_name, ContextPtr context);
// Partition helpers

View File

@ -1,5 +1,4 @@
#include <Storages/MergeTree/MergeTreeDataPartBuilder.h>
#include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
#include <Storages/MergeTree/MergeTreeDataPartCompact.h>
#include <Storages/MergeTree/MergeTreeDataPartWide.h>
#include <Storages/MergeTree/DataPartStorageOnDiskFull.h>
@ -64,8 +63,6 @@ std::shared_ptr<IMergeTreeDataPart> MergeTreeDataPartBuilder::build()
return std::make_shared<MergeTreeDataPartWide>(data, name, *part_info, part_storage, parent_part);
case PartType::Compact:
return std::make_shared<MergeTreeDataPartCompact>(data, name, *part_info, part_storage, parent_part);
case PartType::InMemory:
return std::make_shared<MergeTreeDataPartInMemory>(data, name, *part_info, part_storage, parent_part);
default:
throw Exception(ErrorCodes::UNKNOWN_PART_TYPE,
"Unknown type of part {}", part_storage->getRelativePath());

View File

@ -1,115 +0,0 @@
#include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
#include <Storages/MergeTree/MergeTreeReaderInMemory.h>
#include <Storages/MergeTree/MergedBlockOutputStream.h>
#include <Storages/MergeTree/IMergeTreeReader.h>
#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
#include <Storages/MergeTree/DataPartStorageOnDiskFull.h>
#include <DataTypes/NestedUtils.h>
#include <Disks/createVolume.h>
#include <Interpreters/Context.h>
#include <Poco/Logger.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
MergeTreeDataPartInMemory::MergeTreeDataPartInMemory(
const MergeTreeData & storage_,
const String & name_,
const MergeTreePartInfo & info_,
const MutableDataPartStoragePtr & data_part_storage_,
const IMergeTreeDataPart * parent_part_)
: IMergeTreeDataPart(storage_, name_, info_, data_part_storage_, Type::InMemory, parent_part_)
{
default_codec = CompressionCodecFactory::instance().get("NONE", {});
}
IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartInMemory::getReader(
const NamesAndTypesList & columns_to_read,
const StorageSnapshotPtr & storage_snapshot,
const MarkRanges & mark_ranges,
const VirtualFields & virtual_fields,
UncompressedCache * /* uncompressed_cache */,
MarkCache * /* mark_cache */,
const AlterConversionsPtr & alter_conversions,
const MergeTreeReaderSettings & reader_settings,
const ValueSizeMap & /* avg_value_size_hints */,
const ReadBufferFromFileBase::ProfileCallback & /* profile_callback */) const
{
auto read_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(shared_from_this(), alter_conversions);
auto ptr = std::static_pointer_cast<const MergeTreeDataPartInMemory>(shared_from_this());
return std::make_unique<MergeTreeReaderInMemory>(
read_info,
ptr,
columns_to_read,
virtual_fields,
storage_snapshot,
mark_ranges,
reader_settings);
}
IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter(
const NamesAndTypesList &,
const StorageMetadataPtr &,
const std::vector<MergeTreeIndexPtr> &,
const Statistics &,
const CompressionCodecPtr &,
const MergeTreeWriterSettings &,
const MergeTreeIndexGranularity &)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "In-memory data parts are obsolete and no longer supported for writing");
}
MutableDataPartStoragePtr MergeTreeDataPartInMemory::flushToDisk(const String &, const StorageMetadataPtr &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "In-memory data parts are obsolete and no longer supported for writing");
}
DataPartStoragePtr MergeTreeDataPartInMemory::makeCloneInDetached(const String & prefix,
const StorageMetadataPtr & metadata_snapshot,
const DiskTransactionPtr & disk_transaction) const
{
if (disk_transaction)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "InMemory parts are not compatible with disk transactions");
String detached_path = *getRelativePathForDetachedPart(prefix, /* broken */ false);
return flushToDisk(detached_path, metadata_snapshot);
}
void MergeTreeDataPartInMemory::renameTo(const String & new_relative_path, bool /* remove_new_dir_if_exists */)
{
getDataPartStorage().setRelativePath(new_relative_path);
}
void MergeTreeDataPartInMemory::calculateEachColumnSizes(ColumnSizeByName & each_columns_size, ColumnSize & total_size) const
{
auto it = checksums.files.find("data.bin");
if (it != checksums.files.end())
total_size.data_uncompressed += it->second.uncompressed_size;
for (const auto & column : columns)
each_columns_size[column.name].data_uncompressed += block.getByName(column.name).column->byteSize();
}
IMergeTreeDataPart::Checksum MergeTreeDataPartInMemory::calculateBlockChecksum() const
{
SipHash hash;
IMergeTreeDataPart::Checksum checksum;
for (const auto & column : block)
column.column->updateHashFast(hash);
checksum.uncompressed_size = block.bytes();
checksum.uncompressed_hash = getSipHash128AsPair(hash);
return checksum;
}
DataPartInMemoryPtr asInMemoryPart(const MergeTreeDataPartPtr & part)
{
return std::dynamic_pointer_cast<const MergeTreeDataPartInMemory>(part);
}
}

View File

@ -1,70 +0,0 @@
#pragma once
#include <Storages/MergeTree/IMergeTreeDataPart.h>
namespace DB
{
class UncompressedCache;
class MergeTreeDataPartInMemory : public IMergeTreeDataPart
{
public:
MergeTreeDataPartInMemory(
const MergeTreeData & storage_,
const String & name_,
const MergeTreePartInfo & info_,
const MutableDataPartStoragePtr & data_part_storage_,
const IMergeTreeDataPart * parent_part_ = nullptr);
MergeTreeReaderPtr getReader(
const NamesAndTypesList & columns,
const StorageSnapshotPtr & storage_snapshot,
const MarkRanges & mark_ranges,
const VirtualFields & virtual_fields,
UncompressedCache * uncompressed_cache,
MarkCache * mark_cache,
const AlterConversionsPtr & alter_conversions,
const MergeTreeReaderSettings & reader_settings_,
const ValueSizeMap & avg_value_size_hints,
const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override;
MergeTreeWriterPtr getWriter(
const NamesAndTypesList & columns_list,
const StorageMetadataPtr & metadata_snapshot,
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const Statistics & stats_to_recalc_,
const CompressionCodecPtr & default_codec_,
const MergeTreeWriterSettings & writer_settings,
const MergeTreeIndexGranularity & computed_index_granularity) override;
bool isStoredOnDisk() const override { return false; }
bool isStoredOnRemoteDisk() const override { return false; }
bool isStoredOnRemoteDiskWithZeroCopySupport() const override { return false; }
bool hasColumnFiles(const NameAndTypePair & column) const override { return !!getColumnPosition(column.getNameInStorage()); }
std::optional<String> getFileNameForColumn(const NameAndTypePair & /* column */) const override { return ""; }
void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) override;
DataPartStoragePtr makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot,
const DiskTransactionPtr & disk_transaction) const override;
std::optional<time_t> getColumnModificationTime(const String & /* column_name */) const override { return {}; }
MutableDataPartStoragePtr flushToDisk(const String & new_relative_path, const StorageMetadataPtr & metadata_snapshot) const;
/// Returns hash of parts's block
Checksum calculateBlockChecksum() const;
mutable Block block;
private:
mutable std::condition_variable is_merged;
/// Calculates uncompressed sizes in memory.
void calculateEachColumnSizes(ColumnSizeByName & each_columns_size, ColumnSize & total_size) const override;
};
using DataPartInMemoryPtr = std::shared_ptr<const MergeTreeDataPartInMemory>;
using MutableDataPartInMemoryPtr = std::shared_ptr<MergeTreeDataPartInMemory>;
DataPartInMemoryPtr asInMemoryPart(const MergeTreeDataPartPtr & part);
}

View File

@ -42,9 +42,6 @@ public:
/// Data of all columns is stored in one file. Marks are also stored in single file.
Compact,
/// Format with buffering data in RAM. Obsolete - new parts cannot be created in this format.
InMemory,
Unknown,
};

View File

@ -1,5 +1,8 @@
#pragma once
#include <Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h>
#include <Formats/MarkInCompressedFile.h>
namespace DB
{

View File

@ -1018,7 +1018,11 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
DataTypes key_types;
for (size_t i : key_indices)
{
index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]});
if (i < index.size())
index_columns->emplace_back(index[i], primary_key.data_types[i], primary_key.column_names[i]);
else
index_columns->emplace_back(); /// The column of the primary key was not loaded in memory - we'll skip it.
key_types.emplace_back(primary_key.data_types[i]);
}
@ -1027,7 +1031,6 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
std::function<void(size_t, size_t, FieldRef &)> create_field_ref;
if (key_condition.hasMonotonicFunctionsChain())
{
create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field)
{
field = {index_columns.get(), row, column};
@ -1067,7 +1070,11 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
{
for (size_t i = 0; i < used_key_size; ++i)
{
create_field_ref(range.begin, i, index_left[i]);
if ((*index_columns)[i].column)
create_field_ref(range.begin, i, index_left[i]);
else
index_left[i] = NEGATIVE_INFINITY;
index_right[i] = POSITIVE_INFINITY;
}
}
@ -1078,8 +1085,17 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
for (size_t i = 0; i < used_key_size; ++i)
{
create_field_ref(range.begin, i, index_left[i]);
create_field_ref(range.end, i, index_right[i]);
if ((*index_columns)[i].column)
{
create_field_ref(range.begin, i, index_left[i]);
create_field_ref(range.end, i, index_right[i]);
}
else
{
/// If the PK column was not loaded in memory - exclude it from the analysis.
index_left[i] = NEGATIVE_INFINITY;
index_right[i] = POSITIVE_INFINITY;
}
}
}
key_condition_maybe_true = key_condition.mayBeTrueInRange(used_key_size, index_left.data(), index_right.data(), key_types);
@ -1114,6 +1130,7 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
bool part_offset_condition_exact_range
= !part_offset_condition || part_offset_condition->alwaysUnknownOrTrue() || part_offset_condition->matchesExactContinuousRange();
const String & part_name = part->isProjectionPart() ? fmt::format("{}.{}", part->name, part->getParentPart()->name) : part->name;
if (!key_condition_exact_range || !part_offset_condition_exact_range)
{
// Do exclusion search, where we drop ranges that do not match
@ -1128,10 +1145,10 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
part->index_granularity_info.index_granularity_bytes);
/** There will always be disjoint suspicious segments on the stack, the leftmost one at the top (back).
* At each step, take the left segment and check if it fits.
* If fits, split it into smaller ones and put them on the stack. If not, discard it.
* If the segment is already of one mark length, add it to response and discard it.
*/
* At each step, take the left segment and check if it fits.
* If fits, split it into smaller ones and put them on the stack. If not, discard it.
* If the segment is already of one mark length, add it to response and discard it.
*/
std::vector<MarkRange> ranges_stack = { {0, marks_count} };
size_t steps = 0;
@ -1141,7 +1158,7 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
MarkRange range = ranges_stack.back();
ranges_stack.pop_back();
steps++;
++steps;
if (!may_be_true_in_range(range))
continue;

View File

@ -648,18 +648,11 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl(
const auto & metadata_snapshot = projection.metadata;
MergeTreeDataPartType part_type;
if (parent_part->getType() == MergeTreeDataPartType::InMemory)
{
part_type = MergeTreeDataPartType::InMemory;
}
else
{
/// Size of part would not be greater than block.bytes() + epsilon
size_t expected_size = block.bytes();
// just check if there is enough space on parent volume
data.reserveSpace(expected_size, parent_part->getDataPartStorage());
part_type = data.choosePartFormatOnDisk(expected_size, block.rows()).part_type;
}
/// Size of part would not be greater than block.bytes() + epsilon
size_t expected_size = block.bytes();
// just check if there is enough space on parent volume
data.reserveSpace(expected_size, parent_part->getDataPartStorage());
part_type = data.choosePartFormatOnDisk(expected_size, block.rows()).part_type;
auto new_data_part = parent_part->getProjectionPartBuilder(part_name, is_temp).withPartType(part_type).build();
auto projection_part_storage = new_data_part->getDataPartStoragePtr();

View File

@ -81,8 +81,6 @@ std::string MarkType::getFileExtension() const
return res + "2";
case MergeTreeDataPartType::Compact:
return res + "3";
case MergeTreeDataPartType::InMemory:
return "";
case MergeTreeDataPartType::Unknown:
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown data part type");
}
@ -126,8 +124,6 @@ size_t MergeTreeIndexGranularityInfo::getMarkSizeInBytes(size_t columns_num) con
return mark_type.adaptive ? getAdaptiveMrkSizeWide() : getNonAdaptiveMrkSizeWide();
else if (mark_type.part_type == MergeTreeDataPartType::Compact)
return getAdaptiveMrkSizeCompact(columns_num);
else if (mark_type.part_type == MergeTreeDataPartType::InMemory)
return 0;
else
throw Exception(ErrorCodes::UNKNOWN_PART_TYPE, "Unknown part type");
}

View File

@ -1,119 +0,0 @@
#include <Storages/MergeTree/MergeTreeReaderInMemory.h>
#include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
#include <Interpreters/getColumnFromBlock.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/NestedUtils.h>
#include <Columns/ColumnArray.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_READ_ALL_DATA;
extern const int ARGUMENT_OUT_OF_BOUND;
}
MergeTreeReaderInMemory::MergeTreeReaderInMemory(
MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
DataPartInMemoryPtr data_part_,
NamesAndTypesList columns_,
const VirtualFields & virtual_fields_,
const StorageSnapshotPtr & storage_snapshot_,
MarkRanges mark_ranges_,
MergeTreeReaderSettings settings_)
: IMergeTreeReader(
data_part_info_for_read_,
columns_,
virtual_fields_,
storage_snapshot_,
nullptr,
nullptr,
mark_ranges_,
settings_,
{})
, part_in_memory(std::move(data_part_))
{
for (const auto & column_to_read : columns_to_read)
{
/// If array of Nested column is missing in part,
/// we have to read its offsets if they exist.
if (typeid_cast<const DataTypeArray *>(column_to_read.type.get())
&& !tryGetColumnFromBlock(part_in_memory->block, column_to_read))
{
if (auto offsets_position = findColumnForOffsets(column_to_read))
{
positions_for_offsets[column_to_read.name] = *data_part_info_for_read->getColumnPosition(offsets_position->first);
partially_read_columns.insert(column_to_read.name);
}
}
}
}
size_t MergeTreeReaderInMemory::readRows(
size_t from_mark, size_t /* current_task_last_mark */, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
{
if (!continue_reading)
total_rows_read = 0;
size_t total_marks = data_part_info_for_read->getIndexGranularity().getMarksCount();
if (from_mark >= total_marks)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Mark {} is out of bound. Max mark: {}",
toString(from_mark), toString(total_marks));
size_t num_columns = res_columns.size();
checkNumberOfColumns(num_columns);
size_t part_rows = part_in_memory->block.rows();
if (total_rows_read >= part_rows)
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read data in MergeTreeReaderInMemory. "
"Rows already read: {}. Rows in part: {}", total_rows_read, part_rows);
size_t rows_to_read = std::min(max_rows_to_read, part_rows - total_rows_read);
for (size_t i = 0; i < num_columns; ++i)
{
const auto & column_to_read = columns_to_read[i];
/// Copy offsets, if array of Nested column is missing in part.
auto offsets_it = positions_for_offsets.find(column_to_read.name);
if (offsets_it != positions_for_offsets.end() && !column_to_read.isSubcolumn())
{
const auto & source_offsets = assert_cast<const ColumnArray &>(
*part_in_memory->block.getByPosition(offsets_it->second).column).getOffsets();
if (res_columns[i] == nullptr)
res_columns[i] = column_to_read.type->createColumn();
auto mutable_column = res_columns[i]->assumeMutable();
auto & res_offstes = assert_cast<ColumnArray &>(*mutable_column).getOffsets();
size_t start_offset = total_rows_read ? source_offsets[total_rows_read - 1] : 0;
for (size_t row = 0; row < rows_to_read; ++row)
res_offstes.push_back(source_offsets[total_rows_read + row] - start_offset);
res_columns[i] = std::move(mutable_column);
}
else if (part_in_memory->hasColumnFiles(column_to_read))
{
auto block_column = getColumnFromBlock(part_in_memory->block, column_to_read);
if (rows_to_read == part_rows)
{
res_columns[i] = block_column;
}
else
{
if (res_columns[i] == nullptr)
res_columns[i] = column_to_read.type->createColumn();
auto mutable_column = res_columns[i]->assumeMutable();
mutable_column->insertRangeFrom(*block_column, total_rows_read, rows_to_read);
res_columns[i] = std::move(mutable_column);
}
}
}
total_rows_read += rows_to_read;
return rows_to_read;
}
}

Some files were not shown because too many files have changed in this diff Show More