diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index be66f21b838..8b137891791 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,3 +1 @@ -docs/* @ClickHouse/docs -docs/zh/* @ClickHouse/docs-zh -website/* @ClickHouse/docs + diff --git a/SECURITY.md b/SECURITY.md index 1872d67a529..f002dd53ca9 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -21,9 +21,10 @@ The following versions of ClickHouse server are currently being supported with s | 21.6 | :x: | | 21.7 | :x: | | 21.8 | ✅ | -| 21.9 | ✅ | +| 21.9 | :x: | | 21.10 | ✅ | | 21.11 | ✅ | +| 21.12 | ✅ | ## Reporting a Vulnerability diff --git a/base/base/wide_integer_impl.h b/base/base/wide_integer_impl.h index b8de5efb859..cffffcc213f 100644 --- a/base/base/wide_integer_impl.h +++ b/base/base/wide_integer_impl.h @@ -827,7 +827,7 @@ public: CompilerUInt128 a = (CompilerUInt128(numerator.items[1]) << 64) + numerator.items[0]; CompilerUInt128 b = (CompilerUInt128(denominator.items[1]) << 64) + denominator.items[0]; - CompilerUInt128 c = a / b; + CompilerUInt128 c = a / b; // NOLINT integer res; res.items[0] = c; @@ -1020,8 +1020,15 @@ constexpr integer::integer(std::initializer_list il) noexcept { auto it = il.begin(); for (size_t i = 0; i < _impl::item_count; ++i) + { if (it < il.end()) + { items[i] = *it; + ++it; + } + else + items[i] = 0; + } } } diff --git a/cmake/find/blob_storage.cmake b/cmake/find/blob_storage.cmake index 74a907da7db..4ad7296e95e 100644 --- a/cmake/find/blob_storage.cmake +++ b/cmake/find/blob_storage.cmake @@ -1,30 +1,29 @@ option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES}) -option(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY - "Set to FALSE to use system Azure SDK instead of bundled (OFF currently not implemented)" - ON) - if (ENABLE_AZURE_BLOB_STORAGE) + option(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY + "Set to FALSE to use system Azure SDK instead of bundled (OFF currently not implemented)" + ON) + set(USE_AZURE_BLOB_STORAGE 1) set(AZURE_BLOB_STORAGE_LIBRARY azure_sdk) + + if ((NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/sdk" + OR NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/cmake-modules") + AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) + message (WARNING "submodule contrib/azure is missing. to fix try run: \n git submodule update --init") + set(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY OFF) + set(USE_AZURE_BLOB_STORAGE 0) + endif () + + if (NOT USE_INTERNAL_SSL_LIBRARY AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) + message (FATAL_ERROR "Currently Blob Storage support can be built only with internal SSL library") + endif() + + if (NOT USE_INTERNAL_CURL AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) + message (FATAL_ERROR "Currently Blob Storage support can be built only with internal curl library") + endif() + endif() -if ((NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/sdk" - OR NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/cmake-modules") - AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) - message (WARNING "submodule contrib/azure is missing. to fix try run: \n git submodule update --init") - set(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY OFF) - set(USE_AZURE_BLOB_STORAGE 0) -endif () - -if (NOT USE_INTERNAL_SSL_LIBRARY AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) - message (FATAL_ERROR "Currently Blob Storage support can be built only with internal SSL library") -endif() - -if (NOT USE_INTERNAL_CURL AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) - message (FATAL_ERROR "Currently Blob Storage support can be built only with internal curl library") -endif() - -if (USE_AZURE_BLOB_STORAGE) - message (STATUS "Using Azure Blob Storage - ${USE_AZURE_BLOB_STORAGE}") -endif() +message (STATUS "Using Azure Blob Storage - ${USE_AZURE_BLOB_STORAGE}") diff --git a/cmake/find/ccache.cmake b/cmake/find/ccache.cmake index 95ec3d8a034..9acc0423f67 100644 --- a/cmake/find/ccache.cmake +++ b/cmake/find/ccache.cmake @@ -31,6 +31,7 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE) if (CCACHE_VERSION VERSION_GREATER "3.2.0" OR NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang") message(STATUS "Using ${CCACHE_FOUND} ${CCACHE_VERSION}") + set(LAUNCHER ${CCACHE_FOUND}) # debian (debhelpers) set SOURCE_DATE_EPOCH environment variable, that is # filled from the debian/changelog or current time. @@ -39,13 +40,8 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE) # of the manifest, which do not allow to use previous cache, # - 4.2+ ccache ignores SOURCE_DATE_EPOCH for every file w/o __DATE__/__TIME__ # - # So for: - # - 4.2+ does not require any sloppiness - # - 4.0+ will ignore SOURCE_DATE_EPOCH environment variable. - if (CCACHE_VERSION VERSION_GREATER_EQUAL "4.2") - message(STATUS "ccache is 4.2+ no quirks for SOURCE_DATE_EPOCH required") - set(LAUNCHER ${CCACHE_FOUND}) - elseif (CCACHE_VERSION VERSION_GREATER_EQUAL "4.0") + # Exclude SOURCE_DATE_EPOCH env for ccache versions between [4.0, 4.2). + if (CCACHE_VERSION VERSION_GREATER_EQUAL "4.0" AND CCACHE_VERSION VERSION_LESS "4.2") message(STATUS "Ignore SOURCE_DATE_EPOCH for ccache") set(LAUNCHER env -u SOURCE_DATE_EPOCH ${CCACHE_FOUND}) endif() diff --git a/contrib/NuRaft b/contrib/NuRaft index ff100a87131..c2043aa250e 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit ff100a8713146e1ca4b4158dd6cc4eef9af47fc3 +Subproject commit c2043aa250e53ad5cf75e596e319d587af4dcb3c diff --git a/docker/builder/Dockerfile b/docker/builder/Dockerfile deleted file mode 100644 index 49c40d576e7..00000000000 --- a/docker/builder/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -FROM ubuntu:20.04 - -# ARG for quick switch to a given ubuntu mirror -ARG apt_archive="http://archive.ubuntu.com" -RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list - -ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=13 - -RUN apt-get update \ - && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \ - --yes --no-install-recommends --verbose-versions \ - && export LLVM_PUBKEY_HASH="bda960a8da687a275a2078d43c111d66b1c6a893a3275271beedf266c1ff4a0cdecb429c7a5cccf9f486ea7aa43fd27f" \ - && wget -nv -O /tmp/llvm-snapshot.gpg.key https://apt.llvm.org/llvm-snapshot.gpg.key \ - && echo "${LLVM_PUBKEY_HASH} /tmp/llvm-snapshot.gpg.key" | sha384sum -c \ - && apt-key add /tmp/llvm-snapshot.gpg.key \ - && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \ - && echo "deb [trusted=yes] http://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \ - /etc/apt/sources.list - -RUN apt-get update \ - && apt-get install \ - bash \ - ccache \ - cmake \ - curl \ - expect \ - g++ \ - gcc \ - ninja-build \ - perl \ - pkg-config \ - python3 \ - python3-lxml \ - python3-requests \ - python3-termcolor \ - tzdata \ - llvm-${LLVM_VERSION} \ - clang-${LLVM_VERSION} \ - clang-tidy-${LLVM_VERSION} \ - lld-${LLVM_VERSION} \ - lldb-${LLVM_VERSION} \ - --yes --no-install-recommends - -COPY build.sh / - -CMD ["/bin/bash", "/build.sh"] diff --git a/docker/builder/Makefile b/docker/builder/Makefile deleted file mode 100644 index a9a7cddf3f2..00000000000 --- a/docker/builder/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -build: image - mkdir -p $(HOME)/.ccache - docker run --network=host --rm --workdir /server --volume $(realpath ../..):/server --cap-add=SYS_PTRACE --mount=type=bind,source=$(HOME)/.ccache,destination=/ccache -e CCACHE_DIR=/ccache -it yandex/clickhouse-builder - -pull: - docker pull yandex/clickhouse-builder - -image: - docker build --network=host -t yandex/clickhouse-builder . - -image_push: - docker push yandex/clickhouse-builder diff --git a/docker/builder/README.md b/docker/builder/README.md deleted file mode 100644 index cb9fb7d1b77..00000000000 --- a/docker/builder/README.md +++ /dev/null @@ -1,33 +0,0 @@ -Allows to build ClickHouse in Docker. -This is useful if you have an old OS distribution and you don't want to build fresh gcc or clang from sources. - -Usage: - -Prepare image: -``` -make image -``` - -Run build: -``` -make build -``` - -Before run, ensure that your user has access to docker: -To check, that you have access to Docker, run `docker ps`. -If not, you must add this user to `docker` group: `sudo usermod -aG docker $USER` and relogin. -(You must close all your sessions. For example, restart your computer.) - -Build results are available in `build_docker` directory at top level of your working copy. -It builds only binaries, not packages. - -For example, run server: -``` -cd $(git rev-parse --show-toplevel)/src/Server -$(git rev-parse --show-toplevel)/docker/builder/programs/clickhouse server --config-file $(git rev-parse --show-toplevel)/programs/server/config.xml -``` - -Run client: -``` -$(git rev-parse --show-toplevel)/docker/builder/programs/clickhouse client -``` diff --git a/docker/builder/build.sh b/docker/builder/build.sh deleted file mode 100755 index 1025af3f96e..00000000000 --- a/docker/builder/build.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash -set -e - -#ccache -s # uncomment to display CCache statistics -mkdir -p /server/build_docker -cd /server/build_docker -cmake -G Ninja /server "-DCMAKE_C_COMPILER=$(command -v clang-13)" "-DCMAKE_CXX_COMPILER=$(command -v clang++-13)" - -# Set the number of build jobs to the half of number of virtual CPU cores (rounded up). -# By default, ninja use all virtual CPU cores, that leads to very high memory consumption without much improvement in build time. -# Note that modern x86_64 CPUs use two-way hyper-threading (as of 2018). -# Without this option my laptop with 16 GiB RAM failed to execute build due to full system freeze. -NUM_JOBS=$(( ($(nproc || grep -c ^processor /proc/cpuinfo) + 1) / 2 )) - -ninja -j $NUM_JOBS && env TEST_OPT="--skip long compile $TEST_OPT" ctest -V -j $NUM_JOBS diff --git a/docker/images.json b/docker/images.json index dc7126a3f5a..354bdaa8728 100644 --- a/docker/images.json +++ b/docker/images.json @@ -103,6 +103,10 @@ "name": "clickhouse/mysql-golang-client", "dependent": [] }, + "docker/test/integration/dotnet_client": { + "name": "clickhouse/dotnet-client", + "dependent": [] + }, "docker/test/integration/mysql_java_client": { "name": "clickhouse/mysql-java-client", "dependent": [] diff --git a/docker/server/README.md b/docker/server/README.md index c63bb980c13..5a96a63bb05 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -17,6 +17,8 @@ $ docker run -d --name some-clickhouse-server --ulimit nofile=262144:262144 clic By default ClickHouse will be accessible only via docker network. See the [networking section below](#networking). +By default, starting above server instance will be run as default user without password. + ### connect to it from a native client ```bash $ docker run -it --rm --link some-clickhouse-server:clickhouse-server clickhouse/clickhouse-client --host clickhouse-server diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index 844a7396134..1ebaed752a6 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -52,9 +52,21 @@ function clone } +function wget_with_retry +{ + for _ in 1 2 3 4; do + if wget -nv -nd -c "$1";then + return 0 + else + sleep 0.5 + fi + done + return 1 +} + function download { - wget -nv -nd -c "$BINARY_URL_TO_DOWNLOAD" + wget_with_retry "$BINARY_URL_TO_DOWNLOAD" chmod +x clickhouse ln -s ./clickhouse ./clickhouse-server diff --git a/docker/test/integration/dotnet_client/.gitignore b/docker/test/integration/dotnet_client/.gitignore new file mode 100644 index 00000000000..cd42ee34e87 --- /dev/null +++ b/docker/test/integration/dotnet_client/.gitignore @@ -0,0 +1,2 @@ +bin/ +obj/ diff --git a/docker/test/integration/dotnet_client/Dockerfile b/docker/test/integration/dotnet_client/Dockerfile new file mode 100644 index 00000000000..f8d33415175 --- /dev/null +++ b/docker/test/integration/dotnet_client/Dockerfile @@ -0,0 +1,10 @@ +# docker build . +# docker run -it --rm --network=host 14f23e59669c dotnet run --host localhost --port 8123 --user default --database default + +FROM mcr.microsoft.com/dotnet/sdk:3.1 + +WORKDIR /client +COPY *.cs *.csproj /client/ + +ARG VERSION=4.1.0 +RUN dotnet add package ClickHouse.Client -v ${VERSION} diff --git a/docker/test/integration/dotnet_client/Program.cs b/docker/test/integration/dotnet_client/Program.cs new file mode 100644 index 00000000000..3f640d15e86 --- /dev/null +++ b/docker/test/integration/dotnet_client/Program.cs @@ -0,0 +1,90 @@ +using System; +using System.Threading.Tasks; +using ClickHouse.Client.ADO; +using ClickHouse.Client.Utility; + +namespace clickhouse.test +{ + class Program + { + static async Task Main(string[] args) + { + try + { + using var connection = new ClickHouseConnection(GetConnectionString(args)); + + await connection.ExecuteStatementAsync("CREATE DATABASE IF NOT EXISTS test"); + await connection.ExecuteStatementAsync("TRUNCATE TABLE IF EXISTS test.dotnet_test"); + await connection.ExecuteStatementAsync("CREATE TABLE IF NOT EXISTS test.dotnet_test (`age` Int32, `name` String) Engine = Memory"); + + using var command = connection.CreateCommand(); + command.AddParameter("name", "Linus Torvalds"); + command.AddParameter("age", 51); + command.CommandText = "INSERT INTO test.dotnet_test VALUES({age:Int32}, {name:String})"; + await command.ExecuteNonQueryAsync(); + + using var result1 = await connection.ExecuteReaderAsync("SELECT * FROM test.dotnet_test"); + while (result1.Read()) + { + var values = new object[result1.FieldCount]; + result1.GetValues(values); + + foreach (var row in values) + { + Console.WriteLine(row); + } + } + + using var result2 = await connection.ExecuteReaderAsync(selectSql); + while (result2.Read()) + { + var values = new object[result2.FieldCount]; + result2.GetValues(values); + + foreach (var row in values) + { + Console.WriteLine(row); + } + } + } + catch (Exception e) + { + Console.Error.WriteLine(e); + Environment.ExitCode = 1; + } + } + + private static string GetConnectionString(string[] args) + { + var builder = new ClickHouseConnectionStringBuilder(); + int i = 0; + while (i < args.Length) + { + switch (args[i]) + { + case "--host": + builder.Host = args[++i]; + break; + case "--port": + builder.Port = UInt16.Parse(args[++i]); + break; + case "--user": + builder.Username = args[++i]; + break; + case "--password": + builder.Password = args[++i]; + break; + case "--database": + builder.Database = args[++i]; + break; + default: + i++; + break; + } + } + return builder.ToString(); + } + + private static string selectSql = @"SELECT NULL, toInt8(-8), toUInt8(8), toInt16(-16), toUInt16(16), toInt16(-32), toUInt16(32), toInt64(-64), toUInt64(64), toFloat32(32e6), toFloat32(-32e6), toFloat64(64e6), toFloat64(-64e6), 'TestString', toFixedString('ASD',3), toFixedString('ASD',5), toUUID('00000000-0000-0000-0000-000000000000'), toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0'), toIPv4('1.2.3.4'), toIPv4('255.255.255.255'), CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)'), CAST('a', 'Enum8(\'a\' = -1, \'b\' = 127)'), CAST('a', 'Enum16(\'a\' = -32768, \'b\' = 32767)'), array(1, 2, 3), array('a', 'b', 'c'), array(1, 2, NULL), toInt32OrNull('123'), toInt32OrNull(NULL), CAST(NULL AS Nullable(DateTime)), CAST(NULL AS LowCardinality(Nullable(String))), toLowCardinality('lowcardinality'), tuple(1, 'a', 8), tuple(123, tuple(5, 'a', 7)), toDateOrNull('1999-11-12'), toDateTime('1988-08-28 11:22:33'), toDateTime64('2043-03-01 18:34:04.4444444', 9), toDecimal32(123.45, 3), toDecimal32(-123.45, 3), toDecimal64(1.2345, 7), toDecimal64(-1.2345, 7), toDecimal128(12.34, 9), toDecimal128(-12.34, 9), toIPv6('2001:0db8:85a3:0000:0000:8a2e:0370:7334')"; + } +} diff --git a/docker/test/integration/dotnet_client/clickhouse.test.csproj b/docker/test/integration/dotnet_client/clickhouse.test.csproj new file mode 100644 index 00000000000..11704487bf6 --- /dev/null +++ b/docker/test/integration/dotnet_client/clickhouse.test.csproj @@ -0,0 +1,13 @@ + + + + Exe + netcoreapp3.1 + + + + + + + + diff --git a/docker/test/integration/runner/compose/docker_compose_dotnet_client.yml b/docker/test/integration/runner/compose/docker_compose_dotnet_client.yml new file mode 100644 index 00000000000..b63dac51522 --- /dev/null +++ b/docker/test/integration/runner/compose/docker_compose_dotnet_client.yml @@ -0,0 +1,6 @@ +version: '2.3' +services: + dotnet1: + image: clickhouse/dotnet-client:${DOCKER_DOTNET_CLIENT_TAG:-latest} + # to keep container running + command: sleep infinity diff --git a/docker/test/integration/runner/dockerd-entrypoint.sh b/docker/test/integration/runner/dockerd-entrypoint.sh index ad8a8e4eb84..8109ef7ae64 100755 --- a/docker/test/integration/runner/dockerd-entrypoint.sh +++ b/docker/test/integration/runner/dockerd-entrypoint.sh @@ -39,6 +39,7 @@ export CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH=/clickhouse-odbc-bridge export CLICKHOUSE_LIBRARY_BRIDGE_BINARY_PATH=/clickhouse-library-bridge export DOCKER_MYSQL_GOLANG_CLIENT_TAG=${DOCKER_MYSQL_GOLANG_CLIENT_TAG:=latest} +export DOCKER_DOTNET_CLIENT_TAG=${DOCKER_DOTNET_CLIENT_TAG:=latest} export DOCKER_MYSQL_JAVA_CLIENT_TAG=${DOCKER_MYSQL_JAVA_CLIENT_TAG:=latest} export DOCKER_MYSQL_JS_CLIENT_TAG=${DOCKER_MYSQL_JS_CLIENT_TAG:=latest} export DOCKER_MYSQL_PHP_CLIENT_TAG=${DOCKER_MYSQL_PHP_CLIENT_TAG:=latest} diff --git a/docs/_includes/install/freebsd.sh b/docs/_includes/install/freebsd.sh index 50e3bc02cb7..2a715a1795f 100644 --- a/docs/_includes/install/freebsd.sh +++ b/docs/_includes/install/freebsd.sh @@ -1,3 +1,3 @@ -wget 'https://builds.clickhouse.com/master/freebsd/clickhouse' +fetch 'https://builds.clickhouse.com/master/freebsd/clickhouse' chmod a+x ./clickhouse -sudo ./clickhouse install +su -m root -c './clickhouse install' diff --git a/docs/en/engines/database-engines/materialized-postgresql.md b/docs/en/engines/database-engines/materialized-postgresql.md index d2c4dbf1f3c..43f61201946 100644 --- a/docs/en/engines/database-engines/materialized-postgresql.md +++ b/docs/en/engines/database-engines/materialized-postgresql.md @@ -5,15 +5,15 @@ toc_title: MaterializedPostgreSQL # [experimental] MaterializedPostgreSQL {#materialize-postgresql} -Creates ClickHouse database with an initial data dump of PostgreSQL database tables and starts replication process, i.e. executes background job to apply new changes as they happen on PostgreSQL database tables in the remote PostgreSQL database. +Creates a ClickHouse database with tables from PostgreSQL database. Firstly, database with engine `MaterializedPostgreSQL` creates a snapshot of PostgreSQL database and loads required tables. Required tables can include any subset of tables from any subset of schemas from specified database. Along with the snapshot database engine acquires LSN and once initial dump of tables is performed - it starts pulling updates from WAL. After database is created, newly added tables to PostgreSQL database are not automatically added to replication. They have to be added manually with `ATTACH TABLE db.table` query. -ClickHouse server works as PostgreSQL replica. It reads WAL and performs DML queries. DDL is not replicated, but can be handled (described below). +Replication is implemented with PostgreSQL Logical Replication Protocol, which does not allow to replicate DDL, but allows to know whether replication breaking changes happened (column type changes, adding/removing columns). Such changes are detected and according tables stop receiving updates. Such tables can be automatically reloaded in the background in case required setting is turned on (can be used starting from 22.1). Safest way for now is to use `ATTACH`/ `DETACH` queries to reload table completely. If DDL does not break replication (for example, renaming a column) table will still receive updates (insertion is done by position). ## Creating a Database {#creating-a-database} ``` sql CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] -ENGINE = MaterializedPostgreSQL('host:port', ['database' | database], 'user', 'password') [SETTINGS ...] +ENGINE = MaterializedPostgreSQL('host:port', 'database', 'user', 'password') [SETTINGS ...] ``` **Engine Parameters** @@ -23,51 +23,39 @@ ENGINE = MaterializedPostgreSQL('host:port', ['database' | database], 'user', 'p - `user` — PostgreSQL user. - `password` — User password. +## Example of Use {#example-of-use} + +``` sql +CREATE DATABASE postgresql; +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password'); + +SHOW TABLES FROM postgres_db; + +┌─name───┐ +│ table1 │ +└────────┘ + +SELECT * FROM postgresql_db.postgres_table; +``` + ## Dynamically adding new tables to replication {#dynamically-adding-table-to-replication} +After `MaterializedPostgreSQL` database is created, it does not automatically detect new tables in according PostgreSQL database. Such tables can be added manually: + ``` sql ATTACH TABLE postgres_database.new_table; ``` -When specifying a specific list of tables in the database using the setting [materialized_postgresql_tables_list](../../operations/settings/settings.md#materialized-postgresql-tables-list), it will be updated to the current state, taking into account the tables which were added by the `ATTACH TABLE` query. +Warning: before version 22.1 adding table to replication left unremoved temprorary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in clickhouse version before 22.1, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. Issue is fixed in 22.1. ## Dynamically removing tables from replication {#dynamically-removing-table-from-replication} +It is possible to remove specific tables from replication: + ``` sql DETACH TABLE postgres_database.table_to_remove; ``` -## Settings {#settings} - -- [materialized_postgresql_tables_list](../../operations/settings/settings.md#materialized-postgresql-tables-list) - -- [materialized_postgresql_schema](../../operations/settings/settings.md#materialized-postgresql-schema) - -- [materialized_postgresql_schema_list](../../operations/settings/settings.md#materialized-postgresql-schema-list) - -- [materialized_postgresql_allow_automatic_update](../../operations/settings/settings.md#materialized-postgresql-allow-automatic-update) - -- [materialized_postgresql_max_block_size](../../operations/settings/settings.md#materialized-postgresql-max-block-size) - -- [materialized_postgresql_replication_slot](../../operations/settings/settings.md#materialized-postgresql-replication-slot) - -- [materialized_postgresql_snapshot](../../operations/settings/settings.md#materialized-postgresql-snapshot) - -``` sql -CREATE DATABASE database1 -ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') -SETTINGS materialized_postgresql_tables_list = 'table1,table2,table3'; - -SELECT * FROM database1.table1; -``` - -The settings can be changed, if necessary, using a DDL query. But it is impossible to change the setting `materialized_postgresql_tables_list`. To update the list of tables in this setting use the `ATTACH TABLE` query. - -``` sql -ALTER DATABASE postgres_database MODIFY SETTING materialized_postgresql_max_block_size = ; -``` - - ## PostgreSQL schema {#schema} PostgreSQL [schema](https://www.postgresql.org/docs/9.1/ddl-schemas.html) can be configured in 3 ways (starting from version 21.12). @@ -89,7 +77,7 @@ Tables are accessed via schema name and table name at the same time: ``` sql CREATE DATABASE database1 ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') -SETTINGS materialized_postgresql_tables_list = 'schema1.table1,schema2.table2,schema1.table3'; +SETTINGS materialized_postgresql_tables_list = 'schema1.table1,schema2.table2,schema1.table3', materialized_postgresql_tables_list_with_schema = 1; SELECT * FROM database1.`schema1.table1`; @@ -150,13 +138,65 @@ WHERE oid = 'postgres_table'::regclass; !!! warning "Warning" Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. -## Example of Use {#example-of-use} +## Settings {#settings} + +1. materialized_postgresql_tables_list {#materialized-postgresql-tables-list} + +Sets a comma-separated list of PostgreSQL database tables, which will be replicated via [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) database engine. + +Default value: empty list — means whole PostgreSQL database will be replicated. + +2. materialized_postgresql_schema {#materialized-postgresql-schema} + +Default value: empty string. (Default schema is used) + +3. materialized_postgresql_schema_list {#materialized-postgresql-schema-list} + +Default value: empty list. (Default schema is used) + +4. materialized_postgresql_allow_automatic_update {#materialized-postgresql-allow-automatic-update} + +Do not use this setting before 22.1 version. + +Allows reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them. + +Possible values: + +- 0 — The table is not automatically updated in the background, when schema changes are detected. +- 1 — The table is automatically updated in the background, when schema changes are detected. + +Default value: `0`. + +5. materialized_postgresql_max_block_size {#materialized-postgresql-max-block-size} + +Sets the number of rows collected in memory before flushing data into PostgreSQL database table. + +Possible values: + +- Positive integer. + +Default value: `65536`. + +6. materialized_postgresql_replication_slot {#materialized-postgresql-replication-slot} + +A user-created replication slot. Must be used together with `materialized_postgresql_snapshot`. + +7. materialized_postgresql_snapshot {#materialized-postgresql-snapshot} + +A text string identifying a snapshot, from which [initial dump of PostgreSQL tables](../../engines/database-engines/materialized-postgresql.md) will be performed. Must be used together with `materialized_postgresql_replication_slot`. ``` sql -CREATE DATABASE postgresql_db -ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password'); +CREATE DATABASE database1 +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') +SETTINGS materialized_postgresql_tables_list = 'table1,table2,table3'; -SELECT * FROM postgresql_db.postgres_table; +SELECT * FROM database1.table1; +``` + +The settings can be changed, if necessary, using a DDL query. But it is impossible to change the setting `materialized_postgresql_tables_list`. To update the list of tables in this setting use the `ATTACH TABLE` query. + +``` sql +ALTER DATABASE postgres_database MODIFY SETTING materialized_postgresql_max_block_size = ; ``` ## Notes {#notes} @@ -165,11 +205,11 @@ SELECT * FROM postgresql_db.postgres_table; Logical Replication Slots which exist on the primary are not available on standby replicas. So if there is a failover, new primary (the old physical standby) won’t be aware of any slots which were existing with old primary. This will lead to a broken replication from PostgreSQL. -A solution to this is to manage replication slots yourself and define a permanent replication slot (some information can be found [here](https://patroni.readthedocs.io/en/latest/SETTINGS.html)). You'll need to pass slot name via [materialized_postgresql_replication_slot](../../operations/settings/settings.md#materialized-postgresql-replication-slot) setting, and it has to be exported with `EXPORT SNAPSHOT` option. The snapshot identifier needs to be passed via [materialized_postgresql_snapshot](../../operations/settings/settings.md#materialized-postgresql-snapshot) setting. +A solution to this is to manage replication slots yourself and define a permanent replication slot (some information can be found [here](https://patroni.readthedocs.io/en/latest/SETTINGS.html)). You'll need to pass slot name via `materialized_postgresql_replication_slot` setting, and it has to be exported with `EXPORT SNAPSHOT` option. The snapshot identifier needs to be passed via `materialized_postgresql_snapshot` setting. Please note that this should be used only if it is actually needed. If there is no real need for that or full understanding why, then it is better to allow the table engine to create and manage its own replication slot. -**Example (from [@bchrobot](https://github.com/bchrobot))** +**Example (from [@bchrobot](https://github.com/bchrobot))** 1. Configure replication slot in PostgreSQL. @@ -214,3 +254,23 @@ SETTINGS ```bash kubectl exec acid-demo-cluster-0 -c postgres -- su postgres -c 'patronictl failover --candidate acid-demo-cluster-1 --force' ``` + +### Required permissions + +1. [CREATE PUBLICATION](https://postgrespro.ru/docs/postgresql/14/sql-createpublication) -- create query privilege. + +2. [CREATE_REPLICATION_SLOT](https://postgrespro.ru/docs/postgrespro/10/protocol-replication#PROTOCOL-REPLICATION-CREATE-SLOT) -- replication privelege. + +3. [pg_drop_replication_slot](https://postgrespro.ru/docs/postgrespro/9.5/functions-admin#functions-replication) -- replication privilege or superuser. + +4. [DROP PUBLICATION](https://postgrespro.ru/docs/postgresql/10/sql-droppublication) -- owner of publication (`username` in MaterializedPostgreSQL engine itself). + +It is possible to avoid executing `2` and `3` commands and having those permissions. Use settings `materialized_postgresql_replication_slot` and `materialized_postgresql_snapshot`. But with much care. + +Access to tables: + +1. pg_publication + +2. pg_replication_slots + +3. pg_publication_tables diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index 82227215da2..0d6d90f9d31 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -189,7 +189,7 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us |libhdfs3\_conf | "" | ### Limitations {#limitations} - * `hadoop_security_kerberos_ticket_cache_path` and `libhdfs3_conf` can be global only, not user specific +* `hadoop_security_kerberos_ticket_cache_path` and `libhdfs3_conf` can be global only, not user specific ## Kerberos support {#kerberos-support} diff --git a/docs/en/engines/table-engines/integrations/materialized-postgresql.md b/docs/en/engines/table-engines/integrations/materialized-postgresql.md index d02a11257c2..fa349e49af5 100644 --- a/docs/en/engines/table-engines/integrations/materialized-postgresql.md +++ b/docs/en/engines/table-engines/integrations/materialized-postgresql.md @@ -7,7 +7,7 @@ toc_title: MaterializedPostgreSQL Creates ClickHouse table with an initial data dump of PostgreSQL table and starts replication process, i.e. executes background job to apply new changes as they happen on PostgreSQL table in the remote PostgreSQL database. -If more than one table is required, it is highly recommended to use the [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md) database engine instead of the table engine and use the [materialized_postgresql_tables_list](../../../operations/settings/settings.md#materialized-postgresql-tables-list) setting, which specifies the tables to be replicated. It will be much better in terms of CPU, fewer connections and fewer replication slots inside the remote PostgreSQL database. +If more than one table is required, it is highly recommended to use the [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md) database engine instead of the table engine and use the `materialized_postgresql_tables_list` setting, which specifies the tables to be replicated (will also be possible to add database `schema`). It will be much better in terms of CPU, fewer connections and fewer replication slots inside the remote PostgreSQL database. ## Creating a Table {#creating-a-table} @@ -38,7 +38,7 @@ PRIMARY KEY key; - `_version` — Transaction counter. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). - `_sign` — Deletion mark. Type: [Int8](../../../sql-reference/data-types/int-uint.md). Possible values: - - `1` — Row is not deleted, + - `1` — Row is not deleted, - `-1` — Row is deleted. These columns do not need to be added when a table is created. They are always accessible in `SELECT` query. diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index 0bdb54e0c16..789759ec521 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -36,6 +36,31 @@ The table structure can differ from the original PostgreSQL table structure: - `schema` — Non-default table schema. Optional. - `on conflict ...` — example: `ON CONFLICT DO NOTHING`. Optional. Note: adding this option will make insertion less efficient. +or via config (since version 21.11): + +``` + + + + + + +
+
+ + + + + + +
+``` + +Some parameters can be overriden by key value arguments: +``` sql +SELECT * FROM postgresql(postgres1, schema='schema1', table='table1'); +``` + ## Implementation Details {#implementation-details} `SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 4b7473f76ad..6769f48a466 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -339,7 +339,7 @@ SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 For `Map` data type client can specify if index should be created for keys or values using [mapKeys](../../../sql-reference/functions/tuple-map-functions.md#mapkeys) or [mapValues](../../../sql-reference/functions/tuple-map-functions.md#mapvalues) function. - The following functions can use the filter: [equals](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [in](../../../sql-reference/functions/in-functions.md), [notIn](../../../sql-reference/functions/in-functions.md), [has](../../../sql-reference/functions/array-functions.md#hasarr-elem). + The following functions can use the filter: [equals](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [in](../../../sql-reference/functions/in-functions.md), [notIn](../../../sql-reference/functions/in-functions.md), [has](../../../sql-reference/functions/array-functions.md#hasarr-elem), [hasAny](../../../sql-reference/functions/array-functions.md#hasany), [hasAll](../../../sql-reference/functions/array-functions.md#hasall). Example of index creation for `Map` data type diff --git a/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md b/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md new file mode 100644 index 00000000000..731dc9dface --- /dev/null +++ b/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md @@ -0,0 +1,15 @@ +--- +title: How do I contribute code to ClickHouse? +toc_hidden: true +toc_priority: 120 +--- + +# How do I contribute code to ClickHouse? {#how-do-i-contribute-code-to-clickhouse} + +ClickHouse is an open-source project [developed on GitHub](https://github.com/ClickHouse/ClickHouse). + +As customary, contribution instructions are published in [CONTRIBUTING.md](https://github.com/ClickHouse/ClickHouse/blob/master/CONTRIBUTING.md) file in the root of the source code repository. + +If you want to suggest a substantial change to ClickHouse, consider [opening a GitHub issue](https://github.com/ClickHouse/ClickHouse/issues/new/choose) explaining what you want to do, to discuss it with maintainers and community first. [Examples of such RFC issues](https://github.com/ClickHouse/ClickHouse/issues?q=is%3Aissue+is%3Aopen+rfc). + +If your contributions are security related, please check out [our security policy](https://github.com/ClickHouse/ClickHouse/security/policy/) too. diff --git a/docs/en/faq/general/index.md b/docs/en/faq/general/index.md index cd2368be1cf..51fff9a53ae 100644 --- a/docs/en/faq/general/index.md +++ b/docs/en/faq/general/index.md @@ -17,6 +17,7 @@ Questions: - [What is OLAP?](../../faq/general/olap.md) - [What is a columnar database?](../../faq/general/columnar-database.md) - [Why not use something like MapReduce?](../../faq/general/mapreduce.md) +- [How do I contribute code to ClickHouse?](../../faq/general/how-do-i-contribute-code-to-clickhouse.md) !!! info "Don’t see what you were looking for?" Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. diff --git a/docs/en/faq/operations/index.md b/docs/en/faq/operations/index.md index c0a6d85b66d..81aec18b9cf 100644 --- a/docs/en/faq/operations/index.md +++ b/docs/en/faq/operations/index.md @@ -11,6 +11,7 @@ Questions: - [Which ClickHouse version to use in production?](../../faq/operations/production.md) - [Is it possible to delete old records from a ClickHouse table?](../../faq/operations/delete-old-data.md) +- [Does ClickHouse support multi-region replication?](../../faq/operations/multi-region-replication.md) !!! info "Don’t see what you were looking for?" Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. diff --git a/docs/en/faq/operations/multi-region-replication.md b/docs/en/faq/operations/multi-region-replication.md new file mode 100644 index 00000000000..7d78737544a --- /dev/null +++ b/docs/en/faq/operations/multi-region-replication.md @@ -0,0 +1,13 @@ +--- +title: Does ClickHouse support multi-region replication? +toc_hidden: true +toc_priority: 30 +--- + +# Does ClickHouse support multi-region replication? {#does-clickhouse-support-multi-region-replication} + +The short answer is "yes". However, we recommend keeping latency between all regions/datacenters in two-digit range, otherwise write performance will suffer as it goes through distributed consensus protocol. For example, replication between US coasts will likely work fine, but between the US and Europe won't. + +Configuration-wise there's no difference compared to single-region replication, simply use hosts that are located in different locations for replicas. + +For more information, see [full article on data replication](../../engines/table-engines/mergetree-family/replication.md). diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md index 4a97ab6589d..70a1b8349ff 100644 --- a/docs/en/getting-started/install.md +++ b/docs/en/getting-started/install.md @@ -142,6 +142,12 @@ On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sourc To start the server as a daemon, run: +``` bash +$ sudo clickhouse start +``` + +There are also another ways to run ClickHouse: + ``` bash $ sudo service clickhouse-server start ``` @@ -152,6 +158,12 @@ If you do not have `service` command, run as $ sudo /etc/init.d/clickhouse-server start ``` +If you have `systemctl` command, run as + +``` bash +$ sudo systemctl start clickhouse-server.service +``` + See the logs in the `/var/log/clickhouse-server/` directory. If the server does not start, check the configurations in the file `/etc/clickhouse-server/config.xml`. diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 313c6508b55..f8f6f26d208 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -9,6 +9,8 @@ The HTTP interface lets you use ClickHouse on any platform from any programming By default, `clickhouse-server` listens for HTTP on port 8123 (this can be changed in the config). +Sometimes, `curl` command is not available on user operating systems. On Ubuntu or Debian, run `sudo apt install curl`. Please refer this [documentation](https://curl.se/download.html) to install it before running the examples. + If you make a `GET /` request without parameters, it returns 200 response code and the string which defined in [http_server_default_response](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-http_server_default_response) default value “Ok.” (with a line feed at the end) ``` bash @@ -186,7 +188,7 @@ $ echo "SELECT 1" | gzip -c | \ ``` ``` bash -# Receiving compressed data from the server +# Receiving compressed data archive from the server $ curl -vsS "http://localhost:8123/?enable_http_compression=1" \ -H 'Accept-Encoding: gzip' --output result.gz -d 'SELECT number FROM system.numbers LIMIT 3' $ zcat result.gz @@ -195,6 +197,15 @@ $ zcat result.gz 2 ``` +```bash +# Receiving compressed data from the server and using the gunzip to receive decompressed data +$ curl -sS "http://localhost:8123/?enable_http_compression=1" \ + -H 'Accept-Encoding: gzip' -d 'SELECT number FROM system.numbers LIMIT 3' | gunzip - +0 +1 +2 +``` + ## Default Database {#default-database} You can use the ‘database’ URL parameter or the ‘X-ClickHouse-Database’ header to specify the default database. @@ -424,10 +435,10 @@ Next are the configuration methods for different `type`. `query` value is a predefined query of `predefined_query_handler`, which is executed by ClickHouse when an HTTP request is matched and the result of the query is returned. It is a must configuration. -The following example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_alter_threads` settings, then queries the system table to check whether these settings were set successfully. +The following example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` settings, then queries the system table to check whether these settings were set successfully. !!! note "Warning" - To keep the default `handlers` such as` query`, `play`,` ping`, use the `` rule. + To keep the default `handlers` such as` query`, `play`,` ping`, use the `` rule. Example: @@ -451,9 +462,9 @@ Example: ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "caution" @@ -465,7 +476,7 @@ In `dynamic_query_handler`, the query is written in the form of param of the HTT ClickHouse extracts and executes the value corresponding to the `query_param_name` value in the URL of the HTTP request. The default value of `query_param_name` is `/query` . It is an optional configuration. If there is no definition in the configuration file, the param is not passed in. -To experiment with this functionality, the example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_alter_threads` and `queries` whether the settings were set successfully. +To experiment with this functionality, the example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` and `queries` whether the settings were set successfully. Example: @@ -484,9 +495,9 @@ Example: ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ### static {#static} diff --git a/docs/en/interfaces/mysql.md b/docs/en/interfaces/mysql.md index 38bcc2b68f8..9932e6b6cb3 100644 --- a/docs/en/interfaces/mysql.md +++ b/docs/en/interfaces/mysql.md @@ -36,7 +36,7 @@ mysql> ``` For compatibility with all MySQL clients, it is recommended to specify user password with [double SHA1](../operations/settings/settings-users.md#password_double_sha1_hex) in configuration file. -If user password is specified using [SHA256](../operations/settings/settings-users.md#password_sha256_hex), some clients won’t be able to authenticate (mysqljs and old versions of command-line tool mysql). +If user password is specified using [SHA256](../operations/settings/settings-users.md#password_sha256_hex), some clients won’t be able to authenticate (mysqljs and old versions of command-line tool MySQL and MariaDB). Restrictions: diff --git a/docs/en/interfaces/third-party/client-libraries.md b/docs/en/interfaces/third-party/client-libraries.md index 342b1c9a496..a116c8e2222 100644 --- a/docs/en/interfaces/third-party/client-libraries.md +++ b/docs/en/interfaces/third-party/client-libraries.md @@ -6,7 +6,7 @@ toc_title: Client Libraries # Client Libraries from Third-party Developers {#client-libraries-from-third-party-developers} !!! warning "Disclaimer" - Yandex does **not** maintain the libraries listed below and hasn’t done any extensive testing to ensure their quality. + ClickHouse Inc does **not** maintain the libraries listed below and hasn’t done any extensive testing to ensure their quality. - Python - [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md index 87c5a6f7aec..c2660653907 100644 --- a/docs/en/introduction/adopters.md +++ b/docs/en/introduction/adopters.md @@ -60,8 +60,10 @@ toc_title: Adopters | Exness | Trading | Metrics, Logging | — | — | [Talk in Russian, May 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) | | EventBunker.io | Serverless Data Processing | — | — | — | [Tweet, April 2021](https://twitter.com/Halil_D_/status/1379839133472985091) | | FastNetMon | DDoS Protection | Main Product | | — | [Official website](https://fastnetmon.com/docs-fnm-advanced/fastnetmon-advanced-traffic-persistency/) | +| Firebolt | Analytics | Main product | - | - | [YouTube Tech Talk](https://www.youtube.com/watch?v=9rW9uEJ15tU) | | Flipkart | e-Commerce | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=239) | | FunCorp | Games | | — | 14 bn records/day as of Jan 2021 | [Article](https://www.altinity.com/blog/migrating-from-redshift-to-clickhouse) | +| Futurra Group | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) | | Geniee | Ad network | Main product | — | — | [Blog post in Japanese, July 2017](https://tech.geniee.co.jp/entry/2017/07/20/160100) | | Genotek | Bioinformatics | Main product | — | — | [Video, August 2020](https://youtu.be/v3KyZbz9lEE) | | Gigapipe | Managed ClickHouse | Main product | — | — | [Official website](https://gigapipe.com/) | @@ -70,6 +72,7 @@ toc_title: Adopters | Grouparoo | Data Warehouse Integrations | Main product | — | — | [Official Website, November 2021](https://www.grouparoo.com/integrations) | | HUYA | Video Streaming | Analytics | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/7.%20ClickHouse万亿数据分析实践%20李本旺(sundy-li)%20虎牙.pdf) | | Hydrolix | Cloud data platform | Main product | — | — | [Documentation](https://docs.hydrolix.io/guide/query) | +| Hystax | Cloud Operations | Observability Analytics | - | - | [Blog](https://hystax.com/clickhouse-for-real-time-cost-saving-analytics-how-to-stop-hammering-screws-and-use-an-electric-screwdriver/) | | ICA | FinTech | Risk Management | — | — | [Blog Post in English, Sep 2020](https://altinity.com/blog/clickhouse-vs-redshift-performance-for-fintech-risk-management?utm_campaign=ClickHouse%20vs%20RedShift&utm_content=143520807&utm_medium=social&utm_source=twitter&hss_channel=tw-3894792263) | | Idealista | Real Estate | Analytics | — | — | [Blog Post in English, April 2019](https://clickhouse.com/blog/en/clickhouse-meetup-in-madrid-on-april-2-2019) | | Infobaleen | AI markting tool | Analytics | — | — | [Official site](https://infobaleen.com) | @@ -81,14 +84,18 @@ toc_title: Adopters | Ippon Technologies | Technology Consulting | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=205) | | Ivi | Online Cinema | Analytics, Monitoring | — | — | [Article in Russian, Jan 2018](https://habr.com/en/company/ivi/blog/347408/) | | Jinshuju 金数据 | BI Analytics | Main product | — | — | [Slides in Chinese, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/3.%20金数据数据架构调整方案Public.pdf) | -| Jitsu | Cloud Software | Data Pipeline | — | — | [Documentation](https://jitsu.com/docs/destinations-configuration/clickhouse-destination), [Hacker News](https://news.ycombinator.com/item?id=29106082) | +| Jitsu | Cloud Software | Data Pipeline | — | — | [Documentation](https://jitsu.com/docs/destinations-configuration/clickhouse-destination), [Hacker News post](https://news.ycombinator.com/item?id=29106082) | +| JuiceFS | Storage | Shopping Cart | - | - | [Blog](https://juicefs.com/blog/en/posts/shopee-clickhouse-with-juicefs/) | | kakaocorp | Internet company | — | — | — | [if(kakao)2020](https://tv.kakao.com/channel/3693125/cliplink/414129353), [if(kakao)2021](https://if.kakao.com/session/24) | | Kodiak Data | Clouds | Main product | — | — | [Slides in Engish, April 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup13/kodiak_data.pdf) | | Kontur | Software Development | Metrics | — | — | [Talk in Russian, November 2018](https://www.youtube.com/watch?v=U4u4Bd0FtrY) | | Kuaishou | Video | — | — | — | [ClickHouse Meetup, October 2018](https://clickhouse.com/blog/en/2018/clickhouse-community-meetup-in-beijing-on-october-28-2018/) | | KGK Global | Vehicle monitoring | — | — | — | [Press release, June 2021](https://zoom.cnews.ru/news/item/530921) | +| LANCOM Systems | Network Solutions | Traffic analysis | - | - | [ClickHouse Operator for Kubernetes](https://www.lancom-systems.com/), [Hacker News post] (https://news.ycombinator.com/item?id=29413660) | | Lawrence Berkeley National Laboratory | Research | Traffic analysis | 5 servers | 55 TiB | [Slides in English, April 2019](https://www.smitasin.com/presentations/2019-04-17_DOE-NSM.pdf) | +| Lever | Talent Management | Recruiting | - | - | [Hacker News post](https://news.ycombinator.com/item?id=29558544) | | LifeStreet | Ad network | Main product | 75 servers (3 replicas) | 5.27 PiB | [Blog post in Russian, February 2017](https://habr.com/en/post/322620/) | +| Lookforsale | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) | | Mail.ru Cloud Solutions | Cloud services | Main product | — | — | [Article in Russian](https://mcs.mail.ru/help/db-create/clickhouse#) | | MAXILECT | Ad Tech, Blockchain, ML, AI | — | — | — | [Job advertisement, 2021](https://www.linkedin.com/feed/update/urn:li:activity:6780842017229430784/) | | Marilyn | Advertising | Statistics | — | — | [Talk in Russian, June 2017](https://www.youtube.com/watch?v=iXlIgx2khwc) | @@ -106,6 +113,7 @@ toc_title: Adopters | Ok.ru | Social Network | — | 72 servers | 810 TB compressed, 50bn rows/day, 1.5 TB/day | [SmartData conference, October 2021](https://assets.ctfassets.net/oxjq45e8ilak/4JPHkbJenLgZhBGGyyonFP/57472ec6987003ec4078d0941740703b/____________________ClickHouse_______________________.pdf) | | Omnicomm | Transportation Monitoring | — | — | — | [Facebook post, October 2021](https://www.facebook.com/OmnicommTeam/posts/2824479777774500) | | OneAPM | Monitoring and Data Analysis | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) | +| Opensee | Financial Analytics | Main product | - | - | [Blog](https://opensee.io/news/from-moscow-to-wall-street-the-remarkable-journey-of-clickhouse/) | | Open Targets | Genome Research | Genome Search | — | — | [Tweet, October 2021](https://twitter.com/OpenTargets/status/1452570865342758913?s=20), [Blog](https://blog.opentargets.org/graphql/) | | OZON | E-commerce | — | — | — | [Official website](https://job.ozon.ru/vacancy/razrabotchik-clickhouse-ekspluatatsiya-40991870/) | | Panelbear | Analytics | Monitoring and Analytics | — | — | [Tech Stack, November 2020](https://panelbear.com/blog/tech-stack/) | @@ -118,6 +126,7 @@ toc_title: Adopters | PRANA | Industrial predictive analytics | Main product | — | — | [News (russian), Feb 2021](https://habr.com/en/news/t/541392/) | | QINGCLOUD | Cloud services | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/4.%20Cloud%20%2B%20TSDB%20for%20ClickHouse%20张健%20QingCloud.pdf) | | Qrator | DDoS protection | Main product | — | — | [Blog Post, March 2019](https://blog.qrator.net/en/clickhouse-ddos-mitigation_37/) | +| R-Vision | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) | | Raiffeisenbank | Banking | Analytics | — | — | [Lecture in Russian, December 2020](https://cs.hse.ru/announcements/421965599.html) | | Rambler | Internet services | Analytics | — | — | [Talk in Russian, April 2018](https://medium.com/@ramblertop/разработка-api-clickhouse-для-рамблер-топ-100-f4c7e56f3141) | | Replica | Urban Planning | Analytics | — | — | [Job advertisement](https://boards.greenhouse.io/replica/jobs/5547732002?gh_jid=5547732002) | @@ -153,6 +162,7 @@ toc_title: Adopters | Tinybird | Real-time Data Products | Data processing | — | — | [Official website](https://www.tinybird.co/) | | Traffic Stars | AD network | — | 300 servers in Europe/US | 1.8 PiB, 700 000 insert rps (as of 2021) | [Slides in Russian, May 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup15/lightning/ninja.pdf) | | Uber | Taxi | Logging | — | — | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/uber.pdf) | +| UseTech | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) | | UTMSTAT | Analytics | Main product | — | — | [Blog post, June 2020](https://vc.ru/tribuna/133956-striming-dannyh-iz-servisa-skvoznoy-analitiki-v-clickhouse) | | Vercel | Traffic and Performance Analytics | — | — | — | Direct reference, October 2021 | | VKontakte | Social Network | Statistics, Logging | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/3_vk.pdf) | @@ -168,7 +178,8 @@ toc_title: Adopters | Yandex Cloud | Public Cloud | Main product | — | — | [Talk in Russian, December 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) | | Yandex DataLens | Business Intelligence | Main product | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.com/meetup38/datalens.pdf) | | Yandex Market | e-Commerce | Metrics, Logging | — | — | [Talk in Russian, January 2019](https://youtu.be/_l1qP0DyBcA?t=478) | -| Yandex Metrica | Web analytics | Main product | 630 servers in one cluster, 360 servers in another cluster, 1862 servers in one department | 133 PiB / 8.31 PiB / 120 trillion records | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/introduction/#13) | +| Yandex Metrica | Web analytics | Macin product | 630 servers in one cluster, 360 servers in another cluster, 1862 servers in one department | 133 PiB / 8.31 PiB / 120 trillion records | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/introduction/#13) | +| | Analytics | Main product | - | - | [Integration](https://www.yellowfinbi.com/campaign/yellowfin-9-whats-new#el-30219e0e) | | Yotascale | Cloud | Data pipeline | — | 2 bn records/day | [LinkedIn (Accomplishments)](https://www.linkedin.com/in/adilsaleem/) | | Your Analytics | Product Analytics | Main Product | — | - | [Tweet, November 2021](https://twitter.com/mikenikles/status/1459737241165565953) | | Zagrava Trading | — | — | — | — | [Job offer, May 2021](https://twitter.com/datastackjobs/status/1394707267082063874) | @@ -178,9 +189,5 @@ toc_title: Adopters | Цифровой Рабочий | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) | | ООО «МПЗ Богородский» | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) | | ДомКлик | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) | -| Futurra Group | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) | -| UseTech | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) | -| Lookforsale | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) | -| R-Vision | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) | [Original article](https://clickhouse.com/docs/en/introduction/adopters/) diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index 6738f77cff9..350ca835187 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -3,14 +3,14 @@ toc_priority: 66 toc_title: ClickHouse Keeper --- -# [pre-production] ClickHouse Keeper +# [pre-production] ClickHouse Keeper {#clickHouse-keeper} ClickHouse server uses [ZooKeeper](https://zookeeper.apache.org/) coordination system for data [replication](../engines/table-engines/mergetree-family/replication.md) and [distributed DDL](../sql-reference/distributed-ddl.md) queries execution. ClickHouse Keeper is an alternative coordination system compatible with ZooKeeper. !!! warning "Warning" This feature is currently in the pre-production stage. We test it in our CI and on small internal installations. -## Implementation details +## Implementation details {#implementation-details} ZooKeeper is one of the first well-known open-source coordination systems. It's implemented in Java, has quite a simple and powerful data model. ZooKeeper's coordination algorithm called ZAB (ZooKeeper Atomic Broadcast) doesn't provide linearizability guarantees for reads, because each ZooKeeper node serves reads locally. Unlike ZooKeeper ClickHouse Keeper is written in C++ and uses [RAFT algorithm](https://raft.github.io/) [implementation](https://github.com/eBay/NuRaft). This algorithm allows to have linearizability for reads and writes, has several open-source implementations in different languages. @@ -21,7 +21,7 @@ ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper] !!! info "Note" External integrations are not supported. -## Configuration +## Configuration {#configuration} ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server, but in both cases configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is ``. Keeper configuration has the following parameters: @@ -102,7 +102,7 @@ Examples of configuration for quorum with three nodes can be found in [integrati ``` -## How to run +## How to run {#how-to-run} ClickHouse Keeper is bundled into the ClickHouse server package, just add configuration of `` and start ClickHouse server as always. If you want to run standalone ClickHouse Keeper you can start it in a similar way with: @@ -110,13 +110,14 @@ ClickHouse Keeper is bundled into the ClickHouse server package, just add config clickhouse-keeper --config /etc/your_path_to_config/config.xml --daemon ``` -## Four Letter Word Commands +## Four Letter Word Commands {#four-letter-word-commands} ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively. The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro". You can issue the commands to ClickHouse Keeper via telnet or nc, at the client port. + ``` echo mntr | nc localhost 9181 ``` @@ -296,7 +297,7 @@ Sessions with Ephemerals (1): /clickhouse/task_queue/ddl ``` -## [experimental] Migration from ZooKeeper +## [experimental] Migration from ZooKeeper {#migration-from-zookeeper} Seamlessly migration from ZooKeeper to ClickHouse Keeper is impossible you have to stop your ZooKeeper cluster, convert data and start ClickHouse Keeper. `clickhouse-keeper-converter` tool allows converting ZooKeeper logs and snapshots to ClickHouse Keeper snapshot. It works only with ZooKeeper > 3.4. Steps for migration: diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index e8099ef0ac6..78f6c71c65f 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -672,7 +672,8 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa ## max_concurrent_queries {#max-concurrent-queries} -The maximum number of simultaneously processed queries related to MergeTree table. Queries may be limited by other settings: [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). +The maximum number of simultaneously processed queries related to MergeTree table. +Queries may be limited by other settings: [max_concurrent_insert_queries](#max-concurrent-insert-queries), [max_concurrent_select_queries](#max-concurrent-select-queries), [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). !!! info "Note" These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. @@ -680,7 +681,9 @@ The maximum number of simultaneously processed queries related to MergeTree tabl Possible values: - Positive integer. -- 0 — Disabled. +- 0 — No limit. + +Default value: `100`. **Example** @@ -688,6 +691,46 @@ Possible values: 100 ``` +## max_concurrent_insert_queries {#max-concurrent-insert-queries} + +The maximum number of simultaneously processed `INSERT` queries. + +!!! info "Note" + These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. + +**Example** + +``` xml +100 +``` + +## max_concurrent_select_queries {#max-concurrent-select-queries} + +The maximum number of simultaneously processed `SELECT` queries. + +!!! info "Note" + These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. + +**Example** + +``` xml +100 +``` + ## max_concurrent_queries_for_user {#max-concurrent-queries-for-user} The maximum number of simultaneously processed queries related to MergeTree table per user. @@ -695,7 +738,9 @@ The maximum number of simultaneously processed queries related to MergeTree tabl Possible values: - Positive integer. -- 0 — Disabled. +- 0 — No limit. + +Default value: `0`. **Example** @@ -711,7 +756,12 @@ Example: `max_concurrent_queries_for_all_users` can be set to 99 for all users a Modifying the setting for one query or user does not affect other queries. -Default value: `0` that means no limit. +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. **Example** @@ -1238,6 +1288,20 @@ Example 9004 ``` +## postgresql_port {#server_configuration_parameters-postgresql_port} + +Port for communicating with clients over PostgreSQL protocol. + +**Possible values** + +Positive integer. + +Example + +``` xml +9005 +``` + ## tmp_path {#tmp-path} Path to temporary data for processing large queries. diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md index af75d130ed3..a7bba76a05a 100644 --- a/docs/en/operations/settings/merge-tree-settings.md +++ b/docs/en/operations/settings/merge-tree-settings.md @@ -27,6 +27,10 @@ An example of changing the settings for a specific table with the `ALTER TABLE . ``` sql ALTER TABLE foo MODIFY SETTING max_suspicious_broken_parts = 100; + +-- reset to default (use value from system.merge_tree_settings) +ALTER TABLE foo + RESET SETTING max_suspicious_broken_parts; ``` ## parts_to_throw_insert {#parts-to-throw-insert} diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index e56625fe948..8a0fd618d32 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -817,9 +817,19 @@ If the number of rows to be read from a file of a [MergeTree](../../engines/tabl Possible values: -- Any positive integer. +- Positive integer. -Default value: 163840. +Default value: `163840`. + +## merge_tree_min_rows_for_concurrent_read_for_remote_filesystem {#merge-tree-min-rows-for-concurrent-read-for-remote-filesystem} + +The minimum number of lines to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. + +Possible values: + +- Positive integer. + +Default value: `163840`. ## merge_tree_min_bytes_for_concurrent_read {#setting-merge-tree-min-bytes-for-concurrent-read} @@ -827,9 +837,19 @@ If the number of bytes to read from one file of a [MergeTree](../../engines/tabl Possible value: -- Any positive integer. +- Positive integer. -Default value: 251658240. +Default value: `251658240`. + +## merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem {#merge-tree-min-bytes-for-concurrent-read-for-remote-filesystem} + +The minimum number of bytes to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. + +Possible values: + +- Positive integer. + +Default value: `251658240`. ## merge_tree_min_rows_for_seek {#setting-merge-tree-min-rows-for-seek} @@ -1469,7 +1489,7 @@ Possible values: Default value: `1`. -**See Also** +**See Also** - [min_count_to_compile_aggregate_expression](#min_count_to_compile_aggregate_expression) @@ -2095,7 +2115,7 @@ Possible values: - 0 — Optimization disabled. - 1 — Optimization enabled. - + Default value: `1`. See also: @@ -3134,6 +3154,12 @@ Possible values: Default value: `0`. +!!! warning "Warning" + Nullable primary key usually indicates bad design. It is forbidden in almost all main stream DBMS. The feature is mainly for [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) and is not heavily tested. Use with care. + +!!! warning "Warning" + Do not enable this feature in version `<= 21.8`. It's not properly implemented and may lead to server crash. + ## aggregate_functions_null_for_empty {#aggregate_functions_null_for_empty} Enables or disables rewriting all aggregate functions in a query, adding [-OrNull](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-ornull) suffix to them. Enable it for SQL standard compatibility. @@ -3682,49 +3708,6 @@ Possible values: Default value: `0`. -## materialized_postgresql_max_block_size {#materialized-postgresql-max-block-size} - -Sets the number of rows collected in memory before flushing data into PostgreSQL database table. - -Possible values: - -- Positive integer. - -Default value: `65536`. - -## materialized_postgresql_tables_list {#materialized-postgresql-tables-list} - -Sets a comma-separated list of PostgreSQL database tables, which will be replicated via [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) database engine. - -Default value: empty list — means whole PostgreSQL database will be replicated. - -## materialized_postgresql_schema {#materialized-postgresql-schema} - -Default value: empty string. (Default schema is used) - -## materialized_postgresql_schema_list {#materialized-postgresql-schema-list} - -Default value: empty list. (Default schema is used) - -## materialized_postgresql_allow_automatic_update {#materialized-postgresql-allow-automatic-update} - -Allows reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them. - -Possible values: - -- 0 — The table is not automatically updated in the background, when schema changes are detected. -- 1 — The table is automatically updated in the background, when schema changes are detected. - -Default value: `0`. - -## materialized_postgresql_replication_slot {#materialized-postgresql-replication-slot} - -A user-created replication slot. Must be used together with [materialized_postgresql_snapshot](#materialized-postgresql-snapshot). - -## materialized_postgresql_snapshot {#materialized-postgresql-snapshot} - -A text string identifying a snapshot, from which [initial dump of PostgreSQL tables](../../engines/database-engines/materialized-postgresql.md) will be performed. Must be used together with [materialized_postgresql_replication_slot](#materialized-postgresql-replication-slot). - ## allow_experimental_projection_optimization {#allow-experimental-projection-optimization} Enables or disables [projection](../../engines/table-engines/mergetree-family/mergetree.md#projections) optimization when processing `SELECT` queries. @@ -3993,8 +3976,8 @@ If [wait_for_async_insert](#wait-for-async-insert) is enabled, every client will Possible values: -- 0 — Insertions are made synchronously, one after another. -- 1 — Multiple asynchronous insertions enabled. +- 0 — Insertions are made synchronously, one after another. +- 1 — Multiple asynchronous insertions enabled. Default value: `0`. @@ -4066,7 +4049,7 @@ Default value: `0`. ## alter_partition_verbose_result {#alter-partition-verbose-result} -Enables or disables the display of information about the parts to which the manipulation operations with partitions and parts have been successfully applied. +Enables or disables the display of information about the parts to which the manipulation operations with partitions and parts have been successfully applied. Applicable to [ATTACH PARTITION|PART](../../sql-reference/statements/alter/partition.md#alter_attach-partition) and to [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md#alter_freeze-partition). Possible values: @@ -4172,3 +4155,20 @@ Default value: `''`. Sets the character that is interpreted as a suffix after the result set for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. Default value: `''`. + +## shutdown_wait_unfinished_queries + +Enables or disables waiting unfinished queries when shutdown server. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. The wait time equal shutdown_wait_unfinished config. + +Default value: 0. + +## shutdown_wait_unfinished + +The waiting time in seconds for currently handled connections when shutdown server. + +Default Value: 5. diff --git a/docs/en/operations/system-tables/columns.md b/docs/en/operations/system-tables/columns.md index 5ba38ab3e67..55e4a8284a0 100644 --- a/docs/en/operations/system-tables/columns.md +++ b/docs/en/operations/system-tables/columns.md @@ -6,7 +6,7 @@ You can use this table to get information similar to the [DESCRIBE TABLE](../../ Columns from [temporary tables](../../sql-reference/statements/create/table.md#temporary-tables) are visible in the `system.columns` only in those session where they have been created. They are shown with the empty `database` field. -Columns: +The `system.columns` table contains the following columns (the column type is shown in brackets): - `database` ([String](../../sql-reference/data-types/string.md)) — Database name. - `table` ([String](../../sql-reference/data-types/string.md)) — Table name. @@ -86,21 +86,4 @@ numeric_scale: ᴺᵁᴸᴸ datetime_precision: ᴺᵁᴸᴸ ``` -The `system.columns` table contains the following columns (the column type is shown in brackets): - -- `database` (String) — Database name. -- `table` (String) — Table name. -- `name` (String) — Column name. -- `type` (String) — Column type. -- `default_kind` (String) — Expression type (`DEFAULT`, `MATERIALIZED`, `ALIAS`) for the default value, or an empty string if it is not defined. -- `default_expression` (String) — Expression for the default value, or an empty string if it is not defined. -- `data_compressed_bytes` (UInt64) — The size of compressed data, in bytes. -- `data_uncompressed_bytes` (UInt64) — The size of decompressed data, in bytes. -- `marks_bytes` (UInt64) — The size of marks, in bytes. -- `comment` (String) — Comment on the column, or an empty string if it is not defined. -- `is_in_partition_key` (UInt8) — Flag that indicates whether the column is in the partition expression. -- `is_in_sorting_key` (UInt8) — Flag that indicates whether the column is in the sorting key expression. -- `is_in_primary_key` (UInt8) — Flag that indicates whether the column is in the primary key expression. -- `is_in_sampling_key` (UInt8) — Flag that indicates whether the column is in the sampling key expression. - [Original article](https://clickhouse.com/docs/en/operations/system-tables/columns) diff --git a/docs/en/operations/system-tables/metrics.md b/docs/en/operations/system-tables/metrics.md index 551c63d1aa3..21e5923e3a0 100644 --- a/docs/en/operations/system-tables/metrics.md +++ b/docs/en/operations/system-tables/metrics.md @@ -35,7 +35,7 @@ SELECT * FROM system.metrics LIMIT 10 - [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. - [system.events](../../operations/system-tables/events.md#system_tables-events) — Contains a number of events that occurred. -- [system.metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. +- [system.metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` and `system.events`. - [Monitoring](../../operations/monitoring.md) — Base concepts of ClickHouse monitoring. [Original article](https://clickhouse.com/docs/en/operations/system-tables/metrics) diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniq.md b/docs/en/sql-reference/aggregate-functions/reference/uniq.md index 598af24c0de..33bfe72548b 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/uniq.md +++ b/docs/en/sql-reference/aggregate-functions/reference/uniq.md @@ -24,9 +24,7 @@ Function: - Calculates a hash for all parameters in the aggregate, then uses it in calculations. -- Uses an adaptive sampling algorithm. For the calculation state, the function uses a sample of element hash values up to 65536. - - This algorithm is very accurate and very efficient on the CPU. When the query contains several of these functions, using `uniq` is almost as fast as using other aggregate functions. +- Uses an adaptive sampling algorithm. For the calculation state, the function uses a sample of element hash values up to 65536. This algorithm is very accurate and very efficient on the CPU. When the query contains several of these functions, using `uniq` is almost as fast as using other aggregate functions. - Provides the result deterministically (it does not depend on the query processing order). diff --git a/docs/en/sql-reference/data-types/aggregatefunction.md b/docs/en/sql-reference/data-types/aggregatefunction.md index 81945eeece6..e483a20eed9 100644 --- a/docs/en/sql-reference/data-types/aggregatefunction.md +++ b/docs/en/sql-reference/data-types/aggregatefunction.md @@ -11,9 +11,7 @@ Aggregate functions can have an implementation-defined intermediate state that c **Parameters** -- Name of the aggregate function. - - If the function is parametric, specify its parameters too. +- Name of the aggregate function. If the function is parametric, specify its parameters too. - Types of the aggregate function arguments. diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index 095f059513c..c3c4bbc6493 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -122,7 +122,12 @@ Setting fields: - `command` — The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). - `format` — The file format. All the formats described in [Formats](../../../interfaces/formats.md#formats) are supported. +- `command_termination_timeout` — executable script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. - `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. +- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder. Additional script arguments can be specified using whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `0`. Optional parameter. +- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. @@ -150,10 +155,14 @@ Setting fields: - `command` — The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). - `format` — The file format. All the formats described in “[Formats](../../../interfaces/formats.md#formats)” are supported. -- `pool_size` — Size of pool. If 0 is specified as `pool_size` then there is no pool size restrictions. -- `command_termination_timeout` — Executable pool script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. +- `pool_size` — Size of pool. If 0 is specified as `pool_size` then there is no pool size restrictions. Default value is `16`. +- `command_termination_timeout` — executable script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. - `max_command_execution_time` — Maximum executable script command execution time for processing block of data. Specified in seconds. Default value is 10. Optional parameter. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. - `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. Optional parameter. +- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder. Additional script arguments can be specified using whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `1`. Optional parameter. +- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index d85092d683a..8231cda4b77 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1392,12 +1392,24 @@ Returns the first element in the `arr1` array for which `func` returns something Note that the `arrayFirst` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. +## arrayLast(func, arr1, …) {#array-last} + +Returns the last element in the `arr1` array for which `func` returns something other than 0. + +Note that the `arrayLast` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. + ## arrayFirstIndex(func, arr1, …) {#array-first-index} Returns the index of the first element in the `arr1` array for which `func` returns something other than 0. Note that the `arrayFirstIndex` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. +## arrayLastIndex(func, arr1, …) {#array-last-index} + +Returns the index of the last element in the `arr1` array for which `func` returns something other than 0. + +Note that the `arrayLastIndex` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. + ## arrayMin {#array-min} Returns the minimum of elements in the source array. diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index b85f105758b..7ded7e72d8c 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -57,7 +57,7 @@ Alias: `toTimezone`. **Arguments** - `value` — Time or date and time. [DateTime64](../../sql-reference/data-types/datetime64.md). -- `timezone` — Timezone for the returned value. [String](../../sql-reference/data-types/string.md). +- `timezone` — Timezone for the returned value. [String](../../sql-reference/data-types/string.md). This argument is a constant, because `toTimezone` changes the timezone of a column (timezone is an attribute of `DateTime*` types). **Returned value** diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 0e8352d2d1e..84e1e5eca3b 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -217,8 +217,8 @@ Result: ``` text (0,'2019-05-20') 0 \N \N (NULL,NULL) (1,'2019-05-20') 1 First First ('First','First') -(2,'2019-05-20') 0 \N \N (NULL,NULL) -(3,'2019-05-20') 0 \N \N (NULL,NULL) +(2,'2019-05-20') 1 Second \N ('Second',NULL) +(3,'2019-05-20') 1 Third Third ('Third','Third') (4,'2019-05-20') 0 \N \N (NULL,NULL) ``` diff --git a/docs/en/sql-reference/functions/index.md b/docs/en/sql-reference/functions/index.md index e86e6b37998..ddc113d31f9 100644 --- a/docs/en/sql-reference/functions/index.md +++ b/docs/en/sql-reference/functions/index.md @@ -73,26 +73,74 @@ User defined function configurations are searched relative to the path specified A function configuration contains the following settings: - `name` - a function name. -- `command` - a command or a script to execute. +- `command` - script name to execute or command if `execute_direct` is false. - `argument` - argument description with the `type` of an argument. Each argument is described in a separate setting. - `format` - a [format](../../interfaces/formats.md) in which arguments are passed to the command. - `return_type` - the type of a returned value. - `type` - an executable type. If `type` is set to `executable` then single command is started. If it is set to `executable_pool` then a pool of commands is created. - `max_command_execution_time` - maximum execution time in seconds for processing block of data. This setting is valid for `executable_pool` commands only. Optional. Default value is `10`. -- `command_termination_timeout` - time in seconds during which a command should finish after its pipe is closed. After that time `SIGTERM` is sent to the process executing the command. This setting is valid for `executable_pool` commands only. Optional. Default value is `10`. +- `command_termination_timeout` - time in seconds during which a command should finish after its pipe is closed. After that time `SIGTERM` is sent to the process executing the command. Optional. Default value is `10`. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. - `pool_size` - the size of a command pool. Optional. Default value is `16`. -- `lifetime` - the reload interval of a function in seconds. If it is set to `0` then the function is not reloaded. - `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. +- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder. Additional script arguments can be specified using whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `1`. Optional parameter. +- `lifetime` - the reload interval of a function in seconds. If it is set to `0` then the function is not reloaded. Default value is `0`. Optional parameter. The command must read arguments from `STDIN` and must output the result to `STDOUT`. The command must process arguments iteratively. That is after processing a chunk of arguments it must wait for the next chunk. **Example** -Creating `test_function` using XML configuration: -``` +Creating `test_function` using XML configuration. +File test_function.xml. +```xml executable - test_function + test_function_python + String + + UInt64 + + TabSeparated + test_function.py + + +``` + +Script file inside `user_scripts` folder `test_function.py`. + +```python +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Value " + line, end='') + sys.stdout.flush() +``` + +Query: + +``` sql +SELECT test_function_python(toUInt64(2)); +``` + +Result: + +``` text +┌─test_function_python(2)─┐ +│ Value 2 │ +└─────────────────────────┘ +``` + +Creating `test_function_sum` manually specifying `execute_direct` to `0` using XML configuration. +File test_function.xml. +```xml + + + executable + test_function_sum UInt64 UInt64 @@ -102,7 +150,7 @@ Creating `test_function` using XML configuration: TabSeparated cd /; clickhouse-local --input-format TabSeparated --output-format TabSeparated --structure 'x UInt64, y UInt64' --query "SELECT x + y FROM table" - 0 + 0 ``` @@ -110,15 +158,15 @@ Creating `test_function` using XML configuration: Query: ``` sql -SELECT test_function(toUInt64(2), toUInt64(2)); +SELECT test_function_sum(2, 2); ``` Result: ``` text -┌─test_function(toUInt64(2), toUInt64(2))─┐ -│ 4 │ -└─────────────────────────────────────────┘ +┌─test_function_sum(2, 2)─┐ +│ 4 │ +└─────────────────────────┘ ``` diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index c62603a50b9..a0c0116a058 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -351,8 +351,6 @@ Checks whether the string matches the `pattern` regular expression. A `re2` regu Returns 0 if it does not match, or 1 if it matches. -Note that the backslash symbol (`\`) is used for escaping in the regular expression. The same symbol is used for escaping in string literals. So in order to escape the symbol in a regular expression, you must write two backslashes (\\) in a string literal. - The regular expression works with the string as if it is a set of bytes. The regular expression can’t contain null bytes. For patterns to search for substrings in a string, it is better to use LIKE or ‘position’, since they work much faster. diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 88e1cf47592..160e7be156e 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -344,9 +344,9 @@ SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); Result: ``` text -┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ -│ -1.11100 │ Nullable(Decimal(9, 5)) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ +│ -1.111 │ Nullable(Decimal(9, 5)) │ +└────────┴────────────────────────────────────────────────────┘ ``` Query: @@ -451,9 +451,9 @@ SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); Result: ``` text -┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ -│ -1.11100 │ Decimal(9, 5) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ +│ -1.111 │ Decimal(9, 5) │ +└────────┴────────────────────────────────────────────────────┘ ``` Query: diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index ae2113a2b64..98c3135f2b4 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -360,6 +360,21 @@ SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS Decod └────────────────────────────────────────┘ ``` +### decodeURLFormComponent(URL) {#decodeurlformcomponenturl} + +Returns the decoded URL. Follows rfc-1866, plain plus(`+`) is decoded as space(` `). +Example: + +``` sql +SELECT decodeURLFormComponent('http://127.0.0.1:8123/?query=SELECT%201+2%2B3') AS DecodedURL; +``` + +``` text +┌─DecodedURL────────────────────────────────┐ +│ http://127.0.0.1:8123/?query=SELECT 1 2+3 │ +└───────────────────────────────────────────┘ +``` + ### netloc {#netloc} Extracts network locality (`username:password@host:port`) from a URL. diff --git a/docs/en/sql-reference/statements/alter/projection.md b/docs/en/sql-reference/statements/alter/projection.md index 96cd8f5d607..c7ebc83c496 100644 --- a/docs/en/sql-reference/statements/alter/projection.md +++ b/docs/en/sql-reference/statements/alter/projection.md @@ -9,11 +9,12 @@ The following operations with [projections](../../../engines/table-engines/merge - `ALTER TABLE [db].name ADD PROJECTION name ( SELECT [GROUP BY] [ORDER BY] )` - Adds projection description to tables metadata. -- `ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. +- `ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). - `ALTER TABLE [db.]table MATERIALIZE PROJECTION name IN PARTITION partition_name` - The query rebuilds the projection `name` in the partition `partition_name`. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). -- `ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. +- `ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). + The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only change metadata or remove files. diff --git a/docs/en/sql-reference/statements/create/dictionary.md b/docs/en/sql-reference/statements/create/dictionary.md index 61428cce126..86ab8f977b0 100644 --- a/docs/en/sql-reference/statements/create/dictionary.md +++ b/docs/en/sql-reference/statements/create/dictionary.md @@ -10,7 +10,7 @@ Creates a new [external dictionary](../../../sql-reference/dictionaries/external **Syntax** ``` sql -CREATE DICTIONARY [OR REPLACE][IF NOT EXISTS] [db.]dictionary_name [ON CLUSTER cluster] +CREATE [OR REPLACE] DICTIONARY [IF NOT EXISTS] [db.]dictionary_name [ON CLUSTER cluster] ( key1 type1 [DEFAULT|EXPRESSION expr1] [IS_OBJECT_ID], key2 type2 [DEFAULT|EXPRESSION expr2], diff --git a/docs/en/sql-reference/statements/explain.md b/docs/en/sql-reference/statements/explain.md index 2d129f1bc60..9c74c069f02 100644 --- a/docs/en/sql-reference/statements/explain.md +++ b/docs/en/sql-reference/statements/explain.md @@ -10,7 +10,12 @@ Shows the execution plan of a statement. Syntax: ```sql -EXPLAIN [AST | SYNTAX | PLAN | PIPELINE] [setting = value, ...] SELECT ... [FORMAT ...] +EXPLAIN [AST | SYNTAX | PLAN | PIPELINE | TABLE OVERRIDE] [setting = value, ...] + [ + SELECT ... | + tableFunction(...) [COLUMNS (...)] [ORDER BY ...] [PARTITION BY ...] [PRIMARY KEY] [SAMPLE BY ...] [TTL ...] + ] + [FORMAT ...] ``` Example: @@ -412,4 +417,37 @@ Result: └──────────┴───────┴───────┴──────┴───────┘ ``` +### EXPLAIN TABLE OVERRIDE {#explain-table-override} + +Shows the result of a table override on a table schema accessed through a table function. +Also does some validation, throwing an exception if the override would have caused some kind of failure. + +**Example** + +Assume you have a remote MySQL table like this: + +```sql +CREATE TABLE db.tbl ( + id INT PRIMARY KEY, + created DATETIME DEFAULT now() +) +``` + +```sql +EXPLAIN TABLE OVERRIDE mysql('127.0.0.1:3306', 'db', 'tbl', 'root', 'clickhouse') +PARTITION BY toYYYYMM(assumeNotNull(created)) +``` + +Result: + +```text +┌─explain─────────────────────────────────────────────────┐ +│ PARTITION BY uses columns: `created` Nullable(DateTime) │ +└─────────────────────────────────────────────────────────┘ +``` + +!!! note "Note" + The validation is not complete, so a successfull query does not guarantee that the override would + not cause issues. + [Оriginal article](https://clickhouse.com/docs/en/sql-reference/statements/explain/) diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index f04952746a6..2b1262f7d3c 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -21,7 +21,7 @@ GRANT [ON CLUSTER cluster_name] privilege[(column_name [,...])] [,...] ON {db.ta - `user` — ClickHouse user account. The `WITH GRANT OPTION` clause grants `user` or `role` with permission to execute the `GRANT` query. Users can grant privileges of the same scope they have and less. -The `WITH REPLACE OPTION` clause replace old privileges by new privileges for the `user` or `role`, if not specified it is append privileges. +The `WITH REPLACE OPTION` clause replace old privileges by new privileges for the `user` or `role`, if is not specified it appends privileges. ## Assigning Role Syntax {#assign-role-syntax} @@ -33,7 +33,7 @@ GRANT [ON CLUSTER cluster_name] role [,...] TO {user | another_role | CURRENT_US - `user` — ClickHouse user account. The `WITH ADMIN OPTION` clause grants [ADMIN OPTION](#admin-option-privilege) privilege to `user` or `role`. -The `WITH REPLACE OPTION` clause replace old roles by new role for the `user` or `role`, if not specified it is append roles. +The `WITH REPLACE OPTION` clause replace old roles by new role for the `user` or `role`, if is not specified it appends roles. ## Usage {#grant-usage} diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md index aa61348d2a0..3d302be561a 100644 --- a/docs/en/sql-reference/statements/select/join.md +++ b/docs/en/sql-reference/statements/select/join.md @@ -55,13 +55,13 @@ The behavior of ClickHouse server for `ANY JOIN` operations depends on the [any_ - [join_on_disk_max_files_to_merge](../../../operations/settings/settings.md#join_on_disk_max_files_to_merge) - [any_join_distinct_right_table_keys](../../../operations/settings/settings.md#any_join_distinct_right_table_keys) -## ON Section Conditions {on-section-conditions} +## ON Section Conditions {#on-section-conditions} + +An `ON` section can contain several conditions combined using the `AND` and `OR` operators. Conditions specifying join keys must refer both left and right tables and must use the equality operator. Other conditions may use other logical operators but they must refer either the left or the right table of a query. -An `ON` section can contain several conditions combined using the `AND` operator. Conditions specifying join keys must refer both left and right tables and must use the equality operator. Other conditions may use other logical operators but they must refer either the left or the right table of a query. Rows are joined if the whole complex condition is met. If the conditions are not met, still rows may be included in the result depending on the `JOIN` type. Note that if the same conditions are placed in a `WHERE` section and they are not met, then rows are always filtered out from the result. -!!! note "Note" - The `OR` operator inside an `ON` section is not supported yet. +The `OR` operator inside the `ON` clause works using the hash join algorithm — for each `OR` argument with join keys for `JOIN`, a separate hash table is created, so memory consumption and query execution time grow linearly with an increase in the number of expressions `OR` of the `ON` clause. !!! note "Note" If a condition refers columns from different tables, then only the equality operator (`=`) is supported so far. @@ -109,7 +109,47 @@ Result: │ B │ Text B │ 15 │ └──────┴────────┴────────┘ ``` +Query with `INNER` type of a join and condition with `OR`: +``` sql +CREATE TABLE t1 (`a` Int64, `b` Int64) ENGINE = MergeTree() ORDER BY a; + +CREATE TABLE t2 (`key` Int32, `val` Int64) ENGINE = MergeTree() ORDER BY key; + +INSERT INTO t1 SELECT number as a, -a as b from numbers(5); + +INSERT INTO t2 SELECT if(number % 2 == 0, toInt64(number), -number) as key, number as val from numbers(5); + +SELECT a, b, val FROM t1 INNER JOIN t2 ON t1.a = t2.key OR t1.b = t2.key; +``` + +Result: + +``` +┌─a─┬──b─┬─val─┐ +│ 0 │ 0 │ 0 │ +│ 1 │ -1 │ 1 │ +│ 2 │ -2 │ 2 │ +│ 3 │ -3 │ 3 │ +│ 4 │ -4 │ 4 │ +└───┴────┴─────┘ +``` + +Query with `INNER` type of a join and conditions with `OR` and `AND`: + +``` sql +SELECT a, b, val FROM t1 INNER JOIN t2 ON t1.a = t2.key OR t1.b = t2.key AND t2.val > 3; +``` + +Result: + +``` +┌─a─┬──b─┬─val─┐ +│ 0 │ 0 │ 0 │ +│ 2 │ -2 │ 2 │ +│ 4 │ -4 │ 4 │ +└───┴────┴─────┘ +``` ## ASOF JOIN Usage {#asof-join-usage} `ASOF JOIN` is useful when you need to join records that have no exact match. diff --git a/docs/ja/faq/general/how-do-i-contribute-code-to-clickhouse.md b/docs/ja/faq/general/how-do-i-contribute-code-to-clickhouse.md new file mode 120000 index 00000000000..5ac9a615386 --- /dev/null +++ b/docs/ja/faq/general/how-do-i-contribute-code-to-clickhouse.md @@ -0,0 +1 @@ +../../../en/faq/general/how-do-i-contribute-code-to-clickhouse.md \ No newline at end of file diff --git a/docs/ja/faq/operations/multi-region-replication.md b/docs/ja/faq/operations/multi-region-replication.md new file mode 120000 index 00000000000..dbc985ee1fb --- /dev/null +++ b/docs/ja/faq/operations/multi-region-replication.md @@ -0,0 +1 @@ +../../../en/faq/operations/multi-region-replication.md \ No newline at end of file diff --git a/docs/ja/interfaces/http.md b/docs/ja/interfaces/http.md index 4ac9cd9e472..210e3f46d24 100644 --- a/docs/ja/interfaces/http.md +++ b/docs/ja/interfaces/http.md @@ -397,7 +397,7 @@ $ curl -v 'http://localhost:8123/predefined_query' `` 値は以下の定義済みクエリです `` これは、Http要求が一致し、クエリの結果が返されたときにClickHouseによって実行されます。 これは必須構成です。 -次の例では、次の値を定義します `max_threads` と `max_alter_threads` 設定、そしてクエリのテーブルから設定設定します。 +次の例では、次の値を定義します `max_threads` と `max_final_threads` 設定、そしてクエリのテーブルから設定設定します。 例: @@ -420,9 +420,9 @@ $ curl -v 'http://localhost:8123/predefined_query' ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "注意" @@ -434,7 +434,7 @@ max_alter_threads 2 クリックハウスは、 `` HTTP要求のurlの値。 のデフォルト値 `` は `/query` . これはオプションの構成です。 設定ファイルに定義がない場合、paramは渡されません。 -この機能を試すために、この例ではmax_threadsとmax_alter_threadsの値を定義し、設定が正常に設定されたかどうかを照会します。 +この機能を試すために、この例ではmax_threadsとmax_final_threadsの値を定義し、設定が正常に設定されたかどうかを照会します。 例: @@ -452,9 +452,9 @@ max_alter_threads 2 ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ## 静的 {#static} diff --git a/docs/ja/sql-reference/functions/type-conversion-functions.md b/docs/ja/sql-reference/functions/type-conversion-functions.md index fd935c23d5f..a16bca0c1f9 100644 --- a/docs/ja/sql-reference/functions/type-conversion-functions.md +++ b/docs/ja/sql-reference/functions/type-conversion-functions.md @@ -170,9 +170,9 @@ SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) ``` ``` text -┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ -│ -1.11100 │ Nullable(Decimal(9, 5)) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ +│ -1.111 │ Nullable(Decimal(9, 5)) │ +└────────┴────────────────────────────────────────────────────┘ ``` ``` sql @@ -214,9 +214,9 @@ SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) ``` ``` text -┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ -│ -1.11100 │ Decimal(9, 5) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ +│ -1.111 │ Decimal(9, 5) │ +└────────┴────────────────────────────────────────────────────┘ ``` ``` sql diff --git a/docs/ru/development/developer-instruction.md b/docs/ru/development/developer-instruction.md index 8466c709ad1..964d39163d8 100644 --- a/docs/ru/development/developer-instruction.md +++ b/docs/ru/development/developer-instruction.md @@ -40,10 +40,10 @@ ClickHouse не работает и не собирается на 32-битны Выполните в терминале: - git clone git@github.com:ClickHouse/ClickHouse.git + git clone git@github.com:your_github_username/ClickHouse.git --recursive cd ClickHouse -Замените первое вхождение слова `ClickHouse` в команде для git на имя вашего аккаунта на GitHub. +Замените слово `your_github_username` в команде для git на имя вашего аккаунта на GitHub. Эта команда создаст директорию ClickHouse, содержащую рабочую копию проекта. diff --git a/docs/ru/engines/table-engines/integrations/hdfs.md b/docs/ru/engines/table-engines/integrations/hdfs.md index 5949cc8a0d7..78a82955cd2 100644 --- a/docs/ru/engines/table-engines/integrations/hdfs.md +++ b/docs/ru/engines/table-engines/integrations/hdfs.md @@ -5,7 +5,7 @@ toc_title: HDFS # HDFS {#table_engines-hdfs} -Управляет данными в HDFS. Данный движок похож на движки [File](../special/file.md#table_engines-file) и [URL](../special/url.md#table_engines-url). +Этот движок обеспечивает интеграцию с экосистемой [Apache Hadoop](https://ru.wikipedia.org/wiki/Hadoop), позволяя управлять данными в HDFS посредством ClickHouse. Данный движок похож на движки [File](../special/file.md#table_engines-file) и [URL](../special/url.md#table_engines-url), но предоставляет возможности, характерные для Hadoop. ## Использование движка {#usage} @@ -13,9 +13,11 @@ toc_title: HDFS ENGINE = HDFS(URI, format) ``` -В параметр `URI` нужно передавать полный URI файла в HDFS. +**Параметры движка** + +В параметр `URI` нужно передавать полный URI файла в HDFS. Часть URI с путем файла может содержать шаблоны. В этом случае таблица может использоваться только для чтения. Параметр `format` должен быть таким, который ClickHouse может использовать и в запросах `INSERT`, и в запросах `SELECT`. Полный список поддерживаемых форматов смотрите в разделе [Форматы](../../../interfaces/formats.md#formats). -Часть URI с путем файла может содержать шаблоны. В этом случае таблица может использоваться только для чтения. + **Пример:** @@ -67,12 +69,12 @@ SELECT * FROM hdfs_engine_table LIMIT 2 1. Предположим, у нас есть несколько файлов со следующими URI в HDFS: -- 'hdfs://hdfs1:9000/some_dir/some_file_1' -- 'hdfs://hdfs1:9000/some_dir/some_file_2' -- 'hdfs://hdfs1:9000/some_dir/some_file_3' -- 'hdfs://hdfs1:9000/another_dir/some_file_1' -- 'hdfs://hdfs1:9000/another_dir/some_file_2' -- 'hdfs://hdfs1:9000/another_dir/some_file_3' + - 'hdfs://hdfs1:9000/some_dir/some_file_1' + - 'hdfs://hdfs1:9000/some_dir/some_file_2' + - 'hdfs://hdfs1:9000/some_dir/some_file_3' + - 'hdfs://hdfs1:9000/another_dir/some_file_1' + - 'hdfs://hdfs1:9000/another_dir/some_file_2' + - 'hdfs://hdfs1:9000/another_dir/some_file_3' 1. Есть несколько возможностей создать таблицу, состояющую из этих шести файлов: @@ -128,6 +130,7 @@ CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9 | **параметр** | **по умолчанию** | +| - | - | | rpc\_client\_connect\_tcpnodelay | true | | dfs\_client\_read\_shortcircuit | true | | output\_replace-datanode-on-failure | true | @@ -177,22 +180,23 @@ CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9 #### Расширенные параметры для ClickHouse {#clickhouse-extras} | **параметр** | **по умолчанию** | +| - | - | |hadoop\_kerberos\_keytab | "" | |hadoop\_kerberos\_principal | "" | |hadoop\_kerberos\_kinit\_command | kinit | ### Ограничения {#limitations} - * hadoop\_security\_kerberos\_ticket\_cache\_path могут быть определены только на глобальном уровне + * `hadoop_security_kerberos_ticket_cache_path` и `libhdfs3_conf` могут быть определены только на глобальном, а не на пользовательском уровне ## Поддержка Kerberos {#kerberos-support} -Если hadoop\_security\_authentication параметр имеет значение 'kerberos', ClickHouse аутентифицируется с помощью Kerberos. -[Расширенные параметры](#clickhouse-extras) и hadoop\_security\_kerberos\_ticket\_cache\_path помогают сделать это. +Если параметр `hadoop_security_authentication` имеет значение `kerberos`, ClickHouse аутентифицируется с помощью Kerberos. +[Расширенные параметры](#clickhouse-extras) и `hadoop_security_kerberos_ticket_cache_path` помогают сделать это. Обратите внимание что из-за ограничений libhdfs3 поддерживается только устаревший метод аутентификации, -коммуникация с узлами данных не защищена SASL (HADOOP\_SECURE\_DN\_USER надежный показатель такого -подхода к безопасности). Используйте tests/integration/test\_storage\_kerberized\_hdfs/hdfs_configs/bootstrap.sh для примера настроек. +коммуникация с узлами данных не защищена SASL (`HADOOP_SECURE_DN_USER` надежный показатель такого +подхода к безопасности). Используйте `tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh` для примера настроек. -Если hadoop\_kerberos\_keytab, hadoop\_kerberos\_principal или hadoop\_kerberos\_kinit\_command указаны в настройках, kinit будет вызван. hadoop\_kerberos\_keytab и hadoop\_kerberos\_principal обязательны в этом случае. Необходимо также будет установить kinit и файлы конфигурации krb5. +Если `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` или `hadoop_kerberos_kinit_command` указаны в настройках, `kinit` будет вызван. `hadoop_kerberos_keytab` и `hadoop_kerberos_principal` обязательны в этом случае. Необходимо также будет установить `kinit` и файлы конфигурации krb5. ## Виртуальные столбцы {#virtual-columns} diff --git a/docs/ru/engines/table-engines/integrations/kafka.md b/docs/ru/engines/table-engines/integrations/kafka.md index 19e2850dd51..7ea3d124ab3 100644 --- a/docs/ru/engines/table-engines/integrations/kafka.md +++ b/docs/ru/engines/table-engines/integrations/kafka.md @@ -191,5 +191,5 @@ ClickHouse может поддерживать учетные данные Kerbe **Смотрите также** - [Виртуальные столбцы](index.md#table_engines-virtual_columns) -- [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) +- [background_message_broker_schedule_pool_size](../../../operations/settings/settings.md#background_message_broker_schedule_pool_size) diff --git a/docs/ru/faq/general/how-do-i-contribute-code-to-clickhouse.md b/docs/ru/faq/general/how-do-i-contribute-code-to-clickhouse.md new file mode 120000 index 00000000000..5ac9a615386 --- /dev/null +++ b/docs/ru/faq/general/how-do-i-contribute-code-to-clickhouse.md @@ -0,0 +1 @@ +../../../en/faq/general/how-do-i-contribute-code-to-clickhouse.md \ No newline at end of file diff --git a/docs/ru/faq/operations/multi-region-replication.md b/docs/ru/faq/operations/multi-region-replication.md new file mode 120000 index 00000000000..dbc985ee1fb --- /dev/null +++ b/docs/ru/faq/operations/multi-region-replication.md @@ -0,0 +1 @@ +../../../en/faq/operations/multi-region-replication.md \ No newline at end of file diff --git a/docs/ru/interfaces/http.md b/docs/ru/interfaces/http.md index 27a70a5c26d..8687201e1c9 100644 --- a/docs/ru/interfaces/http.md +++ b/docs/ru/interfaces/http.md @@ -422,7 +422,7 @@ $ curl -v 'http://localhost:8123/predefined_query' Значение `query` — это предопределенный запрос `predefined_query_handler`, который выполняется ClickHouse при совпадении HTTP-запроса и возврате результата запроса. Это обязательная настройка. -В следующем примере определяются настройки [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_alter_threads`, а затем запрашивается системная таблица, чтобы проверить, были ли эти параметры успешно установлены. +В следующем примере определяются настройки [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_final_threads`, а затем запрашивается системная таблица, чтобы проверить, были ли эти параметры успешно установлены. !!! note "Предупреждение" Чтобы сохранить стандартные `handlers` такие как `query`, `play`, `ping`, используйте правило ``. @@ -449,9 +449,9 @@ $ curl -v 'http://localhost:8123/predefined_query' ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "Предупреждение" @@ -463,7 +463,7 @@ max_alter_threads 2 ClickHouse извлекает и выполняет значение, соответствующее значению `query_param_name` URL-адресе HTTP-запроса. Значение по умолчанию `query_param_name` — это `/query` . Это необязательная настройка. Если в файле конфигурации нет определения, параметр не передается. -Чтобы поэкспериментировать с этой функциональностью, в примере определяются значения [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_alter_threads` и запрашивается, успешно ли были установлены настройки. +Чтобы поэкспериментировать с этой функциональностью, в примере определяются значения [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_final_threads` и запрашивается, успешно ли были установлены настройки. Пример: @@ -482,9 +482,9 @@ ClickHouse извлекает и выполняет значение, соотв ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ### static {#static} diff --git a/docs/ru/operations/clickhouse-keeper.md b/docs/ru/operations/clickhouse-keeper.md index 9d6c4799008..2f3f3c0f63c 100644 --- a/docs/ru/operations/clickhouse-keeper.md +++ b/docs/ru/operations/clickhouse-keeper.md @@ -3,14 +3,14 @@ toc_priority: 66 toc_title: ClickHouse Keeper --- -# [пре-продакшн] ClickHouse Keeper +# [пре-продакшн] ClickHouse Keeper {#clickHouse-keeper} Сервер ClickHouse использует сервис координации [ZooKeeper](https://zookeeper.apache.org/) для [репликации](../engines/table-engines/mergetree-family/replication.md) данных и выполнения [распределенных DDL запросов](../sql-reference/distributed-ddl.md). ClickHouse Keeper — это альтернативный сервис координации, совместимый с ZooKeeper. !!! warning "Предупреждение" ClickHouse Keeper находится в стадии пре-продакшн и тестируется в CI ClickHouse и на нескольких внутренних инсталляциях. -## Детали реализации +## Детали реализации {#implementation-details} ZooKeeper — один из первых широко известных сервисов координации с открытым исходным кодом. Он реализован на языке программирования Java, имеет достаточно простую и мощную модель данных. Алгоритм координации Zookeeper называется ZAB (ZooKeeper Atomic Broadcast). Он не гарантирует линеаризуемость операций чтения, поскольку каждый узел ZooKeeper обслуживает чтения локально. В отличие от ZooKeeper, ClickHouse Keeper реализован на C++ и использует алгоритм [RAFT](https://raft.github.io/), [реализация](https://github.com/eBay/NuRaft). Этот алгоритм позволяет достичь линеаризуемости чтения и записи, имеет несколько реализаций с открытым исходным кодом на разных языках. @@ -21,7 +21,7 @@ ZooKeeper — один из первых широко известных сер !!! info "Примечание" Внешние интеграции не поддерживаются. -## Конфигурация +## Конфигурация {#configuration} ClickHouse Keeper может использоваться как равноценная замена ZooKeeper или как внутренняя часть сервера ClickHouse, но в обоих случаях конфигурация представлена файлом `.xml`. Главный тег конфигурации ClickHouse Keeper — это ``. Параметры конфигурации: @@ -54,6 +54,7 @@ ClickHouse Keeper может использоваться как равноце - `auto_forwarding` — разрешить пересылку запросов на запись от последователей лидеру (по умолчанию: true). - `shutdown_timeout` — время ожидания завершения внутренних подключений и выключения, в миллисекундах (по умолчанию: 5000). - `startup_timeout` — время отключения сервера, если он не подключается к другим участникам кворума, в миллисекундах (по умолчанию: 30000). +- `four_letter_word_white_list` — список разрешенных 4-х буквенных команд (по умолчанию: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro"). Конфигурация кворума находится в `.` и содержит описание серверов. @@ -101,7 +102,7 @@ ClickHouse Keeper может использоваться как равноце ``` -## Как запустить +## Как запустить {#how-to-run} ClickHouse Keeper входит в пакет `clickhouse-server`, просто добавьте кофигурацию `` и запустите сервер ClickHouse как обычно. Если вы хотите запустить ClickHouse Keeper автономно, сделайте это аналогичным способом: @@ -109,7 +110,195 @@ ClickHouse Keeper входит в пакет `clickhouse-server`, просто clickhouse-keeper --config /etc/your_path_to_config/config.xml --daemon ``` -## [экспериментально] Переход с ZooKeeper +## 4-х буквенные команды {#four-letter-word-commands} + +ClickHouse Keeper также поддерживает 4-х буквенные команды, почти такие же, как у Zookeeper. Каждая команда состоит из 4-х символов, например, `mntr`, `stat` и т. д. Несколько интересных команд: `stat` предоставляет общую информацию о сервере и подключенных клиентах, а `srvr` и `cons` предоставляют расширенные сведения о сервере и подключениях соответственно. + +У 4-х буквенных команд есть параметр для настройки разрешенного списка `four_letter_word_white_list`, который имеет значение по умолчанию "conf,cons,crst,envi,ruok,srst,srvr,stat, wchc,wchs,dirs,mntr,isro". + +Вы можете отправлять команды в ClickHouse Keeper через telnet или nc на порт для клиента. + +``` +echo mntr | nc localhost 9181 +``` + +Ниже приведен подробный список 4-х буквенных команд: + +- `ruok`: Проверяет, что сервер запущен без ошибок. В этом случае сервер ответит `imok`. В противном случае он не ответит. Ответ `imok` не обязательно означает, что сервер присоединился к кворуму, а указывает, что процесс сервера активен и привязан к указанному клиентскому порту. Используйте команду `stat` для получения подробной информации о состоянии кворума и клиентском подключении. + +``` +imok +``` + +- `mntr`: Выводит список переменных, которые используются для мониторинга работоспособности кластера. + +``` +zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +zk_avg_latency 0 +zk_max_latency 0 +zk_min_latency 0 +zk_packets_received 68 +zk_packets_sent 68 +zk_num_alive_connections 1 +zk_outstanding_requests 0 +zk_server_state leader +zk_znode_count 4 +zk_watch_count 1 +zk_ephemerals_count 0 +zk_approximate_data_size 723 +zk_open_file_descriptor_count 310 +zk_max_file_descriptor_count 10240 +zk_followers 0 +zk_synced_followers 0 +``` + +- `srvr`: Выводит информацию о сервере: его версию, роль участника кворума и т.п. + +``` +ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +Latency min/avg/max: 0/0/0 +Received: 2 +Sent : 2 +Connections: 1 +Outstanding: 0 +Zxid: 34 +Mode: leader +Node count: 4 +``` + +- `stat`: Выводит краткие сведения о сервере и подключенных клиентах. + +``` +ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +Clients: + 192.168.1.1:52852(recved=0,sent=0) + 192.168.1.1:52042(recved=24,sent=48) +Latency min/avg/max: 0/0/0 +Received: 4 +Sent : 4 +Connections: 1 +Outstanding: 0 +Zxid: 36 +Mode: leader +Node count: 4 +``` + +- `srst`: Сбрасывает статистику сервера. Команда влияет на результат вывода `srvr`, `mntr` и `stat`. + +``` +Server stats reset. +``` + +- `conf`: Выводит подробную информацию о серверной конфигурации. + +``` +server_id=1 +tcp_port=2181 +four_letter_word_white_list=* +log_storage_path=./coordination/logs +snapshot_storage_path=./coordination/snapshots +max_requests_batch_size=100 +session_timeout_ms=30000 +operation_timeout_ms=10000 +dead_session_check_period_ms=500 +heart_beat_interval_ms=500 +election_timeout_lower_bound_ms=1000 +election_timeout_upper_bound_ms=2000 +reserved_log_items=1000000000000000 +snapshot_distance=10000 +auto_forwarding=true +shutdown_timeout=5000 +startup_timeout=240000 +raft_logs_level=information +snapshots_to_keep=3 +rotate_log_storage_interval=100000 +stale_log_gap=10000 +fresh_log_gap=200 +max_requests_batch_size=100 +quorum_reads=false +force_sync=false +compress_logs=true +compress_snapshots_with_zstd_format=true +configuration_change_tries_count=20 +``` + +- `cons`: Выводит полную информацию о подключениях/сессиях для всех клиентов, подключенных к этому серверу. Включает информацию о количестве принятых/отправленных пакетов, идентификаторе сессии, задержках операций, последней выполненной операции и т. д. + +``` + 192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0) + 192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0) +``` + +- `crst`: Сбрасывает статистику подключений/сессий для всех подключений. + +``` +Connection stats reset. +``` + +- `envi`: Выводит подробную информацию о серверном окружении. + +``` +Environment: +clickhouse.keeper.version=v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +host.name=ZBMAC-C02D4054M.local +os.name=Darwin +os.arch=x86_64 +os.version=19.6.0 +cpu.count=12 +user.name=root +user.home=/Users/JackyWoo/ +user.dir=/Users/JackyWoo/project/jd/clickhouse/cmake-build-debug/programs/ +user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/ +``` + + +- `dirs`: Показывает общий размер файлов снэпшотов и журналов в байтах. + +``` +snapshot_dir_size: 0 +log_dir_size: 3875 +``` + +- `isro`: Проверяет, что сервер работает в режиме только для чтения. Сервер ответит `ro`, если он находится в режиме только для чтения, или `rw`, если нет. + +``` +rw +``` + +- `wchs`: Показывает краткую информацию о количестве отслеживаемых путей (watches) на сервере. + +``` +1 connections watching 1 paths +Total watches:1 +``` + +- `wchc`: Показывает подробную информацию об отслеживаемых путях (watches) на сервере в разбивке по сессиям. При этом выводится список сессий (подключений) с соответствующими отслеживаемыми путями. Обратите внимание, что в зависимости от количества отслеживаемых путей эта операция может быть дорогостоящей (т. е. повлиять на производительность сервера), используйте ее осторожно. + +``` +0x0000000000000001 + /clickhouse/task_queue/ddl +``` + +- `wchp`: Показывает подробную информацию об отслеживаемых путях (watches) на сервере в разбивке по пути. При этом выводится список путей (узлов) с соответствующими сессиями. Обратите внимание, что в зависимости от количества отселживаемых путей (watches) эта операция может быть дорогостоящей (т. е. повлиять на производительность сервера), используйте ее осторожно. + +``` +/clickhouse/task_queue/ddl + 0x0000000000000001 +``` + +- `dump`: Выводит список незавершенных сеансов и эфемерных узлов. Команда работает только на лидере. + +``` +Sessions dump (2): +0x0000000000000001 +0x0000000000000002 +Sessions with Ephemerals (1): +0x0000000000000001 + /clickhouse/task_queue/ddl +``` + + +## [экспериментально] Переход с ZooKeeper {#migration-from-zookeeper} Плавный переход с ZooKeeper на ClickHouse Keeper невозможен, необходимо остановить кластер ZooKeeper, преобразовать данные и запустить ClickHouse Keeper. Утилита `clickhouse-keeper-converter` конвертирует журналы и снэпшоты ZooKeeper в снэпшот ClickHouse Keeper. Работа утилиты проверена только для версий ZooKeeper выше 3.4. Для миграции необходимо выполнить следующие шаги: diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md index 4a2da778a06..d2cc133e0c9 100644 --- a/docs/ru/operations/server-configuration-parameters/settings.md +++ b/docs/ru/operations/server-configuration-parameters/settings.md @@ -673,7 +673,7 @@ ClickHouse поддерживает динамическое изменение ## max_concurrent_queries {#max-concurrent-queries} -Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`. Запросы также могут быть ограничены настройками: [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). +Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`. Запросы также могут быть ограничены настройками: [max_concurrent_insert_queries](#max-concurrent-insert-queries), [max_concurrent_select_queries](#max-concurrent-select-queries), [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). !!! info "Примечание" Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. @@ -681,7 +681,9 @@ ClickHouse поддерживает динамическое изменение Возможные значения: - Положительное целое число. -- 0 — выключена. +- 0 — нет лимита. + +Значение по умолчанию: `100`. **Пример** @@ -689,6 +691,46 @@ ClickHouse поддерживает динамическое изменение 100 ``` +## max_concurrent_insert_queries {#max-concurrent-insert-queries} + +Определяет максимальное количество одновременных `INSERT` запросов. + +!!! info "Примечание" + Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. + +Возможные значения: + +- Положительное целое число. +- 0 — нет лимита. + +Значение по умолчанию: `0`. + +**Example** + +``` xml +100 +``` + +## max_concurrent_select_queries {#max-concurrent-select-queries} + +Определяет максимальное количество одновременных `SELECT` запросов. + +!!! info "Примечание" + Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. + +Возможные значения: + +- Положительное целое число. +- 0 — нет лимита. + +Значение по умолчанию: `0`. + +**Example** + +``` xml +100 +``` + ## max_concurrent_queries_for_user {#max-concurrent-queries-for-user} Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`, для пользователя. @@ -696,7 +738,9 @@ ClickHouse поддерживает динамическое изменение Возможные значения: - Положительное целое число. -- 0 — выключена. +- 0 — нет лимита. + +Значение по умолчанию: `0`. **Пример** @@ -712,7 +756,12 @@ ClickHouse поддерживает динамическое изменение Изменение настройки для одного запроса или пользователя не влияет на другие запросы. -Значение по умолчанию: `0` — отсутствие ограничений. +Возможные значения: + +- Положительное целое число. +- 0 — нет лимита. + +Значение по умолчанию: `0`. **Пример** diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 94bd2078373..affa90d9840 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -761,9 +761,20 @@ ClickHouse может парсить только базовый формат `Y Возможные значения: -- Любое положительное целое число. +- Положительное целое число. -Значение по умолчанию: 163840. +Значение по умолчанию: `163840`. + + +## merge_tree_min_rows_for_concurrent_read_for_remote_filesystem {#merge-tree-min-rows-for-concurrent-read-for-remote-filesystem} + +Минимальное количество строк для чтения из одного файла, прежде чем движок [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) может выполнять параллельное чтение из удаленной файловой системы. + +Возможные значения: + +- Положительное целое число. + +Значение по умолчанию: `163840`. ## merge_tree_min_bytes_for_concurrent_read {#setting-merge-tree-min-bytes-for-concurrent-read} @@ -773,7 +784,17 @@ ClickHouse может парсить только базовый формат `Y - Положительное целое число. -Значение по умолчанию: 251658240. +Значение по умолчанию: `251658240`. + +## merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem {#merge-tree-min-bytes-for-concurrent-read-for-remote-filesystem} + +Минимальное количество байтов для чтения из одного файла, прежде чем движок [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) может выполнять параллельное чтение из удаленной файловой системы. + +Возможное значение: + +- Положительное целое число. + +Значение по умолчанию: `251658240`. ## merge_tree_min_rows_for_seek {#setting-merge-tree-min-rows-for-seek} @@ -1620,18 +1641,19 @@ SELECT * FROM table_with_enum_column_for_csv_insert; `INSERT` завершается успешно только в том случае, когда ClickHouse смог без ошибки записать данные в `insert_quorum` реплик за время `insert_quorum_timeout`. Если по любой причине количество реплик с успешной записью не достигнет `insert_quorum`, то запись считается не состоявшейся и ClickHouse удалит вставленный блок из всех реплик, куда уже успел записать данные. -Все реплики в кворуме консистентны, т.е. содержат данные всех более ранних запросов `INSERT`. Последовательность `INSERT` линеаризуется. +Когда `insert_quorum_parallel` выключена, все реплики кворума консистентны, то есть содержат данные всех предыдущих запросов `INSERT` (последовательность `INSERT` линеаризуется). При чтении с диска данных, записанных с помощью `insert_quorum` и при выключенной `insert_quorum_parallel`, можно включить последовательную консистентность для запросов `SELECT` с помощью [select_sequential_consistency](#settings-select_sequential_consistency). -При чтении данных, записанных с `insert_quorum` можно использовать настройку [select_sequential_consistency](#settings-select_sequential_consistency). - -ClickHouse генерирует исключение +ClickHouse генерирует исключение: - Если количество доступных реплик на момент запроса меньше `insert_quorum`. - При попытке записать данные в момент, когда предыдущий блок ещё не вставлен в `insert_quorum` реплик. Эта ситуация может возникнуть, если пользователь вызвал `INSERT` прежде, чем завершился предыдущий с `insert_quorum`. +- При выключенной `insert_quorum_parallel` и при попытке записать данные в момент, когда предыдущий блок еще не вставлен в `insert_quorum` реплик (несколько параллельных `INSERT`-запросов). Эта ситуация может возникнуть при попытке пользователя выполнить очередной запрос `INSERT` к той же таблице, прежде чем завершится предыдущий с `insert_quorum`. + См. также: - [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) - [select_sequential_consistency](#settings-select_sequential_consistency) ## insert_quorum_timeout {#settings-insert_quorum_timeout} @@ -1643,11 +1665,29 @@ ClickHouse генерирует исключение См. также: - [insert_quorum](#settings-insert_quorum) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) +- [select_sequential_consistency](#settings-select_sequential_consistency) + +## insert_quorum_parallel {#settings-insert_quorum_parallel} + +Включает и выключает параллелизм для кворумных вставок (`INSERT`-запросы). Когда опция включена, возможно выполнять несколько кворумных `INSERT`-запросов одновременно, при этом запросы не дожидаются окончания друг друга . Когда опция выключена, одновременные записи с кворумом в одну и ту же таблицу будут отклонены (будет выполнена только одна из них). + +Возможные значения: + +- 0 — Выключена. +- 1 — Включена. + +Значение по умолчанию: 1. + +См. также: + +- [insert_quorum](#settings-insert_quorum) +- [insert_quorum_timeout](#settings-insert_quorum_timeout) - [select_sequential_consistency](#settings-select_sequential_consistency) ## select_sequential_consistency {#settings-select_sequential_consistency} -Включает или выключает последовательную консистентность для запросов `SELECT`. +Включает или выключает последовательную консистентность для запросов `SELECT`. Необходимо, чтобы `insert_quorum_parallel` была выключена (по умолчанию включена), а опция `insert_quorum` включена. Возможные значения: @@ -1660,10 +1700,13 @@ ClickHouse генерирует исключение Когда последовательная консистентность включена, то ClickHouse позволит клиенту выполнить запрос `SELECT` только к тем репликам, которые содержат данные всех предыдущих запросов `INSERT`, выполненных с `insert_quorum`. Если клиент обратится к неполной реплике, то ClickHouse сгенерирует исключение. В запросе SELECT не будут участвовать данные, которые ещё не были записаны на кворум реплик. +Если `insert_quorum_parallel` включена (по умолчанию это так), тогда `select_sequential_consistency` не будет работать. Причина в том, что параллельные запросы `INSERT` можно записать в разные наборы реплик кворума, поэтому нет гарантии того, что в отдельно взятую реплику будут сделаны все записи. + См. также: - [insert_quorum](#settings-insert_quorum) - [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) ## insert_deduplicate {#settings-insert-deduplicate} diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index fc5533e75b1..b41defdc92d 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -57,7 +57,7 @@ toTimezone(value, timezone) **Аргументы** - `value` — время или дата с временем. [DateTime64](../../sql-reference/data-types/datetime64.md). -- `timezone` — часовой пояс для возвращаемого значения. [String](../../sql-reference/data-types/string.md). +- `timezone` — часовой пояс для возвращаемого значения. [String](../../sql-reference/data-types/string.md). Этот аргумент является константой, потому что `toTimezone` изменяет часовой пояс столбца (часовой пояс является атрибутом типов `DateTime*`). **Возвращаемое значение** diff --git a/docs/ru/sql-reference/functions/nlp-functions.md b/docs/ru/sql-reference/functions/nlp-functions.md index 250403ab127..992a7d6ccf3 100644 --- a/docs/ru/sql-reference/functions/nlp-functions.md +++ b/docs/ru/sql-reference/functions/nlp-functions.md @@ -3,10 +3,10 @@ toc_priority: 67 toc_title: NLP --- -# [экспериментально] Функции для работы с ествественным языком {#nlp-functions} +# [экспериментально] Функции для работы с естественным языком {#nlp-functions} !!! warning "Предупреждение" - Сейчас использование функций для работы с ествественным языком является экспериментальной возможностью. Чтобы использовать данные функции, включите настройку `allow_experimental_nlp_functions = 1`. + Сейчас использование функций для работы с естественным языком является экспериментальной возможностью. Чтобы использовать данные функции, включите настройку `allow_experimental_nlp_functions = 1`. ## stem {#stem} @@ -84,7 +84,7 @@ SELECT lemmatize('en', 'wolves'); Находит синонимы к заданному слову. Представлены два типа расширений словарей: `plain` и `wordnet`. -Для работы расширения типа `plain` необходимо указать путь до простого текстового файла, где каждая строка соотвествует одному набору синонимов. Слова в данной строке должны быть разделены с помощью пробела или знака табуляции. +Для работы расширения типа `plain` необходимо указать путь до простого текстового файла, где каждая строка соответствует одному набору синонимов. Слова в данной строке должны быть разделены с помощью пробела или знака табуляции. Для работы расширения типа `plain` необходимо указать путь до WordNet тезауруса. Тезаурус должен содержать WordNet sense index. diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 1b4ea4ef609..50a458bb453 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -343,9 +343,9 @@ SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); Результат: ``` text -┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ -│ -1.11100 │ Nullable(Decimal(9, 5)) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ +│ -1.111 │ Nullable(Decimal(9, 5)) │ +└────────┴────────────────────────────────────────────────────┘ ``` Запрос: @@ -449,9 +449,9 @@ SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); Результат: ``` text -┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ -│ -1.11100 │ Decimal(9, 5) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ +│ -1.111 │ Decimal(9, 5) │ +└────────┴────────────────────────────────────────────────────┘ ``` Запрос: diff --git a/docs/ru/sql-reference/statements/grant.md b/docs/ru/sql-reference/statements/grant.md index c970d4d24f3..06432193f9f 100644 --- a/docs/ru/sql-reference/statements/grant.md +++ b/docs/ru/sql-reference/statements/grant.md @@ -21,7 +21,7 @@ GRANT [ON CLUSTER cluster_name] privilege[(column_name [,...])] [,...] ON {db.ta - `user` — Пользователь ClickHouse. `WITH GRANT OPTION` разрешает пользователю или роли выполнять запрос `GRANT`. Пользователь может выдавать только те привилегии, которые есть у него, той же или меньшей области действий. -`WITH REPLACE OPTION` заменяет все старые привилегии новыми привилегиями для `user` или `role`, Если не указано, добавьте новые привилегии для старых. +`WITH REPLACE OPTION` заменяет все старые привилегии новыми привилегиями для `user` или `role`, если не указано, добавляет новые привилегии. ## Синтаксис назначения ролей {#assign-role-syntax} @@ -34,7 +34,7 @@ GRANT [ON CLUSTER cluster_name] role [,...] TO {user | another_role | CURRENT_US - `user` — Пользователь ClickHouse. `WITH ADMIN OPTION` присваивает привилегию [ADMIN OPTION](#admin-option-privilege) пользователю или роли. -`WITH REPLACE OPTION` заменяет все старые роли новыми ролями для пользователя `user` или `role`, Если не указано, добавьте новые роли в старые. +`WITH REPLACE OPTION` заменяет все старые роли новыми ролями для пользователя `user` или `role`, если не указано, добавляет новые новые роли. ## Использование {#grant-usage} diff --git a/docs/ru/sql-reference/statements/select/join.md b/docs/ru/sql-reference/statements/select/join.md index 9f6d38a024f..bb9b7445083 100644 --- a/docs/ru/sql-reference/statements/select/join.md +++ b/docs/ru/sql-reference/statements/select/join.md @@ -55,13 +55,13 @@ FROM - [join_on_disk_max_files_to_merge](../../../operations/settings/settings.md#join_on_disk_max_files_to_merge) - [any_join_distinct_right_table_keys](../../../operations/settings/settings.md#any_join_distinct_right_table_keys) -## Условия в секции ON {on-section-conditions} +## Условия в секции ON {#on-section-conditions} + +Секция `ON` может содержать несколько условий, связанных операторами `AND` и `OR`. Условия, задающие ключи соединения, должны содержать столбцы левой и правой таблицы и должны использовать оператор равенства. Прочие условия могут использовать другие логические операторы, но в отдельном условии могут использоваться столбцы либо только левой, либо только правой таблицы. -Секция `ON` может содержать несколько условий, связанных оператором `AND`. Условия, задающие ключи соединения, должны содержать столбцы левой и правой таблицы и должны использовать оператор равенства. Прочие условия могут использовать другие логические операторы, но в отдельном условии могут использоваться столбцы либо только левой, либо только правой таблицы. Строки объединяются только тогда, когда всё составное условие выполнено. Если оно не выполнено, то строки могут попасть в результат в зависимости от типа `JOIN`. Обратите внимание, что если то же самое условие поместить в секцию `WHERE`, то строки, для которых оно не выполняется, никогда не попаду в результат. -!!! note "Примечание" - Оператор `OR` внутри секции `ON` пока не поддерживается. +Оператор `OR` внутри секции `ON` работает, используя алгоритм хеш-соединения — на каждый агрумент `OR` с ключами соединений для `JOIN` создается отдельная хеш-таблица, поэтому потребление памяти и время выполнения запроса растет линейно при увеличении количества выражений `OR` секции `ON`. !!! note "Примечание" Если в условии использованы столбцы из разных таблиц, то пока поддерживается только оператор равенства (`=`). @@ -110,6 +110,47 @@ SELECT name, text, scores FROM table_1 INNER JOIN table_2 └──────┴────────┴────────┘ ``` +Запрос с типом соединения `INNER` и условием с оператором `OR`: + +``` sql +CREATE TABLE t1 (`a` Int64, `b` Int64) ENGINE = MergeTree() ORDER BY a; + +CREATE TABLE t2 (`key` Int32, `val` Int64) ENGINE = MergeTree() ORDER BY key; + +INSERT INTO t1 SELECT number as a, -a as b from numbers(5); + +INSERT INTO t2 SELECT if(number % 2 == 0, toInt64(number), -number) as key, number as val from numbers(5); + +SELECT a, b, val FROM t1 INNER JOIN t2 ON t1.a = t2.key OR t1.b = t2.key; +``` + +Результат: + +``` +┌─a─┬──b─┬─val─┐ +│ 0 │ 0 │ 0 │ +│ 1 │ -1 │ 1 │ +│ 2 │ -2 │ 2 │ +│ 3 │ -3 │ 3 │ +│ 4 │ -4 │ 4 │ +└───┴────┴─────┘ +``` + +Запрос с типом соединения `INNER` и условиями с операторами `OR` и `AND`: + +``` sql +SELECT a, b, val FROM t1 INNER JOIN t2 ON t1.a = t2.key OR t1.b = t2.key AND t2.val > 3; +``` + +Результат: + +``` +┌─a─┬──b─┬─val─┐ +│ 0 │ 0 │ 0 │ +│ 2 │ -2 │ 2 │ +│ 4 │ -4 │ 4 │ +└───┴────┴─────┘ +``` ## Использование ASOF JOIN {#asof-join-usage} `ASOF JOIN` применим в том случае, когда необходимо объединять записи, которые не имеют точного совпадения. diff --git a/docs/tools/build.py b/docs/tools/build.py index 785928cf4ab..75278075996 100755 --- a/docs/tools/build.py +++ b/docs/tools/build.py @@ -95,7 +95,7 @@ def build_for_lang(lang, args): site_dir=site_dir, strict=True, theme=theme_cfg, - copyright='©2016–2021 ClickHouse, Inc.', + copyright='©2016–2022 ClickHouse, Inc.', use_directory_urls=True, repo_name='ClickHouse/ClickHouse', repo_url='https://github.com/ClickHouse/ClickHouse/', diff --git a/docs/zh/faq/general/how-do-i-contribute-code-to-clickhouse.md b/docs/zh/faq/general/how-do-i-contribute-code-to-clickhouse.md new file mode 120000 index 00000000000..5ac9a615386 --- /dev/null +++ b/docs/zh/faq/general/how-do-i-contribute-code-to-clickhouse.md @@ -0,0 +1 @@ +../../../en/faq/general/how-do-i-contribute-code-to-clickhouse.md \ No newline at end of file diff --git a/docs/zh/faq/operations/multi-region-replication.md b/docs/zh/faq/operations/multi-region-replication.md new file mode 120000 index 00000000000..dbc985ee1fb --- /dev/null +++ b/docs/zh/faq/operations/multi-region-replication.md @@ -0,0 +1 @@ +../../../en/faq/operations/multi-region-replication.md \ No newline at end of file diff --git a/docs/zh/interfaces/http.md b/docs/zh/interfaces/http.md index cdce4f2f2e7..738b0365f46 100644 --- a/docs/zh/interfaces/http.md +++ b/docs/zh/interfaces/http.md @@ -407,7 +407,7 @@ $ curl -v 'http://localhost:8123/predefined_query' `query` 是一个预定义的`predefined_query_handler`查询,它由ClickHouse在匹配HTTP请求并返回查询结果时执行。这是一个必须的配置。 -以下是定义的[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_alter_threads`设置, 然后查询系统表以检查这些设置是否设置成功。 +以下是定义的[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_final_threads`设置, 然后查询系统表以检查这些设置是否设置成功。 示例: @@ -430,9 +430,9 @@ $ curl -v 'http://localhost:8123/predefined_query' ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "警告" @@ -444,7 +444,7 @@ max_alter_threads 2 ClickHouse提取并执行与HTTP请求URL中的`query_param_name`值对应的值。`query_param_name`的默认值是`/query`。这是一个可选的配置。如果配置文件中没有定义,则不会传入参数。 -为了试验这个功能,示例定义了[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_alter_threads`,`queries`设置是否成功的值。 +为了试验这个功能,示例定义了[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_final_threads`,`queries`设置是否成功的值。 示例: @@ -462,9 +462,9 @@ ClickHouse提取并执行与HTTP请求URL中的`query_param_name`值对应的值 ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ### static {#static} diff --git a/docs/zh/operations/system-tables/columns.md b/docs/zh/operations/system-tables/columns.md index 9a90561a07b..6d4299a9056 100644 --- a/docs/zh/operations/system-tables/columns.md +++ b/docs/zh/operations/system-tables/columns.md @@ -1,29 +1,89 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.columns {#system-columns} -# 系统。列 {#system-columns} +此系统表包含所有表中列的信息。 -包含有关所有表中列的信息。 +你可以使用这个表来获得类似于 [DESCRIBE TABLE](../../sql-reference/statements/misc.md#misc-describe-table) 查询的信息,但是可以同时获得多个表的信息。 -您可以使用此表获取类似于以下内容的信息 [DESCRIBE TABLE](../../sql-reference/statements/misc.md#misc-describe-table) 查询,但对于多个表一次。 +[临时表](../../sql-reference/statements/create/table.md#temporary-tables)中的列只在创建它们的会话中的 `system.columns` 中才可见,并且它们的 `database` 字段显示为空。 -该 `system.columns` 表包含以下列(列类型显示在括号中): +`system.columns` 表包含以下列 (括号中显示的是列类型): -- `database` (String) — Database name. -- `table` (String) — Table name. -- `name` (String) — Column name. -- `type` (String) — Column type. -- `default_kind` (String) — Expression type (`DEFAULT`, `MATERIALIZED`, `ALIAS`)为默认值,如果没有定义,则为空字符串。 -- `default_expression` (String) — Expression for the default value, or an empty string if it is not defined. -- `data_compressed_bytes` (UInt64) — The size of compressed data, in bytes. -- `data_uncompressed_bytes` (UInt64) — The size of decompressed data, in bytes. -- `marks_bytes` (UInt64) — The size of marks, in bytes. -- `comment` (String) — Comment on the column, or an empty string if it is not defined. -- `is_in_partition_key` (UInt8) — Flag that indicates whether the column is in the partition expression. -- `is_in_sorting_key` (UInt8) — Flag that indicates whether the column is in the sorting key expression. -- `is_in_primary_key` (UInt8) — Flag that indicates whether the column is in the primary key expression. -- `is_in_sampling_key` (UInt8) — Flag that indicates whether the column is in the sampling key expression. +- `database` ([String](../../sql-reference/data-types/string.md)) — 数据库名称。 +- `table` ([String](../../sql-reference/data-types/string.md)) — 表名。 +- `name` ([String](../../sql-reference/data-types/string.md)) — 列名。 +- `type` ([String](../../sql-reference/data-types/string.md)) — 列类型。 +- `position` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 列在表中的顺序位置,从1开始。 +- `default_kind` ([String](../../sql-reference/data-types/string.md)) — 默认值的表达式类型(`DEFAULT`, `MATERIALIZED`, `ALIAS`) ,如果没有定义,则为空字符串。 +- `default_expression` ([String](../../sql-reference/data-types/string.md)) — 默认值的表达式,如果未定义则为空字符串。 +- `data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 压缩数据的大小,以字节为单位。 +- `data_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 解压后的数据的大小,以字节为单位。 +- `marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 标记的大小,以字节为单位。 +- `comment` ([String](../../sql-reference/data-types/string.md)) — 列注释,如果没有定义,则为空字符串。 +- `is_in_partition_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 列是否在分区表达式中的标志。 +- `is_in_sorting_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 列是否在排序键表达式中的标志。 +- `is_in_primary_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 列是否在主键表达式中的标志。 +- `is_in_sampling_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — 列是否在采样键表达式中的标志。 +- `compression_codec` ([String](../../sql-reference/data-types/string.md)) — 压缩编码的名称。 +- `character_octet_length` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — 二进制数据、字符数据或文本数据和图像的最大长度(以字节为单位)。在 ClickHouse 中只对 `FixedString` 数据类型有意义。否则,将返回 `NULL` 值。 +- `numeric_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — 近似数字型数据、精确数字型数据、整数型数据或货币数据的精度。在 ClickHouse 中,对于整数类型是比特率(bitness),对于 `Decimal` 类型是十进制精度。否则,将返回 `NULL` 值。 +- `numeric_precision_radix` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — 数字系统的基数是近似数字型数据、精确数字型数据、整数型数据或货币数据的精度。在 ClickHouse 中,对于整数类型是2,对于 `Decimal` 类型是10。否则,将返回 `NULL` 值。 +- `numeric_scale` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — 近似数字型数据、精确数字型数据、整数型数据或货币数据的比例。在 ClickHouse 中只对 `Decimal` 类型有意义。否则,将返回 `NULL` 值。 +- `datetime_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — `DateTime64` 数据类型的小数精度。对于其他数据类型,将返回 `NULL` 值。 + +**示例** + +```sql +SELECT * FROM system.columns LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: INFORMATION_SCHEMA +table: COLUMNS +name: table_catalog +type: String +position: 1 +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: +character_octet_length: ᴺᵁᴸᴸ +numeric_precision: ᴺᵁᴸᴸ +numeric_precision_radix: ᴺᵁᴸᴸ +numeric_scale: ᴺᵁᴸᴸ +datetime_precision: ᴺᵁᴸᴸ + +Row 2: +────── +database: INFORMATION_SCHEMA +table: COLUMNS +name: table_schema +type: String +position: 2 +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: +character_octet_length: ᴺᵁᴸᴸ +numeric_precision: ᴺᵁᴸᴸ +numeric_precision_radix: ᴺᵁᴸᴸ +numeric_scale: ᴺᵁᴸᴸ +datetime_precision: ᴺᵁᴸᴸ +``` [原文](https://clickhouse.com/docs/zh/operations/system-tables/columns) diff --git a/docs/zh/operations/system-tables/contributors.md b/docs/zh/operations/system-tables/contributors.md index e9374a7dc9c..fd876da3594 100644 --- a/docs/zh/operations/system-tables/contributors.md +++ b/docs/zh/operations/system-tables/contributors.md @@ -1,15 +1,10 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.contributors {#system-contributors} -# 系统。贡献者 {#system-contributors} - -包含有关贡献者的信息。 该顺序在查询执行时是随机的。 +此系统表包含有关贡献者的信息。排列顺序是在查询执行时随机生成的。 列: -- `name` (String) — Contributor (author) name from git log. +- `name` (String) — git 日志中的贡献者 (作者) 名字。 **示例** @@ -32,7 +27,7 @@ SELECT * FROM system.contributors LIMIT 10 └──────────────────┘ ``` -要在表中找出自己,请使用查询: +要在表中找到你自己,请这样查询: ``` sql SELECT * FROM system.contributors WHERE name = 'Olga Khvostikova' @@ -43,3 +38,5 @@ SELECT * FROM system.contributors WHERE name = 'Olga Khvostikova' │ Olga Khvostikova │ └──────────────────┘ ``` + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/contributors) diff --git a/docs/zh/operations/system-tables/detached_parts.md b/docs/zh/operations/system-tables/detached_parts.md index ba35444c551..efcbb61d37e 100644 --- a/docs/zh/operations/system-tables/detached_parts.md +++ b/docs/zh/operations/system-tables/detached_parts.md @@ -1,14 +1,11 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.detached_parts {#system_tables-detached_parts} -# 系统。detached_parts {#system_tables-detached_parts} +包含关于 [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) 表的分离分区的信息。`reason` 列详细说明了该分区被分离的原因。 -包含有关分离部分的信息 [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) 桌子 该 `reason` 列指定分离部件的原因。 +对于用户分离的分区,原因是空的。你可以通过 [ALTER TABLE ATTACH PARTITION\|PART](../../sql-reference/statements/alter/partition.md#alter_attach-partition) 命令添加这些分区。 -对于用户分离的部件,原因是空的。 这些部件可以附加 [ALTER TABLE ATTACH PARTITION\|PART](../../sql-reference/statements/alter.md#alter_attach-partition) 指挥部 +关于其他列的描述,请参见 [system.parts](../../operations/system-tables/parts.md#system_tables-parts)。 -有关其他列的说明,请参阅 [系统。零件](../../operations/system-tables/parts.md#system_tables-parts). +如果分区名称无效,一些列的值可能是`NULL`。你可以通过[ALTER TABLE DROP DETACHED PART](../../sql-reference/statements/alter/partition.md#alter_drop-detached)来删除这些分区。 -如果部件名称无效,某些列的值可能为 `NULL`. 这些部分可以删除 [ALTER TABLE DROP DETACHED PART](../../sql-reference/statements/alter.md#alter_drop-detached). +[原文](https://clickhouse.com/docs/zh/operations/system-tables/detached_parts) diff --git a/docs/zh/operations/system-tables/metrics.md b/docs/zh/operations/system-tables/metrics.md index 34b7fa35681..5b5b4615f82 100644 --- a/docs/zh/operations/system-tables/metrics.md +++ b/docs/zh/operations/system-tables/metrics.md @@ -1,19 +1,14 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.metrics {#system_tables-metrics} -# 系统。指标 {#system_tables-metrics} - -包含可以立即计算或具有当前值的指标。 例如,同时处理的查询的数量或当前副本的延迟。 此表始终是最新的。 +此系统表包含可以即时计算或具有当前值的指标。例如,同时处理的查询数量或当前的复制延迟。这个表始终是最新的。 列: -- `metric` ([字符串](../../sql-reference/data-types/string.md)) — Metric name. -- `value` ([Int64](../../sql-reference/data-types/int-uint.md)) — Metric value. -- `description` ([字符串](../../sql-reference/data-types/string.md)) — Metric description. +- `metric` ([字符串](../../sql-reference/data-types/string.md)) — 指标名称. +- `value` ([Int64](../../sql-reference/data-types/int-uint.md)) — 指标的值. +- `description` ([字符串](../../sql-reference/data-types/string.md)) — 指标的描述. -支持的指标列表,您可以在 [src/Common/CurrentMetrics.cpp](https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/CurrentMetrics.cpp) ClickHouse的源文件。 +对于支持的指标列表,您可以查看 [src/Common/CurrentMetrics.cpp](https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/CurrentMetrics.cpp) ClickHouse 的源文件。 **示例** @@ -38,7 +33,7 @@ SELECT * FROM system.metrics LIMIT 10 **另请参阅** -- [系统。asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. -- [系统。活动](../../operations/system-tables/events.md#system_tables-events) — Contains a number of events that occurred. -- [系统。metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. -- [监测](../../operations/monitoring.md) — Base concepts of ClickHouse monitoring. +- [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) — 包含周期性的计算指标。 +- [system.events](../../operations/system-tables/events.md#system_tables-events) — 包含发生的一些事件。 +- [system.metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — 包含`system.metrics`表和`system.events`表的历史指标值。 +- [监控](../../operations/monitoring.md) — ClickHouse 监控的基本概念。 diff --git a/docs/zh/operations/system-tables/numbers.md b/docs/zh/operations/system-tables/numbers.md index c42c87053ca..fd67baa01a5 100644 --- a/docs/zh/operations/system-tables/numbers.md +++ b/docs/zh/operations/system-tables/numbers.md @@ -1,12 +1,32 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.numbers {#system-numbers} -# 系统。数字 {#system-numbers} +这个表有一个名为 `number` 的 UInt64 列,包含了几乎所有从 0 开始的自然数。 -此表包含一个名为UInt64的列 `number` 它包含几乎所有从零开始的自然数。 +你可以用这个表进行测试,或者如果你需要进行暴力搜索。 -您可以使用此表进行测试,或者如果您需要进行暴力搜索。 +从该表的读取是不并行的。 -从此表中读取的内容不是并行的。 +**示例** + +```sql +:) SELECT * FROM system.numbers LIMIT 10; +``` + +```text +┌─number─┐ +│ 0 │ +│ 1 │ +│ 2 │ +│ 3 │ +│ 4 │ +│ 5 │ +│ 6 │ +│ 7 │ +│ 8 │ +│ 9 │ +└────────┘ + +10 rows in set. Elapsed: 0.001 sec. +``` + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/numbers) diff --git a/docs/zh/operations/system-tables/parts.md b/docs/zh/operations/system-tables/parts.md index e924ee27df3..dc98288305f 100644 --- a/docs/zh/operations/system-tables/parts.md +++ b/docs/zh/operations/system-tables/parts.md @@ -1,85 +1,167 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.parts {#system_tables-parts} -# 系统。零件 {#system_tables-parts} +此系统表包含 [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) 表分区的相关信息。 -包含有关的部分信息 [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) 桌子 - -每行描述一个数据部分。 +每一行描述一个数据分区。 列: -- `partition` (String) – The partition name. To learn what a partition is, see the description of the [ALTER](../../sql-reference/statements/alter.md#query_language_queries_alter) 查询。 +- `partition` ([String](../../sql-reference/data-types/string.md)) – 分区名称。请参阅 [ALTER](../../sql-reference/statements/alter/index.md#query_language_queries_alter) 查询的说明,来了解什么是分区。 格式: - `YYYYMM` 用于按月自动分区。 - - `any_string` 手动分区时。 + - `any_string` 手动分区时,是其他格式的字符串。 -- `name` (`String`) – Name of the data part. +- `name` ([String](../../sql-reference/data-types/string.md)) – 数据分区的名称。 -- `active` (`UInt8`) – Flag that indicates whether the data part is active. If a data part is active, it's used in a table. Otherwise, it's deleted. Inactive data parts remain after merging. +- `part_type` ([String](../../sql-reference/data-types/string.md)) — 数据分区的存储格式。 -- `marks` (`UInt64`) – The number of marks. To get the approximate number of rows in a data part, multiply `marks` 通过索引粒度(通常为8192)(此提示不适用于自适应粒度)。 + 可能的值: -- `rows` (`UInt64`) – The number of rows. + - `Wide` — 每一列在文件系统中的一个单独文件中存储。 + - `Compact` — 所有列在文件系统中的一个文件中存储。 -- `bytes_on_disk` (`UInt64`) – Total size of all the data part files in bytes. + 数据存储格式由 [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) 表的 `min_bytes_for_wide_part` 和 `min_rows_for_wide_part` 控制。 -- `data_compressed_bytes` (`UInt64`) – Total size of compressed data in the data part. All the auxiliary files (for example, files with marks) are not included. + - `active` ([UInt8](../../sql-reference/data-types/int-uint.md)) – 指示数据分区是否处于活动状态的标志。如果数据分区处于活动状态,则此数据正在被表使用。反之,则不活跃(deleted)。合并后仍会保留非活跃的数据分区。 -- `data_uncompressed_bytes` (`UInt64`) – Total size of uncompressed data in the data part. All the auxiliary files (for example, files with marks) are not included. +- `marks` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 标记数。要获得数据分区中的大致行数:使用`marks`(标记数)乘以索引粒度(通常为 8192)。不适用于自适应颗粒度。 -- `marks_bytes` (`UInt64`) – The size of the file with marks. +- `rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 行数. -- `modification_time` (`DateTime`) – The time the directory with the data part was modified. This usually corresponds to the time of data part creation.\| +- `bytes_on_disk` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 数据总大小(以字节为单位)。 -- `remove_time` (`DateTime`) – The time when the data part became inactive. +- `data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 数据分区中压缩数据的总大小。不包括所有辅助文件(例如,带有标记的文件)。 -- `refcount` (`UInt32`) – The number of places where the data part is used. A value greater than 2 indicates that the data part is used in queries or merges. +- `data_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 数据分区中未压缩数据的总大小。不包括所有辅助文件(例如,带有标记的文件)。 -- `min_date` (`Date`) – The minimum value of the date key in the data part. +- `marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 带有标记的文件的大小。 -- `max_date` (`Date`) – The maximum value of the date key in the data part. +- `secondary_indices_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 数据分区中二级索引的压缩数据总大小。所有的辅助文件(例如,带有标记的文件)都不包括在内。 -- `min_time` (`DateTime`) – The minimum value of the date and time key in the data part. +- `secondary_indices_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 数据分区中二级索引的未压缩数据的总大小。所有的辅助文件(例如,带有标记的文件)都不包括在内。 -- `max_time`(`DateTime`) – The maximum value of the date and time key in the data part. +- `secondary_indices_marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 带标记的二级索引的文件大小。 -- `partition_id` (`String`) – ID of the partition. +- `modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) – 包含数据分区的目录被修改的时间。这通常对应于数据部分创建的时间。 -- `min_block_number` (`UInt64`) – The minimum number of data parts that make up the current part after merging. +- `remove_time` ([DateTime](../../sql-reference/data-types/datetime.md)) – 数据分区变为非活动状态的时间。 -- `max_block_number` (`UInt64`) – The maximum number of data parts that make up the current part after merging. +- `refcount` ([UInt32](../../sql-reference/data-types/int-uint.md)) – 使用数据部分的位置数。大于 2 的值表示数据部分用于查询或是用于合并。 -- `level` (`UInt32`) – Depth of the merge tree. Zero means that the current part was created by insert rather than by merging other parts. +- `min_date` ([Date](../../sql-reference/data-types/date.md)) – 数据部分中日期键的最小值。 -- `data_version` (`UInt64`) – Number that is used to determine which mutations should be applied to the data part (mutations with a version higher than `data_version`). +- `max_date` ([Date](../../sql-reference/data-types/date.md)) – 数据部分中日期键的最大值。 -- `primary_key_bytes_in_memory` (`UInt64`) – The amount of memory (in bytes) used by primary key values. +- `min_time` ([DateTime](../../sql-reference/data-types/datetime.md)) – 数据部分中日期和时间键的最小值。 -- `primary_key_bytes_in_memory_allocated` (`UInt64`) – The amount of memory (in bytes) reserved for primary key values. +- `max_time`([DateTime](../../sql-reference/data-types/datetime.md)) – 数据部分中日期和时间键的最大值。 -- `is_frozen` (`UInt8`) – Flag that shows that a partition data backup exists. 1, the backup exists. 0, the backup doesn't exist. For more details, see [FREEZE PARTITION](../../sql-reference/statements/alter.md#alter_freeze-partition) +- `partition_id` ([String](../../sql-reference/data-types/string.md)) – 分区的 ID。 -- `database` (`String`) – Name of the database. +- `min_block_number` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 合并后构成当前部分的最小数据部分数量。 -- `table` (`String`) – Name of the table. +- `max_block_number` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 合并后构成当前部分的最大数据部分数量。 -- `engine` (`String`) – Name of the table engine without parameters. +- `level` ([UInt32](../../sql-reference/data-types/int-uint.md)) – 合并树的深度。值为 0 表示该分区是通过插入创建的,而不是通过合并创建的。 -- `path` (`String`) – Absolute path to the folder with data part files. +- `data_version` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 用于确定应将哪些订正(mutations)应用于数据部分(版本高于 `data_version` 的订正(mutations))的数字。 -- `disk` (`String`) – Name of a disk that stores the data part. +- `primary_key_bytes_in_memory` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 主键值使用的内存量(以字节为单位)。 -- `hash_of_all_files` (`String`) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) 的压缩文件。 +- `primary_key_bytes_in_memory_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md)) – 为主键值保留的内存量(以字节为单位)。 -- `hash_of_uncompressed_files` (`String`) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) 未压缩的文件(带标记的文件,索引文件等。). +- `is_frozen` ([UInt8](../../sql-reference/data-types/int-uint.md)) – 显示分区数据备份存在的标志。1,备份存在。0,备份不存在。更多细节,见 [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md#alter_freeze-partition)。 -- `uncompressed_hash_of_compressed_files` (`String`) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) 压缩文件中的数据,就好像它们是未压缩的。 +- `database` ([String](../../sql-reference/data-types/string.md)) – 数据库的名称。 -- `bytes` (`UInt64`) – Alias for `bytes_on_disk`. +- `table` ([String](../../sql-reference/data-types/string.md)) – 表的名称。 -- `marks_size` (`UInt64`) – Alias for `marks_bytes`. +- `engine` ([String](../../sql-reference/data-types/string.md)) – 不带参数的表引擎名称。 + +- `path` ([String](../../sql-reference/data-types/string.md)) – 包含数据部分文件的文件夹的绝对路径。 + +- `disk` ([String](../../sql-reference/data-types/string.md)) – 存储数据部分的磁盘的名称。 + +- `hash_of_all_files` ([String](../../sql-reference/data-types/string.md)) – 压缩文件的 [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128)。 + +- `hash_of_uncompressed_files` ([String](../../sql-reference/data-types/string.md)) – 未压缩文件(带有标记的文件、索引文件等)的 [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128)。 + +- `uncompressed_hash_of_compressed_files` ([String](../../sql-reference/data-types/string.md)) – 压缩文件中的数据(没有压缩时)的 [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128)。 + +- `delete_ttl_info_min` ([DateTime](../../sql-reference/data-types/datetime.md)) — [TTL DELETE 规则](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) 的日期和时间键的最小值。 + +- `delete_ttl_info_max` ([DateTime](../../sql-reference/data-types/datetime.md)) — [TTL DELETE 规则](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) 的日期和时间键的最大值。 + +- `move_ttl_info.expression` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — 表达式的数组。 每个表达式定义一个 [TTL MOVE 规则](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). + + !!! note "警告" + 保留 `move_ttl_info.expression` 数组主要是为了向后兼容,现在检查 `TTL MOVE` 规则最简单的方法是使用 `move_ttl_info.min` 和 `move_ttl_info.max` 字段。 + +- `move_ttl_info.min` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — 日期值和时间值的数组。数组中的每个元素都描述了一个 [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) 的最小键值。 + +- `move_ttl_info.max` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — 日期值和时间值的数组。数组中的每个元素都描述了一个 [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) 的最大键值。 + +- `bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – `bytes_on_disk`的别名。 + +- `marks_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) – `marks_bytes`的别名。 + +**示例** + +``` sql +SELECT * FROM system.parts LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +partition: tuple() +name: all_1_4_1_6 +part_type: Wide +active: 1 +marks: 2 +rows: 6 +bytes_on_disk: 310 +data_compressed_bytes: 157 +data_uncompressed_bytes: 91 +secondary_indices_compressed_bytes: 58 +secondary_indices_uncompressed_bytes: 6 +secondary_indices_marks_bytes: 48 +marks_bytes: 144 +modification_time: 2020-06-18 13:01:49 +remove_time: 1970-01-01 00:00:00 +refcount: 1 +min_date: 1970-01-01 +max_date: 1970-01-01 +min_time: 1970-01-01 00:00:00 +max_time: 1970-01-01 00:00:00 +partition_id: all +min_block_number: 1 +max_block_number: 4 +level: 1 +data_version: 6 +primary_key_bytes_in_memory: 8 +primary_key_bytes_in_memory_allocated: 64 +is_frozen: 0 +database: default +table: months +engine: MergeTree +disk_name: default +path: /var/lib/clickhouse/data/default/months/all_1_4_1_6/ +hash_of_all_files: 2d0657a16d9430824d35e327fcbd87bf +hash_of_uncompressed_files: 84950cc30ba867c77a408ae21332ba29 +uncompressed_hash_of_compressed_files: 1ad78f1c6843bbfb99a2c931abe7df7d +delete_ttl_info_min: 1970-01-01 00:00:00 +delete_ttl_info_max: 1970-01-01 00:00:00 +move_ttl_info.expression: [] +move_ttl_info.min: [] +move_ttl_info.max: [] +``` + +**另请参阅** + +- [MergeTree(合并树)家族](../../engines/table-engines/mergetree-family/mergetree.md) +- [列和表的 TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/parts) diff --git a/docs/zh/operations/system-tables/settings.md b/docs/zh/operations/system-tables/settings.md index c717c8c9562..144eb0179c4 100644 --- a/docs/zh/operations/system-tables/settings.md +++ b/docs/zh/operations/system-tables/settings.md @@ -1,27 +1,22 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- +# system.settings {#system-tables-system-settings} -# 系统。设置 {#system-tables-system-settings} - -包含有关当前用户的会话设置的信息。 +包含当前用户会话设置的相关信息。 列: -- `name` ([字符串](../../sql-reference/data-types/string.md)) — Setting name. -- `value` ([字符串](../../sql-reference/data-types/string.md)) — Setting value. -- `changed` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether a setting is changed from its default value. -- `description` ([字符串](../../sql-reference/data-types/string.md)) — Short setting description. -- `min` ([可为空](../../sql-reference/data-types/nullable.md)([字符串](../../sql-reference/data-types/string.md))) — Minimum value of the setting, if any is set via [制约因素](../../operations/settings/constraints-on-settings.md#constraints-on-settings). 如果设置没有最小值,则包含 [NULL](../../sql-reference/syntax.md#null-literal). -- `max` ([可为空](../../sql-reference/data-types/nullable.md)([字符串](../../sql-reference/data-types/string.md))) — Maximum value of the setting, if any is set via [制约因素](../../operations/settings/constraints-on-settings.md#constraints-on-settings). 如果设置没有最大值,则包含 [NULL](../../sql-reference/syntax.md#null-literal). -- `readonly` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether the current user can change the setting: - - `0` — Current user can change the setting. - - `1` — Current user can't change the setting. +- `name` ([字符串](../../sql-reference/data-types/string.md)) — 设置名称。 +- `value` ([字符串](../../sql-reference/data-types/string.md)) — 设置的值。 +- `changed` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 显示该设置是否从其默认值修改。 +- `description` ([字符串](../../sql-reference/data-types/string.md)) — 该设置的简要描述。 +- `min` ([可为空](../../sql-reference/data-types/nullable.md)([字符串](../../sql-reference/data-types/string.md))) — 该设置的最小值,如果有最小值,则是通过[约束](../../operations/settings/constraints-on-settings.md#constraints-on-settings)设置的。如果该设置没有最小值,则包含 [NULL](../../sql-reference/syntax.md#null-literal). +- `max` ([可为空](../../sql-reference/data-types/nullable.md)([字符串](../../sql-reference/data-types/string.md))) — 该设置的最大值, 如果有最大值,则是通过[约束](../../operations/settings/constraints-on-settings.md#constraints-on-settings)设置的。如果该设置没有最大值,则包含 [NULL](../../sql-reference/syntax.md#null-literal). +- `readonly` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — 当前用户是否可以修改该设置: + - `0` — 当前用户可以修改此设置. + - `1` — 当前用户不能修改此设置. **示例** -下面的示例演示如何获取有关名称包含的设置的信息 `min_i`. +下面的例子显示了如何获得设置名称中包含`min_i`的设置信息。 ``` sql SELECT * @@ -37,10 +32,10 @@ WHERE name LIKE '%min_i%' └─────────────────────────────────────────────┴───────────┴─────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────┴──────┴──────────┘ ``` -使用 `WHERE changed` 可以是有用的,例如,当你想检查: +比如,当你想要检查以下情况时,使用 `WHERE changed` 会很有用: -- 配置文件中的设置是否正确加载并正在使用。 -- 在当前会话中更改的设置。 +- 配置文件中的设置是否正确加载,并正在使用。 +- 在当前会话中更改过的设置。 @@ -52,4 +47,6 @@ SELECT * FROM system.settings WHERE changed AND name='load_balancing' - [设置](../../operations/settings/index.md#session-settings-intro) - [查询权限](../../operations/settings/permissions-for-queries.md#settings_readonly) -- [对设置的限制](../../operations/settings/constraints-on-settings.md) +- [对设置的约束](../../operations/settings/constraints-on-settings.md) + +[原文](https://clickhouse.com/docs/zh/operations/system-tables/settings) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/corr.md b/docs/zh/sql-reference/aggregate-functions/reference/corr.md index 5ab49f75023..5352aed5fc4 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/corr.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/corr.md @@ -12,4 +12,4 @@ toc_priority: 107 计算Pearson相关系数: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `corrStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `corrStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md index c6f43c6b9e9..4b961a22795 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md @@ -12,4 +12,4 @@ covarPop(x, y) 计算 `Σ((x - x̅)(y - y̅)) / n` 的值。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarPopStable` 函数。 它的工作速度较慢,但提供了较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarPopStable` 函数。 它的工作速度较慢,但提供了较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md index 5ef5104504b..bed522bbbfa 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md @@ -14,4 +14,4 @@ covarSamp(x, y) 返回Float64。 当 `n <= 1`, 返回 +∞。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md index a193b60338a..72bd797279f 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md @@ -46,7 +46,7 @@ quantileTiming(level)(expr) 类型: `Float32`。 !!! note "注" -如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 + 如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 **示例** diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md index 7b130dbddbd..3ae1124c9c0 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md @@ -48,7 +48,7 @@ quantileTimingWeighted(level)(expr, weight) 类型: `Float32`。 !!! note "注" -如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 + 如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 **示例** diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md b/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md index 378ef4ae7e4..03478bae900 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md @@ -4,7 +4,7 @@ toc_priority: 30 # stddevPop {#stddevpop} -结果等于 [varPop] (../../../sql-reference/aggregate-functions/reference/varpop.md)的平方根。 +结果等于 [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md)的平方根。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevPopStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevPopStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md index 68a348146a9..d49b9d89fd9 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md @@ -7,4 +7,4 @@ toc_priority: 31 结果等于 [varSamp] (../../../sql-reference/aggregate-functions/reference/varsamp.md)的平方根。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/varpop.md b/docs/zh/sql-reference/aggregate-functions/reference/varpop.md index 4dca8efde38..502c1887e38 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/varpop.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/varpop.md @@ -9,4 +9,4 @@ toc_priority: 32 换句话说,计算一组数据的离差。 返回 `Float64`。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varPopStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varPopStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md index c83ee7e24d2..73481c329e4 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md @@ -11,5 +11,5 @@ toc_priority: 33 返回 `Float64`。 当 `n <= 1`,返回 `+∞`。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/functions/type-conversion-functions.md b/docs/zh/sql-reference/functions/type-conversion-functions.md index 6c8843fe2bd..c1d1e66664e 100644 --- a/docs/zh/sql-reference/functions/type-conversion-functions.md +++ b/docs/zh/sql-reference/functions/type-conversion-functions.md @@ -167,9 +167,9 @@ SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) ``` ``` text -┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ -│ -1.11100 │ Nullable(Decimal(9, 5)) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ +│ -1.111 │ Nullable(Decimal(9, 5)) │ +└────────┴────────────────────────────────────────────────────┘ ``` ``` sql @@ -210,9 +210,9 @@ SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) ``` ``` text -┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ -│ -1.11100 │ Decimal(9, 5) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ +│ -1.111 │ Decimal(9, 5) │ +└────────┴────────────────────────────────────────────────────┘ ``` ``` sql diff --git a/docs/zh/sql-reference/statements/create/function.md b/docs/zh/sql-reference/statements/create/function.md deleted file mode 120000 index d41429cb260..00000000000 --- a/docs/zh/sql-reference/statements/create/function.md +++ /dev/null @@ -1 +0,0 @@ -../../../../en/sql-reference/statements/create/function.md \ No newline at end of file diff --git a/docs/zh/sql-reference/statements/create/function.md b/docs/zh/sql-reference/statements/create/function.md new file mode 100644 index 00000000000..d57810ac91b --- /dev/null +++ b/docs/zh/sql-reference/statements/create/function.md @@ -0,0 +1,60 @@ +--- +toc_priority: 38 +toc_title: FUNCTION +--- + +# CREATE FUNCTION {#create-function} + +用一个lambda表达式创建用户自定义函数。该表达式必须由函数参数、常数、运算符或其他函数调用组成。 + +**语法** + +```sql +CREATE FUNCTION name AS (parameter0, ...) -> expression +``` + +一个函数可以有任意数量的参数。 + +存在一些限制如下: + +- 函数名在用户自定义函数和系统函数中必须是唯一的。 +- 递归函数是不允许的。 +- 函数所使用的所有变量必须在其参数列表中指定。 + +如果违反了任何限制,就会产生异常。 + +**示例** + +查询: + +```sql +CREATE FUNCTION linear_equation AS (x, k, b) -> k*x + b; +SELECT number, linear_equation(number, 2, 1) FROM numbers(3); +``` + +结果: + +``` text +┌─number─┬─plus(multiply(2, number), 1)─┐ +│ 0 │ 1 │ +│ 1 │ 3 │ +│ 2 │ 5 │ +└────────┴──────────────────────────────┘ +``` + +在下面的查询中,[conditional function](../../../sql-reference/functions/conditional-functions.md)在用户自定义函数中被调用: + +```sql +CREATE FUNCTION parity_str AS (n) -> if(n % 2, 'odd', 'even'); +SELECT number, parity_str(number) FROM numbers(3); +``` + +结果: + +``` text +┌─number─┬─if(modulo(number, 2), 'odd', 'even')─┐ +│ 0 │ even │ +│ 1 │ odd │ +│ 2 │ even │ +└────────┴──────────────────────────────────────┘ +``` diff --git a/programs/benchmark/Benchmark.cpp b/programs/benchmark/Benchmark.cpp index 1c276a83768..35ffb97b8e2 100644 --- a/programs/benchmark/Benchmark.cpp +++ b/programs/benchmark/Benchmark.cpp @@ -342,6 +342,9 @@ private: } } + /// Now we don't block the Ctrl+C signal and second signal will terminate the program without waiting. + interrupt_listener.unblock(); + pool.wait(); total_watch.stop(); @@ -586,7 +589,6 @@ public: #ifndef __clang__ #pragma GCC optimize("-fno-var-tracking-assignments") #endif -#pragma GCC diagnostic ignored "-Wmissing-declarations" int mainEntryClickHouseBenchmark(int argc, char ** argv) { diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index e01677aaac6..b1e1345cf71 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -25,7 +25,6 @@ #include #include #include -#include "Common/MemoryTracker.h" #include #include @@ -56,11 +55,6 @@ #pragma GCC optimize("-fno-var-tracking-assignments") #endif -namespace CurrentMetrics -{ - extern const Metric MemoryTracking; -} - namespace fs = std::filesystem; @@ -410,16 +404,6 @@ try std::cout << std::fixed << std::setprecision(3); std::cerr << std::fixed << std::setprecision(3); - /// Limit on total memory usage - size_t max_client_memory_usage = config().getInt64("max_memory_usage_in_client", 0 /*default value*/); - - if (max_client_memory_usage != 0) - { - total_memory_tracker.setHardLimit(max_client_memory_usage); - total_memory_tracker.setDescription("(total)"); - total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking); - } - registerFormats(); registerFunctions(); registerAggregateFunctions(); @@ -1014,7 +998,6 @@ void Client::addOptions(OptionsDescription & options_description) ("opentelemetry-tracestate", po::value(), "OpenTelemetry tracestate header as described by W3C Trace Context recommendation") ("no-warnings", "disable warnings when client connects to server") - ("max_memory_usage_in_client", po::value(), "sets memory limit in client") ; /// Commandline options related to external tables. diff --git a/programs/copier/ClusterCopier.cpp b/programs/copier/ClusterCopier.cpp index 536bb37199d..4d491a06795 100644 --- a/programs/copier/ClusterCopier.cpp +++ b/programs/copier/ClusterCopier.cpp @@ -46,7 +46,7 @@ void ClusterCopier::init() reloadTaskDescription(); task_cluster->loadTasks(*task_cluster_current_config); - getContext()->setClustersConfig(task_cluster_current_config, task_cluster->clusters_prefix); + getContext()->setClustersConfig(task_cluster_current_config, false, task_cluster->clusters_prefix); /// Set up shards and their priority task_cluster->random_engine.seed(task_cluster->random_device()); diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index 706e273e2b4..dd93e0b49ab 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -153,10 +153,12 @@ static void createGroup(const String & group_name) if (!group_name.empty()) { #if defined(OS_DARWIN) - // TODO: implement. - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unable to create a group in macOS"); +#elif defined(OS_FREEBSD) + std::string command = fmt::format("pw groupadd {}", group_name); + fmt::print(" {}\n", command); + executeScript(command); #else std::string command = fmt::format("groupadd -r {}", group_name); fmt::print(" {}\n", command); @@ -170,10 +172,14 @@ static void createUser(const String & user_name, [[maybe_unused]] const String & if (!user_name.empty()) { #if defined(OS_DARWIN) - // TODO: implement. - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unable to create a user in macOS"); +#elif defined(OS_FREEBSD) + std::string command = group_name.empty() + ? fmt::format("pw useradd -s /bin/false -d /nonexistent -n {}", user_name) + : fmt::format("pw useradd -s /bin/false -d /nonexistent -g {} -n {}", group_name, user_name); + fmt::print(" {}\n", command); + executeScript(command); #else std::string command = group_name.empty() ? fmt::format("useradd -r --shell /bin/false --home-dir /nonexistent --user-group {}", user_name) @@ -185,6 +191,20 @@ static void createUser(const String & user_name, [[maybe_unused]] const String & } +static std::string formatWithSudo(std::string command, bool needed = true) +{ + if (!needed) + return command; + +#if defined(OS_FREEBSD) + /// FreeBSD does not have 'sudo' installed. + return fmt::format("su -m root -c '{}'", command); +#else + return fmt::format("sudo {}", command); +#endif +} + + int mainEntryClickHouseInstall(int argc, char ** argv) { try @@ -207,10 +227,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) if (options.count("help")) { - std::cout << "Usage: " - << (getuid() == 0 ? "" : "sudo ") - << argv[0] - << " install [options]\n"; + std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " install [options]", getuid() != 0) << '\n'; std::cout << desc << '\n'; return 1; } @@ -233,6 +250,9 @@ int mainEntryClickHouseInstall(int argc, char ** argv) path.pop_back(); fs::path binary_self_path(path); +#elif defined(OS_FREEBSD) + /// https://stackoverflow.com/questions/1023306/finding-current-executables-path-without-proc-self-exe + fs::path binary_self_path = argc >= 1 ? argv[0] : "/proc/curproc/file"; #else fs::path binary_self_path = "/proc/self/exe"; #endif @@ -314,7 +334,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) catch (const Exception & e) { if (e.code() == ErrorCodes::CANNOT_OPEN_FILE && geteuid() != 0) - std::cerr << "Install must be run as root: sudo ./clickhouse install\n"; + std::cerr << "Install must be run as root: " << formatWithSudo("./clickhouse install") << '\n'; throw; } @@ -824,9 +844,10 @@ int mainEntryClickHouseInstall(int argc, char ** argv) fmt::print( "\nClickHouse has been successfully installed.\n" "\nRestart clickhouse-server with:\n" - " sudo clickhouse restart\n" + " {}\n" "\nStart clickhouse-client with:\n" " clickhouse-client{}\n\n", + formatWithSudo("clickhouse restart"), maybe_password); } else @@ -834,9 +855,10 @@ int mainEntryClickHouseInstall(int argc, char ** argv) fmt::print( "\nClickHouse has been successfully installed.\n" "\nStart clickhouse-server with:\n" - " sudo clickhouse start\n" + " {}\n" "\nStart clickhouse-client with:\n" " clickhouse-client{}\n\n", + formatWithSudo("clickhouse start"), maybe_password); } } @@ -845,7 +867,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) std::cerr << getCurrentExceptionMessage(false) << '\n'; if (getuid() != 0) - std::cerr << "\nRun with sudo.\n"; + std::cerr << "\nRun with " << formatWithSudo("...") << "\n"; return getCurrentExceptionCode(); } @@ -901,6 +923,9 @@ namespace if (!user.empty()) { +#if defined(OS_FREEBSD) + command = fmt::format("su -m '{}' -c '{}'", user, command); +#else bool may_need_sudo = geteuid() != 0; if (may_need_sudo) { @@ -910,7 +935,10 @@ namespace command = fmt::format("sudo -u '{}' {}", user, command); } else + { command = fmt::format("su -s /bin/sh '{}' -c '{}'", user, command); + } +#endif } fmt::print("Will run {}\n", command); @@ -1114,10 +1142,7 @@ int mainEntryClickHouseStart(int argc, char ** argv) if (options.count("help")) { - std::cout << "Usage: " - << (getuid() == 0 ? "" : "sudo ") - << argv[0] - << " start\n"; + std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " start", getuid() != 0) << '\n'; return 1; } @@ -1155,10 +1180,7 @@ int mainEntryClickHouseStop(int argc, char ** argv) if (options.count("help")) { - std::cout << "Usage: " - << (getuid() == 0 ? "" : "sudo ") - << argv[0] - << " stop\n"; + std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " stop", getuid() != 0) << '\n'; return 1; } @@ -1191,10 +1213,7 @@ int mainEntryClickHouseStatus(int argc, char ** argv) if (options.count("help")) { - std::cout << "Usage: " - << (getuid() == 0 ? "" : "sudo ") - << argv[0] - << " status\n"; + std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " status", getuid() != 0) << '\n'; return 1; } @@ -1233,10 +1252,7 @@ int mainEntryClickHouseRestart(int argc, char ** argv) if (options.count("help")) { - std::cout << "Usage: " - << (getuid() == 0 ? "" : "sudo ") - << argv[0] - << " restart\n"; + std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " restart", getuid() != 0) << '\n'; return 1; } diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index afd6a36ea15..d144b4d332e 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -379,11 +380,11 @@ int Keeper::main(const std::vector & /*args*/) socket.setReceiveTimeout(settings.receive_timeout); socket.setSendTimeout(settings.send_timeout); servers->emplace_back( + listen_host, port_name, - std::make_unique( - new KeeperTCPHandlerFactory(*this, false), server_pool, socket, new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for connections to Keeper (tcp): {}", address.toString()); + "Keeper (tcp): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory(*this, false), server_pool, socket)); }); const char * secure_port_name = "keeper_server.tcp_port_secure"; @@ -395,10 +396,11 @@ int Keeper::main(const std::vector & /*args*/) socket.setReceiveTimeout(settings.receive_timeout); socket.setSendTimeout(settings.send_timeout); servers->emplace_back( + listen_host, secure_port_name, - std::make_unique( - new KeeperTCPHandlerFactory(*this, true), server_pool, socket, new Poco::Net::TCPServerParams)); - LOG_INFO(log, "Listening for connections to Keeper with secure protocol (tcp_secure): {}", address.toString()); + "Keeper with secure protocol (tcp_secure): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory(*this, true), server_pool, socket)); #else UNUSED(port); throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", @@ -408,7 +410,10 @@ int Keeper::main(const std::vector & /*args*/) } for (auto & server : *servers) + { server.start(); + LOG_INFO(log, "Listening for {}", server.getDescription()); + } zkutil::EventPtr unused_event = std::make_shared(); zkutil::ZooKeeperNodeCache unused_cache([] { return nullptr; }); diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 53e295b7fbb..aa4747636c9 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -313,11 +313,11 @@ void LocalServer::cleanup() std::string LocalServer::getInitialCreateTableQuery() { - if (!config().has("table-structure")) + if (!config().has("table-structure") && !config().has("table-file")) return {}; auto table_name = backQuoteIfNeed(config().getString("table-name", "table")); - auto table_structure = config().getString("table-structure"); + auto table_structure = config().getString("table-structure", "auto"); auto data_format = backQuoteIfNeed(config().getString("table-data-format", "TSV")); String table_file; @@ -332,7 +332,12 @@ std::string LocalServer::getInitialCreateTableQuery() table_file = quoteString(config().getString("table-file")); } - return fmt::format("CREATE TABLE {} ({}) ENGINE = File({}, {});", + if (table_structure == "auto") + table_structure = ""; + else + table_structure = "(" + table_structure + ")"; + + return fmt::format("CREATE TABLE {} {} ENGINE = File({}, {});", table_name, table_structure, data_format, table_file); } @@ -422,7 +427,7 @@ try #else is_interactive = stdin_is_a_tty && (config().hasOption("interactive") - || (!config().has("query") && !config().has("table-structure") && queries_files.empty())); + || (!config().has("query") && !config().has("table-structure") && queries_files.empty() && !config().has("table-file"))); #endif if (!is_interactive) { diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 14075f9fbf2..67c754495d1 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -70,6 +71,7 @@ #include "MetricsTransmitter.h" #include #include +#include #include #include #include @@ -127,6 +129,11 @@ namespace CurrentMetrics extern const Metric MaxPushedDDLEntryID; } +namespace ProfileEvents +{ + extern const Event MainConfigLoads; +} + namespace fs = std::filesystem; #if USE_JEMALLOC @@ -344,16 +351,53 @@ Poco::Net::SocketAddress Server::socketBindListen(Poco::Net::ServerSocket & sock return address; } -void Server::createServer(const std::string & listen_host, const char * port_name, bool listen_try, CreateServerFunc && func) const +std::vector getListenHosts(const Poco::Util::AbstractConfiguration & config) +{ + auto listen_hosts = DB::getMultipleValuesFromConfig(config, "", "listen_host"); + if (listen_hosts.empty()) + { + listen_hosts.emplace_back("::1"); + listen_hosts.emplace_back("127.0.0.1"); + } + return listen_hosts; +} + +bool getListenTry(const Poco::Util::AbstractConfiguration & config) +{ + bool listen_try = config.getBool("listen_try", false); + if (!listen_try) + listen_try = DB::getMultipleValuesFromConfig(config, "", "listen_host").empty(); + return listen_try; +} + + +void Server::createServer( + Poco::Util::AbstractConfiguration & config, + const std::string & listen_host, + const char * port_name, + bool listen_try, + bool start_server, + std::vector & servers, + CreateServerFunc && func) const { /// For testing purposes, user may omit tcp_port or http_port or https_port in configuration file. - if (!config().has(port_name)) + if (config.getString(port_name, "").empty()) return; - auto port = config().getInt(port_name); + /// If we already have an active server for this listen_host/port_name, don't create it again + for (const auto & server : servers) + if (!server.isStopping() && server.getListenHost() == listen_host && server.getPortName() == port_name) + return; + + auto port = config.getInt(port_name); try { - func(port); + servers.push_back(func(port)); + if (start_server) + { + servers.back().start(); + LOG_INFO(&logger(), "Listening for {}", servers.back().getDescription()); + } global_context->registerServerPort(port_name, port); } catch (const Poco::Exception &) @@ -515,6 +559,27 @@ if (ThreadFuzzer::instance().isEffective()) config().getUInt("thread_pool_queue_size", 10000) ); + Poco::ThreadPool server_pool(3, config().getUInt("max_connections", 1024)); + std::mutex servers_lock; + std::vector servers; + std::vector servers_to_start_before_tables; + /// This object will periodically calculate some metrics. + AsynchronousMetrics async_metrics( + global_context, config().getUInt("asynchronous_metrics_update_period_s", 1), + [&]() -> std::vector + { + std::vector metrics; + metrics.reserve(servers_to_start_before_tables.size()); + for (const auto & server : servers_to_start_before_tables) + metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); + + std::lock_guard lock(servers_lock); + for (const auto & server : servers) + metrics.emplace_back(ProtocolServerMetrics{server.getPortName(), server.currentThreads()}); + return metrics; + } + ); + ConnectionCollector::init(global_context, config().getUInt("max_threads_for_connection_collector", 10)); bool has_zookeeper = config().has("zookeeper"); @@ -841,7 +906,7 @@ if (ThreadFuzzer::instance().isEffective()) // in a lot of places. For now, disable updating log configuration without server restart. //setTextLog(global_context->getTextLog()); updateLevels(*config, logger()); - global_context->setClustersConfig(config); + global_context->setClustersConfig(config, has_zookeeper); global_context->setMacros(std::make_unique(*config, "macros", log)); global_context->setExternalAuthenticatorsConfig(*config); @@ -859,6 +924,12 @@ if (ThreadFuzzer::instance().isEffective()) if (config->has("max_concurrent_queries")) global_context->getProcessList().setMaxSize(config->getInt("max_concurrent_queries", 0)); + if (config->has("max_concurrent_insert_queries")) + global_context->getProcessList().setMaxInsertQueriesAmount(config->getInt("max_concurrent_insert_queries", 0)); + + if (config->has("max_concurrent_select_queries")) + global_context->getProcessList().setMaxSelectQueriesAmount(config->getInt("max_concurrent_select_queries", 0)); + if (config->has("keeper_server")) global_context->updateKeeperConfiguration(*config); @@ -870,12 +941,17 @@ if (ThreadFuzzer::instance().isEffective()) global_context->reloadZooKeeperIfChanged(config); global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config); + + std::lock_guard lock(servers_lock); + updateServers(*config, server_pool, async_metrics, servers); } global_context->updateStorageConfiguration(*config); global_context->updateInterserverCredentials(*config); CompressionCodecEncrypted::Configuration::instance().tryLoad(*config, "encryption_codecs"); + + ProfileEvents::increment(ProfileEvents::MainConfigLoads); }, /* already_loaded = */ false); /// Reload it right now (initial loading) @@ -987,24 +1063,8 @@ if (ThreadFuzzer::instance().isEffective()) /// try set up encryption. There are some errors in config, error will be printed and server wouldn't start. CompressionCodecEncrypted::Configuration::instance().load(config(), "encryption_codecs"); - Poco::Timespan keep_alive_timeout(config().getUInt("keep_alive_timeout", 10), 0); - - Poco::ThreadPool server_pool(3, config().getUInt("max_connections", 1024)); - Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; - http_params->setTimeout(settings.http_receive_timeout); - http_params->setKeepAliveTimeout(keep_alive_timeout); - - auto servers_to_start_before_tables = std::make_shared>(); - - std::vector listen_hosts = DB::getMultipleValuesFromConfig(config(), "", "listen_host"); - - bool listen_try = config().getBool("listen_try", false); - if (listen_hosts.empty()) - { - listen_hosts.emplace_back("::1"); - listen_hosts.emplace_back("127.0.0.1"); - listen_try = true; - } + const auto listen_hosts = getListenHosts(config()); + const auto listen_try = getListenTry(config()); if (config().has("keeper_server")) { @@ -1027,39 +1087,46 @@ if (ThreadFuzzer::instance().isEffective()) { /// TCP Keeper const char * port_name = "keeper_server.tcp_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - servers_to_start_before_tables->emplace_back( - port_name, - std::make_unique( - new KeeperTCPHandlerFactory(*this, false), server_pool, socket, new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for connections to Keeper (tcp): {}", address.toString()); - }); + createServer( + config(), listen_host, port_name, listen_try, /* start_server: */ false, + servers_to_start_before_tables, + [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "Keeper (tcp): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory(*this, false), server_pool, socket)); + }); const char * secure_port_name = "keeper_server.tcp_port_secure"; - createServer(listen_host, secure_port_name, listen_try, [&](UInt16 port) - { + createServer( + config(), listen_host, secure_port_name, listen_try, /* start_server: */ false, + servers_to_start_before_tables, + [&](UInt16 port) -> ProtocolServerAdapter + { #if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - servers_to_start_before_tables->emplace_back( - secure_port_name, - std::make_unique( - new KeeperTCPHandlerFactory(*this, true), server_pool, socket, new Poco::Net::TCPServerParams)); - LOG_INFO(log, "Listening for connections to Keeper with secure protocol (tcp_secure): {}", address.toString()); + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + secure_port_name, + "Keeper with secure protocol (tcp_secure): " + address.toString(), + std::make_unique( + new KeeperTCPHandlerFactory(*this, true), server_pool, socket)); #else - UNUSED(port); - throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; + UNUSED(port); + throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; #endif - }); + }); } #else throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination."); @@ -1067,14 +1134,19 @@ if (ThreadFuzzer::instance().isEffective()) } - for (auto & server : *servers_to_start_before_tables) + for (auto & server : servers_to_start_before_tables) + { server.start(); + LOG_INFO(log, "Listening for {}", server.getDescription()); + } SCOPE_EXIT({ /// Stop reloading of the main config. This must be done before `global_context->shutdown()` because /// otherwise the reloading may pass a changed config to some destroyed parts of ContextSharedPart. main_config_reloader.reset(); + async_metrics.stop(); + /** Ask to cancel background jobs all table engines, * and also query_log. * It is important to do early, not in destructor of Context, because @@ -1086,11 +1158,11 @@ if (ThreadFuzzer::instance().isEffective()) LOG_DEBUG(log, "Shut down storages."); - if (!servers_to_start_before_tables->empty()) + if (!servers_to_start_before_tables.empty()) { LOG_DEBUG(log, "Waiting for current connections to servers for tables to finish."); int current_connections = 0; - for (auto & server : *servers_to_start_before_tables) + for (auto & server : servers_to_start_before_tables) { server.stop(); current_connections += server.currentConnections(); @@ -1102,7 +1174,7 @@ if (ThreadFuzzer::instance().isEffective()) LOG_INFO(log, "Closed all listening sockets."); if (current_connections > 0) - current_connections = waitServersToFinish(*servers_to_start_before_tables, config().getInt("shutdown_wait_unfinished", 5)); + current_connections = waitServersToFinish(servers_to_start_before_tables, config().getInt("shutdown_wait_unfinished", 5)); if (current_connections) LOG_INFO(log, "Closed connections to servers for tables. But {} remain. Probably some tables of other users cannot finish their connections after context shutdown.", current_connections); @@ -1256,223 +1328,18 @@ if (ThreadFuzzer::instance().isEffective()) LOG_INFO(log, "TaskStats is not implemented for this OS. IO accounting will be disabled."); #endif - auto servers = std::make_shared>(); { - /// This object will periodically calculate some metrics. - AsynchronousMetrics async_metrics( - global_context, config().getUInt("asynchronous_metrics_update_period_s", 1), servers_to_start_before_tables, servers); attachSystemTablesAsync(global_context, *DatabaseCatalog::instance().getSystemDatabase(), async_metrics); - for (const auto & listen_host : listen_hosts) { - /// HTTP - const char * port_name = "http_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - - servers->emplace_back( - port_name, - std::make_unique( - context(), createHandlerFactory(*this, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params)); - - LOG_INFO(log, "Listening for http://{}", address.toString()); - }); - - /// HTTPS - port_name = "https_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back( - port_name, - std::make_unique( - context(), createHandlerFactory(*this, async_metrics, "HTTPSHandler-factory"), server_pool, socket, http_params)); - - LOG_INFO(log, "Listening for https://{}", address.toString()); -#else - UNUSED(port); - throw Exception{"HTTPS protocol is disabled because Poco library was built without NetSSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; -#endif - }); - - /// TCP - port_name = "tcp_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - servers->emplace_back(port_name, std::make_unique( - new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ false), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for connections with native protocol (tcp): {}", address.toString()); - }); - - /// TCP with PROXY protocol, see https://github.com/wolfeidau/proxyv2/blob/master/docs/proxy-protocol.txt - port_name = "tcp_with_proxy_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - servers->emplace_back(port_name, std::make_unique( - new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ true), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for connections with native protocol (tcp) with PROXY: {}", address.toString()); - }); - - /// TCP with SSL - port_name = "tcp_port_secure"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.receive_timeout); - socket.setSendTimeout(settings.send_timeout); - servers->emplace_back(port_name, std::make_unique( - new TCPHandlerFactory(*this, /* secure */ true, /* proxy protocol */ false), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - LOG_INFO(log, "Listening for connections with secure native protocol (tcp_secure): {}", address.toString()); -#else - UNUSED(port); - throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; -#endif - }); - - /// Interserver IO HTTP - port_name = "interserver_http_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back( - port_name, - std::make_unique( - context(), - createHandlerFactory(*this, async_metrics, "InterserverIOHTTPHandler-factory"), - server_pool, - socket, - http_params)); - - LOG_INFO(log, "Listening for replica communication (interserver): http://{}", address.toString()); - }); - - port_name = "interserver_https_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { -#if USE_SSL - Poco::Net::SecureServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back( - port_name, - std::make_unique( - context(), - createHandlerFactory(*this, async_metrics, "InterserverIOHTTPSHandler-factory"), - server_pool, - socket, - http_params)); - - LOG_INFO(log, "Listening for secure replica communication (interserver): https://{}", address.toString()); -#else - UNUSED(port); - throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; -#endif - }); - - port_name = "mysql_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(Poco::Timespan()); - socket.setSendTimeout(settings.send_timeout); - servers->emplace_back(port_name, std::make_unique( - new MySQLHandlerFactory(*this), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for MySQL compatibility protocol: {}", address.toString()); - }); - - port_name = "postgresql_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); - socket.setReceiveTimeout(Poco::Timespan()); - socket.setSendTimeout(settings.send_timeout); - servers->emplace_back(port_name, std::make_unique( - new PostgreSQLHandlerFactory(*this), - server_pool, - socket, - new Poco::Net::TCPServerParams)); - - LOG_INFO(log, "Listening for PostgreSQL compatibility protocol: " + address.toString()); - }); - -#if USE_GRPC - port_name = "grpc_port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::SocketAddress server_address(listen_host, port); - servers->emplace_back(port_name, std::make_unique(*this, makeSocketAddress(listen_host, port, log))); - LOG_INFO(log, "Listening for gRPC protocol: " + server_address.toString()); - }); -#endif - - /// Prometheus (if defined and not setup yet with http_port) - port_name = "prometheus.port"; - createServer(listen_host, port_name, listen_try, [&](UInt16 port) - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(settings.http_receive_timeout); - socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back( - port_name, - std::make_unique( - context(), - createHandlerFactory(*this, async_metrics, "PrometheusHandler-factory"), - server_pool, - socket, - http_params)); - - LOG_INFO(log, "Listening for Prometheus: http://{}", address.toString()); - }); + std::lock_guard lock(servers_lock); + createServers(config(), listen_hosts, listen_try, server_pool, async_metrics, servers); + if (servers.empty()) + throw Exception( + "No servers started (add valid listen_host and 'tcp_port' or 'http_port' to configuration file.)", + ErrorCodes::NO_ELEMENTS_IN_CONFIG); } - if (servers->empty()) - throw Exception("No servers started (add valid listen_host and 'tcp_port' or 'http_port' to configuration file.)", - ErrorCodes::NO_ELEMENTS_IN_CONFIG); - - /// Must be done after initialization of `servers`, because async_metrics will access `servers` variable from its thread. async_metrics.start(); { @@ -1551,9 +1418,24 @@ if (ThreadFuzzer::instance().isEffective()) &CurrentMetrics::MaxDDLEntryID, &CurrentMetrics::MaxPushedDDLEntryID)); } - for (auto & server : *servers) - server.start(); - LOG_INFO(log, "Ready for connections."); + { + std::lock_guard lock(servers_lock); + for (auto & server : servers) + { + server.start(); + LOG_INFO(log, "Listening for {}", server.getDescription()); + } + LOG_INFO(log, "Ready for connections."); + } + + try + { + global_context->startClusterDiscovery(); + } + catch (...) + { + tryLogCurrentException(log, "Caught exception while starting cluster discovery"); + } SCOPE_EXIT_SAFE({ LOG_DEBUG(log, "Received termination signal."); @@ -1562,10 +1444,13 @@ if (ThreadFuzzer::instance().isEffective()) is_cancelled = true; int current_connections = 0; - for (auto & server : *servers) { - server.stop(); - current_connections += server.currentConnections(); + std::lock_guard lock(servers_lock); + for (auto & server : servers) + { + server.stop(); + current_connections += server.currentConnections(); + } } if (current_connections) @@ -1578,7 +1463,7 @@ if (ThreadFuzzer::instance().isEffective()) global_context->getProcessList().killAllQueries(); if (current_connections) - current_connections = waitServersToFinish(*servers, config().getInt("shutdown_wait_unfinished", 5)); + current_connections = waitServersToFinish(servers, config().getInt("shutdown_wait_unfinished", 5)); if (current_connections) LOG_INFO(log, "Closed connections. But {} remain." @@ -1614,4 +1499,273 @@ if (ThreadFuzzer::instance().isEffective()) return Application::EXIT_OK; } + +void Server::createServers( + Poco::Util::AbstractConfiguration & config, + const std::vector & listen_hosts, + bool listen_try, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers, + bool start_servers) +{ + const Settings & settings = global_context->getSettingsRef(); + + Poco::Timespan keep_alive_timeout(config.getUInt("keep_alive_timeout", 10), 0); + Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; + http_params->setTimeout(settings.http_receive_timeout); + http_params->setKeepAliveTimeout(keep_alive_timeout); + + for (const auto & listen_host : listen_hosts) + { + /// HTTP + const char * port_name = "http_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + + return ProtocolServerAdapter( + listen_host, + port_name, + "http://" + address.toString(), + std::make_unique( + context(), createHandlerFactory(*this, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params)); + }); + + /// HTTPS + port_name = "https_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "https://" + address.toString(), + std::make_unique( + context(), createHandlerFactory(*this, async_metrics, "HTTPSHandler-factory"), server_pool, socket, http_params)); +#else + UNUSED(port); + throw Exception{"HTTPS protocol is disabled because Poco library was built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + }); + + /// TCP + port_name = "tcp_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "native protocol (tcp): " + address.toString(), + std::make_unique( + new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ false), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }); + + /// TCP with PROXY protocol, see https://github.com/wolfeidau/proxyv2/blob/master/docs/proxy-protocol.txt + port_name = "tcp_with_proxy_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "native protocol (tcp) with PROXY: " + address.toString(), + std::make_unique( + new TCPHandlerFactory(*this, /* secure */ false, /* proxy protocol */ true), + server_pool, + socket, + new Poco::Net::TCPServerParams)); + }); + + /// TCP with SSL + port_name = "tcp_port_secure"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.receive_timeout); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "secure native protocol (tcp_secure): " + address.toString(), + std::make_unique( + new TCPHandlerFactory(*this, /* secure */ true, /* proxy protocol */ false), + server_pool, + socket, + new Poco::Net::TCPServerParams)); +#else + UNUSED(port); + throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + }); + + /// Interserver IO HTTP + port_name = "interserver_http_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "replica communication (interserver): http://" + address.toString(), + std::make_unique( + context(), + createHandlerFactory(*this, async_metrics, "InterserverIOHTTPHandler-factory"), + server_pool, + socket, + http_params)); + }); + + port_name = "interserver_https_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { +#if USE_SSL + Poco::Net::SecureServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "secure replica communication (interserver): https://" + address.toString(), + std::make_unique( + context(), + createHandlerFactory(*this, async_metrics, "InterserverIOHTTPSHandler-factory"), + server_pool, + socket, + http_params)); +#else + UNUSED(port); + throw Exception{"SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.", + ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + }); + + port_name = "mysql_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(Poco::Timespan()); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "MySQL compatibility protocol: " + address.toString(), + std::make_unique(new MySQLHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams)); + }); + + port_name = "postgresql_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); + socket.setReceiveTimeout(Poco::Timespan()); + socket.setSendTimeout(settings.send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "PostgreSQL compatibility protocol: " + address.toString(), + std::make_unique(new PostgreSQLHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams)); + }); + +#if USE_GRPC + port_name = "grpc_port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::SocketAddress server_address(listen_host, port); + return ProtocolServerAdapter( + listen_host, + port_name, + "gRPC protocol: " + server_address.toString(), + std::make_unique(*this, makeSocketAddress(listen_host, port, &logger()))); + }); +#endif + + /// Prometheus (if defined and not setup yet with http_port) + port_name = "prometheus.port"; + createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(settings.http_receive_timeout); + socket.setSendTimeout(settings.http_send_timeout); + return ProtocolServerAdapter( + listen_host, + port_name, + "Prometheus: http://" + address.toString(), + std::make_unique( + context(), createHandlerFactory(*this, async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params)); + }); + } + +} + +void Server::updateServers( + Poco::Util::AbstractConfiguration & config, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers) +{ + Poco::Logger * log = &logger(); + /// Gracefully shutdown servers when their port is removed from config + const auto listen_hosts = getListenHosts(config); + const auto listen_try = getListenTry(config); + + for (auto & server : servers) + if (!server.isStopping()) + { + bool has_host = std::find(listen_hosts.begin(), listen_hosts.end(), server.getListenHost()) != listen_hosts.end(); + bool has_port = !config.getString(server.getPortName(), "").empty(); + if (!has_host || !has_port || config.getInt(server.getPortName()) != server.portNumber()) + { + server.stop(); + LOG_INFO(log, "Stopped listening for {}", server.getDescription()); + } + } + + createServers(config, listen_hosts, listen_try, server_pool, async_metrics, servers, /* start_servers: */ true); + + /// Remove servers once all their connections are closed + while (std::any_of(servers.begin(), servers.end(), [](const auto & server) { return server.isStopping(); })) + { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + std::erase_if(servers, [&log](auto & server) + { + if (!server.isStopping()) + return false; + auto is_finished = server.currentConnections() == 0; + if (is_finished) + LOG_DEBUG(log, "Server finished: {}", server.getDescription()); + else + LOG_TRACE(log, "Waiting server to finish: {}", server.getDescription()); + return is_finished; + }); + } +} + } diff --git a/programs/server/Server.h b/programs/server/Server.h index 45e5fccd51d..b4f2ea3bb79 100644 --- a/programs/server/Server.h +++ b/programs/server/Server.h @@ -24,6 +24,8 @@ namespace Poco namespace DB { +class AsynchronousMetrics; +class ProtocolServerAdapter; class Server : public BaseDaemon, public IServer { @@ -67,8 +69,30 @@ private: ContextMutablePtr global_context; Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure = false) const; - using CreateServerFunc = std::function; - void createServer(const std::string & listen_host, const char * port_name, bool listen_try, CreateServerFunc && func) const; + using CreateServerFunc = std::function; + void createServer( + Poco::Util::AbstractConfiguration & config, + const std::string & listen_host, + const char * port_name, + bool listen_try, + bool start_server, + std::vector & servers, + CreateServerFunc && func) const; + + void createServers( + Poco::Util::AbstractConfiguration & config, + const std::vector & listen_hosts, + bool listen_try, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers, + bool start_servers = false); + + void updateServers( + Poco::Util::AbstractConfiguration & config, + Poco::ThreadPool & server_pool, + AsynchronousMetrics & async_metrics, + std::vector & servers); }; } diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index cb6c326cb84..4472e975878 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -160,6 +160,7 @@ enum class AccessType M(SYSTEM_FLUSH_DISTRIBUTED, "FLUSH DISTRIBUTED", TABLE, SYSTEM_FLUSH) \ M(SYSTEM_FLUSH_LOGS, "FLUSH LOGS", GLOBAL, SYSTEM_FLUSH) \ M(SYSTEM_FLUSH, "", GROUP, SYSTEM) \ + M(SYSTEM_THREAD_FUZZER, "SYSTEM START THREAD FUZZER, SYSTEM STOP THREAD FUZZER, START THREAD FUZZER, STOP THREAD FUZZER", GLOBAL, SYSTEM) \ M(SYSTEM, "", GROUP, ALL) /* allows to execute SYSTEM {SHUTDOWN|RELOAD CONFIG|...} */ \ \ M(dictGet, "dictHas, dictGetHierarchy, dictIsIn", DICTIONARY, ALL) /* allows to execute functions dictGet(), dictHas(), dictGetHierarchy(), dictIsIn() */\ diff --git a/src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp b/src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp new file mode 100644 index 00000000000..619abbb8a61 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + +struct ContingencyData : CrossTabData +{ + static const char * getName() + { + return "contingency"; + } + + Float64 getResult() const + { + if (count < 2) + return std::numeric_limits::quiet_NaN(); + + Float64 phi = getPhiSquared(); + return sqrt(phi / (phi + count)); + } +}; + +} + +void registerAggregateFunctionContingency(AggregateFunctionFactory & factory) +{ + factory.registerFunction(ContingencyData::getName(), + [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) + { + assertBinary(name, argument_types); + assertNoParameters(name, parameters); + return std::make_shared>(argument_types); + }); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionCramersV.cpp b/src/AggregateFunctions/AggregateFunctionCramersV.cpp new file mode 100644 index 00000000000..07b691141bc --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionCramersV.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + +struct CramersVData : CrossTabData +{ + static const char * getName() + { + return "cramersV"; + } + + Float64 getResult() const + { + if (count < 2) + return std::numeric_limits::quiet_NaN(); + + UInt64 q = std::min(count_a.size(), count_b.size()); + return sqrt(getPhiSquared() / (q - 1)); + } +}; + +} + +void registerAggregateFunctionCramersV(AggregateFunctionFactory & factory) +{ + factory.registerFunction(CramersVData::getName(), + [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) + { + assertBinary(name, argument_types); + assertNoParameters(name, parameters); + return std::make_shared>(argument_types); + }); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionCramersVBiasCorrected.cpp b/src/AggregateFunctions/AggregateFunctionCramersVBiasCorrected.cpp new file mode 100644 index 00000000000..917869dcd9f --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionCramersVBiasCorrected.cpp @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + +struct CramersVBiasCorrectedData : CrossTabData +{ + static const char * getName() + { + return "cramersVBiasCorrected"; + } + + Float64 getResult() const + { + if (count < 2) + return std::numeric_limits::quiet_NaN(); + + Float64 phi = getPhiSquared(); + + Float64 a_size_adjusted = count_a.size() - 1; + Float64 b_size_adjusted = count_b.size() - 1; + Float64 count_adjusted = count - 1; + + Float64 res = std::max(0.0, phi - a_size_adjusted * b_size_adjusted / count_adjusted); + Float64 correction_a = count_a.size() - a_size_adjusted * a_size_adjusted / count_adjusted; + Float64 correction_b = count_b.size() - b_size_adjusted * b_size_adjusted / count_adjusted; + + res /= std::min(correction_a, correction_b) - 1; + return sqrt(res); + } +}; + +} + +void registerAggregateFunctionCramersVBiasCorrected(AggregateFunctionFactory & factory) +{ + factory.registerFunction(CramersVBiasCorrectedData::getName(), + [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) + { + assertBinary(name, argument_types); + assertNoParameters(name, parameters); + return std::make_shared>(argument_types); + }); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.h b/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.h index 5969fca9dcf..443d76f47cb 100644 --- a/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.h +++ b/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.h @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -15,6 +16,7 @@ #include + namespace DB { @@ -23,12 +25,11 @@ namespace ErrorCodes extern const int TOO_LARGE_ARRAY_SIZE; } -/** - * Calculate total length of intervals without intersections. Each interval is the pair of numbers [begin, end]; - * Return UInt64 for integral types (UInt/Int*, Date/DateTime) and return Float64 for Float*. - * - * Implementation simply stores intervals sorted by beginning and sums lengths at final. - */ +/** Calculate total length of intervals without intersections. Each interval is the pair of numbers [begin, end]; + * Returns UInt64 for integral types (UInt/Int*, Date/DateTime) and returns Float64 for Float*. + * + * Implementation simply stores intervals sorted by beginning and sums lengths at final. + */ template struct AggregateFunctionIntervalLengthSumData { @@ -43,10 +44,14 @@ struct AggregateFunctionIntervalLengthSumData void add(T begin, T end) { + /// Reversed intervals are counted by absolute value of their length. + if (unlikely(end < begin)) + std::swap(begin, end); + else if (unlikely(begin == end)) + return; + if (sorted && !segments.empty()) - { sorted = segments.back().first <= begin; - } segments.emplace_back(begin, end); } @@ -130,6 +135,11 @@ template class AggregateFunctionIntervalLengthSum final : public IAggregateFunctionDataHelper> { private: + static auto NO_SANITIZE_UNDEFINED length(typename Data::Segment segment) + { + return segment.second - segment.first; + } + template TResult getIntervalLengthSum(Data & data) const { @@ -140,21 +150,24 @@ private: TResult res = 0; - typename Data::Segment cur_segment = data.segments[0]; + typename Data::Segment curr_segment = data.segments[0]; - for (size_t i = 1, sz = data.segments.size(); i < sz; ++i) + for (size_t i = 1, size = data.segments.size(); i < size; ++i) { - /// Check if current interval intersect with next one then add length, otherwise advance interval end - if (cur_segment.second < data.segments[i].first) - { - res += cur_segment.second - cur_segment.first; - cur_segment = data.segments[i]; - } - else - cur_segment.second = std::max(cur_segment.second, data.segments[i].second); - } + const typename Data::Segment & next_segment = data.segments[i]; - res += cur_segment.second - cur_segment.first; + /// Check if current interval intersects with next one then add length, otherwise advance interval end. + if (curr_segment.second < next_segment.first) + { + res += length(curr_segment); + curr_segment = next_segment; + } + else if (next_segment.second > curr_segment.second) + { + curr_segment.second = next_segment.second; + } + } + res += length(curr_segment); return res; } diff --git a/src/AggregateFunctions/AggregateFunctionNothing.cpp b/src/AggregateFunctions/AggregateFunctionNothing.cpp new file mode 100644 index 00000000000..b476806da08 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionNothing.cpp @@ -0,0 +1,20 @@ +#include +#include +#include + + +namespace DB +{ + +struct Settings; + +void registerAggregateFunctionNothing(AggregateFunctionFactory & factory) +{ + factory.registerFunction("nothing", [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) + { + assertNoParameters(name, parameters); + return std::make_shared(argument_types, parameters); + }); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionNothing.h b/src/AggregateFunctions/AggregateFunctionNothing.h index 4374ecf85c3..645ea7c3f8a 100644 --- a/src/AggregateFunctions/AggregateFunctionNothing.h +++ b/src/AggregateFunctions/AggregateFunctionNothing.h @@ -4,6 +4,8 @@ #include #include #include +#include +#include namespace DB @@ -26,7 +28,7 @@ public: DataTypePtr getReturnType() const override { - return argument_types.front(); + return argument_types.empty() ? std::make_shared(std::make_shared()) : argument_types.front(); } bool allocatesMemoryInArena() const override { return false; } @@ -62,12 +64,16 @@ public: { } - void serialize(ConstAggregateDataPtr __restrict, WriteBuffer &, std::optional) const override + void serialize(ConstAggregateDataPtr __restrict, WriteBuffer & buf, std::optional) const override { + writeChar('\0', buf); } - void deserialize(AggregateDataPtr, ReadBuffer &, std::optional, Arena *) const override + void deserialize(AggregateDataPtr, ReadBuffer & buf, std::optional, Arena *) const override { + [[maybe_unused]] char symbol; + readChar(symbol, buf); + assert(symbol == '\0'); } void insertResultInto(AggregateDataPtr, IColumn & to, Arena *) const override diff --git a/src/AggregateFunctions/AggregateFunctionTheilsU.cpp b/src/AggregateFunctions/AggregateFunctionTheilsU.cpp new file mode 100644 index 00000000000..96772a0daa8 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionTheilsU.cpp @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + +struct TheilsUData : CrossTabData +{ + static const char * getName() + { + return "theilsU"; + } + + Float64 getResult() const + { + if (count < 2) + return std::numeric_limits::quiet_NaN(); + + Float64 h_a = 0.0; + for (const auto & [key, value] : count_a) + { + Float64 value_float = value; + h_a += (value_float / count) * log(value_float / count); + } + + Float64 dep = 0.0; + for (const auto & [key, value] : count_ab) + { + Float64 value_ab = value; + Float64 value_b = count_b.at(key.items[1]); + + dep += (value_ab / count) * log(value_ab / value_b); + } + + dep -= h_a; + dep /= h_a; + return dep; + } +}; + +} + +void registerAggregateFunctionTheilsU(AggregateFunctionFactory & factory) +{ + factory.registerFunction(TheilsUData::getName(), + [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) + { + assertBinary(name, argument_types); + assertNoParameters(name, parameters); + return std::make_shared>(argument_types); + }); +} + +} diff --git a/src/AggregateFunctions/CrossTab.h b/src/AggregateFunctions/CrossTab.h new file mode 100644 index 00000000000..e01ebcf71ed --- /dev/null +++ b/src/AggregateFunctions/CrossTab.h @@ -0,0 +1,175 @@ +#pragma once + +#include +#include +#include +#include +#include + + +/** Aggregate function that calculates statistics on top of cross-tab: + * - histogram of every argument and every pair of elements. + * These statistics include: + * - Cramer's V; + * - Theil's U; + * - contingency coefficient; + * It can be interpreted as interdependency coefficient between arguments; + * or non-parametric correlation coefficient. + */ +namespace DB +{ + +struct CrossTabData +{ + /// Total count. + UInt64 count = 0; + + /// Count of every value of the first and second argument (values are pre-hashed). + /// Note: non-cryptographic 64bit hash is used, it means that the calculation is approximate. + HashMapWithStackMemory count_a; + HashMapWithStackMemory count_b; + + /// Count of every pair of values. We pack two hashes into UInt128. + HashMapWithStackMemory count_ab; + + + void add(UInt64 hash1, UInt64 hash2) + { + ++count; + ++count_a[hash1]; + ++count_b[hash2]; + + UInt128 hash_pair{hash1, hash2}; + ++count_ab[hash_pair]; + } + + void merge(const CrossTabData & other) + { + count += other.count; + for (const auto & [key, value] : other.count_a) + count_a[key] += value; + for (const auto & [key, value] : other.count_b) + count_b[key] += value; + for (const auto & [key, value] : other.count_ab) + count_ab[key] += value; + } + + void serialize(WriteBuffer & buf) const + { + writeBinary(count, buf); + count_a.write(buf); + count_b.write(buf); + count_ab.write(buf); + } + + void deserialize(ReadBuffer & buf) + { + readBinary(count, buf); + count_a.read(buf); + count_b.read(buf); + count_ab.read(buf); + } + + /** See https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V + * + * φ² is χ² divided by the sample size (count). + * χ² is the sum of squares of the normalized differences between the "expected" and "observed" statistics. + * ("Expected" in the case when one of the hypotheses is true). + * Something resembling the L2 distance. + * + * Note: statisticians use the name χ² for every statistic that has χ² distribution in many various contexts. + * + * Let's suppose that there is no association between the values a and b. + * Then the frequency (e.g. probability) of (a, b) pair is equal to the multiplied frequencies of a and b: + * count_ab / count = (count_a / count) * (count_b / count) + * count_ab = count_a * count_b / count + * + * Let's calculate the difference between the values that are supposed to be equal if there is no association between a and b: + * count_ab - count_a * count_b / count + * + * Let's sum the squares of the differences across all (a, b) pairs. + * Then divide by the second term for normalization: (count_a * count_b / count) + * + * This will be the χ² statistics. + * This statistics is used as a base for many other statistics. + */ + Float64 getPhiSquared() const + { + Float64 chi_squared = 0; + for (const auto & [key, value_ab] : count_ab) + { + Float64 value_a = count_a.at(key.items[0]); + Float64 value_b = count_b.at(key.items[1]); + + Float64 expected_value_ab = (value_a * value_b) / count; + + Float64 chi_squared_elem = value_ab - expected_value_ab; + chi_squared_elem = chi_squared_elem * chi_squared_elem / expected_value_ab; + + chi_squared += chi_squared_elem; + } + return chi_squared / count; + } +}; + + +template +class AggregateFunctionCrossTab : public IAggregateFunctionDataHelper> +{ +public: + AggregateFunctionCrossTab(const DataTypes & arguments) + : IAggregateFunctionDataHelper>({arguments}, {}) + { + } + + String getName() const override + { + return Data::getName(); + } + + bool allocatesMemoryInArena() const override + { + return false; + } + + DataTypePtr getReturnType() const override + { + return std::make_shared>(); + } + + void add( + AggregateDataPtr __restrict place, + const IColumn ** columns, + size_t row_num, + Arena *) const override + { + UInt64 hash1 = UniqVariadicHash::apply(1, &columns[0], row_num); + UInt64 hash2 = UniqVariadicHash::apply(1, &columns[1], row_num); + + this->data(place).add(hash1, hash2); + } + + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override + { + this->data(place).merge(this->data(rhs)); + } + + void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional) const override + { + this->data(place).serialize(buf); + } + + void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional, Arena *) const override + { + this->data(place).deserialize(buf); + } + + void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override + { + Float64 result = this->data(place).getResult(); + auto & column = static_cast &>(to); + column.getData().push_back(result); + } +}; + +} diff --git a/src/AggregateFunctions/registerAggregateFunctions.cpp b/src/AggregateFunctions/registerAggregateFunctions.cpp index 79a418ac69f..33f6a532224 100644 --- a/src/AggregateFunctions/registerAggregateFunctions.cpp +++ b/src/AggregateFunctions/registerAggregateFunctions.cpp @@ -48,8 +48,13 @@ void registerAggregateFunctionRankCorrelation(AggregateFunctionFactory &); void registerAggregateFunctionMannWhitney(AggregateFunctionFactory &); void registerAggregateFunctionWelchTTest(AggregateFunctionFactory &); void registerAggregateFunctionStudentTTest(AggregateFunctionFactory &); +void registerAggregateFunctionCramersV(AggregateFunctionFactory &); +void registerAggregateFunctionTheilsU(AggregateFunctionFactory &); +void registerAggregateFunctionContingency(AggregateFunctionFactory &); +void registerAggregateFunctionCramersVBiasCorrected(AggregateFunctionFactory &); void registerAggregateFunctionSingleValueOrNull(AggregateFunctionFactory &); void registerAggregateFunctionSequenceNextNode(AggregateFunctionFactory &); +void registerAggregateFunctionNothing(AggregateFunctionFactory &); void registerAggregateFunctionExponentialMovingAverage(AggregateFunctionFactory &); void registerAggregateFunctionSparkbar(AggregateFunctionFactory &); void registerAggregateFunctionIntervalLengthSum(AggregateFunctionFactory &); @@ -99,6 +104,10 @@ void registerAggregateFunctions() registerAggregateFunctionUniqUpTo(factory); registerAggregateFunctionTopK(factory); registerAggregateFunctionsBitwise(factory); + registerAggregateFunctionCramersV(factory); + registerAggregateFunctionTheilsU(factory); + registerAggregateFunctionContingency(factory); + registerAggregateFunctionCramersVBiasCorrected(factory); registerAggregateFunctionsBitmap(factory); registerAggregateFunctionsMaxIntersections(factory); registerAggregateFunctionHistogram(factory); @@ -114,6 +123,7 @@ void registerAggregateFunctions() registerAggregateFunctionSequenceNextNode(factory); registerAggregateFunctionWelchTTest(factory); registerAggregateFunctionStudentTTest(factory); + registerAggregateFunctionNothing(factory); registerAggregateFunctionSingleValueOrNull(factory); registerAggregateFunctionIntervalLengthSum(factory); registerAggregateFunctionExponentialMovingAverage(factory); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7124961821e..b51fad2ca25 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -526,6 +526,14 @@ if (USE_BZIP2) target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${BZIP2_INCLUDE_DIR}) endif() +if(USE_SIMDJSON) + dbms_target_link_libraries(PRIVATE simdjson) +endif() + +if(USE_RAPIDJSON) + dbms_target_include_directories(SYSTEM PRIVATE ${RAPIDJSON_INCLUDE_DIR}) +endif() + dbms_target_link_libraries(PUBLIC consistent-hashing) include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake") @@ -559,6 +567,7 @@ if (ENABLE_TESTS AND USE_GTEST) clickhouse_storages_system dbms clickhouse_common_zookeeper + clickhouse_common_config string_utils) add_check(unit_tests_dbms) diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 75e0588f786..0938a9cfee5 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "Common/Exception.h" @@ -65,6 +66,11 @@ namespace fs = std::filesystem; using namespace std::literals; +namespace CurrentMetrics +{ + extern const Metric MemoryTracking; +} + namespace DB { @@ -800,7 +806,7 @@ void ClientBase::onProfileEvents(Block & block) if (rows == 0) return; - if (progress_indication.print_hardware_utilization) + if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_INCREMENTAL_PROFILE_EVENTS) { const auto & array_thread_id = typeid_cast(*block.getByName("thread_id").column).getData(); const auto & names = typeid_cast(*block.getByName("name").column); @@ -834,25 +840,25 @@ void ClientBase::onProfileEvents(Block & block) } auto elapsed_time = profile_events.watch.elapsedMicroseconds(); progress_indication.updateThreadEventData(thread_times, elapsed_time); - } - if (profile_events.print) - { - if (profile_events.watch.elapsedMilliseconds() >= profile_events.delay_ms) + if (profile_events.print) { - initLogsOutputStream(); - progress_indication.clearProgressOutput(); - logs_out_stream->writeProfileEvents(block); - logs_out_stream->flush(); + if (profile_events.watch.elapsedMilliseconds() >= profile_events.delay_ms) + { + initLogsOutputStream(); + progress_indication.clearProgressOutput(); + logs_out_stream->writeProfileEvents(block); + logs_out_stream->flush(); - profile_events.last_block = {}; - } - else - { - incrementProfileEventsBlock(profile_events.last_block, block); + profile_events.last_block = {}; + } + else + { + incrementProfileEventsBlock(profile_events.last_block, block); + } } + profile_events.watch.restart(); } - profile_events.watch.restart(); } @@ -1812,6 +1818,7 @@ void ClientBase::init(int argc, char ** argv) ("interactive", "Process queries-file or --query query and start interactive mode") ("pager", po::value(), "Pipe all output into this command (less or similar)") + ("max_memory_usage_in_client", po::value(), "Set memory limit in client/local server") ; addOptions(options_description); @@ -1909,8 +1916,6 @@ void ClientBase::init(int argc, char ** argv) Poco::Logger::root().setLevel(options["log-level"].as()); if (options.count("server_logs_file")) server_logs_file = options["server_logs_file"].as(); - if (options.count("hardware-utilization")) - progress_indication.print_hardware_utilization = true; query_processing_stage = QueryProcessingStage::fromString(options["stage"].as()); profile_events.print = options.count("print-profile-events"); @@ -1919,6 +1924,15 @@ void ClientBase::init(int argc, char ** argv) processOptions(options_description, options, external_tables_arguments); argsToConfig(common_arguments, config(), 100); clearPasswordFromCommandLine(argc, argv); + + /// Limit on total memory usage + size_t max_client_memory_usage = config().getInt64("max_memory_usage_in_client", 0 /*default value*/); + if (max_client_memory_usage != 0) + { + total_memory_tracker.setHardLimit(max_client_memory_usage); + total_memory_tracker.setDescription("(total)"); + total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking); + } } } diff --git a/src/Common/Config/CMakeLists.txt b/src/Common/Config/CMakeLists.txt index 3da44be2af6..4d72960f727 100644 --- a/src/Common/Config/CMakeLists.txt +++ b/src/Common/Config/CMakeLists.txt @@ -4,6 +4,7 @@ set (SRCS configReadClient.cpp ConfigReloader.cpp YAMLParser.cpp + ConfigHelper.cpp ) add_library(clickhouse_common_config ${SRCS}) diff --git a/src/Common/Config/ConfigHelper.cpp b/src/Common/Config/ConfigHelper.cpp new file mode 100644 index 00000000000..69fe42de86c --- /dev/null +++ b/src/Common/Config/ConfigHelper.cpp @@ -0,0 +1,23 @@ +#include +#include + +namespace DB +{ + +namespace ConfigHelper +{ + +bool getBool(const Poco::Util::AbstractConfiguration & config, const std::string & key, bool default_, bool empty_as) +{ + if (!config.has(key)) + return default_; + Poco::Util::AbstractConfiguration::Keys sub_keys; + config.keys(key, sub_keys); + if (sub_keys.empty() && config.getString(key).empty()) + return empty_as; + return config.getBool(key, default_); +} + +} + +} diff --git a/src/Common/Config/ConfigHelper.h b/src/Common/Config/ConfigHelper.h new file mode 100644 index 00000000000..62271bbaf0a --- /dev/null +++ b/src/Common/Config/ConfigHelper.h @@ -0,0 +1,18 @@ +#pragma once + +namespace Poco +{ + namespace Util + { + class AbstractConfiguration; + } +} + +namespace DB::ConfigHelper +{ + +/// The behavior is like `config.getBool(key, default_)`, +/// except when the tag is empty (aka. self-closing), `empty_as` will be used instead of throwing Poco::Exception. +bool getBool(const Poco::Util::AbstractConfiguration & config, const std::string & key, bool default_, bool empty_as); + +} diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 5c9ba177b78..896168253cf 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -63,8 +63,10 @@ M(MaxDDLEntryID, "Max processed DDL entry of DDLWorker.") \ M(MaxPushedDDLEntryID, "Max DDL entry of DDLWorker that pushed to zookeeper.") \ M(PartsTemporary, "The part is generating now, it is not in data_parts list.") \ - M(PartsPreCommitted, "The part is in data_parts, but not used for SELECTs.") \ - M(PartsCommitted, "Active data part, used by current and upcoming SELECTs.") \ + M(PartsPreCommitted, "Deprecated. See PartsPreActive.") \ + M(PartsCommitted, "Deprecated. See PartsActive.") \ + M(PartsPreActive, "The part is in data_parts, but not used for SELECTs.") \ + M(PartsActive, "Active data part, used by current and upcoming SELECTs.") \ M(PartsOutdated, "Not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes.") \ M(PartsDeleting, "Not active data part with identity refcounter, it is deleting right now by a cleaner.") \ M(PartsDeleteOnDestroy, "Part was moved to another disk and should be deleted in own destructor.") \ diff --git a/src/Common/DateLUTImpl.cpp b/src/Common/DateLUTImpl.cpp index ebf32c4dbd9..869954bb2ae 100644 --- a/src/Common/DateLUTImpl.cpp +++ b/src/Common/DateLUTImpl.cpp @@ -174,6 +174,20 @@ DateLUTImpl::DateLUTImpl(const std::string & time_zone_) { years_months_lut[year_months_lut_index] = first_day_of_last_month; } + + /// Fill saturated LUT. + { + ssize_t day = DATE_LUT_SIZE - 1; + for (; day >= 0; --day) + { + if (lut[day].date >= 0) + lut_saturated[day] = lut[day]; + else + break; + } + for (; day >= 0; --day) + lut_saturated[day] = lut_saturated[day + 1]; + } } diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index e52e6547fa2..c178dc58854 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -61,6 +61,8 @@ private: // has to be a separate type to support overloading // TODO: make sure that any arithmetic on LUTIndex actually results in valid LUTIndex. STRONG_TYPEDEF(UInt32, LUTIndex) + // Same as above but select different function overloads for zero saturation. + STRONG_TYPEDEF(UInt32, LUTIndexWithSaturation) template friend inline LUTIndex operator+(const LUTIndex & index, const T v) @@ -182,6 +184,9 @@ private: /// In comparison to std::vector, plain array is cheaper by one indirection. Values lut[DATE_LUT_SIZE + 1]; + /// Same as above but with dates < 1970-01-01 saturated to 1970-01-01. + Values lut_saturated[DATE_LUT_SIZE + 1]; + /// Year number after DATE_LUT_MIN_YEAR -> LUTIndex in lut for start of year. LUTIndex years_lut[DATE_LUT_YEARS]; @@ -278,19 +283,39 @@ public: auto getOffsetAtStartOfEpoch() const { return offset_at_start_of_epoch; } auto getTimeOffsetAtStartOfLUT() const { return offset_at_start_of_lut; } - auto getDayNumOffsetEpoch() const { return daynum_offset_epoch; } + static auto getDayNumOffsetEpoch() { return daynum_offset_epoch; } /// All functions below are thread-safe; arguments are not checked. - inline ExtendedDayNum toDayNum(ExtendedDayNum d) const + static ExtendedDayNum toDayNum(ExtendedDayNum d) { return d; } - template - inline ExtendedDayNum toDayNum(DateOrTime v) const + static UInt32 saturateMinus(UInt32 x, UInt32 y) { - return ExtendedDayNum{static_cast(toLUTIndex(v).toUnderType() - daynum_offset_epoch)}; + UInt32 res = x - y; + res &= -Int32(res <= x); + return res; + } + + static ExtendedDayNum toDayNum(LUTIndex d) + { + return ExtendedDayNum{static_cast(d.toUnderType() - daynum_offset_epoch)}; + } + + static DayNum toDayNum(LUTIndexWithSaturation d) + { + return DayNum{static_cast(saturateMinus(d.toUnderType(), daynum_offset_epoch))}; + } + + template + inline auto toDayNum(DateOrTime v) const + { + if constexpr (std::is_unsigned_v || std::is_same_v) + return DayNum{static_cast(saturateMinus(toLUTIndex(v).toUnderType(), daynum_offset_epoch))}; + else + return ExtendedDayNum{static_cast(toLUTIndex(v).toUnderType() - daynum_offset_epoch)}; } /// Round down to start of monday. @@ -298,14 +323,20 @@ public: inline Time toFirstDayOfWeek(DateOrTime v) const { const LUTIndex i = toLUTIndex(v); - return lut[i - (lut[i].day_of_week - 1)].date; + if constexpr (std::is_unsigned_v || std::is_same_v) + return lut_saturated[i - (lut[i].day_of_week - 1)].date; + else + return lut[i - (lut[i].day_of_week - 1)].date; } template - inline ExtendedDayNum toFirstDayNumOfWeek(DateOrTime v) const + inline auto toFirstDayNumOfWeek(DateOrTime v) const { const LUTIndex i = toLUTIndex(v); - return toDayNum(i - (lut[i].day_of_week - 1)); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(i - (lut[i].day_of_week - 1))); + else + return toDayNum(LUTIndex(i - (lut[i].day_of_week - 1))); } /// Round down to start of month. @@ -313,21 +344,30 @@ public: inline Time toFirstDayOfMonth(DateOrTime v) const { const LUTIndex i = toLUTIndex(v); - return lut[i - (lut[i].day_of_month - 1)].date; + if constexpr (std::is_unsigned_v || std::is_same_v) + return lut_saturated[i - (lut[i].day_of_month - 1)].date; + else + return lut[i - (lut[i].day_of_month - 1)].date; } template - inline ExtendedDayNum toFirstDayNumOfMonth(DateOrTime v) const + inline auto toFirstDayNumOfMonth(DateOrTime v) const { const LUTIndex i = toLUTIndex(v); - return toDayNum(i - (lut[i].day_of_month - 1)); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(i - (lut[i].day_of_month - 1))); + else + return toDayNum(LUTIndex(i - (lut[i].day_of_month - 1))); } /// Round down to start of quarter. template - inline ExtendedDayNum toFirstDayNumOfQuarter(DateOrTime v) const + inline auto toFirstDayNumOfQuarter(DateOrTime v) const { - return toDayNum(toFirstDayOfQuarterIndex(v)); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(toFirstDayOfQuarterIndex(v))); + else + return toDayNum(LUTIndex(toFirstDayOfQuarterIndex(v))); } template @@ -365,9 +405,12 @@ public: } template - inline ExtendedDayNum toFirstDayNumOfYear(DateOrTime v) const + inline auto toFirstDayNumOfYear(DateOrTime v) const { - return toDayNum(toFirstDayNumOfYearIndex(v)); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(toFirstDayNumOfYearIndex(v))); + else + return toDayNum(LUTIndex(toFirstDayNumOfYearIndex(v))); } inline Time toFirstDayOfNextMonth(Time t) const @@ -514,11 +557,17 @@ public: * because the same calendar day starts/ends at different timestamps in different time zones) */ - inline Time fromDayNum(DayNum d) const { return lut[toLUTIndex(d)].date; } + inline Time fromDayNum(DayNum d) const { return lut_saturated[toLUTIndex(d)].date; } inline Time fromDayNum(ExtendedDayNum d) const { return lut[toLUTIndex(d)].date; } template - inline Time toDate(DateOrTime v) const { return lut[toLUTIndex(v)].date; } + inline Time toDate(DateOrTime v) const + { + if constexpr (std::is_unsigned_v || std::is_same_v) + return lut_saturated[toLUTIndex(v)].date; + else + return lut[toLUTIndex(v)].date; + } template inline unsigned toMonth(DateOrTime v) const { return lut[toLUTIndex(v)].month; } @@ -581,9 +630,12 @@ public: } template - inline ExtendedDayNum toFirstDayNumOfISOYear(DateOrTime v) const + inline auto toFirstDayNumOfISOYear(DateOrTime v) const { - return toDayNum(toFirstDayNumOfISOYearIndex(v)); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(toFirstDayNumOfISOYearIndex(v))); + else + return toDayNum(LUTIndex(toFirstDayNumOfISOYearIndex(v))); } inline Time toFirstDayOfISOYear(Time t) const @@ -596,7 +648,7 @@ public: template inline unsigned toISOWeek(DateOrTime v) const { - return 1 + (toFirstDayNumOfWeek(v) - toFirstDayNumOfISOYear(v)) / 7; + return 1 + (toFirstDayNumOfWeek(v) - toDayNum(toFirstDayNumOfISOYearIndex(v))) / 7; } /* @@ -662,7 +714,7 @@ public: { if (!week_year_mode && ((first_weekday_mode && weekday != 0) || (!first_weekday_mode && weekday >= 4))) return yw; - week_year_mode = 1; + week_year_mode = true; (yw.first)--; first_daynr -= (days = calc_days_in_year(yw.first)); weekday = (weekday + 53 * 7 - days) % 7; @@ -724,7 +776,7 @@ public: /// Get first day of week with week_mode, return Sunday or Monday template - inline ExtendedDayNum toFirstDayNumOfWeek(DateOrTime v, UInt8 week_mode) const + inline auto toFirstDayNumOfWeek(DateOrTime v, UInt8 week_mode) const { bool monday_first_mode = week_mode & static_cast(WeekModeFlag::MONDAY_FIRST); if (monday_first_mode) @@ -733,7 +785,10 @@ public: } else { - return (toDayOfWeek(v) != 7) ? ExtendedDayNum(v - toDayOfWeek(v)) : toDayNum(v); + if constexpr (std::is_unsigned_v || std::is_same_v) + return (toDayOfWeek(v) != 7) ? DayNum(saturateMinus(v, toDayOfWeek(v))) : toDayNum(v); + else + return (toDayOfWeek(v) != 7) ? ExtendedDayNum(v - toDayOfWeek(v)) : toDayNum(v); } } @@ -809,7 +864,7 @@ public: } template - inline ExtendedDayNum toStartOfYearInterval(DateOrTime v, UInt64 years) const + inline auto toStartOfYearInterval(DateOrTime v, UInt64 years) const { if (years == 1) return toFirstDayNumOfYear(v); @@ -822,39 +877,59 @@ public: if (unlikely(year < DATE_LUT_MIN_YEAR)) year = DATE_LUT_MIN_YEAR; - return toDayNum(years_lut[year - DATE_LUT_MIN_YEAR]); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(years_lut[year - DATE_LUT_MIN_YEAR])); + else + return toDayNum(years_lut[year - DATE_LUT_MIN_YEAR]); } - inline ExtendedDayNum toStartOfQuarterInterval(ExtendedDayNum d, UInt64 quarters) const + template || std::is_same_v>> + inline auto toStartOfQuarterInterval(Date d, UInt64 quarters) const { if (quarters == 1) return toFirstDayNumOfQuarter(d); return toStartOfMonthInterval(d, quarters * 3); } - inline ExtendedDayNum toStartOfMonthInterval(ExtendedDayNum d, UInt64 months) const + template || std::is_same_v>> + inline auto toStartOfMonthInterval(Date d, UInt64 months) const { if (months == 1) return toFirstDayNumOfMonth(d); const Values & values = lut[toLUTIndex(d)]; UInt32 month_total_index = (values.year - DATE_LUT_MIN_YEAR) * 12 + values.month - 1; - return toDayNum(years_months_lut[month_total_index / months * months]); + if constexpr (std::is_same_v) + return toDayNum(LUTIndexWithSaturation(years_months_lut[month_total_index / months * months])); + else + return toDayNum(years_months_lut[month_total_index / months * months]); } - inline ExtendedDayNum toStartOfWeekInterval(ExtendedDayNum d, UInt64 weeks) const + template || std::is_same_v>> + inline auto toStartOfWeekInterval(Date d, UInt64 weeks) const { if (weeks == 1) return toFirstDayNumOfWeek(d); UInt64 days = weeks * 7; // January 1st 1970 was Thursday so we need this 4-days offset to make weeks start on Monday. - return ExtendedDayNum(4 + (d - 4) / days * days); + if constexpr (std::is_same_v) + return DayNum(4 + (d - 4) / days * days); + else + return ExtendedDayNum(4 + (d - 4) / days * days); } - inline Time toStartOfDayInterval(ExtendedDayNum d, UInt64 days) const + template || std::is_same_v>> + inline Time toStartOfDayInterval(Date d, UInt64 days) const { if (days == 1) return toDate(d); - return lut[toLUTIndex(ExtendedDayNum(d / days * days))].date; + if constexpr (std::is_same_v) + return lut_saturated[toLUTIndex(ExtendedDayNum(d / days * days))].date; + else + return lut[toLUTIndex(ExtendedDayNum(d / days * days))].date; } inline Time toStartOfHourInterval(Time t, UInt64 hours) const @@ -1140,7 +1215,11 @@ public: /// If resulting month has less deys than source month, then saturation can happen. /// Example: 31 Aug + 1 month = 30 Sep. - inline Time NO_SANITIZE_UNDEFINED addMonths(Time t, Int64 delta) const + template < + typename DateTime, + typename + = std::enable_if_t || std::is_same_v || std::is_same_v>> + inline Time NO_SANITIZE_UNDEFINED addMonths(DateTime t, Int64 delta) const { const auto result_day = addMonthsIndex(t, delta); @@ -1154,20 +1233,28 @@ public: if (time >= lut[result_day].time_at_offset_change()) time -= lut[result_day].amount_of_offset_change(); - return lut[result_day].date + time; + auto res = lut[result_day].date + time; + if constexpr (std::is_same_v) + { + /// Common compiler should generate branchless code for this saturation operation. + return res <= 0 ? 0 : res; + } + else + return res; } - inline ExtendedDayNum NO_SANITIZE_UNDEFINED addMonths(ExtendedDayNum d, Int64 delta) const + template || std::is_same_v>> + inline auto NO_SANITIZE_UNDEFINED addMonths(Date d, Int64 delta) const { - return toDayNum(addMonthsIndex(d, delta)); + if constexpr (std::is_same_v) + return toDayNum(LUTIndexWithSaturation(addMonthsIndex(d, delta))); + else + return toDayNum(addMonthsIndex(d, delta)); } - inline Time NO_SANITIZE_UNDEFINED addQuarters(Time t, Int32 delta) const - { - return addMonths(t, static_cast(delta) * 3); - } - - inline ExtendedDayNum addQuarters(ExtendedDayNum d, Int32 delta) const + template + inline auto addQuarters(DateOrTime d, Int32 delta) const { return addMonths(d, static_cast(delta) * 3); } @@ -1189,7 +1276,11 @@ public: } /// Saturation can occur if 29 Feb is mapped to non-leap year. - inline Time addYears(Time t, Int64 delta) const + template < + typename DateTime, + typename + = std::enable_if_t || std::is_same_v || std::is_same_v>> + inline Time addYears(DateTime t, Int64 delta) const { auto result_day = addYearsIndex(t, delta); @@ -1203,12 +1294,24 @@ public: if (time >= lut[result_day].time_at_offset_change()) time -= lut[result_day].amount_of_offset_change(); - return lut[result_day].date + time; + auto res = lut[result_day].date + time; + if constexpr (std::is_same_v) + { + /// Common compiler should generate branchless code for this saturation operation. + return res <= 0 ? 0 : res; + } + else + return res; } - inline ExtendedDayNum addYears(ExtendedDayNum d, Int64 delta) const + template || std::is_same_v>> + inline auto addYears(Date d, Int64 delta) const { - return toDayNum(addYearsIndex(d, delta)); + if constexpr (std::is_same_v) + return toDayNum(LUTIndexWithSaturation(addYearsIndex(d, delta))); + else + return toDayNum(addYearsIndex(d, delta)); } diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 70d85433513..ef2be3b2164 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -603,6 +603,9 @@ M(632, UNEXPECTED_DATA_AFTER_PARSED_VALUE) \ M(633, QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW) \ M(634, MONGODB_ERROR) \ + M(635, CANNOT_POLL) \ + M(636, CANNOT_EXTRACT_TABLE_STRUCTURE) \ + M(637, INVALID_TABLE_OVERRIDE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/HashTable/HashMap.h b/src/Common/HashTable/HashMap.h index c5675d4d7c9..236a6d65707 100644 --- a/src/Common/HashTable/HashMap.h +++ b/src/Common/HashTable/HashMap.h @@ -10,6 +10,13 @@ * Also, key in hash table must be of type, that zero bytes is compared equals to zero key. */ +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} +} struct NoInitTag { @@ -262,6 +269,13 @@ public: return it->getMapped(); } + + const typename Cell::Mapped & ALWAYS_INLINE at(const Key & x) const + { + if (auto it = this->find(x); it != this->end()) + return it->getMapped(); + throw DB::Exception("Cannot find element in HashMap::at method", DB::ErrorCodes::LOGICAL_ERROR); + } }; namespace std diff --git a/src/Common/LRUCache.h b/src/Common/LRUCache.h index bbc09fd3aff..480a03ab399 100644 --- a/src/Common/LRUCache.h +++ b/src/Common/LRUCache.h @@ -64,6 +64,18 @@ public: setImpl(key, mapped, lock); } + void remove(const Key & key) + { + std::lock_guard lock(mutex); + auto it = cells.find(key); + if (it == cells.end()) + return; + auto & cell = it->second; + current_size -= cell.size; + queue.erase(cell.queue_iterator); + cells.erase(it); + } + /// If the value for the key is in the cache, returns it. If it is not, calls load_func() to /// produce it, saves the result in the cache and returns it. /// Only one of several concurrent threads calling getOrSet() will call load_func(), diff --git a/src/Common/LRUResourceCache.h b/src/Common/LRUResourceCache.h new file mode 100644 index 00000000000..e1a28e7ab60 --- /dev/null +++ b/src/Common/LRUResourceCache.h @@ -0,0 +1,392 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +template +struct TrivailLRUResourceCacheWeightFunction +{ + size_t operator()(const T &) const { return 1; } +}; + +/** + * Similar to implementation in LRUCache.h, but with the difference that keys can + * only be evicted when they are releasable. Release state is controlled by this implementation. + * get() and getOrSet() methods return a Holder to actual value, which does release() in destructor. + * + * Warning (!): This implementation is in development, not to be used. + */ +template , + typename HashFunction = std::hash> +class LRUResourceCache +{ +public: + using Key = TKey; + using Mapped = TMapped; + using MappedPtr = std::shared_ptr; + + class MappedHolder + { + public: + MappedHolder(LRUResourceCache * cache_, const Key & key_, MappedPtr value_) + : cache(cache_), key(key_), val(value_) {} + + ~MappedHolder() { cache->release(key); } + + Mapped & value() { return *val; } + + protected: + LRUResourceCache * cache; + Key key; + MappedPtr val; + }; + + using MappedHolderPtr = std::unique_ptr; + + explicit LRUResourceCache(size_t max_weight_, size_t max_element_size_ = 0) + : max_weight(max_weight_), max_element_size(max_element_size_) {} + + MappedHolderPtr get(const Key & key) + { + auto mapped_ptr = getImpl(key); + if (!mapped_ptr) + return nullptr; + return std::make_unique(this, key, mapped_ptr); + } + + template + MappedHolderPtr getOrSet(const Key & key, LoadFunc && load_func) + { + auto mapped_ptr = getImpl(key, load_func); + if (!mapped_ptr) + return nullptr; + return std::make_unique(this, key, mapped_ptr); + } + + // If the key's reference_count = 0, delete it immediately. + // Otherwise, mark it expired (not visible to get()), and delete when refcount is 0. + void tryRemove(const Key & key) + { + std::lock_guard lock(mutex); + auto it = cells.find(key); + if (it == cells.end()) + return; + auto & cell = it->second; + if (cell.reference_count == 0) + { + queue.erase(cell.queue_iterator); + current_weight -= cell.weight; + cells.erase(it); + } + else + cell.expired = true; + } + + size_t weight() + { + std::lock_guard lock(mutex); + return current_weight; + } + + size_t size() + { + std::lock_guard lock(mutex); + return cells.size(); + } + + void getStats(size_t & out_hits, size_t & out_misses, size_t & out_evict_count) const + { + out_hits = hits; + out_misses = misses; + out_evict_count = evict_count; + } + +private: + mutable std::mutex mutex; + + using LRUQueue = std::list; + using LRUQueueIterator = typename LRUQueue::iterator; + + struct Cell + { + MappedPtr value; + size_t weight = 0; + LRUQueueIterator queue_iterator; + size_t reference_count = 0; + bool expired = false; + }; + + using Cells = std::unordered_map; + Cells cells; + LRUQueue queue; + size_t current_weight = 0; + size_t max_weight = 0; + size_t max_element_size = 0; + + /// Represents pending insertion attempt. + struct InsertToken + { + explicit InsertToken(LRUResourceCache & cache_) : cache(cache_) { } + + std::mutex mutex; + bool cleaned_up = false; /// Protected by the token mutex + MappedPtr value; /// Protected by the token mutex + + LRUResourceCache & cache; + size_t refcount = 0; /// Protected by the cache mutex + }; + + using InsertTokenById = std::unordered_map, HashFunction>; + + /// This class is responsible for removing used insert tokens from the insert_tokens map. + /// Among several concurrent threads the first successful one is responsible for removal. But if they all + /// fail, then the last one is responsible. + struct InsertTokenHolder + { + const Key * key = nullptr; + std::shared_ptr token; + bool cleaned_up = false; + + InsertTokenHolder() = default; + + void + acquire(const Key * key_, const std::shared_ptr & token_, [[maybe_unused]] std::lock_guard & cache_lock) + { + key = key_; + token = token_; + ++token->refcount; + } + + void cleanup([[maybe_unused]] std::lock_guard & token_lock, [[maybe_unused]] std::lock_guard & cache_lock) + { + token->cache.insert_tokens.erase(*key); + token->cleaned_up = true; + cleaned_up = true; + } + + ~InsertTokenHolder() + { + if (!token) + return; + + if (cleaned_up) + return; + + std::lock_guard token_lock(token->mutex); + + if (token->cleaned_up) + return; + + std::lock_guard cache_lock(token->cache.mutex); + + --token->refcount; + if (token->refcount == 0) + cleanup(token_lock, cache_lock); + } + }; + + friend struct InsertTokenHolder; + InsertTokenById insert_tokens; + WeightFunction weight_function; + std::atomic hits{0}; + std::atomic misses{0}; + std::atomic evict_count{0}; + + /// Returns nullptr when there is no more space for the new value or the old value is in used. + template + MappedPtr getImpl(const Key & key, LoadFunc && load_func) + { + InsertTokenHolder token_holder; + { + std::lock_guard lock(mutex); + auto it = cells.find(key); + if (it != cells.end() && !it->second.expired) + { + if (!it->second.expired) + { + ++hits; + it->second.reference_count += 1; + queue.splice(queue.end(), queue, it->second.queue_iterator); + return it->second.value; + } + else if (it->second.reference_count > 0) + return nullptr; + else + { + // should not reach here + LOG_ERROR(&Poco::Logger::get("LRUResourceCache"), "element is in invalid status."); + abort(); + } + } + ++misses; + auto & token = insert_tokens[key]; + if (!token) + token = std::make_shared(*this); + token_holder.acquire(&key, token, lock); + } + + auto * token = token_holder.token.get(); + std::lock_guard token_lock(token->mutex); + token_holder.cleaned_up = token->cleaned_up; + + if (!token->value) + token->value = load_func(); + + std::lock_guard lock(mutex); + auto token_it = insert_tokens.find(key); + Cell * cell_ptr = nullptr; + if (token_it != insert_tokens.end() && token_it->second.get() == token) + { + cell_ptr = set(key, token->value); + } + else + { + auto cell_it = cells.find(key); + if (cell_it != cells.end() && !cell_it->second.expired) + { + cell_ptr = &cell_it->second; + } + } + + if (!token->cleaned_up) + token_holder.cleanup(token_lock, lock); + + if (cell_ptr) + { + queue.splice(queue.end(), queue, cell_ptr->queue_iterator); + cell_ptr->reference_count++; + return cell_ptr->value; + } + return nullptr; + } + + MappedPtr getImpl(const Key & key) + { + std::lock_guard lock(mutex); + + auto it = cells.find(key); + if (it == cells.end() || it->second.expired) + { + ++misses; + return nullptr; + } + + ++hits; + it->second.reference_count += 1; + queue.splice(queue.end(), queue, it->second.queue_iterator); + return it->second.value; + } + + // mark a reference is released + void release(const Key & key) + { + std::lock_guard lock(mutex); + + auto it = cells.find(key); + if (it == cells.end() || it->second.reference_count == 0) + { + LOG_ERROR(&Poco::Logger::get("LRUResourceCache"), "try to release an invalid element"); + abort(); + } + + auto & cell = it->second; + cell.reference_count -= 1; + if (cell.expired && cell.reference_count == 0) + { + queue.erase(cell.queue_iterator); + current_weight -= cell.weight; + cells.erase(it); + } + } + + InsertToken * acquireInsertToken(const Key & key) + { + auto & token = insert_tokens[key]; + token.reference_count += 1; + return &token; + } + + void releaseInsertToken(const Key & key) + { + auto it = insert_tokens.find(key); + if (it != insert_tokens.end()) + { + it->second.reference_count -= 1; + if (it->second.reference_count == 0) + insert_tokens.erase(it); + } + } + + // key mustn't be in the cache + Cell * set(const Key & insert_key, MappedPtr value) + { + auto weight = value ? weight_function(*value) : 0; + auto queue_size = cells.size() + 1; + auto loss_weight = 0; + + auto is_overflow = [&] { + return current_weight + weight - loss_weight > max_weight || (max_element_size != 0 && queue_size > max_element_size); + }; + + auto key_it = queue.begin(); + std::unordered_set to_release_keys; + + while (is_overflow() && queue_size > 1 && key_it != queue.end()) + { + const Key & key = *key_it; + + auto cell_it = cells.find(key); + if (cell_it == cells.end()) + { + LOG_ERROR(&Poco::Logger::get("LRUResourceCache"), "LRUResourceCache became inconsistent. There must be a bug in it."); + abort(); + } + + auto & cell = cell_it->second; + if (cell.reference_count == 0) + { + loss_weight += cell.weight; + queue_size -= 1; + to_release_keys.insert(key); + } + + ++key_it; + } + + if (is_overflow()) + return nullptr; + + if (loss_weight > current_weight + weight) + { + LOG_ERROR(&Poco::Logger::get("LRUResourceCache"), "LRUResourceCache became inconsistent. There must be a bug in it."); + abort(); + } + + for (auto & key : to_release_keys) + { + auto & cell = cells[key]; + queue.erase(cell.queue_iterator); + cells.erase(key); + ++evict_count; + } + + current_weight = current_weight + weight - loss_weight; + + auto & new_cell = cells[insert_key]; + new_cell.value = value; + new_cell.weight = weight; + new_cell.queue_iterator = queue.insert(queue.end(), insert_key); + return &new_cell; + } +}; +} diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 9c4f524a322..878930f58d9 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -276,7 +276,8 @@ M(ThreadPoolReaderPageCacheMissElapsedMicroseconds, "Time spent reading data inside the asynchronous job in ThreadPoolReader - when read was not done from page cache.") \ \ M(AsynchronousReadWaitMicroseconds, "Time spent in waiting for asynchronous reads.") \ - + \ + M(MainConfigLoads, "Number of times the main configuration was reloaded.") \ namespace ProfileEvents { diff --git a/src/Common/ProgressIndication.cpp b/src/Common/ProgressIndication.cpp index 33508f5ad5f..b9a8bc923f7 100644 --- a/src/Common/ProgressIndication.cpp +++ b/src/Common/ProgressIndication.cpp @@ -16,7 +16,7 @@ namespace { constexpr UInt64 ALL_THREADS = 0; - UInt64 calculateCoresNumber(DB::ThreadIdToTimeMap times, UInt64 elapsed) + double calculateCPUUsage(DB::ThreadIdToTimeMap times, UInt64 elapsed) { auto accumulated = std::accumulate(times.begin(), times.end(), 0, [](Int64 acc, const auto & elem) @@ -25,7 +25,7 @@ namespace return acc; return acc + elem.second.time(); }); - return (static_cast(accumulated) + elapsed - 1) / elapsed; + return static_cast(accumulated) / elapsed; } } @@ -53,7 +53,7 @@ void ProgressIndication::resetProgress() show_progress_bar = false; written_progress_chars = 0; write_progress_on_update = false; - host_active_cores.clear(); + host_cpu_usage.clear(); thread_data.clear(); } @@ -81,8 +81,7 @@ void ProgressIndication::updateThreadEventData(HostToThreadTimesMap & new_thread { for (auto & new_host_map : new_thread_data) { - auto new_cores = calculateCoresNumber(new_host_map.second, elapsed_time); - host_active_cores[new_host_map.first] = new_cores; + host_cpu_usage[new_host_map.first] = calculateCPUUsage(new_host_map.second, elapsed_time); thread_data[new_host_map.first] = std::move(new_host_map.second); } } @@ -96,13 +95,12 @@ size_t ProgressIndication::getUsedThreadsCount() const }); } -UInt64 ProgressIndication::getApproximateCoresNumber() const +double ProgressIndication::getCPUUsage() const { - return std::accumulate(host_active_cores.cbegin(), host_active_cores.cend(), 0, - [](UInt64 acc, auto const & elem) - { - return acc + elem.second; - }); + double res = 0; + for (const auto & elem : host_cpu_usage) + res += elem.second; + return res; } ProgressIndication::MemoryUsage ProgressIndication::getMemoryUsage() const @@ -116,6 +114,7 @@ ProgressIndication::MemoryUsage ProgressIndication::getMemoryUsage() const // memory consumption it's enough to look for data with thread id 0. if (auto it = host_data.second.find(ALL_THREADS); it != host_data.second.end()) host_usage = it->second.memory_usage; + return MemoryUsage{.total = acc.total + host_usage, .max = std::max(acc.max, host_usage)}; }); } @@ -183,27 +182,29 @@ void ProgressIndication::writeProgress() written_progress_chars = message.count() - prefix_size - (strlen(indicator) - 2); /// Don't count invisible output (escape sequences). - // If approximate cores number is known, display it. - auto cores_number = getApproximateCoresNumber(); + /// Display resource usage if possible. std::string profiling_msg; - if (cores_number != 0 && print_hardware_utilization) + + double cpu_usage = getCPUUsage(); + auto [memory_usage, max_host_usage] = getMemoryUsage(); + + if (cpu_usage > 0 || memory_usage > 0) { WriteBufferFromOwnString profiling_msg_builder; - // Calculated cores number may be not accurate - // so it's better to print min(threads, cores). - UInt64 threads_number = getUsedThreadsCount(); - profiling_msg_builder << " Running " << threads_number << " threads on " - << std::min(cores_number, threads_number) << " cores"; - auto [memory_usage, max_host_usage] = getMemoryUsage(); - if (memory_usage != 0) - profiling_msg_builder << " with " << formatReadableSizeWithDecimalSuffix(memory_usage) << " RAM used"; - if (thread_data.size() > 1 && max_host_usage) - profiling_msg_builder << " total (per host max: " << formatReadableSizeWithDecimalSuffix(max_host_usage) << ")"; - profiling_msg_builder << "."; + profiling_msg_builder << "(" << fmt::format("{:.1f}", cpu_usage) << " CPU"; + + if (memory_usage > 0) + profiling_msg_builder << ", " << formatReadableSizeWithDecimalSuffix(memory_usage) << " RAM"; + if (max_host_usage < memory_usage) + profiling_msg_builder << ", " << formatReadableSizeWithDecimalSuffix(max_host_usage) << " max/host"; + + profiling_msg_builder << ")"; profiling_msg = profiling_msg_builder.str(); } + int64_t remaining_space = static_cast(terminal_width) - written_progress_chars; + /// If the approximate number of rows to process is known, we can display a progress bar and percentage. if (progress.total_rows_to_read || progress.total_raw_bytes_to_read) { @@ -230,14 +231,35 @@ void ProgressIndication::writeProgress() if (show_progress_bar) { - ssize_t width_of_progress_bar = static_cast(terminal_width) - written_progress_chars - strlen(" 99%") - profiling_msg.length(); + /// We will display profiling info only if there is enough space for it. + int64_t width_of_progress_bar = remaining_space - strlen(" 99%"); + + /// We need at least twice the space, because it will be displayed either + /// at right after progress bar or at left on top of the progress bar. + if (width_of_progress_bar <= 1 + 2 * static_cast(profiling_msg.size())) + profiling_msg.clear(); + else + width_of_progress_bar -= profiling_msg.size(); + if (width_of_progress_bar > 0) { - std::string bar - = UnicodeBar::render(UnicodeBar::getWidth(current_count, 0, max_count, width_of_progress_bar)); + size_t bar_width = UnicodeBar::getWidth(current_count, 0, max_count, width_of_progress_bar); + std::string bar = UnicodeBar::render(bar_width); + + /// Render profiling_msg at left on top of the progress bar. + bool render_profiling_msg_at_left = current_count * 2 >= max_count; + if (!profiling_msg.empty() && render_profiling_msg_at_left) + message << "\033[30;42m" << profiling_msg << "\033[0m"; + message << "\033[0;32m" << bar << "\033[0m"; - if (width_of_progress_bar > static_cast(bar.size() / UNICODE_BAR_CHAR_SIZE)) + + /// Whitespaces after the progress bar. + if (width_of_progress_bar > static_cast(bar.size() / UNICODE_BAR_CHAR_SIZE)) message << std::string(width_of_progress_bar - bar.size() / UNICODE_BAR_CHAR_SIZE, ' '); + + /// Render profiling_msg at right after the progress bar. + if (!profiling_msg.empty() && !render_profiling_msg_at_left) + message << "\033[2m" << profiling_msg << "\033[0m"; } } } @@ -245,8 +267,17 @@ void ProgressIndication::writeProgress() /// Underestimate percentage a bit to avoid displaying 100%. message << ' ' << (99 * current_count / max_count) << '%'; } + else + { + /// We can still display profiling info. + if (remaining_space >= static_cast(profiling_msg.size())) + { + if (remaining_space > static_cast(profiling_msg.size())) + message << std::string(remaining_space - profiling_msg.size(), ' '); + message << "\033[2m" << profiling_msg << "\033[0m"; + } + } - message << profiling_msg; message << CLEAR_TO_END_OF_LINE; ++increment; diff --git a/src/Common/ProgressIndication.h b/src/Common/ProgressIndication.h index b775279f73b..aad4a8c18e5 100644 --- a/src/Common/ProgressIndication.h +++ b/src/Common/ProgressIndication.h @@ -60,13 +60,10 @@ public: void updateThreadEventData(HostToThreadTimesMap & new_thread_data, UInt64 elapsed_time); - bool print_hardware_utilization = false; - private: - size_t getUsedThreadsCount() const; - UInt64 getApproximateCoresNumber() const; + double getCPUUsage() const; struct MemoryUsage { @@ -93,7 +90,7 @@ private: bool write_progress_on_update = false; - std::unordered_map host_active_cores; + std::unordered_map host_cpu_usage; HostToThreadTimesMap thread_data; }; diff --git a/src/Common/ShellCommand.cpp b/src/Common/ShellCommand.cpp index 99461862ef9..0093d72e766 100644 --- a/src/Common/ShellCommand.cpp +++ b/src/Common/ShellCommand.cpp @@ -65,14 +65,14 @@ ShellCommand::~ShellCommand() size_t try_wait_timeout = config.terminate_in_destructor_strategy.wait_for_normal_exit_before_termination_seconds; bool process_terminated_normally = tryWaitProcessWithTimeout(try_wait_timeout); - if (!process_terminated_normally) - { - LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid); + if (process_terminated_normally) + return; - int retcode = kill(pid, SIGTERM); - if (retcode != 0) - LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode)); - } + LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid); + + int retcode = kill(pid, SIGTERM); + if (retcode != 0) + LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode)); } else { @@ -91,7 +91,7 @@ bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds) { int status = 0; - LOG_TRACE(getLogger(), "Try wait for shell command pid ({}) with timeout ({})", pid, timeout_in_seconds); + LOG_TRACE(getLogger(), "Try wait for shell command pid {} with timeout {}", pid, timeout_in_seconds); wait_called = true; struct timespec interval {.tv_sec = 1, .tv_nsec = 0}; @@ -119,7 +119,9 @@ bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds) bool process_terminated_normally = (waitpid_res == pid); if (process_terminated_normally) + { return true; + } else if (waitpid_res == 0) { --timeout_in_seconds; @@ -128,7 +130,9 @@ bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds) continue; } else if (waitpid_res == -1 && errno != EINTR) + { return false; + } } return false; diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp index 568f633975b..32c1a15337c 100644 --- a/src/Common/SymbolIndex.cpp +++ b/src/Common/SymbolIndex.cpp @@ -86,7 +86,7 @@ namespace /// https://stackoverflow.com/questions/32088140/multiple-string-tables-in-elf-object -void updateResources(std::string_view name, const void * address, SymbolIndex::Resources & resources) +void updateResources(ElfW(Addr) base_address, std::string_view object_name, std::string_view name, const void * address, SymbolIndex::Resources & resources) { const char * char_address = static_cast(address); @@ -97,18 +97,23 @@ void updateResources(std::string_view name, const void * address, SymbolIndex::R name = name.substr((name[0] == '_') + strlen("binary_")); name = name.substr(0, name.size() - strlen("_start")); - resources.emplace(name, std::string_view{char_address, 0}); // NOLINT + resources.emplace(name, SymbolIndex::ResourcesBlob{ + base_address, + object_name, + std::string_view{char_address, 0}, // NOLINT + }); } else if (name.ends_with("_end")) { name = name.substr((name[0] == '_') + strlen("binary_")); name = name.substr(0, name.size() - strlen("_end")); - if (auto it = resources.find(name); it != resources.end() && it->second.empty()) + auto it = resources.find(name); + if (it != resources.end() && it->second.base_address == base_address && it->second.data.empty()) { - const char * start = it->second.data(); + const char * start = it->second.data.data(); assert(char_address >= start); - it->second = std::string_view{start, static_cast(char_address - start)}; + it->second.data = std::string_view{start, static_cast(char_address - start)}; } } } @@ -153,10 +158,12 @@ void collectSymbolsFromProgramHeaders( size_t sym_cnt = 0; for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); + // TODO: this branch leads to invalid address of the hash table. Need further investigation. // if (it->d_tag == DT_HASH) // { - // const ElfW(Word) * hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + // const ElfW(Word) * hash = reinterpret_cast(base_address); // sym_cnt = hash[1]; // break; // } @@ -167,7 +174,7 @@ void collectSymbolsFromProgramHeaders( const uint32_t * buckets = nullptr; const uint32_t * hashval = nullptr; - const ElfW(Word) * hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + const ElfW(Word) * hash = reinterpret_cast(base_address); buckets = hash + 4 + (hash[2] * sizeof(size_t) / 4); @@ -196,9 +203,11 @@ void collectSymbolsFromProgramHeaders( const char * strtab = nullptr; for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); + if (it->d_tag == DT_STRTAB) { - strtab = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + strtab = reinterpret_cast(base_address); break; } } @@ -208,10 +217,12 @@ void collectSymbolsFromProgramHeaders( for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); + if (it->d_tag == DT_SYMTAB) { /* Get the pointer to the first entry of the symbol table */ - const ElfW(Sym) * elf_sym = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + const ElfW(Sym) * elf_sym = reinterpret_cast(base_address); /* Iterate over the symbol table */ for (ElfW(Word) sym_index = 0; sym_index < ElfW(Word)(sym_cnt); ++sym_index) @@ -236,7 +247,7 @@ void collectSymbolsFromProgramHeaders( symbols.push_back(symbol); /// But resources can be represented by a pair of empty symbols (indicating their boundaries). - updateResources(symbol.name, symbol.address_begin, resources); + updateResources(base_address, info->dlpi_name, symbol.name, symbol.address_begin, resources); } break; @@ -299,7 +310,7 @@ void collectSymbolsFromELFSymbolTable( if (symbol_table_entry->st_size) symbols.push_back(symbol); - updateResources(symbol.name, symbol.address_begin, resources); + updateResources(info->dlpi_addr, info->dlpi_name, symbol.name, symbol.address_begin, resources); } } diff --git a/src/Common/SymbolIndex.h b/src/Common/SymbolIndex.h index 7c542980099..1331cf81cf7 100644 --- a/src/Common/SymbolIndex.h +++ b/src/Common/SymbolIndex.h @@ -51,7 +51,7 @@ public: std::string_view getResource(String name) const { if (auto it = data.resources.find(name); it != data.resources.end()) - return it->second; + return it->second.data; return {}; } @@ -59,7 +59,17 @@ public: String getBuildID() const { return data.build_id; } String getBuildIDHex() const; - using Resources = std::unordered_map; + struct ResourcesBlob + { + /// Symbol can be presented in multiple shared objects, + /// base_address will be used to compare only symbols from the same SO. + ElfW(Addr) base_address; + /// Just a human name of the SO. + std::string_view object_name; + /// Data blob. + std::string_view data; + }; + using Resources = std::unordered_map; struct Data { diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index f05a10b8815..c8753c8edaf 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -26,6 +26,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; + extern const int BAD_ARGUMENTS; } } @@ -1133,4 +1134,54 @@ Coordination::RequestPtr makeCheckRequest(const std::string & path, int version) return request; } +std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log) +{ + if (!zookeeper_path.empty() && zookeeper_path.back() == '/') + zookeeper_path.resize(zookeeper_path.size() - 1); + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + if (!zookeeper_path.empty() && zookeeper_path.front() != '/') + { + /// Do not allow this for new tables, print warning for tables created in old versions + if (check_starts_with_slash) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "ZooKeeper path must starts with '/', got '{}'", zookeeper_path); + if (log) + LOG_WARNING(log, "ZooKeeper path ('{}') does not start with '/'. It will not be supported in future releases"); + zookeeper_path = "/" + zookeeper_path; + } + + return zookeeper_path; +} + +String extractZooKeeperName(const String & path) +{ + static constexpr auto default_zookeeper_name = "default"; + if (path.empty()) + throw DB::Exception("ZooKeeper path should not be empty", DB::ErrorCodes::BAD_ARGUMENTS); + if (path[0] == '/') + return default_zookeeper_name; + auto pos = path.find(":/"); + if (pos != String::npos && pos < path.find('/')) + { + auto zookeeper_name = path.substr(0, pos); + if (zookeeper_name.empty()) + throw DB::Exception("Zookeeper path should start with '/' or ':/'", DB::ErrorCodes::BAD_ARGUMENTS); + return zookeeper_name; + } + return default_zookeeper_name; +} + +String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log) +{ + if (path.empty()) + throw DB::Exception("ZooKeeper path should not be empty", DB::ErrorCodes::BAD_ARGUMENTS); + if (path[0] == '/') + return normalizeZooKeeperPath(path, check_starts_with_slash, log); + auto pos = path.find(":/"); + if (pos != String::npos && pos < path.find('/')) + { + return normalizeZooKeeperPath(path.substr(pos + 1, String::npos), check_starts_with_slash, log); + } + return normalizeZooKeeperPath(path, check_starts_with_slash, log); +} + } diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 8e015b1f331..371f93f6df3 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -379,4 +379,11 @@ private: }; using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr; + +String normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log = nullptr); + +String extractZooKeeperName(const String & path); + +String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr); + } diff --git a/src/Common/tests/gtest_config_helper.cpp b/src/Common/tests/gtest_config_helper.cpp new file mode 100644 index 00000000000..59a6cfa0ae0 --- /dev/null +++ b/src/Common/tests/gtest_config_helper.cpp @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include + + +using namespace DB; + +TEST(Common, ConfigHelperGetBool) +{ + std::string xml(R"CONFIG( + 0 + 1 + Yes + + + + 1 + 1 + Yes1 +)CONFIG"); + + Poco::XML::DOMParser dom_parser; + Poco::AutoPtr document = dom_parser.parseString(xml); + Poco::AutoPtr config = new Poco::Util::XMLConfiguration(document); + EXPECT_EQ(ConfigHelper::getBool(*config, "zero_as_false", false, true), false); + EXPECT_EQ(ConfigHelper::getBool(*config, "one_as_true", false, true), true); + EXPECT_EQ(ConfigHelper::getBool(*config, "yes_as_true", false, true), true); + EXPECT_EQ(ConfigHelper::getBool(*config, "empty_as_true_1", false, true), true); + EXPECT_EQ(ConfigHelper::getBool(*config, "empty_as_true_2", false, true), true); + ASSERT_THROW(ConfigHelper::getBool(*config, "has_empty_child_1", false, true), Poco::Exception); + EXPECT_EQ(ConfigHelper::getBool(*config, "has_empty_child_2", false, true), true); + EXPECT_EQ(ConfigHelper::getBool(*config, "has_child_1", false, true), true); + ASSERT_THROW(ConfigHelper::getBool(*config, "has_child_2", false, true), Poco::Exception); +} diff --git a/src/Common/tests/gtest_lru_cache.cpp b/src/Common/tests/gtest_lru_cache.cpp new file mode 100644 index 00000000000..7694a76ea72 --- /dev/null +++ b/src/Common/tests/gtest_lru_cache.cpp @@ -0,0 +1,97 @@ +#include +#include +#include +#include + +TEST(LRUCache, set) +{ + using SimpleLRUCache = DB::LRUCache; + auto lru_cache = SimpleLRUCache(10, 10); + lru_cache.set(1, std::make_shared(2)); + lru_cache.set(2, std::make_shared(3)); + + auto w = lru_cache.weight(); + auto n = lru_cache.count(); + ASSERT_EQ(w, 2); + ASSERT_EQ(n, 2); +} + +TEST(LRUCache, update) +{ + using SimpleLRUCache = DB::LRUCache; + auto lru_cache = SimpleLRUCache(10, 10); + lru_cache.set(1, std::make_shared(2)); + lru_cache.set(1, std::make_shared(3)); + auto val = lru_cache.get(1); + ASSERT_TRUE(val != nullptr); + ASSERT_TRUE(*val == 3); +} + +TEST(LRUCache, get) +{ + using SimpleLRUCache = DB::LRUCache; + auto lru_cache = SimpleLRUCache(10, 10); + lru_cache.set(1, std::make_shared(2)); + lru_cache.set(2, std::make_shared(3)); + SimpleLRUCache::MappedPtr value = lru_cache.get(1); + ASSERT_TRUE(value != nullptr); + ASSERT_EQ(*value, 2); + + value = lru_cache.get(2); + ASSERT_TRUE(value != nullptr); + ASSERT_EQ(*value, 3); +} + +struct ValueWeight +{ + size_t operator()(const size_t & x) const { return x; } +}; + +TEST(LRUCache, evictOnSize) +{ + using SimpleLRUCache = DB::LRUCache; + auto lru_cache = SimpleLRUCache(20, 3); + lru_cache.set(1, std::make_shared(2)); + lru_cache.set(2, std::make_shared(3)); + lru_cache.set(3, std::make_shared(4)); + lru_cache.set(4, std::make_shared(5)); + + auto n = lru_cache.count(); + ASSERT_EQ(n, 3); + + auto value = lru_cache.get(1); + ASSERT_TRUE(value == nullptr); +} + +TEST(LRUCache, evictOnWeight) +{ + using SimpleLRUCache = DB::LRUCache, ValueWeight>; + auto lru_cache = SimpleLRUCache(10, 10); + lru_cache.set(1, std::make_shared(2)); + lru_cache.set(2, std::make_shared(3)); + lru_cache.set(3, std::make_shared(4)); + lru_cache.set(4, std::make_shared(5)); + + auto n = lru_cache.count(); + ASSERT_EQ(n, 2); + + auto w = lru_cache.weight(); + ASSERT_EQ(w, 9); + + auto value = lru_cache.get(1); + ASSERT_TRUE(value == nullptr); + value = lru_cache.get(2); + ASSERT_TRUE(value == nullptr); +} + +TEST(LRUCache, getOrSet) +{ + using SimpleLRUCache = DB::LRUCache, ValueWeight>; + auto lru_cache = SimpleLRUCache(10, 10); + size_t x = 10; + auto load_func = [&] { return std::make_shared(x); }; + auto [value, loaded] = lru_cache.getOrSet(1, load_func); + ASSERT_TRUE(value != nullptr); + ASSERT_TRUE(*value == 10); +} + diff --git a/src/Common/tests/gtest_lru_resource_cache.cpp b/src/Common/tests/gtest_lru_resource_cache.cpp new file mode 100644 index 00000000000..f88eded531e --- /dev/null +++ b/src/Common/tests/gtest_lru_resource_cache.cpp @@ -0,0 +1,270 @@ +#include +#include +#include +#include + +TEST(LRUResourceCache, get) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(10, 10); + int x = 10; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + x = 11; + auto holder2 = mcache.getOrSet(2, load_int); + ASSERT_TRUE(holder2 != nullptr); + ASSERT_TRUE(holder2->value() == 11); + + auto holder3 = mcache.get(1); + ASSERT_TRUE(holder3 != nullptr); + ASSERT_TRUE(holder3->value() == 10); +} + +TEST(LRUResourceCache, remove) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(10, 10); + int x = 10; + auto load_int = [&] { return std::make_shared(x); }; + auto holder0 = mcache.getOrSet(1, load_int); + auto holder1 = mcache.getOrSet(1, load_int); + + mcache.tryRemove(1); + holder0 = mcache.get(1); + ASSERT_TRUE(holder0 == nullptr); + auto n = mcache.size(); + ASSERT_TRUE(n == 1); + + holder0.reset(); + holder1.reset(); + n = mcache.size(); + ASSERT_TRUE(n == 0); +} + +struct MyWeight +{ + size_t operator()(const int & x) const { return static_cast(x); } +}; + +TEST(LRUResourceCache, evictOnWweight) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(5, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + + auto holder2 = mcache.getOrSet(2, load_int); + holder2.reset(); + + x = 3; + auto holder3 = mcache.getOrSet(3, load_int); + ASSERT_TRUE(holder3 != nullptr); + + auto w = mcache.weight(); + ASSERT_EQ(w, 5); + auto n = mcache.size(); + ASSERT_EQ(n, 2); + + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 == nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 != nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 != nullptr); +} + +TEST(LRUResourceCache, evictOnWeightV2) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(5, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + + auto holder2 = mcache.getOrSet(2, load_int); + holder2.reset(); + + holder1 = mcache.get(1); + holder1.reset(); + + x = 3; + auto holder3 = mcache.getOrSet(3, load_int); + ASSERT_TRUE(holder3 != nullptr); + + auto w = mcache.weight(); + ASSERT_EQ(w, 5); + auto n = mcache.size(); + ASSERT_EQ(n, 2); + + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 == nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 != nullptr); +} + +TEST(LRUResourceCache, evictOnWeightV3) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(5, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + + auto holder2 = mcache.getOrSet(2, load_int); + holder2.reset(); + + holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + + x = 3; + auto holder3 = mcache.getOrSet(3, load_int); + ASSERT_TRUE(holder3 != nullptr); + + auto w = mcache.weight(); + ASSERT_EQ(w, 5); + auto n = mcache.size(); + ASSERT_EQ(n, 2); + + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 == nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 != nullptr); +} + +TEST(LRUResourceCache, evictOnSize) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(5, 2); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + + auto holder2 = mcache.getOrSet(2, load_int); + holder2.reset(); + + x = 3; + auto holder3 = mcache.getOrSet(3, load_int); + ASSERT_TRUE(holder3 != nullptr); + + auto n = mcache.size(); + ASSERT_EQ(n, 2); + auto w = mcache.weight(); + ASSERT_EQ(w, 2); + + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 == nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 != nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 != nullptr); +} + +TEST(LRUResourceCache, notEvictUsedElement) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(7, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + + auto holder2 = mcache.getOrSet(2, load_int); + holder2.reset(); + + auto holder3 = mcache.getOrSet(3, load_int); + holder3.reset(); + + x = 3; + auto holder4 = mcache.getOrSet(4, load_int); + ASSERT_TRUE(holder4 != nullptr); + + auto n = mcache.size(); + ASSERT_EQ(n, 3); + auto w = mcache.weight(); + ASSERT_EQ(w, 7); + + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 == nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 != nullptr); + holder4 = mcache.get(4); + ASSERT_TRUE(holder4 != nullptr); +} + +TEST(LRUResourceCache, getFail) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(5, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + auto holder2 = mcache.getOrSet(2, load_int); + auto holder3 = mcache.getOrSet(3, load_int); + ASSERT_TRUE(holder3 == nullptr); + + auto n = mcache.size(); + ASSERT_EQ(n, 2); + auto w = mcache.weight(); + ASSERT_EQ(w, 4); + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 != nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 == nullptr); +} + +TEST(LRUResourceCache, dupGet) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(20, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + x = 11; + holder1 = mcache.getOrSet(1, load_int); + ASSERT_TRUE(holder1 != nullptr); + + auto n = mcache.size(); + ASSERT_EQ(n, 1); + auto w = mcache.weight(); + ASSERT_EQ(w, 2); + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + ASSERT_TRUE(holder1->value() == 2); +} + +TEST(LRUResourceCache, reGet) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(20, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + mcache.tryRemove(1); + + x = 11; + holder1.reset(); + holder1 = mcache.getOrSet(1, load_int); + ASSERT_TRUE(holder1 != nullptr); + + auto n = mcache.size(); + ASSERT_EQ(n, 1); + auto w = mcache.weight(); + ASSERT_EQ(w, 11); + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + ASSERT_TRUE(holder1->value() == 11); +} + diff --git a/src/Coordination/ACLMap.cpp b/src/Coordination/ACLMap.cpp index 863dfdec281..41b759531cf 100644 --- a/src/Coordination/ACLMap.cpp +++ b/src/Coordination/ACLMap.cpp @@ -42,11 +42,14 @@ bool ACLMap::ACLsComparator::operator()(const Coordination::ACLs & left, const C uint64_t ACLMap::convertACLs(const Coordination::ACLs & acls) { + if (acls.empty()) + return 0; + if (acl_to_num.count(acls)) return acl_to_num[acls]; /// Start from one - auto index = acl_to_num.size() + 1; + auto index = max_acl_id++; acl_to_num[acls] = index; num_to_acl[index] = acls; @@ -69,6 +72,7 @@ void ACLMap::addMapping(uint64_t acls_id, const Coordination::ACLs & acls) { num_to_acl[acls_id] = acls; acl_to_num[acls] = acls_id; + max_acl_id = std::max(acls_id + 1, max_acl_id); /// max_acl_id pointer next slot } void ACLMap::addUsage(uint64_t acl_id) diff --git a/src/Coordination/ACLMap.h b/src/Coordination/ACLMap.h index 2313b3e7cd3..e1b2ce1eff6 100644 --- a/src/Coordination/ACLMap.h +++ b/src/Coordination/ACLMap.h @@ -31,6 +31,7 @@ private: ACLToNumMap acl_to_num; NumToACLMap num_to_acl; UsageCounter usage_counter; + uint64_t max_acl_id{1}; public: /// Convert ACL to number. If it's new ACL than adds it to map @@ -43,7 +44,7 @@ public: /// Mapping from numbers to ACLs vectors. Used during serialization. const NumToACLMap & getMapping() const { return num_to_acl; } - /// Add mapping to ACLMap. Used during deserialization. + /// Add mapping to ACLMap. Used during deserialization from snapshot. void addMapping(uint64_t acls_id, const Coordination::ACLs & acls); /// Add/remove usage of some id. Used to remove unused ACLs. diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index bb160c98402..22a6bd1d941 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -41,6 +41,7 @@ const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "con KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() : server_id(NOT_EXIST) + , enable_ipv6(true) , tcp_port(NOT_EXIST) , tcp_port_secure(NOT_EXIST) , standalone_keeper(false) @@ -67,6 +68,9 @@ void KeeperConfigurationAndSettings::dump(WriteBufferFromOwnString & buf) const writeText("server_id=", buf); write_int(server_id); + writeText("enable_ipv6=", buf); + write_bool(enable_ipv6); + if (tcp_port != NOT_EXIST) { writeText("tcp_port=", buf); @@ -156,6 +160,8 @@ KeeperConfigurationAndSettings::loadFromConfig(const Poco::Util::AbstractConfigu ret->server_id = config.getInt("keeper_server.server_id"); ret->standalone_keeper = standalone_keeper_; + ret->enable_ipv6 = config.getBool("keeper_server.enable_ipv6", true); + if (config.has("keeper_server.tcp_port")) { ret->tcp_port = config.getInt("keeper_server.tcp_port"); diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 9cf566f95df..5546551cf3a 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -63,6 +63,7 @@ struct KeeperConfigurationAndSettings KeeperConfigurationAndSettings(); int server_id; + bool enable_ipv6; int tcp_port; int tcp_port_secure; diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 438e337b64f..8423f10f3a6 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -276,7 +276,7 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf try { LOG_DEBUG(log, "Waiting server to initialize"); - server->startup(); + server->startup(configuration_and_settings->enable_ipv6); LOG_DEBUG(log, "Server initialized, waiting for quorum"); if (!start_async) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 171fa2986eb..25d57e64e0a 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -75,6 +75,17 @@ std::string checkAndGetSuperdigest(const String & user_and_digest) return user_and_digest; } +int32_t getValueOrMaxInt32AndLogWarning(uint64_t value, const std::string & name, Poco::Logger * log) +{ + if (value > std::numeric_limits::max()) + { + LOG_WARNING(log, "Got {} value for setting '{}' which is bigger than int32_t max value, lowering value to {}.", value, name, std::numeric_limits::max()); + return std::numeric_limits::max(); + } + + return static_cast(value); +} + } KeeperServer::KeeperServer( @@ -96,7 +107,7 @@ KeeperServer::KeeperServer( LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); } -void KeeperServer::startup() +void KeeperServer::startup(bool enable_ipv6) { state_machine->init(); @@ -134,18 +145,18 @@ void KeeperServer::startup() } nuraft::raft_params params; - params.heart_beat_interval_ = coordination_settings->heart_beat_interval_ms.totalMilliseconds(); - params.election_timeout_lower_bound_ = coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(); - params.election_timeout_upper_bound_ = coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds(); - - params.reserved_log_items_ = coordination_settings->reserved_log_items; - params.snapshot_distance_ = coordination_settings->snapshot_distance; - params.stale_log_gap_ = coordination_settings->stale_log_gap; - params.fresh_log_gap_ = coordination_settings->fresh_log_gap; - params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds(); + params.heart_beat_interval_ = getValueOrMaxInt32AndLogWarning(coordination_settings->heart_beat_interval_ms.totalMilliseconds(), "heart_beat_interval_ms", log); + params.election_timeout_lower_bound_ = getValueOrMaxInt32AndLogWarning(coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(), "election_timeout_lower_bound_ms", log); + params.election_timeout_upper_bound_ = getValueOrMaxInt32AndLogWarning(coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds(), "election_timeout_upper_bound_ms", log); + params.reserved_log_items_ = getValueOrMaxInt32AndLogWarning(coordination_settings->reserved_log_items, "reserved_log_items", log); + params.snapshot_distance_ = getValueOrMaxInt32AndLogWarning(coordination_settings->snapshot_distance, "snapshot_distance", log); + params.stale_log_gap_ = getValueOrMaxInt32AndLogWarning(coordination_settings->stale_log_gap, "stale_log_gap", log); + params.fresh_log_gap_ = getValueOrMaxInt32AndLogWarning(coordination_settings->fresh_log_gap, "fresh_log_gap", log); + params.client_req_timeout_ = getValueOrMaxInt32AndLogWarning(coordination_settings->operation_timeout_ms.totalMilliseconds(), "operation_timeout_ms", log); params.auto_forwarding_ = coordination_settings->auto_forwarding; - params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2; - params.max_append_size_ = coordination_settings->max_requests_batch_size; + params.auto_forwarding_req_timeout_ = std::max(coordination_settings->operation_timeout_ms.totalMilliseconds() * 2, std::numeric_limits::max()); + params.auto_forwarding_req_timeout_ = getValueOrMaxInt32AndLogWarning(coordination_settings->operation_timeout_ms.totalMilliseconds() * 2, "operation_timeout_ms", log); + params.max_append_size_ = getValueOrMaxInt32AndLogWarning(coordination_settings->max_requests_batch_size, "max_requests_batch_size", log); params.return_method_ = nuraft::raft_params::async_handler; @@ -160,13 +171,14 @@ void KeeperServer::startup() #endif } - launchRaftServer(params, asio_opts); + launchRaftServer(enable_ipv6, params, asio_opts); if (!raft_instance) throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance"); } void KeeperServer::launchRaftServer( + bool enable_ipv6, const nuraft::raft_params & params, const nuraft::asio_service::options & asio_opts) { @@ -181,7 +193,7 @@ void KeeperServer::launchRaftServer( nuraft::ptr logger = nuraft::cs_new("RaftInstance", coordination_settings->raft_logs_level); asio_service = nuraft::cs_new(asio_opts, logger); - asio_listener = asio_service->create_rpc_listener(state_manager->getPort(), logger); + asio_listener = asio_service->create_rpc_listener(state_manager->getPort(), logger, enable_ipv6); if (!asio_listener) return; diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 376fe111f15..1fb02bb0987 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -44,6 +44,7 @@ private: /// Almost copy-paste from nuraft::launcher, but with separated server init and start /// Allows to avoid race conditions. void launchRaftServer( + bool enable_ipv6, const nuraft::raft_params & params, const nuraft::asio_service::options & asio_opts); @@ -57,7 +58,7 @@ public: SnapshotsQueue & snapshots_queue_); /// Load state machine from the latest snapshot and load log storage. Start NuRaft with required settings. - void startup(); + void startup(bool enable_ipv6 = true); /// Put local read request and execute in state machine directly and response into /// responses queue diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index a770451a733..a64a7d425f6 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -91,8 +91,7 @@ static bool checkACL(int32_t permission, const Coordination::ACLs & node_acls, c static bool fixupACL( const std::vector & request_acls, const std::vector & current_ids, - std::vector & result_acls, - bool hash_acls) + std::vector & result_acls) { if (request_acls.empty()) return true; @@ -125,8 +124,6 @@ static bool fixupACL( return false; valid_found = true; - if (hash_acls) - new_acl.id = generateDigest(new_acl.id); result_acls.push_back(new_acl); } } @@ -310,7 +307,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr KeeperStorage::Node created_node; Coordination::ACLs node_acls; - if (!fixupACL(request.acls, session_auth_ids, node_acls, !request.restored_from_zookeeper_log)) + if (!fixupACL(request.acls, session_auth_ids, node_acls)) { response.error = Coordination::Error::ZINVALIDACL; return {response_ptr, {}}; @@ -778,7 +775,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr auto & session_auth_ids = storage.session_and_auth[session_id]; Coordination::ACLs node_acls; - if (!fixupACL(request.acls, session_auth_ids, node_acls, !request.restored_from_zookeeper_log)) + if (!fixupACL(request.acls, session_auth_ids, node_acls)) { response.error = Coordination::Error::ZINVALIDACL; return {response_ptr, {}}; diff --git a/src/Core/ExternalTable.cpp b/src/Core/ExternalTable.cpp index b4adbcc0662..3b515fab5c9 100644 --- a/src/Core/ExternalTable.cpp +++ b/src/Core/ExternalTable.cpp @@ -169,7 +169,7 @@ void ExternalTablesHandler::handlePart(const Poco::Net::MessageHeader & header, processors.push_back(std::move(sink)); processors.push_back(std::move(exception_handling)); - auto executor = std::make_shared(processors); + auto executor = std::make_shared(processors, getContext()->getProcessListElement()); executor->execute(/*num_threads = */ 1); /// We are ready to receive the next file, for this we clear all the information received diff --git a/src/Core/PostgreSQL/Connection.cpp b/src/Core/PostgreSQL/Connection.cpp index 75786a51d92..f97a35a9e92 100644 --- a/src/Core/PostgreSQL/Connection.cpp +++ b/src/Core/PostgreSQL/Connection.cpp @@ -12,10 +12,7 @@ Connection::Connection(const ConnectionInfo & connection_info_, bool replication , log(&Poco::Logger::get("PostgreSQLReplicaConnection")) { if (replication) - { - connection_info = std::make_pair( - fmt::format("{} replication=database", connection_info.first), connection_info.second); - } + connection_info = {fmt::format("{} replication=database", connection_info.connection_string), connection_info.host_port}; } void Connection::execWithRetry(const std::function & exec) @@ -61,11 +58,14 @@ void Connection::updateConnection() { if (connection) connection->close(); + /// Always throws if there is no connection. - connection = std::make_unique(connection_info.first); + connection = std::make_unique(connection_info.connection_string); + if (replication) connection->set_variable("default_transaction_isolation", "'repeatable read'"); - LOG_DEBUG(&Poco::Logger::get("PostgreSQLConnection"), "New connection to {}", connection_info.second); + + LOG_DEBUG(&Poco::Logger::get("PostgreSQLConnection"), "New connection to {}", connection_info.host_port); } void Connection::connect() diff --git a/src/Core/PostgreSQL/Connection.h b/src/Core/PostgreSQL/Connection.h index d65c38643c1..8c5609dc66b 100644 --- a/src/Core/PostgreSQL/Connection.h +++ b/src/Core/PostgreSQL/Connection.h @@ -8,19 +8,26 @@ #include #include -/* Methods to work with PostgreSQL connection object. +/** Methods to work with PostgreSQL connection object. * Should only be used in case there has to be a single connection object, which * is long-lived and there are no concurrent connection queries. - * Now only use case - for replication handler for replication from PostgreSQL. - * In all other integration engine use pool with failover. - **/ + */ namespace Poco { class Logger; } +namespace pqxx +{ + using ConnectionPtr = std::unique_ptr; +} + namespace postgres { -using ConnectionInfo = std::pair; -using ConnectionPtr = std::unique_ptr; + +struct ConnectionInfo +{ + String connection_string; + String host_port; /// For logs. +}; class Connection : private boost::noncopyable { @@ -33,14 +40,17 @@ public: void connect(); + void updateConnection(); + void tryUpdateConnection(); const ConnectionInfo & getConnectionInfo() { return connection_info; } -private: - void updateConnection(); + String getInfoForLog() const { return connection_info.host_port; } - ConnectionPtr connection; +private: + + pqxx::ConnectionPtr connection; ConnectionInfo connection_info; bool replication; @@ -48,6 +58,9 @@ private: Poco::Logger * log; }; + +using ConnectionPtr = std::unique_ptr; + } #endif diff --git a/src/Core/PostgreSQL/ConnectionHolder.h b/src/Core/PostgreSQL/ConnectionHolder.h index d0d64935e91..38e321e222c 100644 --- a/src/Core/PostgreSQL/ConnectionHolder.h +++ b/src/Core/PostgreSQL/ConnectionHolder.h @@ -7,12 +7,12 @@ #include #include #include +#include "Connection.h" namespace postgres { -using ConnectionPtr = std::unique_ptr; using Pool = BorrowedObjectPool; using PoolPtr = std::shared_ptr; @@ -28,8 +28,12 @@ public: pqxx::connection & get() { - assert(connection != nullptr); - return *connection; + return connection->getRef(); + } + + void update() + { + connection->updateConnection(); } private: diff --git a/src/Core/PostgreSQL/PoolWithFailover.cpp b/src/Core/PostgreSQL/PoolWithFailover.cpp index 3addb511c3b..844c60087e0 100644 --- a/src/Core/PostgreSQL/PoolWithFailover.cpp +++ b/src/Core/PostgreSQL/PoolWithFailover.cpp @@ -32,9 +32,9 @@ PoolWithFailover::PoolWithFailover( { for (const auto & replica_configuration : configurations) { - auto connection_string = formatConnectionString(replica_configuration.database, - replica_configuration.host, replica_configuration.port, replica_configuration.username, replica_configuration.password).first; - replicas_with_priority[priority].emplace_back(connection_string, pool_size, getConnectionForLog(replica_configuration.host, replica_configuration.port)); + auto connection_info = formatConnectionString(replica_configuration.database, + replica_configuration.host, replica_configuration.port, replica_configuration.username, replica_configuration.password); + replicas_with_priority[priority].emplace_back(connection_info, pool_size); } } } @@ -52,8 +52,8 @@ PoolWithFailover::PoolWithFailover( for (const auto & [host, port] : configuration.addresses) { LOG_DEBUG(&Poco::Logger::get("PostgreSQLPoolWithFailover"), "Adding address host: {}, port: {} to connection pool", host, port); - auto connection_string = formatConnectionString(configuration.database, host, port, configuration.username, configuration.password).first; - replicas_with_priority[0].emplace_back(connection_string, pool_size, getConnectionForLog(host, port)); + auto connection_string = formatConnectionString(configuration.database, host, port, configuration.username, configuration.password); + replicas_with_priority[0].emplace_back(connection_string, pool_size); } } @@ -83,16 +83,18 @@ ConnectionHolderPtr PoolWithFailover::get() try { /// Create a new connection or reopen an old connection if it became invalid. - if (!connection || !connection->is_open()) + if (!connection) { - connection = std::make_unique(replica.connection_string); - LOG_DEBUG(log, "New connection to {}:{}", connection->hostname(), connection->port()); + connection = std::make_unique(replica.connection_info); + LOG_DEBUG(log, "New connection to {}", connection->getInfoForLog()); } + + connection->connect(); } catch (const pqxx::broken_connection & pqxx_error) { LOG_ERROR(log, "Connection error: {}", pqxx_error.what()); - error_message << "Try " << try_idx + 1 << ". Connection to `" << replica.name_for_log << "` failed: " << pqxx_error.what() << "\n"; + error_message << "Try " << try_idx + 1 << ". Connection to `" << replica.connection_info.host_port << "` failed: " << pqxx_error.what() << "\n"; replica.pool->returnObject(std::move(connection)); continue; diff --git a/src/Core/PostgreSQL/PoolWithFailover.h b/src/Core/PostgreSQL/PoolWithFailover.h index c59010a5d43..e6f691ed2dd 100644 --- a/src/Core/PostgreSQL/PoolWithFailover.h +++ b/src/Core/PostgreSQL/PoolWithFailover.h @@ -44,12 +44,11 @@ public: private: struct PoolHolder { - String connection_string; + ConnectionInfo connection_info; PoolPtr pool; - String name_for_log; - PoolHolder(const String & connection_string_, size_t pool_size, const String & name_for_log_) - : connection_string(connection_string_), pool(std::make_shared(pool_size)), name_for_log(name_for_log_) {} + PoolHolder(const ConnectionInfo & connection_info_, size_t pool_size) + : connection_info(connection_info_), pool(std::make_shared(pool_size)) {} }; /// Highest priority is 0, the bigger the number in map, the less the priority diff --git a/src/Core/PostgreSQL/Utils.cpp b/src/Core/PostgreSQL/Utils.cpp index 60b13218202..b4ad19c819a 100644 --- a/src/Core/PostgreSQL/Utils.cpp +++ b/src/Core/PostgreSQL/Utils.cpp @@ -17,7 +17,7 @@ ConnectionInfo formatConnectionString(String dbname, String host, UInt16 port, S << " user=" << DB::quote << user << " password=" << DB::quote << password << " connect_timeout=10"; - return std::make_pair(out.str(), host + ':' + DB::toString(port)); + return {out.str(), host + ':' + DB::toString(port)}; } String getConnectionForLog(const String & host, UInt16 port) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 70fb5604997..6e53fa4342c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -46,7 +46,6 @@ class IColumn; M(UInt64, max_insert_threads, 0, "The maximum number of threads to execute the INSERT SELECT query. Values 0 or 1 means that INSERT SELECT is not run in parallel. Higher values will lead to higher memory usage. Parallel INSERT SELECT has effect only if the SELECT part is run on parallel, see 'max_threads' setting.", 0) \ M(UInt64, max_final_threads, 16, "The maximum number of threads to read from table with FINAL.", 0) \ M(MaxThreads, max_threads, 0, "The maximum number of threads to execute the request. By default, it is determined automatically.", 0) \ - M(MaxThreads, max_alter_threads, 0, "The maximum number of threads to execute the ALTER requests. By default, it is determined automatically.", 0) \ M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the buffer to read from the filesystem.", 0) \ M(UInt64, max_distributed_connections, 1024, "The maximum number of connections for distributed processing of one query (should be greater than max_threads).", 0) \ M(UInt64, max_query_size, DBMS_DEFAULT_MAX_QUERY_SIZE, "Which part of the query can be read into RAM for parsing (the remaining data for INSERT, if any, is read later)", 0) \ @@ -572,7 +571,7 @@ class IColumn; MAKE_OBSOLETE(M, UInt64, merge_tree_clear_old_temporary_directories_interval_seconds, 60) \ MAKE_OBSOLETE(M, UInt64, merge_tree_clear_old_parts_interval_seconds, 1) \ MAKE_OBSOLETE(M, UInt64, partial_merge_join_optimizations, 0) \ - + MAKE_OBSOLETE(M, MaxThreads, max_alter_threads, 0) \ /** The section above is for obsolete settings. Do not add anything there. */ @@ -597,6 +596,8 @@ class IColumn; M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \ M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \ M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \ + M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ + M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \ \ M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \ M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \ @@ -662,6 +663,7 @@ class IColumn; M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ \ M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0)\ + // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index e74df5c327a..85644b6f6ca 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -377,6 +377,8 @@ struct WhichDataType constexpr bool isNullable() const { return idx == TypeIndex::Nullable; } constexpr bool isFunction() const { return idx == TypeIndex::Function; } constexpr bool isAggregateFunction() const { return idx == TypeIndex::AggregateFunction; } + + constexpr bool isLowCarnality() const { return idx == TypeIndex::LowCardinality; } }; /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) diff --git a/src/DataTypes/hasNullable.cpp b/src/DataTypes/hasNullable.cpp new file mode 100644 index 00000000000..2c699806874 --- /dev/null +++ b/src/DataTypes/hasNullable.cpp @@ -0,0 +1,33 @@ +#include +#include +#include +#include + +namespace DB +{ + +bool hasNullable(const DataTypePtr & type) +{ + if (type->isNullable() || type->isLowCardinalityNullable()) + return true; + + if (const DataTypeArray * type_array = typeid_cast(type.get())) + return hasNullable(type_array->getNestedType()); + else if (const DataTypeTuple * type_tuple = typeid_cast(type.get())) + { + for (const auto & subtype : type_tuple->getElements()) + { + if (hasNullable(subtype)) + return true; + } + return false; + } + else if (const DataTypeMap * type_map = typeid_cast(type.get())) + { + // Key type cannot be nullable. We only check value type. + return hasNullable(type_map->getValueType()); + } + return false; +} + +} diff --git a/src/DataTypes/hasNullable.h b/src/DataTypes/hasNullable.h new file mode 100644 index 00000000000..271803496f1 --- /dev/null +++ b/src/DataTypes/hasNullable.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace DB +{ + +bool hasNullable(const DataTypePtr & type); + +} diff --git a/src/Databases/DatabaseDictionary.cpp b/src/Databases/DatabaseDictionary.cpp index db7da95fb27..82766c1e384 100644 --- a/src/Databases/DatabaseDictionary.cpp +++ b/src/Databases/DatabaseDictionary.cpp @@ -29,10 +29,13 @@ namespace return nullptr; DictionaryStructure dictionary_structure = ExternalDictionariesLoader::getDictionaryStructure(*load_result.config); + auto comment = load_result.config->config->getString("dictionary.comment", ""); + return StorageDictionary::create( StorageID(database_name, load_result.name), load_result.name, dictionary_structure, + comment, StorageDictionary::Location::DictionaryDatabase, context); } diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index e9944b592ed..165bad950f5 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -76,10 +76,16 @@ std::pair createTableFromAST( /// - the database has not been loaded yet; /// - the code is simpler, since the query is already brought to a suitable form. if (!ast_create_query.columns_list || !ast_create_query.columns_list->columns) - throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); - - columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); - constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints); + { + if (!StorageFactory::instance().checkIfStorageSupportsSchemaInterface(ast_create_query.storage->engine->name)) + throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); + /// Leave columns empty. + } + else + { + columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); + constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints); + } } return diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index ffb39f5b113..1c3f417b431 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -30,27 +30,33 @@ void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemo auto & ast_create_query = query->as(); bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns; + if (ast_create_query.as_table_function && !has_structure) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function" " and doesn't have structure in metadata", backQuote(ast_create_query.getTable())); - assert(has_structure); - ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); - ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); - ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints); - ASTPtr new_projections = InterpreterCreateQuery::formatProjections(metadata.projections); + if (!has_structure && !ast_create_query.is_dictionary) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot alter table {} metadata doesn't have structure", backQuote(ast_create_query.getTable())); - ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->projections, new_projections); + if (!ast_create_query.is_dictionary) + { + ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); + ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); + ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints); + ASTPtr new_projections = InterpreterCreateQuery::formatProjections(metadata.projections); + + ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->projections, new_projections); + } if (metadata.select.select_query) { query->replace(ast_create_query.select, metadata.select.select_query); } - /// MaterializedView is one type of CREATE query without storage. + /// MaterializedView, Dictionary are types of CREATE query without storage. if (ast_create_query.storage) { ASTStorage & storage_ast = *ast_create_query.storage; diff --git a/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp b/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp index fede4319230..d43bde0b886 100644 --- a/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp +++ b/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp @@ -182,19 +182,19 @@ StoragePtr DatabasePostgreSQL::fetchTable(const String & table_name, ContextPtr, return StoragePtr{}; auto connection_holder = pool->get(); - auto columns = fetchPostgreSQLTableStructure(connection_holder->get(), table_name, configuration.schema).columns; + auto columns_info = fetchPostgreSQLTableStructure(connection_holder->get(), table_name, configuration.schema).physical_columns; - if (!columns) + if (!columns_info) return StoragePtr{}; auto storage = StoragePostgreSQL::create( StorageID(database_name, table_name), pool, table_name, - ColumnsDescription{*columns}, ConstraintsDescription{}, String{}, configuration.schema, configuration.on_conflict); + ColumnsDescription{columns_info->columns}, ConstraintsDescription{}, String{}, configuration.schema, configuration.on_conflict); if (cache_tables) cached_tables[table_name] = storage; - return storage; + return std::move(storage); } if (table_checked || checkPostgresTable(table_name)) @@ -414,7 +414,7 @@ ASTPtr DatabasePostgreSQL::getCreateTableQueryImpl(const String & table_name, Co assert(storage_engine_arguments->children.size() >= 2); storage_engine_arguments->children.insert(storage_engine_arguments->children.begin() + 2, std::make_shared(table_id.table_name)); - return create_table_query; + return std::move(create_table_query); } diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp index dd6d1dd2e52..67d328db00b 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp @@ -15,7 +15,7 @@ #include #include #include - +#include namespace DB { @@ -155,10 +155,11 @@ static DataTypePtr convertPostgreSQLDataType(String & type, Fn auto && r template -std::shared_ptr readNamesAndTypesList( - T & tx, const String & postgres_table, const String & query, bool use_nulls, bool only_names_and_types) +PostgreSQLTableStructure::ColumnsInfoPtr readNamesAndTypesList( + T & tx, const String & postgres_table, const String & query, bool use_nulls, bool only_names_and_types) { auto columns = NamesAndTypes(); + PostgreSQLTableStructure::Attributes attributes; try { @@ -180,14 +181,22 @@ std::shared_ptr readNamesAndTypesList( } else { - std::tuple row; + std::tuple row; while (stream >> row) { - auto data_type = convertPostgreSQLDataType(std::get<1>(row), - recheck_array, - use_nulls && (std::get<2>(row) == "f"), /// 'f' means that postgres `not_null` is false, i.e. value is nullable - std::get<3>(row)); + auto data_type = convertPostgreSQLDataType( + std::get<1>(row), recheck_array, + use_nulls && (std::get<2>(row) == /* not nullable */"f"), + std::get<3>(row)); + columns.push_back(NameAndTypePair(std::get<0>(row), data_type)); + + attributes.emplace_back( + PostgreSQLTableStructure::PGAttribute{ + .atttypid = parse(std::get<4>(row)), + .atttypmod = parse(std::get<5>(row)), + }); + ++i; } } @@ -226,7 +235,9 @@ std::shared_ptr readNamesAndTypesList( throw; } - return !columns.empty() ? std::make_shared(columns.begin(), columns.end()) : nullptr; + return !columns.empty() + ? std::make_shared(NamesAndTypesList(columns.begin(), columns.end()), std::move(attributes)) + : nullptr; } @@ -244,14 +255,14 @@ PostgreSQLTableStructure fetchPostgreSQLTableStructure( std::string query = fmt::format( "SELECT attname AS name, format_type(atttypid, atttypmod) AS type, " - "attnotnull AS not_null, attndims AS dims " + "attnotnull AS not_null, attndims AS dims, atttypid as type_id, atttypmod as type_modifier " "FROM pg_attribute " "WHERE attrelid = (SELECT oid FROM pg_class WHERE {}) " "AND NOT attisdropped AND attnum > 0", where); - table.columns = readNamesAndTypesList(tx, postgres_table, query, use_nulls, false); + table.physical_columns = readNamesAndTypesList(tx, postgres_table, query, use_nulls, false); - if (!table.columns) + if (!table.physical_columns) throw Exception(ErrorCodes::UNKNOWN_TABLE, "PostgreSQL table {} does not exist", postgres_table); if (with_primary_key) diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h index 279c88c4571..3be3aa79078 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h @@ -12,9 +12,24 @@ namespace DB struct PostgreSQLTableStructure { - std::shared_ptr columns = nullptr; - std::shared_ptr primary_key_columns = nullptr; - std::shared_ptr replica_identity_columns = nullptr; + struct PGAttribute + { + Int32 atttypid; + Int32 atttypmod; + }; + using Attributes = std::vector; + + struct ColumnsInfo + { + NamesAndTypesList columns; + Attributes attributes; + ColumnsInfo(NamesAndTypesList && columns_, Attributes && attributes_) : columns(columns_), attributes(attributes_) {} + }; + using ColumnsInfoPtr = std::shared_ptr; + + ColumnsInfoPtr physical_columns; + ColumnsInfoPtr primary_key_columns; + ColumnsInfoPtr replica_identity_columns; }; using PostgreSQLTableStructurePtr = std::unique_ptr; diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index 2c7e9ad7092..5fd1bd420c6 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -13,6 +13,7 @@ #include #include + namespace DB { @@ -308,7 +309,7 @@ private: if (was_inserted) { if constexpr (std::is_same_v) - cell.key = copyStringInArena(key); + cell.key = copyStringInArena(arena, key); else cell.key = key; @@ -332,8 +333,7 @@ private: else if constexpr (std::is_same_v) { const String & string_value = column_value.get(); - StringRef string_value_ref = StringRef {string_value.data(), string_value.size()}; - StringRef inserted_value = copyStringInArena(string_value_ref); + StringRef inserted_value = copyStringInArena(arena, string_value); container.back() = inserted_value; } else @@ -353,7 +353,7 @@ private: { char * data = const_cast(cell.key.data); arena.free(data, cell.key.size); - cell.key = copyStringInArena(key); + cell.key = copyStringInArena(arena, key); } else cell.key = key; @@ -379,8 +379,7 @@ private: else if constexpr (std::is_same_v) { const String & string_value = column_value.get(); - StringRef string_ref_value = StringRef {string_value.data(), string_value.size()}; - StringRef inserted_value = copyStringInArena(string_ref_value); + StringRef inserted_value = copyStringInArena(arena, string_value); if (!cell_was_default) { @@ -423,7 +422,7 @@ private: if (was_inserted) { if constexpr (std::is_same_v) - cell.key = copyStringInArena(key); + cell.key = copyStringInArena(arena, key); else cell.key = key; @@ -463,7 +462,7 @@ private: { char * data = const_cast(cell.key.data); arena.free(data, cell.key.size); - cell.key = copyStringInArena(key); + cell.key = copyStringInArena(arena, key); } else cell.key = key; @@ -526,16 +525,6 @@ private: return const_cast *>(this)->template getAttributeContainer(attribute_index, std::forward(func)); } - StringRef copyStringInArena(StringRef value_to_copy) - { - size_t value_to_copy_size = value_to_copy.size; - char * place_for_key = arena.alloc(value_to_copy_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(value_to_copy.data), value_to_copy_size); - StringRef updated_value{place_for_key, value_to_copy_size}; - - return updated_value; - } - template using ContainerType = std::conditional_t< std::is_same_v || std::is_same_v, diff --git a/src/Dictionaries/ClickHouseDictionarySource.cpp b/src/Dictionaries/ClickHouseDictionarySource.cpp index 6abd5f317e2..a99bee60cfd 100644 --- a/src/Dictionaries/ClickHouseDictionarySource.cpp +++ b/src/Dictionaries/ClickHouseDictionarySource.cpp @@ -28,6 +28,10 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +static const std::unordered_set dictionary_allowed_keys = { + "host", "port", "user", "password", "db", "database", "table", + "update_field", "update_tag", "invalidate_query", "query", "where", "name", "secure"}; + namespace { constexpr size_t MAX_CONNECTIONS = 16; @@ -235,9 +239,11 @@ void registerDictionarySourceClickHouse(DictionarySourceFactory & factory) std::string db = config.getString(settings_config_prefix + ".db", default_database); std::string table = config.getString(settings_config_prefix + ".table", ""); UInt16 port = static_cast(config.getUInt(settings_config_prefix + ".port", default_port)); + auto has_config_key = [](const String & key) { return dictionary_allowed_keys.contains(key); }; - auto named_collection = created_from_ddl ? - getExternalDataSourceConfiguration(config, settings_config_prefix, global_context) : std::nullopt; + auto named_collection = created_from_ddl + ? getExternalDataSourceConfiguration(config, settings_config_prefix, global_context, has_config_key) + : std::nullopt; if (named_collection) { diff --git a/src/Dictionaries/DictionaryHelpers.h b/src/Dictionaries/DictionaryHelpers.h index b59e29c327e..1e6a4a5fb44 100644 --- a/src/Dictionaries/DictionaryHelpers.h +++ b/src/Dictionaries/DictionaryHelpers.h @@ -623,6 +623,17 @@ void mergeBlockWithPipe( } } +template +static StringRef copyStringInArena(Arena & arena, StringRef value) +{ + size_t key_size = value.size; + char * place_for_key = arena.alloc(key_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(value.data), key_size); + StringRef result{place_for_key, key_size}; + + return result; +} + /** * Returns ColumnVector data as PaddedPodArray. diff --git a/src/Dictionaries/DictionaryStructure.cpp b/src/Dictionaries/DictionaryStructure.cpp index 21d43031204..6955b3ddfdc 100644 --- a/src/Dictionaries/DictionaryStructure.cpp +++ b/src/Dictionaries/DictionaryStructure.cpp @@ -25,6 +25,7 @@ namespace ErrorCodes namespace { + DictionaryTypedSpecialAttribute makeDictionaryTypedSpecialAttribute( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const std::string & default_type) { @@ -38,7 +39,7 @@ DictionaryTypedSpecialAttribute makeDictionaryTypedSpecialAttribute( return DictionaryTypedSpecialAttribute{std::move(name), std::move(expression), DataTypeFactory::instance().get(type_name)}; } -std::optional maybeGetAttributeUnderlyingType(TypeIndex index) +std::optional tryGetAttributeUnderlyingType(TypeIndex index) { switch (index) /// Special cases which do not map TypeIndex::T -> AttributeUnderlyingType::T { @@ -65,14 +66,16 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration { std::string structure_prefix = config_prefix + ".structure"; - const auto has_id = config.has(structure_prefix + ".id"); - const auto has_key = config.has(structure_prefix + ".key"); + const bool has_id = config.has(structure_prefix + ".id"); + const bool has_key = config.has(structure_prefix + ".key"); if (has_key && has_id) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only one of 'id' and 'key' should be specified"); if (has_id) + { id.emplace(config, structure_prefix + ".id"); + } else if (has_key) { key.emplace(getAttributes(config, structure_prefix + ".key", /*complex_key_attributes =*/ true)); @@ -80,7 +83,9 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration throw Exception(ErrorCodes::BAD_ARGUMENTS, "Empty 'key' supplied"); } else + { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary structure should specify either 'id' or 'key'"); + } if (id) { @@ -94,7 +99,8 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration parseRangeConfiguration(config, structure_prefix); attributes = getAttributes(config, structure_prefix, /*complex_key_attributes =*/ false); - for (size_t i = 0; i < attributes.size(); ++i) + size_t attributes_size = attributes.size(); + for (size_t i = 0; i < attributes_size; ++i) { const auto & attribute = attributes[i]; const auto & attribute_name = attribute.name; @@ -106,7 +112,6 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration throw Exception(ErrorCodes::TYPE_MISMATCH, "Hierarchical attribute type for dictionary with simple key must be UInt64. Actual {}", attribute.underlying_type); - else if (key) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary with complex key does not support hierarchy"); @@ -121,17 +126,27 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration void DictionaryStructure::validateKeyTypes(const DataTypes & key_types) const { - if (key_types.size() != key->size()) + size_t key_types_size = key_types.size(); + if (key_types_size != getKeysSize()) throw Exception(ErrorCodes::TYPE_MISMATCH, "Key structure does not match, expected {}", getKeyDescription()); - for (size_t i = 0; i < key_types.size(); ++i) + if (id && !isUInt64(key_types[0])) + { + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Key type for simple key does not match, expected {}, found {}", + std::to_string(0), + "UInt64", + key_types[0]->getName()); + } + + for (size_t i = 0; i < key_types_size; ++i) { const auto & expected_type = (*key)[i].type; const auto & actual_type = key_types[i]; if (!areTypesEqual(expected_type, actual_type)) throw Exception(ErrorCodes::TYPE_MISMATCH, - "Key type at position {} does not match, expected {}, found {}", + "Key type for complex key at position {} does not match, expected {}, found {}", std::to_string(i), expected_type->getName(), actual_type->getName()); @@ -204,19 +219,6 @@ std::string DictionaryStructure::getKeyDescription() const return out.str(); } - -bool DictionaryStructure::isKeySizeFixed() const -{ - if (!key) - return true; - - for (const auto & key_i : *key) - if (key_i.underlying_type == AttributeUnderlyingType::String) - return false; - - return true; -} - Strings DictionaryStructure::getKeysNames() const { if (id) @@ -235,7 +237,7 @@ Strings DictionaryStructure::getKeysNames() const static void checkAttributeKeys(const Poco::Util::AbstractConfiguration::Keys & keys) { - static const std::unordered_set valid_keys + static const std::unordered_set valid_keys = {"name", "type", "expression", "null_value", "hierarchical", "injective", "is_object_id"}; for (const auto & key : keys) @@ -256,7 +258,7 @@ std::vector DictionaryStructure::getAttributes( Poco::Util::AbstractConfiguration::Keys config_elems; config.keys(config_prefix, config_elems); - auto has_hierarchy = false; + bool has_hierarchy = false; std::unordered_set attribute_names; std::vector res_attributes; @@ -296,7 +298,7 @@ std::vector DictionaryStructure::getAttributes( auto non_nullable_type = removeNullable(initial_type); - const auto underlying_type_opt = maybeGetAttributeUnderlyingType(non_nullable_type->getTypeId()); + const auto underlying_type_opt = tryGetAttributeUnderlyingType(non_nullable_type->getTypeId()); if (!underlying_type_opt) throw Exception(ErrorCodes::UNKNOWN_TYPE, @@ -336,6 +338,7 @@ std::vector DictionaryStructure::getAttributes( const auto hierarchical = config.getBool(prefix + "hierarchical", false); const auto injective = config.getBool(prefix + "injective", false); const auto is_object_id = config.getBool(prefix + "is_object_id", false); + if (name.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Properties 'name' and 'type' of an attribute cannot be empty"); @@ -388,13 +391,12 @@ void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConf range_max->type->getName()); } - if (range_min) + if (range_min && !range_min->type->isValueRepresentedByInteger()) { - if (!range_min->type->isValueRepresentedByInteger()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum." - " Actual 'range_min' and 'range_max' type is {}", - range_min->type->getName()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum." + " Actual 'range_min' and 'range_max' type is {}", + range_min->type->getName()); } if ((range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty())) diff --git a/src/Dictionaries/DictionaryStructure.h b/src/Dictionaries/DictionaryStructure.h index 4de00ddd259..817bc8d7824 100644 --- a/src/Dictionaries/DictionaryStructure.h +++ b/src/Dictionaries/DictionaryStructure.h @@ -129,7 +129,6 @@ struct DictionaryStructure final size_t getKeysSize() const; std::string getKeyDescription() const; - bool isKeySizeFixed() const; private: /// range_min and range_max have to be parsed before this function call diff --git a/src/Dictionaries/DirectDictionary.cpp b/src/Dictionaries/DirectDictionary.cpp index 12c624a6859..19bbcb6ca98 100644 --- a/src/Dictionaries/DirectDictionary.cpp +++ b/src/Dictionaries/DirectDictionary.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include diff --git a/src/Dictionaries/DirectDictionary.h b/src/Dictionaries/DirectDictionary.h index 4bf24e6ae98..de18e9486e6 100644 --- a/src/Dictionaries/DirectDictionary.h +++ b/src/Dictionaries/DirectDictionary.h @@ -3,15 +3,12 @@ #include #include #include -#include -#include -#include -#include -#include -#include "DictionaryStructure.h" -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryHelpers.h" + +#include +#include +#include +#include + namespace DB { diff --git a/src/Dictionaries/ExecutableDictionarySource.cpp b/src/Dictionaries/ExecutableDictionarySource.cpp index 8d10a6665cf..7a3550e7284 100644 --- a/src/Dictionaries/ExecutableDictionarySource.cpp +++ b/src/Dictionaries/ExecutableDictionarySource.cpp @@ -1,10 +1,16 @@ #include "ExecutableDictionarySource.h" +#include + +#include + #include #include +#include #include #include +#include #include #include @@ -27,15 +33,46 @@ namespace ErrorCodes extern const int UNSUPPORTED_METHOD; } +namespace +{ + + void updateCommandIfNeeded(String & command, bool execute_direct, ContextPtr context) + { + if (!execute_direct) + return; + + auto global_context = context->getGlobalContext(); + auto user_scripts_path = global_context->getUserScriptsPath(); + auto script_path = user_scripts_path + '/' + command; + + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} must be inside user scripts folder {}", + command, + user_scripts_path); + + if (!std::filesystem::exists(std::filesystem::path(script_path))) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} does not exist inside user scripts folder {}", + command, + user_scripts_path); + + command = std::move(script_path); + } + +} + ExecutableDictionarySource::ExecutableDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_) : log(&Poco::Logger::get("ExecutableDictionarySource")) , dict_struct(dict_struct_) , configuration(configuration_) - , sample_block{sample_block_} + , sample_block(sample_block_) + , coordinator(std::move(coordinator_)) , context(context_) { /// Remove keys from sample_block for implicit_key dictionary because @@ -58,6 +95,7 @@ ExecutableDictionarySource::ExecutableDictionarySource(const ExecutableDictionar , dict_struct(other.dict_struct) , configuration(other.configuration) , sample_block(other.sample_block) + , coordinator(other.coordinator) , context(Context::createCopy(other.context)) { } @@ -69,11 +107,11 @@ Pipe ExecutableDictionarySource::loadAll() LOG_TRACE(log, "loadAll {}", toString()); - ShellCommand::Config config(configuration.command); - auto process = ShellCommand::execute(config); + const auto & coordinator_configuration = coordinator->getConfiguration(); + auto command = configuration.command; + updateCommandIfNeeded(command, coordinator_configuration.execute_direct, context); - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process))); - return pipe; + return coordinator->createPipe(command, configuration.command_arguments, sample_block, context); } Pipe ExecutableDictionarySource::loadUpdatedAll() @@ -82,17 +120,32 @@ Pipe ExecutableDictionarySource::loadUpdatedAll() throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "ExecutableDictionarySource with implicit_key does not support loadUpdatedAll method"); time_t new_update_time = time(nullptr); - SCOPE_EXIT(update_time = new_update_time); - std::string command_with_update_field = configuration.command; + const auto & coordinator_configuration = coordinator->getConfiguration(); + auto command = configuration.command; + updateCommandIfNeeded(command, coordinator_configuration.execute_direct, context); + + auto command_arguments = configuration.command_arguments; + if (update_time) - command_with_update_field += " " + configuration.update_field + " " + DB::toString(LocalDateTime(update_time - configuration.update_lag)); + { + auto update_difference = DB::toString(LocalDateTime(update_time - configuration.update_lag)); - LOG_TRACE(log, "loadUpdatedAll {}", command_with_update_field); - ShellCommand::Config config(command_with_update_field); - auto process = ShellCommand::execute(config); - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process))); - return pipe; + if (coordinator_configuration.execute_direct) + { + command_arguments.emplace_back(configuration.update_field); + command_arguments.emplace_back(std::move(update_difference)); + } + else + { + command += ' ' + configuration.update_field + ' ' + update_difference; + } + } + + update_time = new_update_time; + + LOG_TRACE(log, "loadUpdatedAll {}", command); + return coordinator->createPipe(command, command_arguments, sample_block, context); } Pipe ExecutableDictionarySource::loadIds(const std::vector & ids) @@ -113,27 +166,17 @@ Pipe ExecutableDictionarySource::loadKeys(const Columns & key_columns, const std Pipe ExecutableDictionarySource::getStreamForBlock(const Block & block) { - ShellCommand::Config config(configuration.command); - auto process = ShellCommand::execute(config); - auto * process_in = &process->in; + const auto & coordinator_configuration = coordinator->getConfiguration(); + String command = configuration.command; + updateCommandIfNeeded(command, coordinator_configuration.execute_direct, context); - ShellCommandSource::SendDataTask task = {[process_in, block, this]() - { - auto & out = *process_in; + auto source = std::make_shared(block); + auto shell_input_pipe = Pipe(std::move(source)); - if (configuration.send_chunk_header) - { - writeText(block.rows(), out); - writeChar('\n', out); - } + Pipes shell_input_pipes; + shell_input_pipes.emplace_back(std::move(shell_input_pipe)); - auto output_format = context->getOutputFormat(configuration.format, out, block.cloneEmpty()); - formatBlock(output_format, block); - out.close(); - }}; - std::vector tasks = {std::move(task)}; - - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process), std::move(tasks))); + auto pipe = coordinator->createPipe(command, configuration.command_arguments, std::move(shell_input_pipes), sample_block, context); if (configuration.implicit_key) pipe.addTransform(std::make_shared(block, pipe.getHeader())); @@ -189,17 +232,40 @@ void registerDictionarySourceExecutable(DictionarySourceFactory & factory) std::string settings_config_prefix = config_prefix + ".executable"; + bool execute_direct = config.getBool(settings_config_prefix + ".execute_direct", false); + std::string command_value = config.getString(settings_config_prefix + ".command"); + std::vector command_arguments; + + if (execute_direct) + { + boost::split(command_arguments, command_value, [](char c) { return c == ' '; }); + + command_value = std::move(command_arguments[0]); + command_arguments.erase(command_arguments.begin()); + } + ExecutableDictionarySource::Configuration configuration { - .command = config.getString(settings_config_prefix + ".command"), - .format = config.getString(settings_config_prefix + ".format"), + .command = std::move(command_value), + .command_arguments = std::move(command_arguments), .update_field = config.getString(settings_config_prefix + ".update_field", ""), .update_lag = config.getUInt64(settings_config_prefix + ".update_lag", 1), .implicit_key = config.getBool(settings_config_prefix + ".implicit_key", false), - .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false) }; - return std::make_unique(dict_struct, configuration, sample_block, context); + ShellCommandSourceCoordinator::Configuration shell_command_coordinator_configration + { + .format = config.getString(settings_config_prefix + ".format"), + .command_termination_timeout_seconds = config.getUInt64(settings_config_prefix + ".command_termination_timeout", 10), + .command_read_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_read_timeout", 10000), + .command_write_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_write_timeout", 10000), + .is_executable_pool = false, + .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false), + .execute_direct = config.getBool(settings_config_prefix + ".execute_direct", false) + }; + + auto coordinator = std::make_shared(shell_command_coordinator_configration); + return std::make_unique(dict_struct, configuration, sample_block, std::move(coordinator), context); }; factory.registerSource("executable", create_table_source); diff --git a/src/Dictionaries/ExecutableDictionarySource.h b/src/Dictionaries/ExecutableDictionarySource.h index a7ffc8bebcb..6c5d2de3714 100644 --- a/src/Dictionaries/ExecutableDictionarySource.h +++ b/src/Dictionaries/ExecutableDictionarySource.h @@ -7,6 +7,7 @@ #include #include +#include namespace DB @@ -20,20 +21,19 @@ public: struct Configuration { std::string command; - std::string format; + std::vector command_arguments; std::string update_field; UInt64 update_lag; /// Implicit key means that the source script will return only values, /// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result. bool implicit_key; - /// Send number_of_rows\n before sending chunk to process - bool send_chunk_header; }; ExecutableDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_); ExecutableDictionarySource(const ExecutableDictionarySource & other); @@ -69,6 +69,7 @@ private: const DictionaryStructure dict_struct; const Configuration configuration; Block sample_block; + std::shared_ptr coordinator; ContextPtr context; }; diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.cpp b/src/Dictionaries/ExecutablePoolDictionarySource.cpp index a0eb3435a11..48ddeed7fa6 100644 --- a/src/Dictionaries/ExecutablePoolDictionarySource.cpp +++ b/src/Dictionaries/ExecutablePoolDictionarySource.cpp @@ -1,14 +1,20 @@ #include "ExecutablePoolDictionarySource.h" +#include + +#include + #include #include +#include #include +#include +#include +#include #include #include -#include -#include #include #include @@ -23,20 +29,19 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int DICTIONARY_ACCESS_DENIED; extern const int UNSUPPORTED_METHOD; - extern const int TIMEOUT_EXCEEDED; } ExecutablePoolDictionarySource::ExecutablePoolDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_) : dict_struct(dict_struct_) , configuration(configuration_) , sample_block(sample_block_) + , coordinator(std::move(coordinator_)) , context(context_) - /// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type. - , process_pool(std::make_shared(configuration.pool_size == 0 ? std::numeric_limits::max() : configuration.pool_size)) , log(&Poco::Logger::get("ExecutablePoolDictionarySource")) { /// Remove keys from sample_block for implicit_key dictionary because @@ -59,8 +64,8 @@ ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(const ExecutableP : dict_struct(other.dict_struct) , configuration(other.configuration) , sample_block(other.sample_block) + , coordinator(other.coordinator) , context(Context::createCopy(other.context)) - , process_pool(std::make_shared(configuration.pool_size)) , log(&Poco::Logger::get("ExecutablePoolDictionarySource")) { } @@ -93,41 +98,47 @@ Pipe ExecutablePoolDictionarySource::loadKeys(const Columns & key_columns, const Pipe ExecutablePoolDictionarySource::getStreamForBlock(const Block & block) { - std::unique_ptr process; - bool result = process_pool->tryBorrowObject(process, [this]() + String command = configuration.command; + const auto & coordinator_configuration = coordinator->getConfiguration(); + + if (coordinator_configuration.execute_direct) { - ShellCommand::Config config(configuration.command); - config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, configuration.command_termination_timeout }; - auto shell_command = ShellCommand::execute(config); - return shell_command; - }, configuration.max_command_execution_time * 10000); + auto global_context = context->getGlobalContext(); + auto user_scripts_path = global_context->getUserScriptsPath(); + auto script_path = user_scripts_path + '/' + command; - if (!result) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Could not get process from pool, max command execution timeout exceeded {} seconds", - configuration.max_command_execution_time); + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} must be inside user scripts folder {}", + command, + user_scripts_path); - size_t rows_to_read = block.rows(); - auto * process_in = &process->in; - ShellCommandSource::SendDataTask task = [process_in, block, this]() mutable - { - auto & out = *process_in; + if (!std::filesystem::exists(std::filesystem::path(script_path))) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} does not exist inside user scripts folder {}", + command, + user_scripts_path); - if (configuration.send_chunk_header) - { - writeText(block.rows(), out); - writeChar('\n', out); - } + command = std::move(script_path); + } - auto output_format = context->getOutputFormat(configuration.format, out, block.cloneEmpty()); - formatBlock(output_format, block); - }; - std::vector tasks = {std::move(task)}; + auto source = std::make_shared(block); + auto shell_input_pipe = Pipe(std::move(source)); ShellCommandSourceConfiguration command_configuration; command_configuration.read_fixed_number_of_rows = true; - command_configuration.number_of_rows_to_read = rows_to_read; - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process), std::move(tasks), command_configuration, process_pool)); + command_configuration.number_of_rows_to_read = block.rows(); + + Pipes shell_input_pipes; + shell_input_pipes.emplace_back(std::move(shell_input_pipe)); + + auto pipe = coordinator->createPipe( + command, + configuration.command_arguments, + std::move(shell_input_pipes), + sample_block, + context, + command_configuration); if (configuration.implicit_key) pipe.addTransform(std::make_shared(block, pipe.getHeader())); @@ -157,7 +168,8 @@ DictionarySourcePtr ExecutablePoolDictionarySource::clone() const std::string ExecutablePoolDictionarySource::toString() const { - return "ExecutablePool size: " + std::to_string(configuration.pool_size) + " command: " + configuration.command; + size_t pool_size = coordinator->getConfiguration().pool_size; + return "ExecutablePool size: " + std::to_string(pool_size) + " command: " + configuration.command; } void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory) @@ -189,18 +201,40 @@ void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory) if (max_execution_time_seconds != 0 && max_command_execution_time > max_execution_time_seconds) max_command_execution_time = max_execution_time_seconds; + bool execute_direct = config.getBool(settings_config_prefix + ".execute_direct", false); + std::string command_value = config.getString(settings_config_prefix + ".command"); + std::vector command_arguments; + + if (execute_direct) + { + boost::split(command_arguments, command_value, [](char c) { return c == ' '; }); + + command_value = std::move(command_arguments[0]); + command_arguments.erase(command_arguments.begin()); + } + ExecutablePoolDictionarySource::Configuration configuration { - .command = config.getString(settings_config_prefix + ".command"), - .format = config.getString(settings_config_prefix + ".format"), - .pool_size = config.getUInt64(settings_config_prefix + ".size"), - .command_termination_timeout = config.getUInt64(settings_config_prefix + ".command_termination_timeout", 10), - .max_command_execution_time = max_command_execution_time, + .command = std::move(command_value), + .command_arguments = std::move(command_arguments), .implicit_key = config.getBool(settings_config_prefix + ".implicit_key", false), - .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false) }; - return std::make_unique(dict_struct, configuration, sample_block, context); + ShellCommandSourceCoordinator::Configuration shell_command_coordinator_configration + { + .format = config.getString(settings_config_prefix + ".format"), + .command_termination_timeout_seconds = config.getUInt64(settings_config_prefix + ".command_termination_timeout", 10), + .command_read_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_read_timeout", 10000), + .command_write_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_write_timeout", 10000), + .pool_size = config.getUInt64(settings_config_prefix + ".pool_size", 16), + .max_command_execution_time_seconds = max_command_execution_time, + .is_executable_pool = true, + .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false), + .execute_direct = execute_direct + }; + + auto coordinator = std::make_shared(shell_command_coordinator_configration); + return std::make_unique(dict_struct, configuration, sample_block, std::move(coordinator), context); }; factory.registerSource("executable_pool", create_table_source); diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.h b/src/Dictionaries/ExecutablePoolDictionarySource.h index 51215b6311b..b9b3b8efb1b 100644 --- a/src/Dictionaries/ExecutablePoolDictionarySource.h +++ b/src/Dictionaries/ExecutablePoolDictionarySource.h @@ -28,21 +28,15 @@ public: struct Configuration { String command; - String format; - size_t pool_size; - size_t command_termination_timeout; - size_t max_command_execution_time; - /// Implicit key means that the source script will return only values, - /// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result. + std::vector command_arguments; bool implicit_key; - /// Send number_of_rows\n before sending chunk to process - bool send_chunk_header; }; ExecutablePoolDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_); ExecutablePoolDictionarySource(const ExecutablePoolDictionarySource & other); @@ -77,8 +71,8 @@ private: const Configuration configuration; Block sample_block; + std::shared_ptr coordinator; ContextPtr context; - std::shared_ptr process_pool; Poco::Logger * log; }; diff --git a/src/Dictionaries/FlatDictionary.cpp b/src/Dictionaries/FlatDictionary.cpp index de4ae66300a..c260924a82b 100644 --- a/src/Dictionaries/FlatDictionary.cpp +++ b/src/Dictionaries/FlatDictionary.cpp @@ -399,9 +399,6 @@ void FlatDictionary::calculateBytesAllocated() } bucket_count = container.capacity(); - - if constexpr (std::is_same_v) - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); }; callOnDictionaryAttributeType(attribute.type, type_call); @@ -414,12 +411,14 @@ void FlatDictionary::calculateBytesAllocated() if (update_field_loaded_block) bytes_allocated += update_field_loaded_block->allocatedBytes(); + + bytes_allocated += string_arena.size(); } FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttribute & dictionary_attribute) { auto is_nullable_set = dictionary_attribute.is_nullable ? std::make_optional() : std::optional{}; - Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), {}, {}}; + Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), {}}; auto type_call = [&](const auto & dictionary_attribute_type) { @@ -427,9 +426,6 @@ FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttrib using AttributeType = typename Type::AttributeType; using ValueType = DictionaryValueType; - if constexpr (std::is_same_v) - attribute.string_arena = std::make_unique(); - attribute.container.emplace>(configuration.initial_array_size, ValueType()); }; @@ -510,8 +506,8 @@ void FlatDictionary::setAttributeValueImpl(Attribute & attribute, UInt64 key, co template <> void FlatDictionary::setAttributeValueImpl(Attribute & attribute, UInt64 key, const String & value) { - const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); - setAttributeValueImpl(attribute, key, StringRef{string_in_arena, value.size()}); + auto arena_value = copyStringInArena(string_arena, value); + setAttributeValueImpl(attribute, key, arena_value); } void FlatDictionary::setAttributeValue(Attribute & attribute, const UInt64 key, const Field & value) diff --git a/src/Dictionaries/FlatDictionary.h b/src/Dictionaries/FlatDictionary.h index 308cd72d55b..e8f40ea1d66 100644 --- a/src/Dictionaries/FlatDictionary.h +++ b/src/Dictionaries/FlatDictionary.h @@ -133,8 +133,6 @@ private: ContainerType, ContainerType> container; - - std::unique_ptr string_arena; }; void createAttributes(); @@ -176,6 +174,7 @@ private: mutable std::atomic found_count{0}; BlockPtr update_field_loaded_block; + Arena string_arena; }; } diff --git a/src/Dictionaries/HashedArrayDictionary.cpp b/src/Dictionaries/HashedArrayDictionary.cpp index a92f8bc1191..062620fb25b 100644 --- a/src/Dictionaries/HashedArrayDictionary.cpp +++ b/src/Dictionaries/HashedArrayDictionary.cpp @@ -352,8 +352,7 @@ void HashedArrayDictionary::createAttributes() using ValueType = DictionaryValueType; auto is_index_null = dictionary_attribute.is_nullable ? std::make_optional>() : std::optional>{}; - std::unique_ptr string_arena = std::is_same_v ? std::make_unique() : nullptr; - Attribute attribute{dictionary_attribute.underlying_type, AttributeContainerType(), std::move(is_index_null), std::move(string_arena)}; + Attribute attribute{dictionary_attribute.underlying_type, AttributeContainerType(), std::move(is_index_null)}; attributes.emplace_back(std::move(attribute)); }; @@ -431,7 +430,7 @@ void HashedArrayDictionary::blockToAttributes(const Block & } if constexpr (std::is_same_v) - key = copyKeyInArena(key); + key = copyStringInArena(string_arena, key); key_attribute.container.insert({key, element_count}); @@ -466,11 +465,7 @@ void HashedArrayDictionary::blockToAttributes(const Block & if constexpr (std::is_same_v) { String & value_to_insert = column_value_to_insert.get(); - size_t value_to_insert_size = value_to_insert.size(); - - const char * string_in_arena = attribute.string_arena->insert(value_to_insert.data(), value_to_insert_size); - - StringRef string_in_arena_reference = StringRef{string_in_arena, value_to_insert_size}; + StringRef string_in_arena_reference = copyStringInArena(string_arena, value_to_insert); attribute_container.back() = string_in_arena_reference; } else @@ -676,16 +671,6 @@ void HashedArrayDictionary::getItemsImpl( } } -template -StringRef HashedArrayDictionary::copyKeyInArena(StringRef key) -{ - size_t key_size = key.size; - char * place_for_key = complex_key_arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - StringRef updated_key{place_for_key, key_size}; - return updated_key; -} - template void HashedArrayDictionary::loadData() { @@ -742,21 +727,15 @@ void HashedArrayDictionary::calculateBytesAllocated() } bucket_count = container.capacity(); - - if constexpr (std::is_same_v) - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); }; callOnDictionaryAttributeType(attribute.type, type_call); - if (attribute.string_arena) - bytes_allocated += attribute.string_arena->size(); - if (attribute.is_index_null.has_value()) bytes_allocated += (*attribute.is_index_null).size(); } - bytes_allocated += complex_key_arena.size(); + bytes_allocated += string_arena.size(); if (update_field_loaded_block) bytes_allocated += update_field_loaded_block->allocatedBytes(); diff --git a/src/Dictionaries/HashedArrayDictionary.h b/src/Dictionaries/HashedArrayDictionary.h index 0d07c43477a..80436a3d044 100644 --- a/src/Dictionaries/HashedArrayDictionary.h +++ b/src/Dictionaries/HashedArrayDictionary.h @@ -155,7 +155,6 @@ private: container; std::optional> is_index_null; - std::unique_ptr string_arena; }; struct KeyAttribute final @@ -205,8 +204,6 @@ private: void resize(size_t added_rows); - StringRef copyKeyInArena(StringRef key); - const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const HashedArrayDictionaryStorageConfiguration configuration; @@ -222,7 +219,7 @@ private: mutable std::atomic found_count{0}; BlockPtr update_field_loaded_block; - Arena complex_key_arena; + Arena string_arena; }; extern template class HashedArrayDictionary; diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index f7627d5817f..b4f3aece174 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -239,7 +239,7 @@ ColumnPtr HashedDictionary::getHierarchy(ColumnPtr if (it != parent_keys_map.end()) result = getValueFromCell(it); - keys_found +=result.has_value(); + keys_found += result.has_value(); return result; }; @@ -354,8 +354,7 @@ void HashedDictionary::createAttributes() using ValueType = DictionaryValueType; auto is_nullable_set = dictionary_attribute.is_nullable ? std::make_optional() : std::optional{}; - std::unique_ptr string_arena = std::is_same_v ? std::make_unique() : nullptr; - Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), CollectionType(), std::move(string_arena)}; + Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), CollectionType()}; attributes.emplace_back(std::move(attribute)); }; @@ -449,7 +448,7 @@ void HashedDictionary::blockToAttributes(const Bloc } if constexpr (std::is_same_v) - key = copyKeyInArena(key); + key = copyStringInArena(string_arena, key); attribute_column.get(key_index, column_value_to_insert); @@ -463,12 +462,8 @@ void HashedDictionary::blockToAttributes(const Bloc if constexpr (std::is_same_v) { String & value_to_insert = column_value_to_insert.get(); - size_t value_to_insert_size = value_to_insert.size(); - - const char * string_in_arena = attribute.string_arena->insert(value_to_insert.data(), value_to_insert_size); - - StringRef string_in_arena_reference = StringRef{string_in_arena, value_to_insert_size}; - container.insert({key, string_in_arena_reference}); + StringRef arena_value = copyStringInArena(string_arena, value_to_insert); + container.insert({key, arena_value}); } else { @@ -548,16 +543,6 @@ void HashedDictionary::getItemsImpl( found_count.fetch_add(keys_found, std::memory_order_relaxed); } -template -StringRef HashedDictionary::copyKeyInArena(StringRef key) -{ - size_t key_size = key.size; - char * place_for_key = complex_key_arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - StringRef updated_key{place_for_key, key_size}; - return updated_key; -} - template void HashedDictionary::loadData() { @@ -591,7 +576,9 @@ void HashedDictionary::loadData() } } else + { updateData(); + } if (configuration.require_nonempty && 0 == element_count) throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, @@ -629,16 +616,13 @@ void HashedDictionary::calculateBytesAllocated() } }); - if (attributes[i].string_arena) - bytes_allocated += attributes[i].string_arena->size(); - bytes_allocated += sizeof(attributes[i].is_nullable_set); if (attributes[i].is_nullable_set.has_value()) bytes_allocated = attributes[i].is_nullable_set->getBufferSizeInBytes(); } - bytes_allocated += complex_key_arena.size(); + bytes_allocated += string_arena.size(); if (update_field_loaded_block) bytes_allocated += update_field_loaded_block->allocatedBytes(); diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index 6f63c5ec546..c1761944b14 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -173,8 +173,6 @@ private: CollectionType, CollectionType> container; - - std::unique_ptr string_arena; }; void createAttributes(); @@ -202,8 +200,6 @@ private: void resize(size_t added_rows); - StringRef copyKeyInArena(StringRef key); - const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const HashedDictionaryStorageConfiguration configuration; @@ -217,7 +213,7 @@ private: mutable std::atomic found_count{0}; BlockPtr update_field_loaded_block; - Arena complex_key_arena; + Arena string_arena; }; extern template class HashedDictionary; diff --git a/src/Dictionaries/MongoDBDictionarySource.cpp b/src/Dictionaries/MongoDBDictionarySource.cpp index 1e8be726941..7c720691c3c 100644 --- a/src/Dictionaries/MongoDBDictionarySource.cpp +++ b/src/Dictionaries/MongoDBDictionarySource.cpp @@ -8,6 +8,9 @@ namespace DB { +static const std::unordered_set dictionary_allowed_keys = { + "host", "port", "user", "password", "db", "database", "uri", "collection", "name", "method"}; + void registerDictionarySourceMongoDB(DictionarySourceFactory & factory) { auto create_mongo_db_dictionary = []( @@ -21,7 +24,8 @@ void registerDictionarySourceMongoDB(DictionarySourceFactory & factory) { const auto config_prefix = root_config_prefix + ".mongodb"; ExternalDataSourceConfiguration configuration; - auto named_collection = getExternalDataSourceConfiguration(config, config_prefix, context); + auto has_config_key = [](const String & key) { return dictionary_allowed_keys.contains(key); }; + auto named_collection = getExternalDataSourceConfiguration(config, config_prefix, context, has_config_key); if (named_collection) { configuration = *named_collection; diff --git a/src/Dictionaries/MySQLDictionarySource.cpp b/src/Dictionaries/MySQLDictionarySource.cpp index 0bf5cc3cae0..5bfb6273e8d 100644 --- a/src/Dictionaries/MySQLDictionarySource.cpp +++ b/src/Dictionaries/MySQLDictionarySource.cpp @@ -30,6 +30,18 @@ namespace ErrorCodes extern const int UNSUPPORTED_METHOD; } +static const std::unordered_set dictionary_allowed_keys = { + "host", "port", "user", "password", + "db", "database", "table", "schema", + "update_field", "invalidate_query", "priority", + "update_tag", "dont_check_update_time", + "query", "where", "name" /* name_collection */, "socket", + "share_connection", "fail_on_connection_loss", "close_connection", + "ssl_ca", "ssl_cert", "ssl_key", + "enable_local_infile", "opt_reconnect", + "connect_timeout", "mysql_connect_timeout", + "mysql_rw_timeout", "rw_timeout"}; + void registerDictionarySourceMysql(DictionarySourceFactory & factory) { auto create_table_source = [=]([[maybe_unused]] const DictionaryStructure & dict_struct, @@ -48,8 +60,11 @@ void registerDictionarySourceMysql(DictionarySourceFactory & factory) auto settings_config_prefix = config_prefix + ".mysql"; std::shared_ptr pool; + auto has_config_key = [](const String & key) { return dictionary_allowed_keys.contains(key) || key.starts_with("replica"); }; StorageMySQLConfiguration configuration; - auto named_collection = created_from_ddl ? getExternalDataSourceConfiguration(config, settings_config_prefix, global_context) : std::nullopt; + auto named_collection = created_from_ddl + ? getExternalDataSourceConfiguration(config, settings_config_prefix, global_context, has_config_key) + : std::nullopt; if (named_collection) { configuration.set(*named_collection); diff --git a/src/Dictionaries/PolygonDictionary.h b/src/Dictionaries/PolygonDictionary.h index 762c136b8e0..50810e250cb 100644 --- a/src/Dictionaries/PolygonDictionary.h +++ b/src/Dictionaries/PolygonDictionary.h @@ -3,16 +3,14 @@ #include #include #include -#include -#include -#include #include #include -#include "DictionaryStructure.h" -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryHelpers.h" +#include +#include +#include +#include + namespace DB { diff --git a/src/Dictionaries/PostgreSQLDictionarySource.cpp b/src/Dictionaries/PostgreSQLDictionarySource.cpp index 0ac84b35048..9af3ea06838 100644 --- a/src/Dictionaries/PostgreSQLDictionarySource.cpp +++ b/src/Dictionaries/PostgreSQLDictionarySource.cpp @@ -28,6 +28,10 @@ namespace ErrorCodes static const UInt64 max_block_size = 8192; +static const std::unordered_set dictionary_allowed_keys = { + "host", "port", "user", "password", "db", "database", "table", "schema", + "update_field", "update_tag", "invalidate_query", "query", "where", "name", "priority"}; + namespace { ExternalQueryBuilder makeExternalQueryBuilder(const DictionaryStructure & dict_struct, const String & schema, const String & table, const String & query, const String & where) @@ -185,8 +189,8 @@ void registerDictionarySourcePostgreSQL(DictionarySourceFactory & factory) { #if USE_LIBPQXX const auto settings_config_prefix = config_prefix + ".postgresql"; - - auto configuration = getExternalDataSourceConfigurationByPriority(config, settings_config_prefix, context); + auto has_config_key = [](const String & key) { return dictionary_allowed_keys.contains(key) || key.starts_with("replica"); }; + auto configuration = getExternalDataSourceConfigurationByPriority(config, settings_config_prefix, context, has_config_key); auto pool = std::make_shared( configuration.replicas_configurations, context->getSettingsRef().postgresql_connection_pool_size, diff --git a/src/Dictionaries/RangeHashedDictionary.cpp b/src/Dictionaries/RangeHashedDictionary.cpp index 7dc955eb8f7..9dcc38dc4b2 100644 --- a/src/Dictionaries/RangeHashedDictionary.cpp +++ b/src/Dictionaries/RangeHashedDictionary.cpp @@ -345,9 +345,6 @@ void RangeHashedDictionary::calculateBytesAllocated() const auto & collection = std::get>(attribute.maps); bytes_allocated += sizeof(CollectionType) + collection.getBufferSizeInBytes(); bucket_count = collection.getBufferSizeInCells(); - - if constexpr (std::is_same_v) - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); }; callOnDictionaryAttributeType(attribute.type, type_call); @@ -358,12 +355,14 @@ void RangeHashedDictionary::calculateBytesAllocated() if (update_field_loaded_block) bytes_allocated += update_field_loaded_block->allocatedBytes(); + + bytes_allocated += string_arena.size(); } template typename RangeHashedDictionary::Attribute RangeHashedDictionary::createAttribute(const DictionaryAttribute & dictionary_attribute) { - Attribute attribute{dictionary_attribute.underlying_type, dictionary_attribute.is_nullable, {}, {}}; + Attribute attribute{dictionary_attribute.underlying_type, dictionary_attribute.is_nullable, {}}; auto type_call = [&](const auto &dictionary_attribute_type) { @@ -371,9 +370,6 @@ typename RangeHashedDictionary::Attribute RangeHashedDictio using AttributeType = typename Type::AttributeType; using ValueType = DictionaryValueType; - if constexpr (std::is_same_v) - attribute.string_arena = std::make_unique(); - attribute.maps = CollectionType(); }; @@ -544,7 +540,7 @@ void RangeHashedDictionary::blockToAttributes(const Block & } if constexpr (std::is_same_v) - key = copyKeyInArena(key); + key = copyStringInArena(string_arena, key); setAttributeValue(attribute, key, Range{lower_bound, upper_bound}, attribute_column[key_index]); keys_extractor.rollbackCurrentKey(); @@ -572,8 +568,7 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute if constexpr (std::is_same_v) { const auto & string = value.get(); - const auto * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - const StringRef string_ref{string_in_arena, string.size()}; + StringRef string_ref = copyStringInArena(string_arena, string); value_to_insert = Value{ range, { string_ref }}; } else @@ -671,16 +666,6 @@ void RangeHashedDictionary::getKeysAndDates( } } -template -StringRef RangeHashedDictionary::copyKeyInArena(StringRef key) -{ - size_t key_size = key.size; - char * place_for_key = complex_key_arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - StringRef updated_key{place_for_key, key_size}; - return updated_key; -} - template template PaddedPODArray RangeHashedDictionary::makeDateKeys( diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index fca72d5d7cc..a9b41a4c4d0 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -139,7 +139,6 @@ private: CollectionType, CollectionType> maps; - std::unique_ptr string_arena; }; void createAttributes(); @@ -162,9 +161,9 @@ private: void blockToAttributes(const Block & block); template - static void setAttributeValueImpl(Attribute & attribute, KeyType key, const Range & range, const Field & value); + void setAttributeValueImpl(Attribute & attribute, KeyType key, const Range & range, const Field & value); - static void setAttributeValue(Attribute & attribute, KeyType key, const Range & range, const Field & value); + void setAttributeValue(Attribute & attribute, KeyType key, const Range & range, const Field & value); template void getKeysAndDates( @@ -184,8 +183,6 @@ private: const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; - StringRef copyKeyInArena(StringRef key); - const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; @@ -200,6 +197,7 @@ private: size_t bucket_count = 0; mutable std::atomic query_count{0}; mutable std::atomic found_count{0}; + Arena string_arena; }; } diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index 7f0ecdb5cb8..e30b0a257d9 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -1148,10 +1148,7 @@ private: if constexpr (dictionary_key_type == DictionaryKeyType::Complex) { /// Copy complex key into arena and put in cache - size_t key_size = key.size; - char * place_for_key = complex_key_arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - KeyType updated_key{place_for_key, key_size}; + KeyType updated_key = copyStringInArena(complex_key_arena, key); ssd_cache_key.key = updated_key; } diff --git a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp index 36a462c533e..d1ce665d002 100644 --- a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp +++ b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp @@ -35,7 +35,13 @@ namespace ErrorCodes namespace { -using NamesToTypeNames = std::unordered_map; +struct AttributeConfiguration +{ + std::string type; + std::string expression; +}; + +using AttributeNameToConfiguration = std::unordered_map; /// Get value from field and convert it to string. /// Also remove quotes from strings. @@ -46,6 +52,21 @@ String getFieldAsString(const Field & field) return applyVisitor(FieldVisitorToString(), field); } +String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_attr) +{ + if (!dict_attr->expression) + return {}; + + /// EXPRESSION PROPERTY should be expression or string + String expression_str; + if (const auto * literal = dict_attr->expression->as(); literal && literal->value.getType() == Field::Types::String) + expression_str = getFieldAsString(literal->value); + else + expression_str = queryToString(dict_attr->expression); + + return expression_str; +} + using namespace Poco; using namespace Poco::XML; @@ -63,20 +84,19 @@ void buildLifetimeConfiguration( AutoPtr root, const ASTDictionaryLifetime * lifetime) { + if (!lifetime) + return; - if (lifetime) - { - AutoPtr lifetime_element(doc->createElement("lifetime")); - AutoPtr min_element(doc->createElement("min")); - AutoPtr max_element(doc->createElement("max")); - AutoPtr min_sec(doc->createTextNode(toString(lifetime->min_sec))); - min_element->appendChild(min_sec); - AutoPtr max_sec(doc->createTextNode(toString(lifetime->max_sec))); - max_element->appendChild(max_sec); - lifetime_element->appendChild(min_element); - lifetime_element->appendChild(max_element); - root->appendChild(lifetime_element); - } + AutoPtr lifetime_element(doc->createElement("lifetime")); + AutoPtr min_element(doc->createElement("min")); + AutoPtr max_element(doc->createElement("max")); + AutoPtr min_sec(doc->createTextNode(toString(lifetime->min_sec))); + min_element->appendChild(min_sec); + AutoPtr max_sec(doc->createTextNode(toString(lifetime->max_sec))); + max_element->appendChild(max_sec); + lifetime_element->appendChild(min_element); + lifetime_element->appendChild(max_element); + root->appendChild(lifetime_element); } /* Transforms next definition @@ -105,40 +125,43 @@ void buildLayoutConfiguration( AutoPtr layout_type_element(doc->createElement(layout->layout_type)); layout_element->appendChild(layout_type_element); - if (layout->parameters) + if (!layout->parameters) + return; + + for (const auto & param : layout->parameters->children) { - for (const auto & param : layout->parameters->children) + const ASTPair * pair = param->as(); + if (!pair) { - const ASTPair * pair = param->as(); - if (!pair) - { - throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary layout parameters must be key/value pairs, got '{}' instead", - param->formatForErrorMessage()); - } - - const ASTLiteral * value_literal = pair->second->as(); - if (!value_literal) - { - throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, - "Dictionary layout parameter value must be a literal, got '{}' instead", - pair->second->formatForErrorMessage()); - } - - const auto value_field = value_literal->value; - - if (value_field.getType() != Field::Types::UInt64 - && value_field.getType() != Field::Types::String) - { - throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, - "Dictionary layout parameter value must be an UInt64 or String, got '{}' instead", - value_field.getTypeName()); - } - - AutoPtr layout_type_parameter_element(doc->createElement(pair->first)); - AutoPtr value_to_append(doc->createTextNode(toString(value_field))); - layout_type_parameter_element->appendChild(value_to_append); - layout_type_element->appendChild(layout_type_parameter_element); + throw DB::Exception( + ErrorCodes::BAD_ARGUMENTS, + "Dictionary layout parameters must be key/value pairs, got '{}' instead", + param->formatForErrorMessage()); } + + const ASTLiteral * value_literal = pair->second->as(); + if (!value_literal) + { + throw DB::Exception( + ErrorCodes::BAD_ARGUMENTS, + "Dictionary layout parameter value must be a literal, got '{}' instead", + pair->second->formatForErrorMessage()); + } + + const auto value_field = value_literal->value; + + if (value_field.getType() != Field::Types::UInt64 && value_field.getType() != Field::Types::String) + { + throw DB::Exception( + ErrorCodes::BAD_ARGUMENTS, + "Dictionary layout parameter value must be an UInt64 or String, got '{}' instead", + value_field.getTypeName()); + } + + AutoPtr layout_type_parameter_element(doc->createElement(pair->first)); + AutoPtr value_to_append(doc->createTextNode(toString(value_field))); + layout_type_parameter_element->appendChild(value_to_append); + layout_type_element->appendChild(layout_type_parameter_element); } } @@ -149,10 +172,10 @@ void buildLayoutConfiguration( * StartDate * EndDate */ -void buildRangeConfiguration(AutoPtr doc, AutoPtr root, const ASTDictionaryRange * range, const NamesToTypeNames & all_attrs) +void buildRangeConfiguration(AutoPtr doc, AutoPtr root, const ASTDictionaryRange * range, const AttributeNameToConfiguration & all_attrs) { // appends value to root - auto append_element = [&doc, &root](const std::string & key, const std::string & name, const std::string & type) + auto append_element = [&doc, &root](const std::string & key, const std::string & name, const AttributeConfiguration & configuration) { AutoPtr element(doc->createElement(key)); AutoPtr name_node(doc->createElement("name")); @@ -161,22 +184,33 @@ void buildRangeConfiguration(AutoPtr doc, AutoPtr root, const element->appendChild(name_node); AutoPtr type_node(doc->createElement("type")); - AutoPtr type_text(doc->createTextNode(type)); + AutoPtr type_text(doc->createTextNode(configuration.type)); type_node->appendChild(type_text); element->appendChild(type_node); + if (!configuration.expression.empty()) + { + AutoPtr expression_node(doc->createElement("expression")); + AutoPtr expression_text(doc->createTextNode(configuration.expression)); + expression_node->appendChild(expression_text); + element->appendChild(expression_node); + } + root->appendChild(element); }; - if (!all_attrs.count(range->min_attr_name)) + auto range_min_attribute_it = all_attrs.find(range->min_attr_name); + if (range_min_attribute_it == all_attrs.end()) throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, - "MIN ({}) attribute is not defined in the dictionary attributes", range->min_attr_name); - if (!all_attrs.count(range->max_attr_name)) - throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, - "MAX ({}) attribute is not defined in the dictionary attributes", range->max_attr_name); + "MIN {} attribute is not defined in the dictionary attributes", range->min_attr_name); - append_element("range_min", range->min_attr_name, all_attrs.at(range->min_attr_name)); - append_element("range_max", range->max_attr_name, all_attrs.at(range->max_attr_name)); + auto range_max_attribute_it = all_attrs.find(range->min_attr_name); + if (range_max_attribute_it == all_attrs.end()) + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, + "MAX {} attribute is not defined in the dictionary attributes", range->max_attr_name); + + append_element("range_min", range->min_attr_name, range_min_attribute_it->second); + append_element("range_max", range->max_attr_name, range_max_attribute_it->second); } @@ -199,25 +233,14 @@ void buildAttributeExpressionIfNeeded( AutoPtr root, const ASTDictionaryAttributeDeclaration * dict_attr) { - if (dict_attr->expression != nullptr) - { - AutoPtr expression_element(doc->createElement("expression")); + if (!dict_attr->expression) + return; - /// EXPRESSION PROPERTY should be expression or string - String expression_str; - if (const auto * literal = dict_attr->expression->as(); - literal && literal->value.getType() == Field::Types::String) - { - expression_str = getFieldAsString(literal->value); - } - else - expression_str = queryToString(dict_attr->expression); - - - AutoPtr expression(doc->createTextNode(expression_str)); - expression_element->appendChild(expression); - root->appendChild(expression_element); - } + AutoPtr expression_element(doc->createElement("expression")); + String expression_str = getAttributeExpression(dict_attr); + AutoPtr expression(doc->createTextNode(expression_str)); + expression_element->appendChild(expression); + root->appendChild(expression_element); } /** Transofrms single dictionary attribute to configuration @@ -373,25 +396,28 @@ void buildPrimaryKeyConfiguration( /** Transforms list of ASTDictionaryAttributeDeclarations to list of dictionary attributes */ -NamesToTypeNames buildDictionaryAttributesConfiguration( +AttributeNameToConfiguration buildDictionaryAttributesConfiguration( AutoPtr doc, AutoPtr root, const ASTExpressionList * dictionary_attributes, const Names & key_columns) { const auto & children = dictionary_attributes->children; - NamesToTypeNames attributes_names_and_types; + AttributeNameToConfiguration attributes_name_to_configuration; + for (const auto & child : children) { const ASTDictionaryAttributeDeclaration * dict_attr = child->as(); if (!dict_attr->type) throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Dictionary attribute must has type"); - attributes_names_and_types.emplace(dict_attr->name, queryToString(dict_attr->type)); + AttributeConfiguration attribute_configuration {queryToString(dict_attr->type), getAttributeExpression(dict_attr)}; + attributes_name_to_configuration.emplace(dict_attr->name, std::move(attribute_configuration)); + if (std::find(key_columns.begin(), key_columns.end(), dict_attr->name) == key_columns.end()) buildSingleAttribute(doc, root, dict_attr); } - return attributes_names_and_types; + return attributes_name_to_configuration; } /** Transform function with key-value arguments to configuration @@ -513,10 +539,10 @@ void checkAST(const ASTCreateQuery & query) throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Cannot create dictionary with empty source"); } -void checkPrimaryKey(const NamesToTypeNames & all_attrs, const Names & key_attrs) +void checkPrimaryKey(const AttributeNameToConfiguration & all_attrs, const Names & key_attrs) { for (const auto & key_attr : key_attrs) - if (all_attrs.count(key_attr) == 0) + if (all_attrs.find(key_attr) == all_attrs.end()) throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Unknown key attribute '{}'", key_attr); } diff --git a/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp index e2ee6ee0153..31e85442c6a 100644 --- a/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp +++ b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp @@ -62,7 +62,8 @@ DiskAzureBlobStorage::DiskAzureBlobStorage( std::unique_ptr DiskAzureBlobStorage::readFile( const String & path, const ReadSettings & read_settings, - std::optional /*estimated_size*/) const + std::optional, + std::optional) const { auto settings = current_settings.get(); auto metadata = readMeta(path); diff --git a/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h index f90ede1add9..63c3c735812 100644 --- a/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h +++ b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h @@ -50,7 +50,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional estimated_size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index b09487c17bc..f741b8242f5 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -86,15 +86,16 @@ std::unique_ptr DiskCacheWrapper::readFile( const String & path, const ReadSettings & settings, - std::optional size) const + std::optional read_hint, + std::optional file_size) const { if (!cache_file_predicate(path)) - return DiskDecorator::readFile(path, settings, size); + return DiskDecorator::readFile(path, settings, read_hint, file_size); LOG_TEST(log, "Read file {} from cache", backQuote(path)); if (cache_disk->exists(path)) - return cache_disk->readFile(path, settings, size); + return cache_disk->readFile(path, settings, read_hint, file_size); auto metadata = acquireDownloadMetadata(path); @@ -128,7 +129,7 @@ DiskCacheWrapper::readFile( auto tmp_path = path + ".tmp"; { - auto src_buffer = DiskDecorator::readFile(path, settings, size); + auto src_buffer = DiskDecorator::readFile(path, settings, read_hint, file_size); auto dst_buffer = cache_disk->writeFile(tmp_path, settings.local_fs_buffer_size, WriteMode::Rewrite); copyData(*src_buffer, *dst_buffer); } @@ -152,9 +153,9 @@ DiskCacheWrapper::readFile( } if (metadata->status == DOWNLOADED) - return cache_disk->readFile(path, settings, size); + return cache_disk->readFile(path, settings, read_hint, file_size); - return DiskDecorator::readFile(path, settings, size); + return DiskDecorator::readFile(path, settings, read_hint, file_size); } std::unique_ptr @@ -174,7 +175,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode [this, path, buf_size, mode]() { /// Copy file from cache to actual disk when cached buffer is finalized. - auto src_buffer = cache_disk->readFile(path, ReadSettings(), /* size= */ {}); + auto src_buffer = cache_disk->readFile(path, ReadSettings(), /* read_hint= */ {}, /* file_size= */ {}); auto dst_buffer = DiskDecorator::writeFile(path, buf_size, mode); copyData(*src_buffer, *dst_buffer); dst_buffer->finalize(); diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h index 8b15a8875be..6eb79114a54 100644 --- a/src/Disks/DiskCacheWrapper.h +++ b/src/Disks/DiskCacheWrapper.h @@ -37,7 +37,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode) override; diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp index 263c6c9c0ff..d4acb6fab0d 100644 --- a/src/Disks/DiskDecorator.cpp +++ b/src/Disks/DiskDecorator.cpp @@ -115,9 +115,9 @@ void DiskDecorator::listFiles(const String & path, std::vector & file_na std::unique_ptr DiskDecorator::readFile( - const String & path, const ReadSettings & settings, std::optional size) const + const String & path, const ReadSettings & settings, std::optional read_hint, std::optional file_size) const { - return delegate->readFile(path, settings, size); + return delegate->readFile(path, settings, read_hint, file_size); } std::unique_ptr diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index 5b88f4a36fa..ff4f16fdf3d 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -38,7 +38,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, @@ -70,6 +71,20 @@ public: void startup() override; void applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context, const String & config_prefix, const DisksMap & map) override; + std::unique_ptr readMetaFile( + const String & path, + const ReadSettings & settings, + std::optional size) const override { return delegate->readMetaFile(path, settings, size); } + + std::unique_ptr writeMetaFile( + const String & path, + size_t buf_size, + WriteMode mode) override { return delegate->writeMetaFile(path, buf_size, mode); } + + void removeMetaFileIfExists(const String & path) override { delegate->removeMetaFileIfExists(path); } + + UInt32 getRefCount(const String & path) const override { return delegate->getRefCount(path); } + protected: Executor & getExecutor() override; diff --git a/src/Disks/DiskEncrypted.cpp b/src/Disks/DiskEncrypted.cpp index de569d82c60..714264b7720 100644 --- a/src/Disks/DiskEncrypted.cpp +++ b/src/Disks/DiskEncrypted.cpp @@ -252,10 +252,11 @@ void DiskEncrypted::copy(const String & from_path, const std::shared_ptr std::unique_ptr DiskEncrypted::readFile( const String & path, const ReadSettings & settings, - std::optional size) const + std::optional read_hint, + std::optional file_size) const { auto wrapped_path = wrappedPath(path); - auto buffer = delegate->readFile(wrapped_path, settings, size); + auto buffer = delegate->readFile(wrapped_path, settings, read_hint, file_size); if (buffer->eof()) { /// File is empty, that's a normal case, see DiskEncrypted::truncateFile(). diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index 5b1bd7c5c6d..d99fe17457d 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -120,7 +120,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index 0a0764d41b1..3428a9aef54 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -86,6 +86,22 @@ static void loadDiskLocalConfig(const String & name, } } +std::optional fileSizeSafe(const fs::path & path) +{ + std::error_code ec; + + size_t size = fs::file_size(path, ec); + if (!ec) + return size; + + if (ec == std::errc::no_such_file_or_directory) + return std::nullopt; + if (ec == std::errc::operation_not_supported) + return std::nullopt; + + throw fs::filesystem_error("DiskLocal", path, ec); +} + class DiskLocalReservation : public IReservation { public: @@ -269,9 +285,11 @@ void DiskLocal::replaceFile(const String & from_path, const String & to_path) fs::rename(from_file, to_file); } -std::unique_ptr DiskLocal::readFile(const String & path, const ReadSettings & settings, std::optional size) const +std::unique_ptr DiskLocal::readFile(const String & path, const ReadSettings & settings, std::optional read_hint, std::optional file_size) const { - return createReadBufferFromFileBase(fs::path(disk_path) / path, settings, size); + if (!file_size.has_value()) + file_size = fileSizeSafe(fs::path(disk_path) / path); + return createReadBufferFromFileBase(fs::path(disk_path) / path, settings, read_hint, file_size); } std::unique_ptr diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h index 37855327578..f16497ae432 100644 --- a/src/Disks/DiskLocal.h +++ b/src/Disks/DiskLocal.h @@ -74,7 +74,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/DiskMemory.cpp b/src/Disks/DiskMemory.cpp index 834ed3e0c65..abaea0846a5 100644 --- a/src/Disks/DiskMemory.cpp +++ b/src/Disks/DiskMemory.cpp @@ -315,7 +315,7 @@ void DiskMemory::replaceFileImpl(const String & from_path, const String & to_pat files.insert(std::move(node)); } -std::unique_ptr DiskMemory::readFile(const String & path, const ReadSettings &, std::optional) const +std::unique_ptr DiskMemory::readFile(const String & path, const ReadSettings &, std::optional, std::optional) const { std::lock_guard lock(mutex); diff --git a/src/Disks/DiskMemory.h b/src/Disks/DiskMemory.h index d77161d898e..eef7b78502d 100644 --- a/src/Disks/DiskMemory.h +++ b/src/Disks/DiskMemory.h @@ -65,7 +65,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/DiskRestartProxy.cpp b/src/Disks/DiskRestartProxy.cpp index a8edd15ba79..9bd59513040 100644 --- a/src/Disks/DiskRestartProxy.cpp +++ b/src/Disks/DiskRestartProxy.cpp @@ -190,10 +190,10 @@ void DiskRestartProxy::listFiles(const String & path, std::vector & file } std::unique_ptr DiskRestartProxy::readFile( - const String & path, const ReadSettings & settings, std::optional size) const + const String & path, const ReadSettings & settings, std::optional read_hint, std::optional file_size) const { ReadLock lock (mutex); - auto impl = DiskDecorator::readFile(path, settings, size); + auto impl = DiskDecorator::readFile(path, settings, read_hint, file_size); return std::make_unique(*this, std::move(impl)); } diff --git a/src/Disks/DiskRestartProxy.h b/src/Disks/DiskRestartProxy.h index e8b5891947a..3644539e941 100644 --- a/src/Disks/DiskRestartProxy.h +++ b/src/Disks/DiskRestartProxy.h @@ -46,7 +46,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode) override; void removeFile(const String & path) override; void removeFileIfExists(const String & path) override; diff --git a/src/Disks/DiskWebServer.cpp b/src/Disks/DiskWebServer.cpp index 63e1cc0e6c5..7c94a5b98b1 100644 --- a/src/Disks/DiskWebServer.cpp +++ b/src/Disks/DiskWebServer.cpp @@ -154,7 +154,7 @@ bool DiskWebServer::exists(const String & path) const } -std::unique_ptr DiskWebServer::readFile(const String & path, const ReadSettings & read_settings, std::optional) const +std::unique_ptr DiskWebServer::readFile(const String & path, const ReadSettings & read_settings, std::optional, std::optional) const { LOG_TRACE(log, "Read from path: {}", path); auto iter = files.find(path); diff --git a/src/Disks/DiskWebServer.h b/src/Disks/DiskWebServer.h index 1a193d91adf..bda8c8adaad 100644 --- a/src/Disks/DiskWebServer.h +++ b/src/Disks/DiskWebServer.h @@ -63,7 +63,8 @@ public: std::unique_ptr readFile(const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; /// Disk info diff --git a/src/Disks/HDFS/DiskHDFS.cpp b/src/Disks/HDFS/DiskHDFS.cpp index 41c407c10ee..572c908768b 100644 --- a/src/Disks/HDFS/DiskHDFS.cpp +++ b/src/Disks/HDFS/DiskHDFS.cpp @@ -71,7 +71,7 @@ DiskHDFS::DiskHDFS( } -std::unique_ptr DiskHDFS::readFile(const String & path, const ReadSettings & read_settings, std::optional) const +std::unique_ptr DiskHDFS::readFile(const String & path, const ReadSettings & read_settings, std::optional, std::optional) const { auto metadata = readMeta(path); diff --git a/src/Disks/HDFS/DiskHDFS.h b/src/Disks/HDFS/DiskHDFS.h index 47150f1cfd8..de373d8d6ee 100644 --- a/src/Disks/HDFS/DiskHDFS.h +++ b/src/Disks/HDFS/DiskHDFS.h @@ -53,7 +53,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode) override; diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp index 42d5f5fce10..b1d7b33fec3 100644 --- a/src/Disks/IDisk.cpp +++ b/src/Disks/IDisk.cpp @@ -86,4 +86,28 @@ SyncGuardPtr IDisk::getDirectorySyncGuard(const String & /* path */) const return nullptr; } +std::unique_ptr IDisk::readMetaFile( + const String & path, + const ReadSettings & settings, + std::optional size) const +{ + LOG_TRACE(&Poco::Logger::get("IDisk"), "Read local metafile: {}", path); + return readFile(path, settings, size); +} + +std::unique_ptr IDisk::writeMetaFile( + const String & path, + size_t buf_size, + WriteMode mode) +{ + LOG_TRACE(&Poco::Logger::get("IDisk"), "Write local metafile: {}", path); + return writeFile(path, buf_size, mode); +} + +void IDisk::removeMetaFileIfExists(const String & path) +{ + LOG_TRACE(&Poco::Logger::get("IDisk"), "Remove local metafile: {}", path); + removeFileIfExists(path); +} + } diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 0a63421ae5c..665a35459c7 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -161,7 +161,8 @@ public: virtual std::unique_ptr readFile( const String & path, const ReadSettings & settings = ReadSettings{}, - std::optional size = {}) const = 0; + std::optional read_hint = {}, + std::optional file_size = {}) const = 0; /// Open the file for write and return WriteBufferFromFileBase object. virtual std::unique_ptr writeFile( @@ -247,6 +248,28 @@ public: /// Applies new settings for disk in runtime. virtual void applyNewSettings(const Poco::Util::AbstractConfiguration &, ContextPtr, const String &, const DisksMap &) {} + /// Open the local file for read and return ReadBufferFromFileBase object. + /// Overridden in IDiskRemote. + /// Used for work with custom metadata. + virtual std::unique_ptr readMetaFile( + const String & path, + const ReadSettings & settings, + std::optional size) const; + + /// Open the local file for write and return WriteBufferFromFileBase object. + /// Overridden in IDiskRemote. + /// Used for work with custom metadata. + virtual std::unique_ptr writeMetaFile( + const String & path, + size_t buf_size, + WriteMode mode); + + virtual void removeMetaFileIfExists(const String & path); + + /// Return reference count for remote FS. + /// Overridden in IDiskRemote. + virtual UInt32 getRefCount(const String &) const { return 0; } + protected: friend class DiskDecorator; diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp index 848726f957d..706f0f84f32 100644 --- a/src/Disks/IDiskRemote.cpp +++ b/src/Disks/IDiskRemote.cpp @@ -484,6 +484,7 @@ bool IDiskRemote::tryReserve(UInt64 bytes) String IDiskRemote::getUniqueId(const String & path) const { + LOG_TRACE(log, "Remote path: {}, Path: {}", remote_fs_root_path, path); Metadata metadata(remote_fs_root_path, metadata_disk, path); String id; if (!metadata.remote_fs_objects.empty()) @@ -500,4 +501,34 @@ AsynchronousReaderPtr IDiskRemote::getThreadPoolReader() return reader; } +std::unique_ptr IDiskRemote::readMetaFile( + const String & path, + const ReadSettings & settings, + std::optional size) const +{ + LOG_TRACE(log, "Read metafile: {}", path); + return metadata_disk->readFile(path, settings, size); +} + +std::unique_ptr IDiskRemote::writeMetaFile( + const String & path, + size_t buf_size, + WriteMode mode) +{ + LOG_TRACE(log, "Write metafile: {}", path); + return metadata_disk->writeFile(path, buf_size, mode); +} + +void IDiskRemote::removeMetaFileIfExists(const String & path) +{ + LOG_TRACE(log, "Remove metafile: {}", path); + return metadata_disk->removeFileIfExists(path); +} + +UInt32 IDiskRemote::getRefCount(const String & path) const +{ + auto meta = readMeta(path); + return meta.ref_count; +} + } diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h index c6a904020de..c4f475f5b3e 100644 --- a/src/Disks/IDiskRemote.h +++ b/src/Disks/IDiskRemote.h @@ -136,6 +136,21 @@ public: static AsynchronousReaderPtr getThreadPoolReader(); + virtual std::unique_ptr readMetaFile( + const String & path, + const ReadSettings & settings, + std::optional size) const override; + + virtual std::unique_ptr writeMetaFile( + const String & path, + size_t buf_size, + WriteMode mode) override; + + virtual void removeMetaFileIfExists( + const String & path) override; + + UInt32 getRefCount(const String & path) const override; + protected: Poco::Logger * log; const String name; diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h index 1b0cc17cb41..c9b81c98e61 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h @@ -32,7 +32,7 @@ public: explicit AsynchronousReadIndirectBufferFromRemoteFS( AsynchronousReaderPtr reader_, const ReadSettings & settings_, std::shared_ptr impl_, - size_t min_bytes_for_seek = 1024 * 1024); + size_t min_bytes_for_seek = DBMS_DEFAULT_BUFFER_SIZE); ~AsynchronousReadIndirectBufferFromRemoteFS() override; diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 201334cbd12..0bebf91df97 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -214,7 +214,7 @@ void DiskS3::moveFile(const String & from_path, const String & to_path, bool sen metadata_disk->moveFile(from_path, to_path); } -std::unique_ptr DiskS3::readFile(const String & path, const ReadSettings & read_settings, std::optional) const +std::unique_ptr DiskS3::readFile(const String & path, const ReadSettings & read_settings, std::optional, std::optional) const { auto settings = current_settings.get(); auto metadata = readMeta(path); diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index 18ed733ff01..c5d0722c6c2 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -76,7 +76,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/tests/gtest_disk_encrypted.cpp b/src/Disks/tests/gtest_disk_encrypted.cpp index 85dd8eb78b8..d03128a6b33 100644 --- a/src/Disks/tests/gtest_disk_encrypted.cpp +++ b/src/Disks/tests/gtest_disk_encrypted.cpp @@ -57,7 +57,7 @@ protected: String getFileContents(const String & file_name) { - auto buf = encrypted_disk->readFile(file_name, /* settings= */ {}, /* size= */ {}); + auto buf = encrypted_disk->readFile(file_name, /* settings= */ {}, /* read_hint= */ {}, /* file_size= */ {}); String str; readStringUntilEOF(str, *buf); return str; diff --git a/src/Disks/tests/gtest_disk_hdfs.cpp b/src/Disks/tests/gtest_disk_hdfs.cpp index 2864797aae3..4b5ff182256 100644 --- a/src/Disks/tests/gtest_disk_hdfs.cpp +++ b/src/Disks/tests/gtest_disk_hdfs.cpp @@ -53,7 +53,7 @@ TEST(DiskTestHDFS, WriteReadHDFS) { DB::String result; - auto in = disk.readFile(file_name, {}, 1024); + auto in = disk.readFile(file_name, {}, 1024, 1024); readString(result, *in); EXPECT_EQ("Test write to file", result); } @@ -76,7 +76,7 @@ TEST(DiskTestHDFS, RewriteFileHDFS) { String result; - auto in = disk.readFile(file_name, {}, 1024); + auto in = disk.readFile(file_name, {}, 1024, 1024); readString(result, *in); EXPECT_EQ("Text10", result); readString(result, *in); @@ -104,7 +104,7 @@ TEST(DiskTestHDFS, AppendFileHDFS) { String result, expected; - auto in = disk.readFile(file_name, {}, 1024); + auto in = disk.readFile(file_name, {}, 1024, 1024); readString(result, *in); EXPECT_EQ("Text0123456789", result); @@ -131,7 +131,7 @@ TEST(DiskTestHDFS, SeekHDFS) /// Test SEEK_SET { String buf(4, '0'); - std::unique_ptr in = disk.readFile(file_name, {}, 1024); + std::unique_ptr in = disk.readFile(file_name, {}, 1024, 1024); in->seek(5, SEEK_SET); @@ -141,7 +141,7 @@ TEST(DiskTestHDFS, SeekHDFS) /// Test SEEK_CUR { - std::unique_ptr in = disk.readFile(file_name, {}, 1024); + std::unique_ptr in = disk.readFile(file_name, {}, 1024, 1024); String buf(4, '0'); in->readStrict(buf.data(), 4); diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp index ecfa5df8351..bed46a97c1b 100644 --- a/src/Formats/CapnProtoUtils.cpp +++ b/src/Formats/CapnProtoUtils.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -26,6 +28,7 @@ namespace ErrorCodes extern const int FILE_DOESNT_EXIST; extern const int UNKNOWN_EXCEPTION; extern const int INCORRECT_DATA; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) @@ -427,6 +430,113 @@ void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Blo } } +template +static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) +{ + std::vector> values; + for (auto enumerant : enumerants) + values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); + return std::make_shared>(std::move(values)); +} + +static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) +{ + auto enumerants = enum_schema.getEnumerants(); + if (enumerants.size() < 128) + return getEnumDataTypeFromEnumerants(enumerants); + if (enumerants.size() < 32768) + return getEnumDataTypeFromEnumerants(enumerants); + + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "ClickHouse supports only 8 and 16-but Enums"); +} + +static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) +{ + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: + return std::make_shared(); + case capnp::schema::Type::INT16: + return std::make_shared(); + case capnp::schema::Type::INT32: + return std::make_shared(); + case capnp::schema::Type::INT64: + return std::make_shared(); + case capnp::schema::Type::BOOL: [[fallthrough]]; + case capnp::schema::Type::UINT8: + return std::make_shared(); + case capnp::schema::Type::UINT16: + return std::make_shared(); + case capnp::schema::Type::UINT32: + return std::make_shared(); + case capnp::schema::Type::UINT64: + return std::make_shared(); + case capnp::schema::Type::FLOAT32: + return std::make_shared(); + case capnp::schema::Type::FLOAT64: + return std::make_shared(); + case capnp::schema::Type::DATA: [[fallthrough]]; + case capnp::schema::Type::TEXT: + return std::make_shared(); + case capnp::schema::Type::ENUM: + return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); + case capnp::schema::Type::LIST: + { + auto list_schema = capnp_type.asList(); + auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType()); + return std::make_shared(nested_type); + } + case capnp::schema::Type::STRUCT: + { + auto struct_schema = capnp_type.asStruct(); + + /// Check if it can be Nullable. + if (checkIfStructIsNamedUnion(struct_schema)) + { + auto fields = struct_schema.getUnionFields(); + if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unions are not supported"); + auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); + if (value_type.isStruct() || value_type.isList()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Tuples and Lists cannot be inside Nullable"); + + auto nested_type = getDataTypeFromCapnProtoType(value_type); + return std::make_shared(nested_type); + } + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unnamed union is not supported"); + + /// Treat Struct as Tuple. + DataTypes nested_types; + Names nested_names; + for (auto field : struct_schema.getNonUnionFields()) + { + nested_names.push_back(field.getProto().getName()); + nested_types.push_back(getDataTypeFromCapnProtoType(field.getType())); + } + return std::make_shared(std::move(nested_types), std::move(nested_names)); + } + default: + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); + } +} + +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema) +{ + if (checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unnamed union is not supported"); + + NamesAndTypesList names_and_types; + for (auto field : schema.getNonUnionFields()) + { + auto name = field.getProto().getName(); + auto type = getDataTypeFromCapnProtoType(field.getType()); + names_and_types.emplace_back(name, type); + } + return names_and_types; +} + } #endif diff --git a/src/Formats/CapnProtoUtils.h b/src/Formats/CapnProtoUtils.h index 93ca0a5e616..51c152de17f 100644 --- a/src/Formats/CapnProtoUtils.h +++ b/src/Formats/CapnProtoUtils.h @@ -38,6 +38,7 @@ capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Re void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode); +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema); } #endif diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index d956d9e6bfb..0a7747fc864 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -1,7 +1,16 @@ #include +#include +#include #include +#include +#include +#include #include #include +#include +#include +#include +#include namespace DB { @@ -9,6 +18,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule) @@ -193,30 +203,145 @@ void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSe } } -String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +template +String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { String result; switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: - readQuotedString(result, buf); + if constexpr (read_string) + readQuotedString(result, buf); + else + readQuotedFieldIntoString(result, buf); break; case FormatSettings::EscapingRule::JSON: - readJSONString(result, buf); + if constexpr (read_string) + readJSONString(result, buf); + else + readJSONFieldIntoString(result, buf); break; case FormatSettings::EscapingRule::Raw: readString(result, buf); break; case FormatSettings::EscapingRule::CSV: - readCSVString(result, buf, format_settings.csv); + if constexpr (read_string) + readCSVString(result, buf, format_settings.csv); + else + readCSVField(result, buf, format_settings.csv); break; case FormatSettings::EscapingRule::Escaped: readEscapedString(result, buf); break; default: - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read string with {} escaping rule", escapingRuleToString(escaping_rule)); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule)); } return result; } +String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + return readByEscapingRule(buf, escaping_rule, format_settings); +} + +String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + return readByEscapingRule(buf, escaping_rule, format_settings); +} + +static bool evaluateConstantExpressionFromString(const StringRef & field, DataTypePtr & type, ContextPtr context) +{ + if (!context) + throw Exception(ErrorCodes::LOGICAL_ERROR, "You must provide context to evaluate constant expression"); + + ParserExpression parser; + Expected expected; + Tokens tokens(field.data, field.data + field.size); + IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); + ASTPtr ast; + + /// FIXME: Our parser cannot parse maps in the form of '{key : value}' that is used in text formats. + bool parsed = parser.parse(token_iterator, ast, expected); + if (!parsed) + return false; + + try + { + std::pair result = evaluateConstantExpression(ast, context); + type = generalizeDataType(result.second); + return true; + } + catch (...) + { + return false; + } +} + +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::Quoted: + { + DataTypePtr type; + bool parsed = evaluateConstantExpressionFromString(field, type, context); + return parsed ? type : nullptr; + } + case FormatSettings::EscapingRule::JSON: + return getDataTypeFromJSONField(field); + case FormatSettings::EscapingRule::CSV: + { + if (field.empty() || field == format_settings.csv.null_representation) + return nullptr; + + if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) + return std::make_shared(); + + DataTypePtr type; + bool parsed; + if (field[0] == '\'' || field[0] == '"') + { + /// Try to evaluate expression inside quotes. + parsed = evaluateConstantExpressionFromString(StringRef(field.data() + 1, field.size() - 2), type, context); + /// If it's a number in quotes we determine it as a string. + if (parsed && type && isNumber(removeNullable(type))) + return makeNullable(std::make_shared()); + } + else + parsed = evaluateConstantExpressionFromString(field, type, context); + + /// If we couldn't parse an expression, determine it as a string. + return parsed ? type : makeNullable(std::make_shared()); + } + case FormatSettings::EscapingRule::Raw: [[fallthrough]]; + case FormatSettings::EscapingRule::Escaped: + /// TODO: Try to use some heuristics here to determine the type of data. + return field.empty() ? nullptr : makeNullable(std::make_shared()); + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule)); + } +} + +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +{ + DataTypes data_types; + data_types.reserve(fields.size()); + for (const auto & field : fields) + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule, context)); + return data_types; +} + +DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::CSV: [[fallthrough]]; + case FormatSettings::EscapingRule::Escaped: [[fallthrough]]; + case FormatSettings::EscapingRule::Raw: + return makeNullable(std::make_shared()); + default: + return nullptr; + } +} + } diff --git a/src/Formats/EscapingRuleUtils.h b/src/Formats/EscapingRuleUtils.h index 02f027db74d..10147b29ad6 100644 --- a/src/Formats/EscapingRuleUtils.h +++ b/src/Formats/EscapingRuleUtils.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -33,5 +34,24 @@ void serializeFieldByEscapingRule( void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); +String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); + +/// Try to determine the type of the field written by a specific escaping rule. +/// If cannot, return nullptr. +/// - For Quoted escaping rule we can interpret a single field as a constant +/// expression and get it's type by evaluation this expression. +/// - For JSON escaping rule we can use JSON parser to parse a single field +/// and then convert JSON type of this field to ClickHouse type. +/// - For CSV escaping rule we can do the next: +/// - If the field is an unquoted string, then we could try to evaluate it +/// as a constant expression, and if it fails, treat it as a String. +/// - If the field is a string in quotes, then we can try to evaluate +/// expression inside quotes as a constant expression, and if it fails or +/// the result is a number (we don't parse numbers in quotes) we treat it as a String. +/// - For TSV and TSVRaw we treat each field as a String (TODO: try to use some tweaks and heuristics here) +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); + +DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index d292bbf551c..2068de0d01c 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -1,20 +1,18 @@ #include #include -#include -#include #include #include +#include +#include #include #include -#include #include -#include #include +#include +#include #include - -#include -#include +#include namespace DB { @@ -119,6 +117,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.seekable_read = settings.input_format_allow_seeks; + format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns; + format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context if (format_settings.schema.is_server) @@ -200,7 +200,6 @@ InputFormatPtr FormatFactory::getInput( return format; } - InputFormatPtr FormatFactory::getInputFormat( const String & name, ReadBuffer & buf, @@ -235,6 +234,18 @@ InputFormatPtr FormatFactory::getInputFormat( return format; } +static void addExistingProgressToOutputFormat(OutputFormatPtr format, ContextPtr context) +{ + auto * element_id = context->getProcessListElement(); + if (element_id) + { + /// While preparing the query there might have been progress (for example in subscalar subqueries) so add it here + auto current_progress = element_id->getProgressIn(); + Progress read_progress{current_progress.read_rows, current_progress.read_bytes, current_progress.total_rows_to_read}; + format->onProgress(read_progress); + } +} + OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( const String & name, WriteBuffer & buf, @@ -263,7 +274,9 @@ OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( if (context->hasQueryContext() && settings.log_queries) context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); - return std::make_shared(builder); + auto format = std::make_shared(builder); + addExistingProgressToOutputFormat(format, context); + return format; } return getOutputFormat(name, buf, sample, context, callback, _format_settings); @@ -303,6 +316,8 @@ OutputFormatPtr FormatFactory::getOutputFormat( if (auto * mysql = typeid_cast(format.get())) mysql->setContext(context); + addExistingProgressToOutputFormat(format, context); + return format; } @@ -325,6 +340,32 @@ String FormatFactory::getContentType( return format->getContentType(); } +SchemaReaderPtr FormatFactory::getSchemaReader( + const String & name, + ReadBuffer & buf, + ContextPtr context, + const std::optional & _format_settings) const +{ + const auto & schema_reader_creator = dict.at(name).schema_reader_creator; + if (!schema_reader_creator) + throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + return schema_reader_creator(buf, format_settings, context); +} + +ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader( + const String & name, + ContextPtr context, + const std::optional & _format_settings) const +{ + const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator; + if (!external_schema_reader_creator) + throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + return external_schema_reader_creator(format_settings); +} void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator) { @@ -358,6 +399,21 @@ void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegm target = std::move(file_segmentation_engine); } +void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator) +{ + auto & target = dict[name].schema_reader_creator; + if (target) + throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); + target = std::move(schema_reader_creator); +} + +void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator) +{ + auto & target = dict[name].external_schema_reader_creator; + if (target) + throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); + target = std::move(external_schema_reader_creator); +} void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name) { @@ -395,6 +451,23 @@ bool FormatFactory::isOutputFormat(const String & name) const return it != dict.end() && it->second.output_creator; } +bool FormatFactory::checkIfFormatHasSchemaReader(const String & name) +{ + const auto & target = getCreators(name); + return bool(target.schema_reader_creator); +} + +bool FormatFactory::checkIfFormatHasExternalSchemaReader(const String & name) +{ + const auto & target = getCreators(name); + return bool(target.external_schema_reader_creator); +} + +bool FormatFactory::checkIfFormatHasAnySchemaReader(const String & name) +{ + return checkIfFormatHasSchemaReader(name) || checkIfFormatHasExternalSchemaReader(name); +} + FormatFactory & FormatFactory::instance() { static FormatFactory ret; diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index ea285c47996..a62b32da0cc 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include @@ -31,6 +33,11 @@ class IOutputFormat; struct RowInputFormatParams; struct RowOutputFormatParams; +class ISchemaReader; +class IExternalSchemaReader; +using SchemaReaderPtr = std::shared_ptr; +using ExternalSchemaReaderPtr = std::shared_ptr; + using InputFormatPtr = std::shared_ptr; using OutputFormatPtr = std::shared_ptr; @@ -85,11 +92,16 @@ private: /// The checker should return true if parallel parsing should be disabled. using NonTrivialPrefixAndSuffixChecker = std::function; + using SchemaReaderCreator = std::function; + using ExternalSchemaReaderCreator = std::function; + struct Creators { InputCreator input_creator; OutputCreator output_creator; FileSegmentationEngine file_segmentation_engine; + SchemaReaderCreator schema_reader_creator; + ExternalSchemaReaderCreator external_schema_reader_creator; bool supports_parallel_formatting{false}; bool is_column_oriented{false}; NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker; @@ -138,6 +150,17 @@ public: ContextPtr context, const std::optional & format_settings = std::nullopt) const; + SchemaReaderPtr getSchemaReader( + const String & name, + ReadBuffer & buf, + ContextPtr context, + const std::optional & format_settings = std::nullopt) const; + + ExternalSchemaReaderPtr getExternalSchemaReader( + const String & name, + ContextPtr context, + const std::optional & format_settings = std::nullopt) const; + void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine); void registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker); @@ -146,11 +169,19 @@ public: void registerInputFormat(const String & name, InputCreator input_creator); void registerOutputFormat(const String & name, OutputCreator output_creator); + /// Register schema readers for format its name. + void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator); + void registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator); + void markOutputFormatSupportsParallelFormatting(const String & name); void markFormatAsColumnOriented(const String & name); bool checkIfFormatIsColumnOriented(const String & name); + bool checkIfFormatHasSchemaReader(const String & name); + bool checkIfFormatHasExternalSchemaReader(const String & name); + bool checkIfFormatHasAnySchemaReader(const String & name); + const FormatsDictionary & getAllFormats() const { return dict; @@ -163,6 +194,7 @@ private: FormatsDictionary dict; const Creators & getCreators(const String & name) const; + }; } diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index d9af07fdc9c..6298e959c3e 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -33,6 +33,7 @@ struct FormatSettings bool defaults_for_omitted_fields = true; bool seekable_read = true; + UInt64 max_rows_to_read_for_schema_inference = 100; enum class DateTimeInputFormat { @@ -217,6 +218,11 @@ struct FormatSettings { EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES; } capn_proto; + + struct + { + UInt64 number_of_columns = 0; + } msgpack; }; } diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index b55e9f59cc7..c63b8453634 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -1,7 +1,17 @@ #include #include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include @@ -26,7 +36,7 @@ static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast(pos - in.position()) < min_chunk_size || number_of_rows < min_rows)) { const auto current_object_size = memory.size() + static_cast(pos - in.position()); - if (current_object_size > 10 * min_chunk_size) + if (min_chunk_size != 0 && current_object_size > 10 * min_chunk_size) throw ParsingException("Size of JSON object is extremely large. Expected not greater than " + std::to_string(min_chunk_size) + " bytes, but current is " + std::to_string(current_object_size) + " bytes per row. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely JSON is malformed", ErrorCodes::INCORRECT_DATA); @@ -92,6 +102,122 @@ static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer return {loadAtPosition(in, memory, pos), number_of_rows}; } +template +static String readJSONEachRowLineIntoStringImpl(ReadBuffer & in) +{ + Memory memory; + fileSegmentationEngineJSONEachRowImpl(in, memory, 0, 1); + return String(memory.data(), memory.size()); +} + +template +DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) +{ + if (field.isNull()) + return nullptr; + + if (field.isBool()) + return makeNullable(std::make_shared()); + + if (field.isInt64() || field.isUInt64() || field.isDouble()) + return makeNullable(std::make_shared()); + + if (field.isString()) + return makeNullable(std::make_shared()); + + if (field.isArray()) + { + auto array = field.getArray(); + + /// Return nullptr in case of empty array because we cannot determine nested type. + if (array.size() == 0) + return nullptr; + + DataTypes nested_data_types; + /// If this array contains fields with different types we will treat it as Tuple. + bool is_tuple = false; + for (const auto element : array) + { + auto type = getDataTypeFromJSONFieldImpl(element); + if (!type) + return nullptr; + + if (!nested_data_types.empty() && type->getName() != nested_data_types.back()->getName()) + is_tuple = true; + + nested_data_types.push_back(std::move(type)); + } + + if (is_tuple) + return std::make_shared(nested_data_types); + + return std::make_shared(nested_data_types.back()); + } + + if (field.isObject()) + { + auto object = field.getObject(); + DataTypePtr value_type; + for (const auto key_value_pair : object) + { + auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second); + if (!type) + return nullptr; + + if (value_type && value_type->getName() != type->getName()) + return nullptr; + + value_type = type; + } + return std::make_shared(std::make_shared(), value_type); + } + + throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; +} + +auto getJSONParserAndElement() +{ +#if USE_SIMDJSON + return std::pair(); +#elif USE_RAPIDJSON + return std::pair(); +#else + return std::pair(); +#endif +} + +DataTypePtr getDataTypeFromJSONField(const String & field) +{ + auto [parser, element] = getJSONParserAndElement(); + bool parsed = parser.parse(field, element); + if (!parsed) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object"); + + return getDataTypeFromJSONFieldImpl(element); +} + +template +static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, bool /*json_strings*/, Extractor & extractor) +{ + String line = readJSONEachRowLineIntoStringImpl(in); + auto [parser, element] = getJSONParserAndElement(); + bool parsed = parser.parse(line, element); + if (!parsed) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object"); + + auto fields = extractor.extract(element); + + DataTypes data_types; + data_types.reserve(fields.size()); + for (const auto & field : fields) + data_types.push_back(getDataTypeFromJSONFieldImpl(field)); + + /// TODO: For JSONStringsEachRow/JSONCompactStringsEach all types will be strings. + /// Should we try to parse data inside strings somehow in this case? + + return data_types; +} + std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) { return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_chunk_size, 1); @@ -102,6 +228,60 @@ std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_chunk_size, min_rows); } +struct JSONEachRowFieldsExtractor +{ + template + std::vector extract(const Element & element) + { + /// {..., "" : , ...} + auto object = element.getObject(); + std::vector fields; + fields.reserve(object.size()); + column_names.reserve(object.size()); + for (const auto & key_value_pair : object) + { + column_names.emplace_back(key_value_pair.first); + fields.push_back(key_value_pair.second); + } + + return fields; + } + + std::vector column_names; +}; + +std::unordered_map readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings) +{ + JSONEachRowFieldsExtractor extractor; + auto data_types = determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); + std::unordered_map result; + for (size_t i = 0; i != extractor.column_names.size(); ++i) + result[extractor.column_names[i]] = data_types[i]; + return result; +} + +struct JSONCompactEachRowFieldsExtractor +{ + template + std::vector extract(const Element & element) + { + /// [..., , ...] + auto array = element.getArray(); + std::vector fields; + fields.reserve(array.size()); + for (size_t i = 0; i != array.size(); ++i) + fields.push_back(array[i]); + return fields; + } +}; + +DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings) +{ + JSONCompactEachRowFieldsExtractor extractor; + return determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); +} + + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) { /// For JSONEachRow we can safely skip whitespace characters diff --git a/src/Formats/JSONEachRowUtils.h b/src/Formats/JSONEachRowUtils.h index 4a049aa1abd..6f71baa8b40 100644 --- a/src/Formats/JSONEachRowUtils.h +++ b/src/Formats/JSONEachRowUtils.h @@ -11,6 +11,21 @@ namespace DB std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows); + +/// Parse JSON from string and convert it's type to ClickHouse type. Make the result type always Nullable. +/// JSON array with different nested types is treated as Tuple. +/// If cannot convert (for example when field contains null), return nullptr. +DataTypePtr getDataTypeFromJSONField(const String & field); + +/// Read row in JSONEachRow format and try to determine type for each field. +/// Return map {column_name : type}. +/// If cannot determine the type of some field, return nullptr for it. +std::unordered_map readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings); + +/// Read row in JSONCompactEachRow format and try to determine type for each field. +/// If cannot determine the type of some field, return nullptr for it. +DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings); + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf); bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings); diff --git a/src/Formats/ParsedTemplateFormatString.cpp b/src/Formats/ParsedTemplateFormatString.cpp index 4966420f05b..8d1b987d01a 100644 --- a/src/Formats/ParsedTemplateFormatString.cpp +++ b/src/Formats/ParsedTemplateFormatString.cpp @@ -14,14 +14,14 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT; } -ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name) +ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes) { ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); String format_string; readStringUntilEOF(format_string, schema_file); try { - parse(format_string, idx_by_name); + parse(format_string, idx_by_name, allow_indexes); } catch (DB::Exception & e) { @@ -33,7 +33,7 @@ ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & } -void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name) +void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes) { enum ParserState { @@ -100,6 +100,8 @@ void ParsedTemplateFormatString::parse(const String & format_string, const Colum column_idx = strtoull(column_names.back().c_str(), &col_idx_end, 10); if (col_idx_end != column_names.back().c_str() + column_names.back().size() || errno) column_idx = idx_by_name(column_names.back()); + else if (!allow_indexes) + throw Exception(ErrorCodes::INVALID_TEMPLATE_FORMAT, "Indexes instead of names are not allowed"); } format_idx_to_column_idx.emplace_back(column_idx); break; diff --git a/src/Formats/ParsedTemplateFormatString.h b/src/Formats/ParsedTemplateFormatString.h index ba0ebdf5aa8..c5617d0f0ef 100644 --- a/src/Formats/ParsedTemplateFormatString.h +++ b/src/Formats/ParsedTemplateFormatString.h @@ -31,9 +31,9 @@ struct ParsedTemplateFormatString typedef std::function(const String &)> ColumnIdxGetter; ParsedTemplateFormatString() = default; - ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name); + ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true); - void parse(const String & format_string, const ColumnIdxGetter & idx_by_name); + void parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true); static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s); size_t columnsCount() const; diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index 5232b76b7fe..b59db12a16c 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -24,6 +24,7 @@ # include # include # include +# include # include # include # include @@ -56,6 +57,7 @@ namespace ErrorCodes extern const int PROTOBUF_FIELD_NOT_REPEATED; extern const int PROTOBUF_BAD_CAST; extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } namespace @@ -3017,10 +3019,8 @@ namespace { std::vector column_names_used; column_names_used.reserve(used_column_indices_in_nested.size()); - for (size_t i : used_column_indices_in_nested) column_names_used.emplace_back(nested_column_names[i]); - auto field_serializer = std::make_unique( std::move(column_names_used), field_descriptor, std::move(nested_message_serializer), get_root_desc_function); transformColumnIndices(used_column_indices_in_nested, nested_column_indices); @@ -3230,8 +3230,105 @@ namespace std::function get_root_desc_function; std::shared_ptr root_serializer_ptr; }; -} + template + DataTypePtr getEnumDataType(const google::protobuf::EnumDescriptor * enum_descriptor) + { + std::vector> values; + for (int i = 0; i != enum_descriptor->value_count(); ++i) + { + const auto * enum_value_descriptor = enum_descriptor->value(i); + values.emplace_back(enum_value_descriptor->name(), enum_value_descriptor->number()); + } + return std::make_shared>(std::move(values)); + } + + NameAndTypePair getNameAndDataTypeFromField(const google::protobuf::FieldDescriptor * field_descriptor, bool allow_repeat = true) + { + if (allow_repeat && field_descriptor->is_map()) + { + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false); + const auto * tuple_type = assert_cast(name_and_type.type.get()); + return {name_and_type.name, std::make_shared(tuple_type->getElements())}; + } + + if (allow_repeat && field_descriptor->is_repeated()) + { + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false); + return {name_and_type.name, std::make_shared(name_and_type.type)}; + } + + switch (field_descriptor->type()) + { + case FieldTypeId::TYPE_SFIXED32: [[fallthrough]]; + case FieldTypeId::TYPE_SINT32: [[fallthrough]]; + case FieldTypeId::TYPE_INT32: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_SFIXED64: [[fallthrough]]; + case FieldTypeId::TYPE_SINT64: [[fallthrough]]; + case FieldTypeId::TYPE_INT64: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_BOOL: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_FLOAT: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_DOUBLE: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_UINT32: [[fallthrough]]; + case FieldTypeId::TYPE_FIXED32: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_UINT64: [[fallthrough]]; + case FieldTypeId::TYPE_FIXED64: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_BYTES: [[fallthrough]]; + case FieldTypeId::TYPE_STRING: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_ENUM: + { + const auto * enum_descriptor = field_descriptor->enum_type(); + if (enum_descriptor->value_count() == 0) + throw Exception("Empty enum field", ErrorCodes::BAD_ARGUMENTS); + int max_abs = std::abs(enum_descriptor->value(0)->number()); + for (int i = 1; i != enum_descriptor->value_count(); ++i) + { + if (std::abs(enum_descriptor->value(i)->number()) > max_abs) + max_abs = std::abs(enum_descriptor->value(i)->number()); + } + if (max_abs < 128) + return {field_descriptor->name(), getEnumDataType(enum_descriptor)}; + else if (max_abs < 32768) + return {field_descriptor->name(), getEnumDataType(enum_descriptor)}; + else + throw Exception("ClickHouse supports only 8-bit and 16-bit enums", ErrorCodes::BAD_ARGUMENTS); + } + case FieldTypeId::TYPE_GROUP: [[fallthrough]]; + case FieldTypeId::TYPE_MESSAGE: + { + const auto * message_descriptor = field_descriptor->message_type(); + if (message_descriptor->field_count() == 1) + { + const auto * nested_field_descriptor = message_descriptor->field(0); + auto nested_name_and_type = getNameAndDataTypeFromField(nested_field_descriptor); + return {field_descriptor->name() + "_" + nested_name_and_type.name, nested_name_and_type.type}; + } + else + { + DataTypes nested_types; + Strings nested_names; + for (int i = 0; i != message_descriptor->field_count(); ++i) + { + auto nested_name_and_type = getNameAndDataTypeFromField(message_descriptor->field(i)); + nested_types.push_back(nested_name_and_type.type); + nested_names.push_back(nested_name_and_type.name); + } + return {field_descriptor->name(), std::make_shared(std::move(nested_types), std::move(nested_names))}; + } + } + } + + __builtin_unreachable(); + } +} std::unique_ptr ProtobufSerializer::create( const Strings & column_names, @@ -3254,5 +3351,14 @@ std::unique_ptr ProtobufSerializer::create( std::vector missing_column_indices; return ProtobufSerializerBuilder(writer).buildMessageSerializer(column_names, data_types, missing_column_indices, message_descriptor, with_length_delimiter); } + +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor) +{ + NamesAndTypesList schema; + for (int i = 0; i != message_descriptor->field_count(); ++i) + schema.push_back(getNameAndDataTypeFromField(message_descriptor->field(i))); + return schema; +} + } #endif diff --git a/src/Formats/ProtobufSerializer.h b/src/Formats/ProtobufSerializer.h index 3eaca6a18d6..d9bed913517 100644 --- a/src/Formats/ProtobufSerializer.h +++ b/src/Formats/ProtobufSerializer.h @@ -4,6 +4,7 @@ #if USE_PROTOBUF # include +#include namespace google::protobuf { class Descriptor; } @@ -48,5 +49,7 @@ public: ProtobufWriter & writer); }; +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor); + } #endif diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp new file mode 100644 index 00000000000..37067eae64f --- /dev/null +++ b/src/Formats/ReadSchemaUtils.cpp @@ -0,0 +1,112 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int BAD_ARGUMENTS; +} + +ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context) +{ + NamesAndTypesList names_and_types; + if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name)) + { + auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings); + try + { + names_and_types = external_schema_reader->readSchema(); + } + catch (const DB::Exception & e) + { + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message()); + } + } + else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name)) + { + auto read_buf = read_buffer_creator(); + if (read_buf->eof()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, file is empty", format_name); + + auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *read_buf, context, format_settings); + try + { + names_and_types = schema_reader->readSchema(); + } + catch (const DB::Exception & e) + { + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message()); + } + } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} file format doesn't support schema inference", format_name); + + return ColumnsDescription(names_and_types); +} + +DataTypePtr generalizeDataType(DataTypePtr type) +{ + WhichDataType which(type); + + if (which.isNothing()) + return nullptr; + + if (which.isNullable()) + { + const auto * nullable_type = assert_cast(type.get()); + return generalizeDataType(nullable_type->getNestedType()); + } + + if (isNumber(type)) + return makeNullable(std::make_shared()); + + if (which.isArray()) + { + const auto * array_type = assert_cast(type.get()); + auto nested_type = generalizeDataType(array_type->getNestedType()); + return nested_type ? std::make_shared(nested_type) : nullptr; + } + + if (which.isTuple()) + { + const auto * tuple_type = assert_cast(type.get()); + DataTypes nested_types; + for (const auto & element : tuple_type->getElements()) + { + auto nested_type = generalizeDataType(element); + if (!nested_type) + return nullptr; + nested_types.push_back(nested_type); + } + return std::make_shared(std::move(nested_types)); + } + + if (which.isMap()) + { + const auto * map_type = assert_cast(type.get()); + auto key_type = removeNullable(generalizeDataType(map_type->getKeyType())); + auto value_type = generalizeDataType(map_type->getValueType()); + return key_type && value_type ? std::make_shared(key_type, value_type) : nullptr; + } + + if (which.isLowCarnality()) + { + const auto * lc_type = assert_cast(type.get()); + auto nested_type = generalizeDataType(lc_type->getDictionaryType()); + return nested_type ? std::make_shared(nested_type) : nullptr; + } + + return makeNullable(type); +} + +} diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h new file mode 100644 index 00000000000..fb43acc3cd6 --- /dev/null +++ b/src/Formats/ReadSchemaUtils.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Try to determine the schema of the data in specifying format. +/// For formats that have an external schema reader, it will +/// use it and won't create a read buffer. +/// For formats that have a schema reader from the data, +/// read buffer will be created by the provided creator and +/// the schema will be extracted from the data. +/// If format doesn't have any schema reader or a schema reader +/// couldn't determine the schema, an exception will be thrown. +using ReadBufferCreator = std::function()>; +ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context); + +/// Convert type to the most general type: +/// - IntN, UIntN, FloatN, Decimal -> Float64 +/// - Type -> Nullable(type) +/// - Array(Type) -> Array(Nullable(Type)) +/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN)) +/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType)) +/// - LowCardinality(Type) -> LowCardinality(Nullable(Type)) +/// If type is Nothing or one of the nested types is Nothing, return nullptr. +DataTypePtr generalizeDataType(DataTypePtr type); + +} diff --git a/src/Formats/config_formats.h.in b/src/Formats/config_formats.h.in index f6497b4830b..427abc7d1ce 100644 --- a/src/Formats/config_formats.h.in +++ b/src/Formats/config_formats.h.in @@ -10,4 +10,3 @@ #cmakedefine01 USE_ARROW #cmakedefine01 USE_PROTOBUF #cmakedefine01 USE_MSGPACK - diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index 7425c6898de..1349c9e3323 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -81,6 +81,28 @@ void registerInputFormatCapnProto(FormatFactory & factory); void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory); void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory); +void registerArrowSchemaReader(FormatFactory & factory); +void registerParquetSchemaReader(FormatFactory & factory); +void registerORCSchemaReader(FormatFactory & factory); +void registerTSVSchemaReader(FormatFactory & factory); +void registerCSVSchemaReader(FormatFactory & factory); +void registerJSONCompactEachRowSchemaReader(FormatFactory & factory); +void registerJSONEachRowSchemaReader(FormatFactory & factory); +void registerNativeSchemaReader(FormatFactory & factory); +void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory); +void registerAvroSchemaReader(FormatFactory & factory); +void registerProtobufSchemaReader(FormatFactory & factory); +void registerLineAsStringSchemaReader(FormatFactory & factory); +void registerJSONAsStringSchemaReader(FormatFactory & factory); +void registerRawBLOBSchemaReader(FormatFactory & factory); +void registerMsgPackSchemaReader(FormatFactory & factory); +void registerCapnProtoSchemaReader(FormatFactory & factory); +void registerCustomSeparatedSchemaReader(FormatFactory & factory); +void registerRegexpSchemaReader(FormatFactory & factory); +void registerTSKVSchemaReader(FormatFactory & factory); +void registerValuesSchemaReader(FormatFactory & factory); +void registerTemplateSchemaReader(FormatFactory & factory); + void registerFormats() { auto & factory = FormatFactory::instance(); @@ -152,6 +174,28 @@ void registerFormats() registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory); registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory); + + registerArrowSchemaReader(factory); + registerParquetSchemaReader(factory); + registerORCSchemaReader(factory); + registerTSVSchemaReader(factory); + registerCSVSchemaReader(factory); + registerJSONCompactEachRowSchemaReader(factory); + registerJSONEachRowSchemaReader(factory); + registerNativeSchemaReader(factory); + registerRowBinaryWithNamesAndTypesSchemaReader(factory); + registerAvroSchemaReader(factory); + registerProtobufSchemaReader(factory); + registerLineAsStringSchemaReader(factory); + registerJSONAsStringSchemaReader(factory); + registerRawBLOBSchemaReader(factory); + registerMsgPackSchemaReader(factory); + registerCapnProtoSchemaReader(factory); + registerCustomSeparatedSchemaReader(factory); + registerRegexpSchemaReader(factory); + registerTSKVSchemaReader(factory); + registerValuesSchemaReader(factory); + registerTemplateSchemaReader(factory); } } diff --git a/src/Functions/CustomWeekTransforms.h b/src/Functions/CustomWeekTransforms.h index 5ccb2e06c44..8656f9da927 100644 --- a/src/Functions/CustomWeekTransforms.h +++ b/src/Functions/CustomWeekTransforms.h @@ -76,7 +76,7 @@ struct ToStartOfWeekImpl } static inline UInt16 execute(UInt16 d, UInt8 week_mode, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfWeek(ExtendedDayNum(d), week_mode); + return time_zone.toFirstDayNumOfWeek(DayNum(d), week_mode); } using FactorTransform = ZeroTransform; diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 08dac9c2ba0..a7f06689820 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -84,7 +84,8 @@ struct ToDate32Impl } static inline Int32 execute(UInt32 t, const DateLUTImpl & time_zone) { - return Int32(time_zone.toDayNum(t)); + /// Don't saturate. + return Int32(time_zone.toDayNum(t)); } static inline Int32 execute(Int32 d, const DateLUTImpl &) { @@ -117,7 +118,7 @@ struct ToStartOfDayImpl } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDate(ExtendedDayNum(d)); + return time_zone.toDate(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -143,7 +144,7 @@ struct ToMondayImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfWeek(ExtendedDayNum(d)); + return time_zone.toFirstDayNumOfWeek(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -167,7 +168,7 @@ struct ToStartOfMonthImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfMonth(ExtendedDayNum(d)); + return time_zone.toFirstDayNumOfMonth(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -191,7 +192,7 @@ struct ToStartOfQuarterImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfQuarter(ExtendedDayNum(d)); + return time_zone.toFirstDayNumOfQuarter(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -215,7 +216,7 @@ struct ToStartOfYearImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfYear(ExtendedDayNum(d)); + return time_zone.toFirstDayNumOfYear(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -224,7 +225,7 @@ struct ToStartOfYearImpl struct ToTimeImpl { - /// When transforming to time, the date will be equated to 1970-01-01. + /// When transforming to time, the date will be equated to 1970-01-02. static constexpr auto name = "toTime"; static UInt32 execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl & time_zone) @@ -456,7 +457,7 @@ struct ToYearImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toYear(ExtendedDayNum(d)); + return time_zone.toYear(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -480,7 +481,7 @@ struct ToQuarterImpl } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toQuarter(ExtendedDayNum(d)); + return time_zone.toQuarter(DayNum(d)); } using FactorTransform = ToStartOfYearImpl; @@ -504,7 +505,7 @@ struct ToMonthImpl } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toMonth(ExtendedDayNum(d)); + return time_zone.toMonth(DayNum(d)); } using FactorTransform = ToStartOfYearImpl; @@ -528,7 +529,7 @@ struct ToDayOfMonthImpl } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDayOfMonth(ExtendedDayNum(d)); + return time_zone.toDayOfMonth(DayNum(d)); } using FactorTransform = ToStartOfMonthImpl; @@ -552,7 +553,7 @@ struct ToDayOfWeekImpl } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDayOfWeek(ExtendedDayNum(d)); + return time_zone.toDayOfWeek(DayNum(d)); } using FactorTransform = ToMondayImpl; @@ -576,7 +577,7 @@ struct ToDayOfYearImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDayOfYear(ExtendedDayNum(d)); + return time_zone.toDayOfYear(DayNum(d)); } using FactorTransform = ToStartOfYearImpl; @@ -699,7 +700,7 @@ struct ToISOYearImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toISOYear(ExtendedDayNum(d)); + return time_zone.toISOYear(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -723,7 +724,7 @@ struct ToStartOfISOYearImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfISOYear(ExtendedDayNum(d)); + return time_zone.toFirstDayNumOfISOYear(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -747,7 +748,7 @@ struct ToISOWeekImpl } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toISOWeek(ExtendedDayNum(d)); + return time_zone.toISOWeek(DayNum(d)); } using FactorTransform = ToISOYearImpl; @@ -771,7 +772,7 @@ struct ToRelativeYearNumImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toYear(ExtendedDayNum(d)); + return time_zone.toYear(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -795,7 +796,7 @@ struct ToRelativeQuarterNumImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeQuarterNum(ExtendedDayNum(d)); + return time_zone.toRelativeQuarterNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -819,7 +820,7 @@ struct ToRelativeMonthNumImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeMonthNum(ExtendedDayNum(d)); + return time_zone.toRelativeMonthNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -843,7 +844,7 @@ struct ToRelativeWeekNumImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeWeekNum(ExtendedDayNum(d)); + return time_zone.toRelativeWeekNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -892,7 +893,7 @@ struct ToRelativeHourNumImpl } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeHourNum(ExtendedDayNum(d)); + return time_zone.toRelativeHourNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -916,7 +917,7 @@ struct ToRelativeMinuteNumImpl } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeMinuteNum(ExtendedDayNum(d)); + return time_zone.toRelativeMinuteNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -940,7 +941,7 @@ struct ToRelativeSecondNumImpl } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(ExtendedDayNum(d)); + return time_zone.fromDayNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -960,11 +961,11 @@ struct ToYYYYMMImpl } static inline UInt32 execute(Int32 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMM(static_cast(d)); + return time_zone.toNumYYYYMM(ExtendedDayNum(d)); } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMM(static_cast(d)); + return time_zone.toNumYYYYMM(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -984,11 +985,11 @@ struct ToYYYYMMDDImpl } static inline UInt32 execute(Int32 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMMDD(static_cast(d)); + return time_zone.toNumYYYYMMDD(ExtendedDayNum(d)); } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMMDD(static_cast(d)); + return time_zone.toNumYYYYMMDD(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -1008,11 +1009,11 @@ struct ToYYYYMMDDhhmmssImpl } static inline UInt64 execute(Int32 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMMDDhhmmss(time_zone.toDate(static_cast(d))); + return time_zone.toNumYYYYMMDDhhmmss(time_zone.toDate(ExtendedDayNum(d))); } static inline UInt64 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMMDDhhmmss(time_zone.toDate(static_cast(d))); + return time_zone.toNumYYYYMMDDhhmmss(time_zone.toDate(DayNum(d))); } using FactorTransform = ZeroTransform; diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h index 4224a74ae8e..8f6b1370935 100644 --- a/src/Functions/FunctionDateOrDateTimeAddInterval.h +++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h @@ -58,7 +58,7 @@ struct AddSecondsImpl } static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(ExtendedDayNum(d)) + delta; + return time_zone.fromDayNum(DayNum(d)) + delta; } }; @@ -83,7 +83,7 @@ struct AddMinutesImpl } static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(ExtendedDayNum(d)) + delta * 60; + return time_zone.fromDayNum(DayNum(d)) + delta * 60; } }; @@ -107,7 +107,7 @@ struct AddHoursImpl } static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(ExtendedDayNum(d)) + delta * 3600; + return time_zone.fromDayNum(DayNum(d)) + delta * 3600; } }; @@ -180,7 +180,7 @@ struct AddMonthsImpl static inline UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.addMonths(ExtendedDayNum(d), delta); + return time_zone.addMonths(DayNum(d), delta); } static inline Int32 execute(Int32 d, Int64 delta, const DateLUTImpl & time_zone) @@ -206,7 +206,7 @@ struct AddQuartersImpl static inline UInt16 execute(UInt16 d, Int32 delta, const DateLUTImpl & time_zone) { - return time_zone.addQuarters(ExtendedDayNum(d), delta); + return time_zone.addQuarters(DayNum(d), delta); } static inline Int32 execute(Int32 d, Int32 delta, const DateLUTImpl & time_zone) @@ -232,7 +232,7 @@ struct AddYearsImpl static inline UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.addYears(ExtendedDayNum(d), delta); + return time_zone.addYears(DayNum(d), delta); } static inline Int32 execute(Int32 d, Int64 delta, const DateLUTImpl & time_zone) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 8018fa8e726..62e62b5f5dc 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -1835,6 +1835,8 @@ public: size_t getNumberOfArguments() const override { return 0; } bool useDefaultImplementationForConstants() const override { return true; } + bool canBeExecutedOnDefaultArguments() const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override diff --git a/src/Functions/FunctionsTimeWindow.h b/src/Functions/FunctionsTimeWindow.h index 6e5d79fd062..313de10702d 100644 --- a/src/Functions/FunctionsTimeWindow.h +++ b/src/Functions/FunctionsTimeWindow.h @@ -48,7 +48,7 @@ struct ToStartOfTransform; template <> \ struct ToStartOfTransform \ { \ - static ExtendedDayNum execute(UInt32 t, UInt64 delta, const DateLUTImpl & time_zone) \ + static auto execute(UInt32 t, UInt64 delta, const DateLUTImpl & time_zone) \ { \ return time_zone.toStartOf##INTERVAL_KIND##Interval(time_zone.toDayNum(t), delta); \ } \ @@ -89,7 +89,7 @@ struct ToStartOfTransform; template <> \ struct AddTime \ { \ - static inline ExtendedDayNum execute(UInt16 d, UInt64 delta, const DateLUTImpl & time_zone) \ + static inline auto execute(UInt16 d, UInt64 delta, const DateLUTImpl & time_zone) \ { \ return time_zone.add##INTERVAL_KIND##s(ExtendedDayNum(d), delta); \ } \ diff --git a/src/Functions/GatherUtils/Algorithms.h b/src/Functions/GatherUtils/Algorithms.h index fc54eaf88ab..046e2dcf70f 100644 --- a/src/Functions/GatherUtils/Algorithms.h +++ b/src/Functions/GatherUtils/Algorithms.h @@ -347,18 +347,31 @@ void NO_INLINE sliceDynamicOffsetUnbounded(Source && src, Sink && sink, const IC } } -template -void NO_INLINE sliceDynamicOffsetBounded(Source && src, Sink && sink, const IColumn & offset_column, const IColumn & length_column) -{ - const bool is_offset_null = offset_column.onlyNull(); - const auto * offset_nullable = typeid_cast(&offset_column); - const ColumnUInt8::Container * offset_null_map = offset_nullable ? &offset_nullable->getNullMapData() : nullptr; - const IColumn * offset_nested_column = offset_nullable ? &offset_nullable->getNestedColumn() : &offset_column; - const bool is_length_null = length_column.onlyNull(); - const auto * length_nullable = typeid_cast(&length_column); - const ColumnUInt8::Container * length_null_map = length_nullable ? &length_nullable->getNullMapData() : nullptr; - const IColumn * length_nested_column = length_nullable ? &length_nullable->getNestedColumn() : &length_column; +template +static void sliceDynamicOffsetBoundedImpl(Source && src, Sink && sink, const IColumn * offset_column, const IColumn * length_column) +{ + const bool is_offset_null = !offset_column || offset_column->onlyNull(); + const ColumnUInt8::Container * offset_null_map = nullptr; + const IColumn * offset_nested_column = nullptr; + + if (!is_offset_null) + { + const auto * offset_nullable = typeid_cast(offset_column); + offset_null_map = offset_nullable ? &offset_nullable->getNullMapData() : nullptr; + offset_nested_column = offset_nullable ? &offset_nullable->getNestedColumn() : offset_column; + } + + const bool is_length_null = !length_column || length_column->onlyNull(); + const ColumnUInt8::Container * length_null_map = nullptr; + const IColumn * length_nested_column = nullptr; + + if (!is_length_null) + { + const auto * length_nullable = typeid_cast(length_column); + length_null_map = length_nullable ? &length_nullable->getNullMapData() : nullptr; + length_nested_column = length_nullable ? &length_nullable->getNestedColumn() : length_column; + } while (!src.isEnd()) { @@ -376,9 +389,19 @@ void NO_INLINE sliceDynamicOffsetBounded(Source && src, Sink && sink, const ICol typename std::decay_t::Slice slice; if (offset > 0) - slice = src.getSliceFromLeft(offset - 1, size); + { + if constexpr (inverse) + slice = src.getSliceFromRight(UInt64(size) + UInt64(offset) - 1, size); + else + slice = src.getSliceFromLeft(UInt64(offset) - 1, size); + } else - slice = src.getSliceFromRight(-UInt64(offset), size); + { + if constexpr (inverse) + slice = src.getSliceFromLeft(-UInt64(offset), size); + else + slice = src.getSliceFromRight(-UInt64(offset), size); + } writeSlice(slice, sink); } @@ -389,6 +412,26 @@ void NO_INLINE sliceDynamicOffsetBounded(Source && src, Sink && sink, const ICol } +template +void NO_INLINE sliceDynamicOffsetBounded(Source && src, Sink && sink, const IColumn & offset_column, const IColumn & length_column) +{ + sliceDynamicOffsetBoundedImpl(std::forward(src), std::forward(sink), &offset_column, &length_column); +} + +/// Similar to above, but with no offset. +template +void NO_INLINE sliceFromLeftDynamicLength(Source && src, Sink && sink, const IColumn & length_column) +{ + sliceDynamicOffsetBoundedImpl(std::forward(src), std::forward(sink), nullptr, &length_column); +} + +template +void NO_INLINE sliceFromRightDynamicLength(Source && src, Sink && sink, const IColumn & length_column) +{ + sliceDynamicOffsetBoundedImpl(std::forward(src), std::forward(sink), nullptr, &length_column); +} + + template void NO_INLINE conditional(SourceA && src_a, SourceB && src_b, Sink && sink, const PaddedPODArray & condition) { @@ -593,6 +636,7 @@ bool insliceEqualElements(const NumericArraySlice & first [[maybe_unused]], else return accurate::equalsOp(first.data[first_ind], first.data[second_ind]); } + inline ALWAYS_INLINE bool insliceEqualElements(const GenericArraySlice & first, size_t first_ind, size_t second_ind) { return first.elements->compareAt(first_ind + first.begin, second_ind + first.begin, *first.elements, -1) == 0; diff --git a/src/Functions/GatherUtils/GatherUtils.h b/src/Functions/GatherUtils/GatherUtils.h index c2513214a79..8a623caa297 100644 --- a/src/Functions/GatherUtils/GatherUtils.h +++ b/src/Functions/GatherUtils/GatherUtils.h @@ -32,9 +32,9 @@ namespace DB::GatherUtils enum class ArraySearchType { - Any, // Corresponds to the hasAny array function - All, // Corresponds to the hasAll array function - Substr // Corresponds to the hasSubstr array function + Any, // Corresponds to the hasAny array function + All, // Corresponds to the hasAll array function + Substr // Corresponds to the hasSubstr array function }; std::unique_ptr createArraySource(const ColumnArray & col, bool is_const, size_t total_rows); @@ -52,6 +52,9 @@ ColumnArray::MutablePtr sliceFromRightConstantOffsetBounded(IArraySource & src, ColumnArray::MutablePtr sliceDynamicOffsetUnbounded(IArraySource & src, const IColumn & offset_column); ColumnArray::MutablePtr sliceDynamicOffsetBounded(IArraySource & src, const IColumn & offset_column, const IColumn & length_column); +ColumnArray::MutablePtr sliceFromLeftDynamicLength(IArraySource & src, const IColumn & length_column); +ColumnArray::MutablePtr sliceFromRightDynamicLength(IArraySource & src, const IColumn & length_column); + void sliceHasAny(IArraySource & first, IArraySource & second, ColumnUInt8 & result); void sliceHasAll(IArraySource & first, IArraySource & second, ColumnUInt8 & result); void sliceHasSubstr(IArraySource & first, IArraySource & second, ColumnUInt8 & result); diff --git a/src/Functions/GatherUtils/Sources.h b/src/Functions/GatherUtils/Sources.h index c8014d3e855..7d1241be7d1 100644 --- a/src/Functions/GatherUtils/Sources.h +++ b/src/Functions/GatherUtils/Sources.h @@ -358,6 +358,11 @@ struct UTF8StringSource : public StringSource return pos; } + size_t getElementSize() const + { + return UTF8::countCodePoints(&elements[prev_offset], StringSource::getElementSize()); + } + Slice getSliceFromLeft(size_t offset) const { const auto * begin = &elements[prev_offset]; diff --git a/src/Functions/GatherUtils/sliceFromLeftDynamicLength.cpp b/src/Functions/GatherUtils/sliceFromLeftDynamicLength.cpp new file mode 100644 index 00000000000..b704f7ada7d --- /dev/null +++ b/src/Functions/GatherUtils/sliceFromLeftDynamicLength.cpp @@ -0,0 +1,60 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + +#include "GatherUtils.h" +#include "Selectors.h" +#include "Algorithms.h" + +namespace DB::GatherUtils +{ + +namespace +{ + +struct Selector : public ArraySourceSelector +{ + template + static void selectSource(bool is_const, bool is_nullable, Source && source, + const IColumn & length_column, ColumnArray::MutablePtr & result) + { + using SourceType = typename std::decay::type; + using Sink = typename SourceType::SinkType; + + if (is_nullable) + { + using NullableSource = NullableArraySource; + using NullableSink = typename NullableSource::SinkType; + + auto & nullable_source = static_cast(source); + + result = ColumnArray::create(nullable_source.createValuesColumn()); + NullableSink sink(result->getData(), result->getOffsets(), source.getColumnSize()); + + if (is_const) + sliceFromLeftDynamicLength(static_cast &>(source), sink, length_column); + else + sliceFromLeftDynamicLength(static_cast(source), sink, length_column); + } + else + { + result = ColumnArray::create(source.createValuesColumn()); + Sink sink(result->getData(), result->getOffsets(), source.getColumnSize()); + + if (is_const) + sliceFromLeftDynamicLength(static_cast &>(source), sink, length_column); + else + sliceFromLeftDynamicLength(source, sink, length_column); + } + } +}; + +} + +ColumnArray::MutablePtr sliceFromLeftDynamicLength(IArraySource & src, const IColumn & length_column) +{ + ColumnArray::MutablePtr res; + Selector::select(src, length_column, res); + return res; +} +} + +#endif diff --git a/src/Functions/GatherUtils/sliceFromRightDynamicLength.cpp b/src/Functions/GatherUtils/sliceFromRightDynamicLength.cpp new file mode 100644 index 00000000000..1db86b4fda9 --- /dev/null +++ b/src/Functions/GatherUtils/sliceFromRightDynamicLength.cpp @@ -0,0 +1,60 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + +#include "GatherUtils.h" +#include "Selectors.h" +#include "Algorithms.h" + +namespace DB::GatherUtils +{ + +namespace +{ + +struct Selector : public ArraySourceSelector +{ + template + static void selectSource(bool is_const, bool is_nullable, Source && source, + const IColumn & length_column, ColumnArray::MutablePtr & result) + { + using SourceType = typename std::decay::type; + using Sink = typename SourceType::SinkType; + + if (is_nullable) + { + using NullableSource = NullableArraySource; + using NullableSink = typename NullableSource::SinkType; + + auto & nullable_source = static_cast(source); + + result = ColumnArray::create(nullable_source.createValuesColumn()); + NullableSink sink(result->getData(), result->getOffsets(), source.getColumnSize()); + + if (is_const) + sliceFromRightDynamicLength(static_cast &>(source), sink, length_column); + else + sliceFromRightDynamicLength(static_cast(source), sink, length_column); + } + else + { + result = ColumnArray::create(source.createValuesColumn()); + Sink sink(result->getData(), result->getOffsets(), source.getColumnSize()); + + if (is_const) + sliceFromRightDynamicLength(static_cast &>(source), sink, length_column); + else + sliceFromRightDynamicLength(source, sink, length_column); + } + } +}; + +} + +ColumnArray::MutablePtr sliceFromRightDynamicLength(IArraySource & src, const IColumn & length_column) +{ + ColumnArray::MutablePtr res; + Selector::select(src, length_column, res); + return res; +} +} + +#endif diff --git a/src/Functions/LeftRight.h b/src/Functions/LeftRight.h new file mode 100644 index 00000000000..054e76b7792 --- /dev/null +++ b/src/Functions/LeftRight.h @@ -0,0 +1,145 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +using namespace GatherUtils; + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +enum class SubstringDirection +{ + Left, + Right +}; + +template +class FunctionLeftRight : public IFunction +{ +public: + static constexpr auto name = direction == SubstringDirection::Left + ? (is_utf8 ? "leftUTF8" : "left") + : (is_utf8 ? "rightUTF8" : "right"); + + static FunctionPtr create(ContextPtr) + { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + bool isVariadic() const override { return false; } + size_t getNumberOfArguments() const override { return 2; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if ((is_utf8 && !isString(arguments[0])) || !isStringOrFixedString(arguments[0])) + throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (!isNativeNumber(arguments[1])) + throw Exception("Illegal type " + arguments[1]->getName() + + " of second argument of function " + + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); + } + + template + ColumnPtr executeForSource(const ColumnPtr & column_length, + const ColumnConst * column_length_const, + Int64 length_value, Source && source, + size_t input_rows_count) const + { + auto col_res = ColumnString::create(); + + if constexpr (direction == SubstringDirection::Left) + { + if (column_length_const) + sliceFromLeftConstantOffsetBounded(source, StringSink(*col_res, input_rows_count), 0, length_value); + else + sliceFromLeftDynamicLength(source, StringSink(*col_res, input_rows_count), *column_length); + } + else + { + if (column_length_const) + sliceFromRightConstantOffsetUnbounded(source, StringSink(*col_res, input_rows_count), length_value); + else + sliceFromRightDynamicLength(source, StringSink(*col_res, input_rows_count), *column_length); + } + + return col_res; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + ColumnPtr column_string = arguments[0].column; + ColumnPtr column_length = arguments[1].column; + + const ColumnConst * column_length_const = checkAndGetColumn(column_length.get()); + + Int64 length_value = 0; + + if (column_length_const) + length_value = column_length_const->getInt(0); + + if constexpr (is_utf8) + { + if (const ColumnString * col = checkAndGetColumn(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, UTF8StringSource(*col), input_rows_count); + else if (const ColumnConst * col_const = checkAndGetColumnConst(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, ConstSource(*col_const), input_rows_count); + else + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + if (const ColumnString * col = checkAndGetColumn(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, StringSource(*col), input_rows_count); + else if (const ColumnFixedString * col_fixed = checkAndGetColumn(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, FixedStringSource(*col_fixed), input_rows_count); + else if (const ColumnConst * col_const = checkAndGetColumnConst(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, ConstSource(*col_const), input_rows_count); + else if (const ColumnConst * col_const_fixed = checkAndGetColumnConst(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, ConstSource(*col_const_fixed), input_rows_count); + else + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + } +}; + +} diff --git a/src/Functions/URL/decodeURLComponent.cpp b/src/Functions/URL/decodeURLComponent.cpp index b6abaab515e..9ed290b1832 100644 --- a/src/Functions/URL/decodeURLComponent.cpp +++ b/src/Functions/URL/decodeURLComponent.cpp @@ -12,7 +12,7 @@ namespace ErrorCodes } /// We assume that size of the dst buf isn't less than src_size. -static size_t decodeURL(const char * src, size_t src_size, char * dst) +static size_t decodeURL(const char * src, size_t src_size, char * dst, bool plus_as_space) { const char * src_prev_pos = src; const char * src_curr_pos = src; @@ -21,12 +21,28 @@ static size_t decodeURL(const char * src, size_t src_size, char * dst) while (true) { - src_curr_pos = find_first_symbols<'%'>(src_curr_pos, src_end); + src_curr_pos = find_first_symbols<'%', '+'>(src_curr_pos, src_end); if (src_curr_pos == src_end) { break; } + else if (*src_curr_pos == '+') + { + if (!plus_as_space) + { + ++src_curr_pos; + continue; + } + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + + ++src_curr_pos; + src_prev_pos = src_curr_pos; + *dst_pos = ' '; + ++dst_pos; + } else if (src_end - src_curr_pos < 3) { src_curr_pos = src_end; @@ -67,6 +83,7 @@ static size_t decodeURL(const char * src, size_t src_size, char * dst) /// Percent decode of URL data. +template struct DecodeURLComponentImpl { static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, @@ -83,7 +100,7 @@ struct DecodeURLComponentImpl { const char * src_data = reinterpret_cast(&data[prev_offset]); size_t src_size = offsets[i] - prev_offset; - size_t dst_size = decodeURL(src_data, src_size, reinterpret_cast(res_data.data() + res_offset)); + size_t dst_size = decodeURL(src_data, src_size, reinterpret_cast(res_data.data() + res_offset), plus_as_space); res_offset += dst_size; res_offsets[i] = res_offset; @@ -101,11 +118,14 @@ struct DecodeURLComponentImpl struct NameDecodeURLComponent { static constexpr auto name = "decodeURLComponent"; }; -using FunctionDecodeURLComponent = FunctionStringToString; +struct NameDecodeURLFormComponent { static constexpr auto name = "decodeURLFormComponent"; }; +using FunctionDecodeURLComponent = FunctionStringToString, NameDecodeURLComponent>; +using FunctionDecodeURLFormComponent = FunctionStringToString, NameDecodeURLFormComponent>; void registerFunctionDecodeURLComponent(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/array/arrayFirst.cpp b/src/Functions/array/arrayFirst.cpp index dbe545ea387..edbf7ef6269 100644 --- a/src/Functions/array/arrayFirst.cpp +++ b/src/Functions/array/arrayFirst.cpp @@ -11,7 +11,14 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; } -struct ArrayFirstImpl +enum class ArrayFirstLastStrategy +{ + First, + Last +}; + +template +struct ArrayFirstLastImpl { static bool needBoolean() { return false; } static bool needExpression() { return true; } @@ -40,15 +47,23 @@ struct ArrayFirstImpl auto out = data.cloneEmpty(); out->reserve(data.size()); - size_t pos{}; - for (auto offset : offsets) + size_t offsets_size = offsets.size(); + for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index) { - if (offset - pos > 0) - out->insert(data[pos]); - else - out->insertDefault(); + size_t start_offset = offsets[offset_index - 1]; + size_t end_offset = offsets[offset_index]; - pos = offset; + if (end_offset > start_offset) + { + if constexpr (strategy == ArrayFirstLastStrategy::First) + out->insert(data[start_offset]); + else + out->insert(data[end_offset - 1]); + } + else + { + out->insertDefault(); + } } return out; @@ -67,18 +82,36 @@ struct ArrayFirstImpl auto out = data.cloneEmpty(); out->reserve(data.size()); - size_t pos{}; - for (auto offset : offsets) + size_t offsets_size = offsets.size(); + for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index) { - auto exists = false; - for (; pos < offset; ++pos) + size_t start_offset = offsets[offset_index - 1]; + size_t end_offset = offsets[offset_index]; + + bool exists = false; + + if constexpr (strategy == ArrayFirstLastStrategy::First) { - if (filter[pos]) + for (; start_offset != end_offset; ++start_offset) { - out->insert(data[pos]); - exists = true; - pos = offset; - break; + if (filter[start_offset]) + { + out->insert(data[start_offset]); + exists = true; + break; + } + } + } + else + { + for (; end_offset != start_offset; --end_offset) + { + if (filter[end_offset - 1]) + { + out->insert(data[end_offset - 1]); + exists = true; + break; + } } } @@ -91,11 +124,17 @@ struct ArrayFirstImpl }; struct NameArrayFirst { static constexpr auto name = "arrayFirst"; }; +using ArrayFirstImpl = ArrayFirstLastImpl; using FunctionArrayFirst = FunctionArrayMapped; +struct NameArrayLast { static constexpr auto name = "arrayLast"; }; +using ArrayLastImpl = ArrayFirstLastImpl; +using FunctionArrayLast = FunctionArrayMapped; + void registerFunctionArrayFirst(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/array/arrayFirstIndex.cpp b/src/Functions/array/arrayFirstIndex.cpp deleted file mode 100644 index d229687774e..00000000000 --- a/src/Functions/array/arrayFirstIndex.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include -#include "FunctionArrayMapped.h" -#include - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int ILLEGAL_COLUMN; -} - -struct ArrayFirstIndexImpl -{ - static bool needBoolean() { return false; } - static bool needExpression() { return true; } - static bool needOneArray() { return false; } - - static DataTypePtr getReturnType(const DataTypePtr & /*expression_return*/, const DataTypePtr & /*array_element*/) - { - return std::make_shared(); - } - - static ColumnPtr execute(const ColumnArray & array, ColumnPtr mapped) - { - const auto * column_filter = typeid_cast(&*mapped); - - if (!column_filter) - { - const auto * column_filter_const = checkAndGetColumnConst(&*mapped); - - if (!column_filter_const) - throw Exception("Unexpected type of filter column", ErrorCodes::ILLEGAL_COLUMN); - - if (column_filter_const->getValue()) - { - const auto & offsets = array.getOffsets(); - auto out_column = ColumnUInt32::create(offsets.size()); - auto & out_index = out_column->getData(); - - size_t pos{}; - for (size_t i = 0; i < offsets.size(); ++i) - { - out_index[i] = offsets[i] - pos > 0; - pos = offsets[i]; - } - - return out_column; - } - else - return DataTypeUInt32().createColumnConst(array.size(), 0u); - } - - const auto & filter = column_filter->getData(); - const auto & offsets = array.getOffsets(); - auto out_column = ColumnUInt32::create(offsets.size()); - auto & out_index = out_column->getData(); - - size_t pos{}; - for (size_t i = 0; i < offsets.size(); ++i) - { - UInt32 index{}; - for (size_t idx{1}; pos < offsets[i]; ++pos, ++idx) - { - if (filter[pos]) - { - index = idx; - pos = offsets[i]; - break; - } - } - - out_index[i] = index; - } - - return out_column; - } -}; - -struct NameArrayFirstIndex { static constexpr auto name = "arrayFirstIndex"; }; -using FunctionArrayFirstIndex = FunctionArrayMapped; - -void registerFunctionArrayFirstIndex(FunctionFactory & factory) -{ - factory.registerFunction(); -} - -} - diff --git a/src/Functions/array/arrayFirstLastIndex.cpp b/src/Functions/array/arrayFirstLastIndex.cpp new file mode 100644 index 00000000000..467678f3faa --- /dev/null +++ b/src/Functions/array/arrayFirstLastIndex.cpp @@ -0,0 +1,134 @@ +#include +#include +#include "FunctionArrayMapped.h" +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + +enum class ArrayFirstLastIndexStrategy +{ + First, + Last +}; + +template +struct ArrayFirstLastIndexImpl +{ + static bool needBoolean() { return false; } + static bool needExpression() { return true; } + static bool needOneArray() { return false; } + + static DataTypePtr getReturnType(const DataTypePtr & /*expression_return*/, const DataTypePtr & /*array_element*/) + { + return std::make_shared(); + } + + static ColumnPtr execute(const ColumnArray & array, ColumnPtr mapped) + { + const auto * column_filter = typeid_cast(&*mapped); + + if (!column_filter) + { + const auto * column_filter_const = checkAndGetColumnConst(&*mapped); + + if (!column_filter_const) + throw Exception("Unexpected type of filter column", ErrorCodes::ILLEGAL_COLUMN); + + if (column_filter_const->getValue()) + { + const auto & offsets = array.getOffsets(); + auto out_column = ColumnUInt32::create(offsets.size()); + auto & out_index = out_column->getData(); + + size_t offsets_size = offsets.size(); + for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index) + { + size_t start_offset = offsets[offset_index - 1]; + size_t end_offset = offsets[offset_index]; + + if (end_offset > start_offset) + { + if constexpr (strategy == ArrayFirstLastIndexStrategy::First) + out_index[offset_index] = 1; + else + out_index[offset_index] = end_offset - start_offset; + } + else + { + out_index[offset_index] = 0; + } + } + + return out_column; + } + else + { + return DataTypeUInt32().createColumnConst(array.size(), 0u); + } + } + + const auto & filter = column_filter->getData(); + const auto & offsets = array.getOffsets(); + + size_t offsets_size = offsets.size(); + auto out_column = ColumnUInt32::create(offsets_size); + auto & out_index = out_column->getData(); + + for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index) + { + size_t start_offset = offsets[offset_index - 1]; + size_t end_offset = offsets[offset_index]; + size_t result_index = 0; + + if constexpr (strategy == ArrayFirstLastIndexStrategy::First) + { + for (size_t index = 1; start_offset != end_offset; ++start_offset, ++index) + { + if (filter[start_offset]) + { + result_index = index; + break; + } + } + } + else + { + for (size_t index = end_offset - start_offset; end_offset != start_offset; --end_offset, --index) + { + if (filter[end_offset - 1]) + { + result_index = index; + break; + } + } + } + + out_index[offset_index] = result_index; + } + + return out_column; + } +}; + +struct NameArrayFirstIndex { static constexpr auto name = "arrayFirstIndex"; }; +using ArrayFirstIndexImpl = ArrayFirstLastIndexImpl; +using FunctionArrayFirstIndex = FunctionArrayMapped; + +struct NameArrayLastIndex { static constexpr auto name = "arrayLastIndex"; }; +using ArrayLastIndexImpl = ArrayFirstLastIndexImpl; +using FunctionArrayLastIndex = FunctionArrayMapped; + +void registerFunctionArrayFirstIndex(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerFunction(); +} + +} + diff --git a/src/Functions/left.cpp b/src/Functions/left.cpp new file mode 100644 index 00000000000..aa7a2cdd5a8 --- /dev/null +++ b/src/Functions/left.cpp @@ -0,0 +1,13 @@ +#include +#include + +namespace DB +{ + +void registerFunctionLeft(FunctionFactory & factory) +{ + factory.registerFunction>(FunctionFactory::CaseInsensitive); + factory.registerFunction>(FunctionFactory::CaseSensitive); +} + +} diff --git a/src/Functions/monthName.cpp b/src/Functions/monthName.cpp new file mode 100644 index 00000000000..c397fdffaa5 --- /dev/null +++ b/src/Functions/monthName.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +class FunctionMonthName : public IFunction +{ +public: + static constexpr auto name = "monthName"; + + static constexpr auto month_str = "month"; + + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + + explicit FunctionMonthName(ContextPtr context_) + : function_resolver(FunctionFactory::instance().get("dateName", std::move(context_))) + {} + + String getName() const override { return name; } + + bool useDefaultImplementationForConstants() const override { return true; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + size_t getNumberOfArguments() const override { return 1; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.size() != 1) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 1", + getName(), + toString(arguments.size())); + + WhichDataType argument_type(arguments[0].type); + if (!argument_type.isDate() && !argument_type.isDateTime() && !argument_type.isDateTime64()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of argument of function {}, should be Date, DateTime or DateTime64", + getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl( + const ColumnsWithTypeAndName & arguments, + const DataTypePtr & result_type, + size_t input_rows_count) const override + { + auto month_column = DataTypeString().createColumnConst(arguments[0].column->size(), month_str); + ColumnsWithTypeAndName temporary_columns + { + ColumnWithTypeAndName(month_column, std::make_shared(), ""), + arguments[0] + }; + + auto date_name_func = function_resolver->build(temporary_columns); + return date_name_func->execute(temporary_columns, result_type, input_rows_count); + } + +private: + FunctionOverloadResolverPtr function_resolver; +}; + +void registerFunctionMonthName(FunctionFactory & factory) +{ + factory.registerFunction(FunctionFactory::CaseInsensitive); +} + +} diff --git a/src/Functions/registerFunctionsDateTime.cpp b/src/Functions/registerFunctionsDateTime.cpp index abbc52c8360..5211a62ff1e 100644 --- a/src/Functions/registerFunctionsDateTime.cpp +++ b/src/Functions/registerFunctionsDateTime.cpp @@ -65,6 +65,7 @@ void registerFunctionSubtractQuarters(FunctionFactory &); void registerFunctionSubtractYears(FunctionFactory &); void registerFunctionDateDiff(FunctionFactory &); void registerFunctionDateName(FunctionFactory &); +void registerFunctionMonthName(FunctionFactory &); void registerFunctionToTimeZone(FunctionFactory &); void registerFunctionFormatDateTime(FunctionFactory &); void registerFunctionFromModifiedJulianDay(FunctionFactory &); @@ -136,6 +137,7 @@ void registerFunctionsDateTime(FunctionFactory & factory) registerFunctionSubtractYears(factory); registerFunctionDateDiff(factory); registerFunctionDateName(factory); + registerFunctionMonthName(factory); registerFunctionToTimeZone(factory); registerFunctionFormatDateTime(factory); registerFunctionFromModifiedJulianDay(factory); diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index 79002f0a97d..7d1673aff7c 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -23,6 +23,8 @@ void registerFunctionsConcat(FunctionFactory &); void registerFunctionFormat(FunctionFactory &); void registerFunctionFormatRow(FunctionFactory &); void registerFunctionSubstring(FunctionFactory &); +void registerFunctionLeft(FunctionFactory &); +void registerFunctionRight(FunctionFactory &); void registerFunctionCRC(FunctionFactory &); void registerFunctionAppendTrailingCharIfAbsent(FunctionFactory &); void registerFunctionStartsWith(FunctionFactory &); @@ -74,6 +76,8 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionFormat(factory); registerFunctionFormatRow(factory); registerFunctionSubstring(factory); + registerFunctionLeft(factory); + registerFunctionRight(factory); registerFunctionAppendTrailingCharIfAbsent(factory); registerFunctionStartsWith(factory); registerFunctionEndsWith(factory); diff --git a/src/Functions/right.cpp b/src/Functions/right.cpp new file mode 100644 index 00000000000..ca0df8b2d98 --- /dev/null +++ b/src/Functions/right.cpp @@ -0,0 +1,13 @@ +#include +#include + +namespace DB +{ + +void registerFunctionRight(FunctionFactory & factory) +{ + factory.registerFunction>(FunctionFactory::CaseInsensitive); + factory.registerFunction>(FunctionFactory::CaseSensitive); +} + +} diff --git a/src/Functions/toStartOfInterval.cpp b/src/Functions/toStartOfInterval.cpp index f8ea44851b6..09b7931de8d 100644 --- a/src/Functions/toStartOfInterval.cpp +++ b/src/Functions/toStartOfInterval.cpp @@ -37,7 +37,7 @@ namespace static UInt16 execute(UInt16 d, UInt64 years, const DateLUTImpl & time_zone) { - return time_zone.toStartOfYearInterval(ExtendedDayNum(d), years); + return time_zone.toStartOfYearInterval(DayNum(d), years); } static UInt16 execute(Int32 d, UInt64 years, const DateLUTImpl & time_zone) @@ -63,7 +63,7 @@ namespace static UInt16 execute(UInt16 d, UInt64 quarters, const DateLUTImpl & time_zone) { - return time_zone.toStartOfQuarterInterval(ExtendedDayNum(d), quarters); + return time_zone.toStartOfQuarterInterval(DayNum(d), quarters); } static UInt16 execute(Int32 d, UInt64 quarters, const DateLUTImpl & time_zone) @@ -89,7 +89,7 @@ namespace static UInt16 execute(UInt16 d, UInt64 months, const DateLUTImpl & time_zone) { - return time_zone.toStartOfMonthInterval(ExtendedDayNum(d), months); + return time_zone.toStartOfMonthInterval(DayNum(d), months); } static UInt16 execute(Int32 d, UInt64 months, const DateLUTImpl & time_zone) @@ -115,7 +115,7 @@ namespace static UInt16 execute(UInt16 d, UInt64 weeks, const DateLUTImpl & time_zone) { - return time_zone.toStartOfWeekInterval(ExtendedDayNum(d), weeks); + return time_zone.toStartOfWeekInterval(DayNum(d), weeks); } static UInt16 execute(Int32 d, UInt64 weeks, const DateLUTImpl & time_zone) diff --git a/src/IO/AsynchronousReadBufferFromFile.cpp b/src/IO/AsynchronousReadBufferFromFile.cpp index 9327b80738d..969384cd91c 100644 --- a/src/IO/AsynchronousReadBufferFromFile.cpp +++ b/src/IO/AsynchronousReadBufferFromFile.cpp @@ -30,8 +30,10 @@ AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile( size_t buf_size, int flags, char * existing_memory, - size_t alignment) - : AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, -1, buf_size, existing_memory, alignment), file_name(file_name_) + size_t alignment, + std::optional file_size_) + : AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, -1, buf_size, existing_memory, alignment, file_size_) + , file_name(file_name_) { ProfileEvents::increment(ProfileEvents::FileOpen); @@ -62,10 +64,10 @@ AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile( const std::string & original_file_name, size_t buf_size, char * existing_memory, - size_t alignment) - : - AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, fd_, buf_size, existing_memory, alignment), - file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name) + size_t alignment, + std::optional file_size_) + : AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, fd_, buf_size, existing_memory, alignment, file_size_) + , file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name) { fd_ = -1; } diff --git a/src/IO/AsynchronousReadBufferFromFile.h b/src/IO/AsynchronousReadBufferFromFile.h index d9d5e43e0d4..96834350bab 100644 --- a/src/IO/AsynchronousReadBufferFromFile.h +++ b/src/IO/AsynchronousReadBufferFromFile.h @@ -14,17 +14,25 @@ protected: public: explicit AsynchronousReadBufferFromFile( - AsynchronousReaderPtr reader_, Int32 priority_, - const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1, - char * existing_memory = nullptr, size_t alignment = 0); + AsynchronousReaderPtr reader_, + Int32 priority_, + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt); /// Use pre-opened file descriptor. explicit AsynchronousReadBufferFromFile( - AsynchronousReaderPtr reader_, Int32 priority_, + AsynchronousReaderPtr reader_, + Int32 priority_, int & fd, /// Will be set to -1 if constructor didn't throw and ownership of file descriptor is passed to the object. const std::string & original_file_name = {}, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - char * existing_memory = nullptr, size_t alignment = 0); + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt); ~AsynchronousReadBufferFromFile() override; @@ -48,11 +56,16 @@ private: public: AsynchronousReadBufferFromFileWithDescriptorsCache( - AsynchronousReaderPtr reader_, Int32 priority_, - const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1, - char * existing_memory = nullptr, size_t alignment = 0) - : AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, -1, buf_size, existing_memory, alignment), - file_name(file_name_) + AsynchronousReaderPtr reader_, + Int32 priority_, + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, -1, buf_size, existing_memory, alignment, file_size_) + , file_name(file_name_) { file = OpenedFileCache::instance().get(file_name, flags); fd = file->getFD(); diff --git a/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp b/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp index a27c9035c61..9c92201b3a1 100644 --- a/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp +++ b/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp @@ -44,6 +44,15 @@ std::future AsynchronousReadBufferFromFileDescripto request.offset = file_offset_of_buffer_end; request.priority = priority; + /// This is a workaround of a read pass EOF bug in linux kernel with pread() + if (file_size.has_value() && file_offset_of_buffer_end >= *file_size) + { + return std::async(std::launch::deferred, [] + { + return IAsynchronousReader::Result{ .size = 0, .offset = 0 }; + }); + } + return reader->submit(request); } diff --git a/src/IO/AsynchronousReadBufferFromFileDescriptor.h b/src/IO/AsynchronousReadBufferFromFileDescriptor.h index 50d8f5819fe..2a16148812e 100644 --- a/src/IO/AsynchronousReadBufferFromFileDescriptor.h +++ b/src/IO/AsynchronousReadBufferFromFileDescriptor.h @@ -35,10 +35,18 @@ protected: public: AsynchronousReadBufferFromFileDescriptor( - AsynchronousReaderPtr reader_, Int32 priority_, - int fd_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0) - : ReadBufferFromFileBase(buf_size, existing_memory, alignment), - reader(std::move(reader_)), priority(priority_), required_alignment(alignment), fd(fd_) + AsynchronousReaderPtr reader_, + Int32 priority_, + int fd_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : ReadBufferFromFileBase(buf_size, existing_memory, alignment, file_size_) + , reader(std::move(reader_)) + , priority(priority_) + , required_alignment(alignment) + , fd(fd_) { prefetch_buffer.alignment = alignment; } diff --git a/src/IO/BrotliReadBuffer.cpp b/src/IO/BrotliReadBuffer.cpp index b66bbf45054..77069746153 100644 --- a/src/IO/BrotliReadBuffer.cpp +++ b/src/IO/BrotliReadBuffer.cpp @@ -39,7 +39,7 @@ BrotliReadBuffer::BrotliReadBuffer(std::unique_ptr in_, size_t buf_s , in_data(nullptr) , out_capacity(0) , out_data(nullptr) - , eof(false) + , eof_flag(false) { } @@ -47,7 +47,7 @@ BrotliReadBuffer::~BrotliReadBuffer() = default; bool BrotliReadBuffer::nextImpl() { - if (eof) + if (eof_flag) return false; if (!in_available) @@ -74,7 +74,7 @@ bool BrotliReadBuffer::nextImpl() { if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } else diff --git a/src/IO/BrotliReadBuffer.h b/src/IO/BrotliReadBuffer.h index 0fa999d1de5..44a7dc7ddbd 100644 --- a/src/IO/BrotliReadBuffer.h +++ b/src/IO/BrotliReadBuffer.h @@ -32,7 +32,7 @@ private: size_t out_capacity; uint8_t * out_data; - bool eof; + bool eof_flag; }; } diff --git a/src/IO/Bzip2ReadBuffer.cpp b/src/IO/Bzip2ReadBuffer.cpp index df9a8d5b369..c2060612757 100644 --- a/src/IO/Bzip2ReadBuffer.cpp +++ b/src/IO/Bzip2ReadBuffer.cpp @@ -42,7 +42,7 @@ Bzip2ReadBuffer::Bzip2ReadBuffer(std::unique_ptr in_, size_t buf_siz : BufferWithOwnMemory(buf_size, existing_memory, alignment) , in(std::move(in_)) , bz(std::make_unique()) - , eof(false) + , eof_flag(false) { } @@ -50,7 +50,7 @@ Bzip2ReadBuffer::~Bzip2ReadBuffer() = default; bool Bzip2ReadBuffer::nextImpl() { - if (eof) + if (eof_flag) return false; if (!bz->stream.avail_in) @@ -72,7 +72,7 @@ bool Bzip2ReadBuffer::nextImpl() { if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } else @@ -91,7 +91,7 @@ bool Bzip2ReadBuffer::nextImpl() if (in->eof()) { - eof = true; + eof_flag = true; throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected end of bzip2 archive"); } diff --git a/src/IO/Bzip2ReadBuffer.h b/src/IO/Bzip2ReadBuffer.h index dc113800683..de1e61ee388 100644 --- a/src/IO/Bzip2ReadBuffer.h +++ b/src/IO/Bzip2ReadBuffer.h @@ -26,7 +26,7 @@ private: class Bzip2StateWrapper; std::unique_ptr bz; - bool eof; + bool eof_flag; }; } diff --git a/src/IO/LZMAInflatingReadBuffer.cpp b/src/IO/LZMAInflatingReadBuffer.cpp index f2df6bdca6a..80da7421fc3 100644 --- a/src/IO/LZMAInflatingReadBuffer.cpp +++ b/src/IO/LZMAInflatingReadBuffer.cpp @@ -7,7 +7,7 @@ namespace ErrorCodes extern const int LZMA_STREAM_DECODER_FAILED; } LZMAInflatingReadBuffer::LZMAInflatingReadBuffer(std::unique_ptr in_, size_t buf_size, char * existing_memory, size_t alignment) - : BufferWithOwnMemory(buf_size, existing_memory, alignment), in(std::move(in_)), eof(false) + : BufferWithOwnMemory(buf_size, existing_memory, alignment), in(std::move(in_)), eof_flag(false) { lstr = LZMA_STREAM_INIT; lstr.allocator = nullptr; @@ -36,7 +36,7 @@ LZMAInflatingReadBuffer::~LZMAInflatingReadBuffer() bool LZMAInflatingReadBuffer::nextImpl() { - if (eof) + if (eof_flag) return false; lzma_action action = LZMA_RUN; @@ -64,7 +64,7 @@ bool LZMAInflatingReadBuffer::nextImpl() { if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } else diff --git a/src/IO/LZMAInflatingReadBuffer.h b/src/IO/LZMAInflatingReadBuffer.h index 18922f64516..2d676eeeeb3 100644 --- a/src/IO/LZMAInflatingReadBuffer.h +++ b/src/IO/LZMAInflatingReadBuffer.h @@ -25,7 +25,7 @@ private: std::unique_ptr in; lzma_stream lstr; - bool eof; + bool eof_flag; }; } diff --git a/src/IO/Lz4InflatingReadBuffer.cpp b/src/IO/Lz4InflatingReadBuffer.cpp index 22bce94cad2..61e912d440c 100644 --- a/src/IO/Lz4InflatingReadBuffer.cpp +++ b/src/IO/Lz4InflatingReadBuffer.cpp @@ -32,7 +32,7 @@ Lz4InflatingReadBuffer::~Lz4InflatingReadBuffer() bool Lz4InflatingReadBuffer::nextImpl() { - if (eof) + if (eof_flag) return false; if (!in_available) @@ -66,7 +66,7 @@ bool Lz4InflatingReadBuffer::nextImpl() if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } diff --git a/src/IO/Lz4InflatingReadBuffer.h b/src/IO/Lz4InflatingReadBuffer.h index 0462d85adf7..d4d81f8765c 100644 --- a/src/IO/Lz4InflatingReadBuffer.h +++ b/src/IO/Lz4InflatingReadBuffer.h @@ -35,7 +35,7 @@ private: size_t in_available; size_t out_available; - bool eof = false; + bool eof_flag = false; }; } diff --git a/src/IO/ReadBufferFromFile.cpp b/src/IO/ReadBufferFromFile.cpp index d0f94441622..4f601301686 100644 --- a/src/IO/ReadBufferFromFile.cpp +++ b/src/IO/ReadBufferFromFile.cpp @@ -28,8 +28,9 @@ ReadBufferFromFile::ReadBufferFromFile( size_t buf_size, int flags, char * existing_memory, - size_t alignment) - : ReadBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment), file_name(file_name_) + size_t alignment, + std::optional file_size_) + : ReadBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment, file_size_), file_name(file_name_) { ProfileEvents::increment(ProfileEvents::FileOpen); @@ -58,10 +59,10 @@ ReadBufferFromFile::ReadBufferFromFile( const std::string & original_file_name, size_t buf_size, char * existing_memory, - size_t alignment) - : - ReadBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment), - file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name) + size_t alignment, + std::optional file_size_) + : ReadBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment, file_size_) + , file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name) { fd_ = -1; } diff --git a/src/IO/ReadBufferFromFile.h b/src/IO/ReadBufferFromFile.h index 1a45e4c1829..ff19fa40fdf 100644 --- a/src/IO/ReadBufferFromFile.h +++ b/src/IO/ReadBufferFromFile.h @@ -23,15 +23,22 @@ protected: CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForRead}; public: - explicit ReadBufferFromFile(const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1, - char * existing_memory = nullptr, size_t alignment = 0); + explicit ReadBufferFromFile( + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt); /// Use pre-opened file descriptor. explicit ReadBufferFromFile( int & fd, /// Will be set to -1 if constructor didn't throw and ownership of file descriptor is passed to the object. const std::string & original_file_name = {}, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - char * existing_memory = nullptr, size_t alignment = 0); + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt); ~ReadBufferFromFile() override; @@ -50,9 +57,14 @@ public: class ReadBufferFromFilePRead : public ReadBufferFromFile { public: - ReadBufferFromFilePRead(const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1, - char * existing_memory = nullptr, size_t alignment = 0) - : ReadBufferFromFile(file_name_, buf_size, flags, existing_memory, alignment) + ReadBufferFromFilePRead( + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : ReadBufferFromFile(file_name_, buf_size, flags, existing_memory, alignment, file_size_) { use_pread = true; } @@ -68,10 +80,15 @@ private: OpenedFileCache::OpenedFilePtr file; public: - ReadBufferFromFilePReadWithDescriptorsCache(const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1, - char * existing_memory = nullptr, size_t alignment = 0) - : ReadBufferFromFileDescriptorPRead(-1, buf_size, existing_memory, alignment), - file_name(file_name_) + ReadBufferFromFilePReadWithDescriptorsCache( + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : ReadBufferFromFileDescriptorPRead(-1, buf_size, existing_memory, alignment, file_size_) + , file_name(file_name_) { file = OpenedFileCache::instance().get(file_name, flags); fd = file->getFD(); diff --git a/src/IO/ReadBufferFromFileBase.cpp b/src/IO/ReadBufferFromFileBase.cpp index b598501a608..4db64755abf 100644 --- a/src/IO/ReadBufferFromFileBase.cpp +++ b/src/IO/ReadBufferFromFileBase.cpp @@ -7,8 +7,13 @@ ReadBufferFromFileBase::ReadBufferFromFileBase() : BufferWithOwnMemory file_size_) : BufferWithOwnMemory(buf_size, existing_memory, alignment) + , file_size(file_size_) { } diff --git a/src/IO/ReadBufferFromFileBase.h b/src/IO/ReadBufferFromFileBase.h index 731fd373e24..a051283b2bb 100644 --- a/src/IO/ReadBufferFromFileBase.h +++ b/src/IO/ReadBufferFromFileBase.h @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -22,7 +23,11 @@ class ReadBufferFromFileBase : public BufferWithOwnMemory { public: ReadBufferFromFileBase(); - ReadBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment); + ReadBufferFromFileBase( + size_t buf_size, + char * existing_memory, + size_t alignment, + std::optional file_size_ = std::nullopt); ~ReadBufferFromFileBase() override; virtual std::string getFileName() const = 0; @@ -44,6 +49,7 @@ public: } protected: + std::optional file_size; ProfileCallback profile_callback; clockid_t clock_type{}; }; diff --git a/src/IO/ReadBufferFromFileDescriptor.cpp b/src/IO/ReadBufferFromFileDescriptor.cpp index ed8eba62f04..ed6b1a60181 100644 --- a/src/IO/ReadBufferFromFileDescriptor.cpp +++ b/src/IO/ReadBufferFromFileDescriptor.cpp @@ -54,6 +54,10 @@ bool ReadBufferFromFileDescriptor::nextImpl() /// If internal_buffer size is empty, then read() cannot be distinguished from EOF assert(!internal_buffer.empty()); + /// This is a workaround of a read pass EOF bug in linux kernel with pread() + if (file_size.has_value() && file_offset_of_buffer_end >= *file_size) + return false; + size_t bytes_read = 0; while (!bytes_read) { diff --git a/src/IO/ReadBufferFromFileDescriptor.h b/src/IO/ReadBufferFromFileDescriptor.h index 8dbe8707bdb..188cdd709b5 100644 --- a/src/IO/ReadBufferFromFileDescriptor.h +++ b/src/IO/ReadBufferFromFileDescriptor.h @@ -27,8 +27,15 @@ protected: std::string getFileName() const override; public: - ReadBufferFromFileDescriptor(int fd_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0) - : ReadBufferFromFileBase(buf_size, existing_memory, alignment), required_alignment(alignment), fd(fd_) + ReadBufferFromFileDescriptor( + int fd_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : ReadBufferFromFileBase(buf_size, existing_memory, alignment, file_size_) + , required_alignment(alignment) + , fd(fd_) { } @@ -63,8 +70,13 @@ private: class ReadBufferFromFileDescriptorPRead : public ReadBufferFromFileDescriptor { public: - ReadBufferFromFileDescriptorPRead(int fd_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0) - : ReadBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment) + ReadBufferFromFileDescriptorPRead( + int fd_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : ReadBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment, file_size_) { use_pread = true; } diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index b0a6838b81e..48811a41edd 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -702,6 +702,25 @@ void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & set readCSVStringInto(s, buf, settings); } +void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings) +{ + s.clear(); + bool add_quote = false; + char quote = '\''; + + if (!buf.eof() && (*buf.position() == '\'' || *buf.position() == '"')) + { + quote = *buf.position(); + s.push_back(quote); + add_quote = true; + } + + readCSVStringInto(s, buf, settings); + + if (add_quote) + s.push_back(quote); +} + template void readCSVStringInto>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); @@ -1212,6 +1231,19 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim } } +// Use PeekableReadBuffer to copy field to string after parsing. +template +static void readParsedValueIntoString(String & s, ReadBuffer & buf, ParseFunc parse_func) +{ + PeekableReadBuffer peekable_buf(buf); + peekable_buf.setCheckpoint(); + parse_func(peekable_buf); + peekable_buf.makeContinuousMemoryFromCheckpointToPos(); + auto * end = peekable_buf.position(); + peekable_buf.rollbackToCheckpoint(); + s.append(peekable_buf.position(), end); + peekable_buf.position() = end; +} template static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf) @@ -1266,7 +1298,11 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) /// - Number: integer, float, decimal. if (*buf.position() == '\'') - readQuotedString(s, buf); + { + s.push_back('\''); + readQuotedStringInto(s, buf); + s.push_back('\''); + } else if (*buf.position() == '[') readQuotedFieldInBrackets<'[', ']'>(s, buf); else if (*buf.position() == '(') @@ -1290,18 +1326,19 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) else { /// It's an integer, float or decimal. They all can be parsed as float. - /// Use PeekableReadBuffer to copy field to string after parsing. - PeekableReadBuffer peekable_buf(buf); - peekable_buf.setCheckpoint(); - Float64 tmp; - readFloatText(tmp, peekable_buf); - peekable_buf.makeContinuousMemoryFromCheckpointToPos(); - auto * end = peekable_buf.position(); - peekable_buf.rollbackToCheckpoint(); - s.append(peekable_buf.position(), end); - peekable_buf.position() = end; + auto parse_func = [](ReadBuffer & in) + { + Float64 tmp; + readFloatText(tmp, in); + }; + readParsedValueIntoString(s, buf, parse_func); } } +void readJSONFieldIntoString(String & s, ReadBuffer & buf) +{ + auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); }; + readParsedValueIntoString(s, buf, parse_func); +} } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index b2ad4035cdc..6d1023947a5 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -563,6 +563,8 @@ void readStringUntilWhitespace(String & s, ReadBuffer & buf); */ void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +/// Differ from readCSVString in that it doesn't remove quotes around field if any. +void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); /// Read and append result to array of characters. template @@ -1381,4 +1383,7 @@ struct PcgDeserializer void readQuotedFieldIntoString(String & s, ReadBuffer & buf); +void readJSONFieldIntoString(String & s, ReadBuffer & buf); + } + diff --git a/src/IO/ZlibInflatingReadBuffer.cpp b/src/IO/ZlibInflatingReadBuffer.cpp index 472399dea3d..28426e920ef 100644 --- a/src/IO/ZlibInflatingReadBuffer.cpp +++ b/src/IO/ZlibInflatingReadBuffer.cpp @@ -16,7 +16,7 @@ ZlibInflatingReadBuffer::ZlibInflatingReadBuffer( size_t alignment) : BufferWithOwnMemory(buf_size, existing_memory, alignment) , in(std::move(in_)) - , eof(false) + , eof_flag(false) { zstr.zalloc = nullptr; zstr.zfree = nullptr; @@ -54,7 +54,7 @@ bool ZlibInflatingReadBuffer::nextImpl() do { /// if we already found eof, we shouldn't do anything - if (eof) + if (eof_flag) return false; /// if there is no available bytes in zstr, move ptr to next available data @@ -83,7 +83,7 @@ bool ZlibInflatingReadBuffer::nextImpl() /// * false if there is no data in working buffer if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } /// If it is not end of file, we need to reset zstr and return true, because we still have some data to read diff --git a/src/IO/ZlibInflatingReadBuffer.h b/src/IO/ZlibInflatingReadBuffer.h index b8c141e9b9b..905ab0cd3fc 100644 --- a/src/IO/ZlibInflatingReadBuffer.h +++ b/src/IO/ZlibInflatingReadBuffer.h @@ -33,7 +33,7 @@ private: std::unique_ptr in; z_stream zstr; - bool eof; + bool eof_flag; }; } diff --git a/src/IO/ZstdInflatingReadBuffer.cpp b/src/IO/ZstdInflatingReadBuffer.cpp index ce89f09f955..6f244dc5a75 100644 --- a/src/IO/ZstdInflatingReadBuffer.cpp +++ b/src/IO/ZstdInflatingReadBuffer.cpp @@ -31,7 +31,7 @@ bool ZstdInflatingReadBuffer::nextImpl() do { // If it is known that end of file was reached, return false - if (eof) + if (eof_flag) return false; /// If end was reached, get next part @@ -64,7 +64,7 @@ bool ZstdInflatingReadBuffer::nextImpl() /// If end of file is reached, fill eof variable and return true if there is some data in buffer, otherwise return false if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } /// It is possible, that input buffer is not at eof yet, but nothing was decompressed in current iteration. diff --git a/src/IO/ZstdInflatingReadBuffer.h b/src/IO/ZstdInflatingReadBuffer.h index e6e2dad0ad5..ec80b860e0e 100644 --- a/src/IO/ZstdInflatingReadBuffer.h +++ b/src/IO/ZstdInflatingReadBuffer.h @@ -31,7 +31,7 @@ private: ZSTD_DCtx * dctx; ZSTD_inBuffer input; ZSTD_outBuffer output; - bool eof = false; + bool eof_flag = false; }; } diff --git a/src/IO/createReadBufferFromFileBase.cpp b/src/IO/createReadBufferFromFileBase.cpp index bed97d54ab0..b83bfdbf3a8 100644 --- a/src/IO/createReadBufferFromFileBase.cpp +++ b/src/IO/createReadBufferFromFileBase.cpp @@ -29,14 +29,20 @@ namespace ErrorCodes std::unique_ptr createReadBufferFromFileBase( const std::string & filename, const ReadSettings & settings, - std::optional size, + std::optional read_hint, + std::optional file_size, int flags, char * existing_memory, size_t alignment) { - if (size.has_value() && !*size) + if (file_size.has_value() && !*file_size) return std::make_unique(); - size_t estimated_size = size.has_value() ? *size : 0; + + size_t estimated_size = 0; + if (read_hint.has_value()) + estimated_size = *read_hint; + else if (file_size.has_value()) + estimated_size = file_size.has_value() ? *file_size : 0; if (!existing_memory && settings.local_fs_method == LocalFSReadMethod::mmap @@ -63,23 +69,23 @@ std::unique_ptr createReadBufferFromFileBase( if (settings.local_fs_method == LocalFSReadMethod::read) { - res = std::make_unique(filename, buffer_size, actual_flags, existing_memory, alignment); + res = std::make_unique(filename, buffer_size, actual_flags, existing_memory, alignment, file_size); } else if (settings.local_fs_method == LocalFSReadMethod::pread || settings.local_fs_method == LocalFSReadMethod::mmap) { - res = std::make_unique(filename, buffer_size, actual_flags, existing_memory, alignment); + res = std::make_unique(filename, buffer_size, actual_flags, existing_memory, alignment, file_size); } else if (settings.local_fs_method == LocalFSReadMethod::pread_fake_async) { static AsynchronousReaderPtr reader = std::make_shared(); res = std::make_unique( - reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, alignment); + reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, alignment, file_size); } else if (settings.local_fs_method == LocalFSReadMethod::pread_threadpool) { static AsynchronousReaderPtr reader = std::make_shared(16, 1000000); res = std::make_unique( - reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, alignment); + reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, alignment, file_size); } else throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown read method"); diff --git a/src/IO/createReadBufferFromFileBase.h b/src/IO/createReadBufferFromFileBase.h index 86da469b55d..c2e2040587b 100644 --- a/src/IO/createReadBufferFromFileBase.h +++ b/src/IO/createReadBufferFromFileBase.h @@ -11,12 +11,14 @@ namespace DB /** Create an object to read data from a file. * - * @param size - the number of bytes to read + * @param read_hint - the number of bytes to read hint + * @param file_size - size of file */ std::unique_ptr createReadBufferFromFileBase( const std::string & filename, const ReadSettings & settings, - std::optional size = {}, + std::optional read_hint = {}, + std::optional file_size = {}, int flags_ = -1, char * existing_memory = nullptr, size_t alignment = 0); diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp index 4f00ec5f96d..3b05d8c76b6 100644 --- a/src/IO/parseDateTimeBestEffort.cpp +++ b/src/IO/parseDateTimeBestEffort.cpp @@ -151,7 +151,18 @@ ReturnType parseDateTimeBestEffortImpl( { num_digits = readDigits(digits, sizeof(digits), in); - if (num_digits == 10 && !year && !has_time) + if (num_digits == 13 && !year && !has_time) + { + /// This is unix timestamp with millisecond. + readDecimalNumber<10>(res, digits); + if (fractional) + { + fractional->digits = 3; + readDecimalNumber<3>(fractional->value, digits + 10); + } + return ReturnType(true); + } + else if (num_digits == 10 && !year && !has_time) { /// This is unix timestamp. readDecimalNumber<10>(res, digits); diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index d5eb701e2aa..bc937755618 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -697,6 +697,10 @@ ASTs ActionsMatcher::doUntuple(const ASTFunction * function, ActionsMatcher::Dat for (const auto & name [[maybe_unused]] : tuple_type->getElementNames()) { auto tuple_ast = function->arguments->children[0]; + + /// This transformation can lead to exponential growth of AST size, let's check it. + tuple_ast->checkSize(data.getContext()->getSettingsRef().max_ast_elements); + if (tid != 0) tuple_ast = tuple_ast->clone(); diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index ae5ce117c61..5c9d94d7c45 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -361,7 +361,6 @@ void Aggregator::compileAggregateFunctionsIfNeeded() auto compiled_aggregate_functions = compileAggregateFunctions(getJITInstance(), functions_to_compile, functions_description); return std::make_shared(std::move(compiled_aggregate_functions)); }); - compiled_aggregate_functions_holder = std::static_pointer_cast(compiled_function_cache_entry); } else diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 121f7c4153f..d1c5fbebbc7 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -69,12 +69,10 @@ static std::unique_ptr openFileIfExists(const std::stri AsynchronousMetrics::AsynchronousMetrics( ContextPtr global_context_, int update_period_seconds, - std::shared_ptr> servers_to_start_before_tables_, - std::shared_ptr> servers_) + const ProtocolServerMetricsFunc & protocol_server_metrics_func_) : WithContext(global_context_) , update_period(update_period_seconds) - , servers_to_start_before_tables(servers_to_start_before_tables_) - , servers(servers_) + , protocol_server_metrics_func(protocol_server_metrics_func_) , log(&Poco::Logger::get("AsynchronousMetrics")) { #if defined(OS_LINUX) @@ -238,7 +236,7 @@ void AsynchronousMetrics::start() thread = std::make_unique([this] { run(); }); } -AsynchronousMetrics::~AsynchronousMetrics() +void AsynchronousMetrics::stop() { try { @@ -249,7 +247,10 @@ AsynchronousMetrics::~AsynchronousMetrics() wait_cond.notify_one(); if (thread) + { thread->join(); + thread.reset(); + } } catch (...) { @@ -257,6 +258,11 @@ AsynchronousMetrics::~AsynchronousMetrics() } } +AsynchronousMetrics::~AsynchronousMetrics() +{ + stop(); +} + AsynchronousMetricValues AsynchronousMetrics::getValues() const { @@ -1381,22 +1387,11 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti return it->second; }; - if (servers_to_start_before_tables) + const auto server_metrics = protocol_server_metrics_func(); + for (const auto & server_metric : server_metrics) { - for (const auto & server : *servers_to_start_before_tables) - { - if (const auto * name = get_metric_name(server.getPortName())) - new_values[name] = server.currentThreads(); - } - } - - if (servers) - { - for (const auto & server : *servers) - { - if (const auto * name = get_metric_name(server.getPortName())) - new_values[name] = server.currentThreads(); - } + if (const auto * name = get_metric_name(server_metric.port_name)) + new_values[name] = server_metric.current_threads; } } diff --git a/src/Interpreters/AsynchronousMetrics.h b/src/Interpreters/AsynchronousMetrics.h index 7a5c2d638d7..3c7581ce1a3 100644 --- a/src/Interpreters/AsynchronousMetrics.h +++ b/src/Interpreters/AsynchronousMetrics.h @@ -30,6 +30,11 @@ class ReadBuffer; using AsynchronousMetricValue = double; using AsynchronousMetricValues = std::unordered_map; +struct ProtocolServerMetrics +{ + String port_name; + size_t current_threads; +}; /** Periodically (by default, each minute, starting at 30 seconds offset) * calculates and updates some metrics, @@ -41,24 +46,25 @@ using AsynchronousMetricValues = std::unordered_map()>; AsynchronousMetrics( ContextPtr global_context_, int update_period_seconds, - std::shared_ptr> servers_to_start_before_tables_, - std::shared_ptr> servers_); + const ProtocolServerMetricsFunc & protocol_server_metrics_func_); ~AsynchronousMetrics(); /// Separate method allows to initialize the `servers` variable beforehand. void start(); + void stop(); + /// Returns copy of all values. AsynchronousMetricValues getValues() const; private: const std::chrono::seconds update_period; - std::shared_ptr> servers_to_start_before_tables{nullptr}; - std::shared_ptr> servers{nullptr}; + ProtocolServerMetricsFunc protocol_server_metrics_func; mutable std::mutex mutex; std::condition_variable wait_cond; diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index 30d0dd4cece..05972f2ee50 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -101,7 +102,7 @@ Cluster::Address::Address( user = config.getString(config_prefix + ".user", "default"); password = config.getString(config_prefix + ".password", ""); default_database = config.getString(config_prefix + ".default_database", ""); - secure = config.getBool(config_prefix + ".secure", false) ? Protocol::Secure::Enable : Protocol::Secure::Disable; + secure = ConfigHelper::getBool(config, config_prefix + ".secure", false, /* empty_as */true) ? Protocol::Secure::Enable : Protocol::Secure::Disable; priority = config.getInt(config_prefix + ".priority", 1); const char * port_type = secure == Protocol::Secure::Enable ? "tcp_port_secure" : "tcp_port"; is_local = isLocal(config.getInt(port_type, 0)); @@ -320,13 +321,29 @@ void Clusters::updateClusters(const Poco::Util::AbstractConfiguration & new_conf if (old_config) { for (const auto & key : deleted_keys) - impl.erase(key); + { + if (!automatic_clusters.contains(key)) + impl.erase(key); + } } else - impl.clear(); + { + if (!automatic_clusters.empty()) + std::erase_if(impl, [this](const auto & e) { return automatic_clusters.contains(e.first); }); + else + impl.clear(); + } + for (const auto & key : new_config_keys) { + if (new_config.has(config_prefix + "." + key + ".discovery")) + { + /// Handled in ClusterDiscovery + automatic_clusters.insert(key); + continue; + } + if (key.find('.') != String::npos) throw Exception("Cluster names with dots are not supported: '" + key + "'", ErrorCodes::SYNTAX_ERROR); diff --git a/src/Interpreters/Cluster.h b/src/Interpreters/Cluster.h index a64e17264b1..3773dadaf13 100644 --- a/src/Interpreters/Cluster.h +++ b/src/Interpreters/Cluster.h @@ -6,6 +6,8 @@ #include #include +#include +#include namespace Poco { @@ -295,12 +297,15 @@ public: void updateClusters(const Poco::Util::AbstractConfiguration & new_config, const Settings & settings, const String & config_prefix, Poco::Util::AbstractConfiguration * old_config = nullptr); -public: using Impl = std::map; Impl getContainer() const; protected: + + /// setup outside of this class, stored to prevent deleting from impl on config update + std::unordered_set automatic_clusters; + Impl impl; mutable std::mutex mutex; }; diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp new file mode 100644 index 00000000000..8b68ba02504 --- /dev/null +++ b/src/Interpreters/ClusterDiscovery.cpp @@ -0,0 +1,479 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace +{ + +fs::path getShardsListPath(const String & zk_root) +{ + return fs::path(zk_root + "/shards"); +} + +} + +/* + * Holds boolean flags for fixed set of keys. + * Flags can be concurrently set from different threads, and consumer can wait for it. + */ +template +class ClusterDiscovery::ConcurrentFlags +{ +public: + template + ConcurrentFlags(It begin, It end) + { + for (auto it = begin; it != end; ++it) + flags.emplace(*it, false); + } + + void set(const T & key) + { + auto it = flags.find(key); + if (it == flags.end()) + throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Unknown value '{}'", key); + it->second = true; + any_need_update = true; + cv.notify_one(); + } + + /// waits unit at least one flag is set + /// caller should handle all set flags (or set it again manually) + /// note: keys of returen map should not be changed! + /// @param finished - output parameter indicates that stop() was called + std::unordered_map & wait(std::chrono::milliseconds timeout, bool & finished) + { + std::unique_lock lk(mu); + cv.wait_for(lk, timeout, [this]() -> bool { return any_need_update || stop_flag; }); + finished = stop_flag; + + /// all set flags expected to be handled by caller + any_need_update = false; + return flags; + } + + void stop() + { + std::unique_lock lk(mu); + stop_flag = true; + cv.notify_one(); + } + +private: + std::condition_variable cv; + std::mutex mu; + + /// flag indicates that update is required + std::unordered_map flags; + std::atomic_bool any_need_update = true; + bool stop_flag = false; +}; + +ClusterDiscovery::ClusterDiscovery( + const Poco::Util::AbstractConfiguration & config, + ContextPtr context_, + const String & config_prefix) + : context(Context::createCopy(context_)) + , current_node_name(toString(ServerUUID::get())) + , log(&Poco::Logger::get("ClusterDiscovery")) +{ + LOG_DEBUG(log, "Cluster discovery is enabled"); + + Poco::Util::AbstractConfiguration::Keys config_keys; + config.keys(config_prefix, config_keys); + + for (const auto & key : config_keys) + { + String prefix = config_prefix + "." + key + ".discovery"; + if (!config.has(prefix)) + continue; + + clusters_info.emplace( + key, + ClusterInfo( + /* name_= */ key, + /* zk_root_= */ config.getString(prefix + ".path"), + /* port= */ context->getTCPPort(), + /* secure= */ config.getBool(prefix + ".secure", false), + /* shard_id= */ config.getUInt(prefix + ".shard", 0) + ) + ); + } + clusters_to_update = std::make_shared(config_keys.begin(), config_keys.end()); +} + +/// List node in zookeper for cluster +Strings ClusterDiscovery::getNodeNames(zkutil::ZooKeeperPtr & zk, + const String & zk_root, + const String & cluster_name, + int * version, + bool set_callback) +{ + auto watch_callback = [cluster_name, clusters_to_update=clusters_to_update](auto) { clusters_to_update->set(cluster_name); }; + + Coordination::Stat stat; + Strings nodes = zk->getChildrenWatch(getShardsListPath(zk_root), &stat, set_callback ? watch_callback : Coordination::WatchCallback{}); + if (version) + *version = stat.cversion; + return nodes; +} + +/// Reads node information from specified zookeeper nodes +/// On error returns empty result +ClusterDiscovery::NodesInfo ClusterDiscovery::getNodes(zkutil::ZooKeeperPtr & zk, const String & zk_root, const Strings & node_uuids) +{ + NodesInfo result; + for (const auto & node_uuid : node_uuids) + { + String payload; + bool ok = zk->tryGet(getShardsListPath(zk_root) / node_uuid, payload) && + NodeInfo::parse(payload, result[node_uuid]); + if (!ok) + { + LOG_WARNING(log, "Can't get data from node '{}' in '{}'", node_uuid, zk_root); + return {}; + } + } + return result; +} + +/// Checks if cluster nodes set is changed. +/// Returns true if update required. +/// It performs only shallow check (set of nodes' uuids). +/// So, if node's hostname are changed, then cluster won't be updated. +bool ClusterDiscovery::needUpdate(const Strings & node_uuids, const NodesInfo & nodes) +{ + bool has_difference = node_uuids.size() != nodes.size() || + std::any_of(node_uuids.begin(), node_uuids.end(), [&nodes] (auto u) { return !nodes.contains(u); }); + { + /// Just to log updated nodes, suboptimal, but should be ok for expected update sizes + std::set new_names(node_uuids.begin(), node_uuids.end()); + std::set old_names; + for (const auto & [name, _] : nodes) + old_names.emplace(name); + + auto format_cluster_update = [](const std::set & s1, const std::set & s2) + { + std::vector diff; + std::set_difference(s1.begin(), s1.end(), s2.begin(), s2.end(), std::back_inserter(diff)); + + constexpr size_t max_to_show = 3; + size_t sz = diff.size(); + bool need_crop = sz > max_to_show; + if (need_crop) + diff.resize(max_to_show); + + if (sz == 0) + return fmt::format("{} nodes", sz); + return fmt::format("{} node{} [{}{}]", sz, sz != 1 ? "s" : "", fmt::join(diff, ", "), need_crop ? ",..." : ""); + }; + + LOG_DEBUG(log, "Cluster update: added {}, removed {}", + format_cluster_update(new_names, old_names), + format_cluster_update(old_names, new_names)); + } + return has_difference; +} + +ClusterPtr ClusterDiscovery::makeCluster(const ClusterInfo & cluster_info) +{ + std::vector> shards; + { + std::map replica_adresses; + + for (const auto & [_, node] : cluster_info.nodes_info) + { + if (cluster_info.current_node.secure != node.secure) + { + LOG_WARNING(log, "Node '{}' in cluster '{}' has different 'secure' value, skipping it", node.address, cluster_info.name); + continue; + } + replica_adresses[node.shard_id].emplace_back(node.address); + } + + shards.reserve(replica_adresses.size()); + for (auto & [_, replicas] : replica_adresses) + shards.emplace_back(std::move(replicas)); + } + + bool secure = cluster_info.current_node.secure; + auto cluster = std::make_shared( + context->getSettings(), + shards, + /* username= */ context->getUserName(), + /* password= */ "", + /* clickhouse_port= */ secure ? context->getTCPPortSecure().value_or(DBMS_DEFAULT_SECURE_PORT) : context->getTCPPort(), + /* treat_local_as_remote= */ false, + /* treat_local_port_as_remote= */ context->getApplicationType() == Context::ApplicationType::LOCAL, + /* secure= */ secure); + return cluster; +} + +/// Reads data from zookeeper and tries to update cluster. +/// Returns true on success (or no update required). +bool ClusterDiscovery::updateCluster(ClusterInfo & cluster_info) +{ + LOG_DEBUG(log, "Updating cluster '{}'", cluster_info.name); + + auto zk = context->getZooKeeper(); + + int start_version; + Strings node_uuids = getNodeNames(zk, cluster_info.zk_root, cluster_info.name, &start_version, false); + auto & nodes_info = cluster_info.nodes_info; + + if (std::find(node_uuids.begin(), node_uuids.end(), current_node_name) == node_uuids.end()) + { + LOG_ERROR(log, "Can't find current node in cluster '{}', will register again", cluster_info.name); + registerInZk(zk, cluster_info); + nodes_info.clear(); + return false; + } + + if (!needUpdate(node_uuids, nodes_info)) + { + LOG_DEBUG(log, "No update required for cluster '{}'", cluster_info.name); + return true; + } + + nodes_info = getNodes(zk, cluster_info.zk_root, node_uuids); + if (nodes_info.empty()) + { + LOG_WARNING(log, "Can't get nodes info for '{}'", cluster_info.name); + return false; + } + + int current_version; + getNodeNames(zk, cluster_info.zk_root, cluster_info.name, ¤t_version, true); + + if (current_version != start_version) + { + LOG_DEBUG(log, "Cluster '{}' configuration changed during update", cluster_info.name); + nodes_info.clear(); + return false; + } + + LOG_DEBUG(log, "Updating system.clusters record for '{}' with {} nodes", cluster_info.name, cluster_info.nodes_info.size()); + + auto cluster = makeCluster(cluster_info); + context->setCluster(cluster_info.name, cluster); + return true; +} + +void ClusterDiscovery::registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info) +{ + LOG_DEBUG(log, "Registering current node {} in cluster {}", current_node_name, info.name); + + String node_path = getShardsListPath(info.zk_root) / current_node_name; + zk->createAncestors(node_path); + + zk->createOrUpdate(node_path, info.current_node.serialize(), zkutil::CreateMode::Ephemeral); + LOG_DEBUG(log, "Current node {} registered in cluster {}", current_node_name, info.name); +} + +void ClusterDiscovery::initialUpdate() +{ + auto zk = context->getZooKeeper(); + for (auto & [_, info] : clusters_info) + { + registerInZk(zk, info); + if (!updateCluster(info)) + { + LOG_WARNING(log, "Error on initial cluster '{}' update, will retry in background", info.name); + clusters_to_update->set(info.name); + } + } +} + +void ClusterDiscovery::start() +{ + if (clusters_info.empty()) + { + LOG_DEBUG(log, "No defined clusters for discovery"); + return; + } + + try + { + initialUpdate(); + } + catch (...) + { + tryLogCurrentException(log, "Caught exception in cluster discovery initialization"); + } + + using namespace std::chrono_literals; + constexpr static std::chrono::milliseconds DEFAULT_BACKOFF_TIMEOUT = 10ms; + + LOG_DEBUG(log, "Starting working thread"); + main_thread = ThreadFromGlobalPool([this] + { + std::chrono::milliseconds backoff_timeout = DEFAULT_BACKOFF_TIMEOUT; + + bool finish = false; + while (!finish) + { + try + { + finish = runMainThread([&backoff_timeout] { backoff_timeout = DEFAULT_BACKOFF_TIMEOUT; }); + } + catch (...) + { + /* + * it can be zk error (will take new session) or other retriable error, + * should not stop discovery forever + */ + tryLogCurrentException(log, "Caught exception in cluster discovery runMainThread"); + } + std::this_thread::sleep_for(backoff_timeout); + backoff_timeout = std::min(backoff_timeout * 2, std::chrono::milliseconds(3min)); + } + }); +} + +/// Returns `true` on graceful shutdown (no restart required) +bool ClusterDiscovery::runMainThread(std::function up_to_date_callback) +{ + setThreadName("ClusterDiscover"); + LOG_DEBUG(log, "Worker thread started"); + + using namespace std::chrono_literals; + + constexpr auto force_update_interval = 2min; + bool finished = false; + while (!finished) + { + bool all_up_to_date = true; + auto & clusters = clusters_to_update->wait(5s, finished); + for (auto & [cluster_name, need_update] : clusters) + { + auto cluster_info_it = clusters_info.find(cluster_name); + if (cluster_info_it == clusters_info.end()) + { + LOG_ERROR(log, "Unknown cluster '{}'", cluster_name); + continue; + } + auto & cluster_info = cluster_info_it->second; + + if (!need_update.exchange(false)) + { + /// force updating periodically + bool force_update = cluster_info.watch.elapsedSeconds() > std::chrono::seconds(force_update_interval).count(); + if (!force_update) + continue; + } + + if (updateCluster(cluster_info)) + { + cluster_info.watch.restart(); + LOG_DEBUG(log, "Cluster '{}' updated successfully", cluster_name); + } + else + { + all_up_to_date = false; + /// no need to trigger convar, will retry after timeout in `wait` + need_update = true; + LOG_WARNING(log, "Cluster '{}' wasn't updated, will retry", cluster_name); + } + } + + if (all_up_to_date) + { + up_to_date_callback(); + } + } + LOG_DEBUG(log, "Worker thread stopped"); + return finished; +} + +void ClusterDiscovery::shutdown() +{ + LOG_DEBUG(log, "Shutting down"); + clusters_to_update->stop(); + + if (main_thread.joinable()) + main_thread.join(); +} + +ClusterDiscovery::~ClusterDiscovery() +{ + ClusterDiscovery::shutdown(); +} + +bool ClusterDiscovery::NodeInfo::parse(const String & data, NodeInfo & result) +{ + try + { + Poco::JSON::Parser parser; + auto json = parser.parse(data).extract(); + + size_t ver = json->optValue("version", data_ver); + if (ver == data_ver) + { + result.address = json->getValue("address"); + result.secure = json->optValue("secure", false); + result.shard_id = json->optValue("shard_id", 0); + } + else + { + LOG_ERROR( + &Poco::Logger::get("ClusterDiscovery"), + "Unsupported version '{}' of data in zk node '{}'", + ver, data.size() < 1024 ? data : "[data too long]"); + } + } + catch (Poco::Exception & e) + { + LOG_WARNING( + &Poco::Logger::get("ClusterDiscovery"), + "Can't parse '{}' from node: {}", + data.size() < 1024 ? data : "[data too long]", e.displayText()); + return false; + } + return true; +} + +String ClusterDiscovery::NodeInfo::serialize() const +{ + Poco::JSON::Object json; + json.set("version", data_ver); + json.set("address", address); + json.set("shard_id", shard_id); + + std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(json, oss); + return oss.str(); +} + +} diff --git a/src/Interpreters/ClusterDiscovery.h b/src/Interpreters/ClusterDiscovery.h new file mode 100644 index 00000000000..2098652c069 --- /dev/null +++ b/src/Interpreters/ClusterDiscovery.h @@ -0,0 +1,124 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace DB +{ + +/* + * Discover cluster nodes. + * + * Each node adds ephemernal node into specified path in zookeeper (each cluster have own path). + * Also node subscribed for updates for these paths, and at each child node chanhe cluster updated. + * When node goes down ephemernal node are destroyed, cluster configuration is updated on other node and gone node is removed from cluster. + */ +class ClusterDiscovery +{ + +public: + ClusterDiscovery( + const Poco::Util::AbstractConfiguration & config, + ContextPtr context_, + const String & config_prefix = "remote_servers"); + + void start(); + + ~ClusterDiscovery(); + +private: + struct NodeInfo + { + /// versioning for format of data stored in zk + static constexpr size_t data_ver = 1; + + /// host:port + String address; + /// is secure tcp port user + bool secure = false; + /// shard number + size_t shard_id = 0; + + NodeInfo() = default; + explicit NodeInfo(const String & address_, bool secure_, size_t shard_id_) + : address(address_) + , secure(secure_) + , shard_id(shard_id_) + {} + + static bool parse(const String & data, NodeInfo & result); + String serialize() const; + }; + + // node uuid -> address ("host:port") + using NodesInfo = std::unordered_map; + + struct ClusterInfo + { + const String name; + const String zk_root; + NodesInfo nodes_info; + + /// Track last update time + Stopwatch watch; + + NodeInfo current_node; + + explicit ClusterInfo(const String & name_, const String & zk_root_, UInt16 port, bool secure, size_t shard_id) + : name(name_) + , zk_root(zk_root_) + , current_node(getFQDNOrHostName() + ":" + toString(port), secure, shard_id) + { + } + }; + + void initialUpdate(); + + void registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info); + + Strings getNodeNames(zkutil::ZooKeeperPtr & zk, + const String & zk_root, + const String & cluster_name, + int * version = nullptr, + bool set_callback = true); + + NodesInfo getNodes(zkutil::ZooKeeperPtr & zk, const String & zk_root, const Strings & node_uuids); + + ClusterPtr makeCluster(const ClusterInfo & cluster_info); + + bool needUpdate(const Strings & node_uuids, const NodesInfo & nodes); + bool updateCluster(ClusterInfo & cluster_info); + + bool runMainThread(std::function up_to_date_callback); + void shutdown(); + + /// cluster name -> cluster info (zk root, set of nodes) + std::unordered_map clusters_info; + + ContextMutablePtr context; + + String current_node_name; + + template class ConcurrentFlags; + using UpdateFlags = ConcurrentFlags; + + /// Cluster names to update. + /// The `shared_ptr` is used because it's passed to watch callback. + /// It prevents accessing to invalid object after ClusterDiscovery is destroyed. + std::shared_ptr clusters_to_update; + + ThreadFromGlobalPool main_thread; + + Poco::Logger * log; +}; + +} diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index db1d6a37877..14b0f65072a 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -86,6 +86,7 @@ #include #include #include +#include #include @@ -254,6 +255,7 @@ struct ContextSharedPart std::shared_ptr clusters; ConfigurationPtr clusters_config; /// Stores updated configs mutable std::mutex clusters_mutex; /// Guards clusters and clusters_config + std::unique_ptr cluster_discovery; std::shared_ptr async_insert_queue; std::map server_ports; @@ -2195,11 +2197,22 @@ std::shared_ptr Context::getClusters() const return shared->clusters; } +void Context::startClusterDiscovery() +{ + if (!shared->cluster_discovery) + return; + shared->cluster_discovery->start(); +} + /// On repeating calls updates existing clusters and adds new clusters, doesn't delete old clusters -void Context::setClustersConfig(const ConfigurationPtr & config, const String & config_name) +void Context::setClustersConfig(const ConfigurationPtr & config, bool enable_discovery, const String & config_name) { std::lock_guard lock(shared->clusters_mutex); + if (config->getBool("allow_experimental_cluster_discovery", false) && enable_discovery && !shared->cluster_discovery) + { + shared->cluster_discovery = std::make_unique(*config, getGlobalContext()); + } /// Do not update clusters if this part of config wasn't changed. if (shared->clusters && isSameConfiguration(*config, *shared->clusters_config, config_name)) @@ -2209,7 +2222,7 @@ void Context::setClustersConfig(const ConfigurationPtr & config, const String & shared->clusters_config = config; if (!shared->clusters) - shared->clusters = std::make_unique(*shared->clusters_config, settings, config_name); + shared->clusters = std::make_shared(*shared->clusters_config, settings, config_name); else shared->clusters->updateClusters(*shared->clusters_config, settings, config_name, old_clusters_config); } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 823bc028c15..6b0a4671efb 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -750,7 +750,10 @@ public: std::shared_ptr getClusters() const; std::shared_ptr getCluster(const std::string & cluster_name) const; std::shared_ptr tryGetCluster(const std::string & cluster_name) const; - void setClustersConfig(const ConfigurationPtr & config, const String & config_name = "remote_servers"); + void setClustersConfig(const ConfigurationPtr & config, bool enable_discovery = false, const String & config_name = "remote_servers"); + + void startClusterDiscovery(); + /// Sets custom cluster, but doesn't update configuration void setCluster(const String & cluster_name, const std::shared_ptr & cluster); void reloadClusterConfig() const; diff --git a/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp b/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp index 2de7b4b7846..b266746642f 100644 --- a/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp +++ b/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp @@ -1,5 +1,7 @@ #include "ExternalUserDefinedExecutableFunctionsLoader.h" +#include + #include #include @@ -54,29 +56,44 @@ ExternalLoader::LoadablePtr ExternalUserDefinedExecutableFunctionsLoader::create throw Exception(ErrorCodes::FUNCTION_ALREADY_EXISTS, "The aggregate function '{}' already exists", name); String type = config.getString(key_in_config + ".type"); - UserDefinedExecutableFunctionType function_type; + + bool is_executable_pool = false; if (type == "executable") - function_type = UserDefinedExecutableFunctionType::executable; + is_executable_pool = false; else if (type == "executable_pool") - function_type = UserDefinedExecutableFunctionType::executable_pool; + is_executable_pool = true; else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong user defined function type expected 'executable' or 'executable_pool' actual {}", - function_type); + type); + + bool execute_direct = config.getBool(key_in_config + ".execute_direct", true); + + String command_value = config.getString(key_in_config + ".command"); + std::vector command_arguments; + + if (execute_direct) + { + boost::split(command_arguments, command_value, [](char c) { return c == ' '; }); + + command_value = std::move(command_arguments[0]); + command_arguments.erase(command_arguments.begin()); + } - String command = config.getString(key_in_config + ".command"); String format = config.getString(key_in_config + ".format"); DataTypePtr result_type = DataTypeFactory::instance().get(config.getString(key_in_config + ".return_type")); bool send_chunk_header = config.getBool(key_in_config + ".send_chunk_header", false); + size_t command_termination_timeout_seconds = config.getUInt64(key_in_config + ".command_termination_timeout", 10); + size_t command_read_timeout_milliseconds = config.getUInt64(key_in_config + ".command_read_timeout", 10000); + size_t command_write_timeout_milliseconds = config.getUInt64(key_in_config + ".command_write_timeout", 10000); size_t pool_size = 0; - size_t command_termination_timeout = 0; size_t max_command_execution_time = 0; - if (function_type == UserDefinedExecutableFunctionType::executable_pool) + + if (is_executable_pool) { pool_size = config.getUInt64(key_in_config + ".pool_size", 16); - command_termination_timeout = config.getUInt64(key_in_config + ".command_termination_timeout", 10); max_command_execution_time = config.getUInt64(key_in_config + ".max_command_execution_time", 10); size_t max_execution_time_seconds = static_cast(getContext()->getSettings().max_execution_time.totalSeconds()); @@ -106,19 +123,28 @@ ExternalLoader::LoadablePtr ExternalUserDefinedExecutableFunctionsLoader::create UserDefinedExecutableFunctionConfiguration function_configuration { - .type = function_type, .name = std::move(name), //-V1030 - .script_path = std::move(command), //-V1030 - .format = std::move(format), //-V1030 + .command = std::move(command_value), //-V1030 + .command_arguments = std::move(command_arguments), //-V1030 .argument_types = std::move(argument_types), //-V1030 .result_type = std::move(result_type), //-V1030 - .pool_size = pool_size, - .command_termination_timeout = command_termination_timeout, - .max_command_execution_time = max_command_execution_time, - .send_chunk_header = send_chunk_header }; - return std::make_shared(function_configuration, lifetime); + ShellCommandSourceCoordinator::Configuration shell_command_coordinator_configration + { + .format = std::move(format), //-V1030 + .command_termination_timeout_seconds = command_termination_timeout_seconds, + .command_read_timeout_milliseconds = command_read_timeout_milliseconds, + .command_write_timeout_milliseconds = command_write_timeout_milliseconds, + .pool_size = pool_size, + .max_command_execution_time_seconds = max_command_execution_time, + .is_executable_pool = is_executable_pool, + .send_chunk_header = send_chunk_header, + .execute_direct = execute_direct + }; + + auto coordinator = std::make_shared(shell_command_coordinator_configration); + return std::make_shared(function_configuration, std::move(coordinator), lifetime); } } diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 5f7c54e427f..2475d437acb 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -48,10 +48,15 @@ BlockIO InterpreterAlterQuery::execute() FunctionNameNormalizer().visit(query_ptr.get()); const auto & alter = query_ptr->as(); if (alter.alter_object == ASTAlterQuery::AlterObjectType::DATABASE) + { return executeToDatabase(alter); + } else if (alter.alter_object == ASTAlterQuery::AlterObjectType::TABLE || alter.alter_object == ASTAlterQuery::AlterObjectType::LIVE_VIEW) + { return executeToTable(alter); + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown alter object type"); } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 8f003e75a07..7ddb0c8c26e 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -637,13 +637,14 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti /// Table function without columns list. auto table_function = TableFunctionFactory::instance().get(create.as_table_function, getContext()); properties.columns = table_function->getActualTableStructure(getContext()); - assert(!properties.columns.empty()); } else if (create.is_dictionary) { return {}; } - else + /// We can have queries like "CREATE TABLE ENGINE=" if + /// supports schema inference (will determine table structure in it's constructor). + else if (!StorageFactory::instance().checkIfStorageSupportsSchemaInterface(create.storage->engine->name)) throw Exception("Incorrect CREATE query: required list of column descriptions or AS section or SELECT.", ErrorCodes::INCORRECT_QUERY); /// Even if query has list of columns, canonicalize it (unfold Nested columns). @@ -1083,7 +1084,10 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, { const auto & factory = TableFunctionFactory::instance(); auto table_func = factory.get(create.as_table_function, getContext()); - res = table_func->execute(create.as_table_function, getContext(), create.getTable(), properties.columns); + /// In case of CREATE AS table_function() query we should use global context + /// in storage creation because there will be no query context on server startup + /// and because storage lifetime is bigger than query context lifetime. + res = table_func->execute(create.as_table_function, getContext(), create.getTable(), properties.columns, /*use_global_context=*/true); res->renameInMemory({create.getDatabase(), create.getTable(), create.uuid}); } else diff --git a/src/Interpreters/InterpreterExplainQuery.cpp b/src/Interpreters/InterpreterExplainQuery.cpp index e3a6812124f..fdb35637a9a 100644 --- a/src/Interpreters/InterpreterExplainQuery.cpp +++ b/src/Interpreters/InterpreterExplainQuery.cpp @@ -7,10 +7,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -250,117 +252,141 @@ QueryPipeline InterpreterExplainQuery::executeImpl() WriteBufferFromOwnString buf; bool single_line = false; + bool insert_buf = true; - if (ast.getKind() == ASTExplainQuery::ParsedAST) + switch (ast.getKind()) { - if (ast.getSettings()) - throw Exception("Settings are not supported for EXPLAIN AST query.", ErrorCodes::UNKNOWN_SETTING); - - dumpAST(*ast.getExplainedQuery(), buf); - } - else if (ast.getKind() == ASTExplainQuery::AnalyzedSyntax) - { - if (ast.getSettings()) - throw Exception("Settings are not supported for EXPLAIN SYNTAX query.", ErrorCodes::UNKNOWN_SETTING); - - ExplainAnalyzedSyntaxVisitor::Data data(getContext()); - ExplainAnalyzedSyntaxVisitor(data).visit(query); - - ast.getExplainedQuery()->format(IAST::FormatSettings(buf, false)); - } - else if (ast.getKind() == ASTExplainQuery::QueryPlan) - { - if (!dynamic_cast(ast.getExplainedQuery().get())) - throw Exception("Only SELECT is supported for EXPLAIN query", ErrorCodes::INCORRECT_QUERY); - - auto settings = checkAndGetSettings(ast.getSettings()); - QueryPlan plan; - - InterpreterSelectWithUnionQuery interpreter(ast.getExplainedQuery(), getContext(), SelectQueryOptions()); - interpreter.buildQueryPlan(plan); - - if (settings.optimize) - plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext())); - - if (settings.json) + case ASTExplainQuery::ParsedAST: { - /// Add extra layers to make plan look more like from postgres. - auto plan_map = std::make_unique(); - plan_map->add("Plan", plan.explainPlan(settings.query_plan_options)); - auto plan_array = std::make_unique(); - plan_array->add(std::move(plan_map)); + if (ast.getSettings()) + throw Exception("Settings are not supported for EXPLAIN AST query.", ErrorCodes::UNKNOWN_SETTING); - auto format_settings = getFormatSettings(getContext()); - format_settings.json.quote_64bit_integers = false; - - JSONBuilder::FormatSettings json_format_settings{.settings = format_settings}; - JSONBuilder::FormatContext format_context{.out = buf}; - - plan_array->format(json_format_settings, format_context); - - single_line = true; + dumpAST(*ast.getExplainedQuery(), buf); + break; } - else - plan.explainPlan(buf, settings.query_plan_options); - } - else if (ast.getKind() == ASTExplainQuery::QueryPipeline) - { - if (dynamic_cast(ast.getExplainedQuery().get())) + case ASTExplainQuery::AnalyzedSyntax: { - auto settings = checkAndGetSettings(ast.getSettings()); + if (ast.getSettings()) + throw Exception("Settings are not supported for EXPLAIN SYNTAX query.", ErrorCodes::UNKNOWN_SETTING); + + ExplainAnalyzedSyntaxVisitor::Data data(getContext()); + ExplainAnalyzedSyntaxVisitor(data).visit(query); + + ast.getExplainedQuery()->format(IAST::FormatSettings(buf, false)); + break; + } + case ASTExplainQuery::QueryPlan: + { + if (!dynamic_cast(ast.getExplainedQuery().get())) + throw Exception("Only SELECT is supported for EXPLAIN query", ErrorCodes::INCORRECT_QUERY); + + auto settings = checkAndGetSettings(ast.getSettings()); QueryPlan plan; InterpreterSelectWithUnionQuery interpreter(ast.getExplainedQuery(), getContext(), SelectQueryOptions()); interpreter.buildQueryPlan(plan); - auto pipeline = plan.buildQueryPipeline( + + if (settings.optimize) + plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext())); + + if (settings.json) + { + /// Add extra layers to make plan look more like from postgres. + auto plan_map = std::make_unique(); + plan_map->add("Plan", plan.explainPlan(settings.query_plan_options)); + auto plan_array = std::make_unique(); + plan_array->add(std::move(plan_map)); + + auto format_settings = getFormatSettings(getContext()); + format_settings.json.quote_64bit_integers = false; + + JSONBuilder::FormatSettings json_format_settings{.settings = format_settings}; + JSONBuilder::FormatContext format_context{.out = buf}; + + plan_array->format(json_format_settings, format_context); + + single_line = true; + } + else + plan.explainPlan(buf, settings.query_plan_options); + break; + } + case ASTExplainQuery::QueryPipeline: + { + if (dynamic_cast(ast.getExplainedQuery().get())) + { + auto settings = checkAndGetSettings(ast.getSettings()); + QueryPlan plan; + + InterpreterSelectWithUnionQuery interpreter(ast.getExplainedQuery(), getContext(), SelectQueryOptions()); + interpreter.buildQueryPlan(plan); + auto pipeline = plan.buildQueryPipeline( + QueryPlanOptimizationSettings::fromContext(getContext()), + BuildQueryPipelineSettings::fromContext(getContext())); + + if (settings.graph) + { + /// Pipe holds QueryPlan, should not go out-of-scope + auto pipe = QueryPipelineBuilder::getPipe(std::move(*pipeline)); + const auto & processors = pipe.getProcessors(); + + if (settings.compact) + printPipelineCompact(processors, buf, settings.query_pipeline_options.header); + else + printPipeline(processors, buf); + } + else + { + plan.explainPipeline(buf, settings.query_pipeline_options); + } + } + else if (dynamic_cast(ast.getExplainedQuery().get())) + { + InterpreterInsertQuery insert(ast.getExplainedQuery(), getContext()); + auto io = insert.execute(); + printPipeline(io.pipeline.getProcessors(), buf); + } + else + throw Exception("Only SELECT and INSERT is supported for EXPLAIN PIPELINE query", ErrorCodes::INCORRECT_QUERY); + break; + } + case ASTExplainQuery::QueryEstimates: + { + if (!dynamic_cast(ast.getExplainedQuery().get())) + throw Exception("Only SELECT is supported for EXPLAIN ESTIMATE query", ErrorCodes::INCORRECT_QUERY); + + auto settings = checkAndGetSettings(ast.getSettings()); + QueryPlan plan; + + InterpreterSelectWithUnionQuery interpreter(ast.getExplainedQuery(), getContext(), SelectQueryOptions()); + interpreter.buildQueryPlan(plan); + // collect the selected marks, rows, parts during build query pipeline. + plan.buildQueryPipeline( QueryPlanOptimizationSettings::fromContext(getContext()), BuildQueryPipelineSettings::fromContext(getContext())); - if (settings.graph) - { - /// Pipe holds QueryPlan, should not go out-of-scope - auto pipe = QueryPipelineBuilder::getPipe(std::move(*pipeline)); - const auto & processors = pipe.getProcessors(); - - if (settings.compact) - printPipelineCompact(processors, buf, settings.query_pipeline_options.header); - else - printPipeline(processors, buf); - } - else - { - plan.explainPipeline(buf, settings.query_pipeline_options); - } + if (settings.optimize) + plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext())); + plan.explainEstimate(res_columns); + insert_buf = false; + break; } - else if (dynamic_cast(ast.getExplainedQuery().get())) + case ASTExplainQuery::TableOverride: { - InterpreterInsertQuery insert(ast.getExplainedQuery(), getContext()); - auto io = insert.execute(); - printPipeline(io.pipeline.getProcessors(), buf); + if (auto * table_function = ast.getTableFunction()->as(); !table_function || table_function->name != "mysql") + { + throw Exception(ErrorCodes::INCORRECT_QUERY, "EXPLAIN TABLE OVERRIDE is not supported for the {}() table function", table_function->name); + } + auto storage = getContext()->getQueryContext()->executeTableFunction(ast.getTableFunction()); + auto metadata_snapshot = storage->getInMemoryMetadata(); + TableOverrideAnalyzer::Result override_info; + TableOverrideAnalyzer override_analyzer(ast.getTableOverride()); + override_analyzer.analyze(metadata_snapshot, override_info); + override_info.appendTo(buf); + break; } - else - throw Exception("Only SELECT and INSERT is supported for EXPLAIN PIPELINE query", ErrorCodes::INCORRECT_QUERY); } - else if (ast.getKind() == ASTExplainQuery::QueryEstimates) - { - if (!dynamic_cast(ast.getExplainedQuery().get())) - throw Exception("Only SELECT is supported for EXPLAIN ESTIMATE query", ErrorCodes::INCORRECT_QUERY); - - auto settings = checkAndGetSettings(ast.getSettings()); - QueryPlan plan; - - InterpreterSelectWithUnionQuery interpreter(ast.getExplainedQuery(), getContext(), SelectQueryOptions()); - interpreter.buildQueryPlan(plan); - // collect the selected marks, rows, parts during build query pipeline. - plan.buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(getContext()), - BuildQueryPipelineSettings::fromContext(getContext())); - - if (settings.optimize) - plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext())); - plan.explainEstimate(res_columns); - } - if (ast.getKind() != ASTExplainQuery::QueryEstimates) + if (insert_buf) { if (single_line) res_columns[0]->insertData(buf.str().data(), buf.str().size()); diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 8677cf59d79..d340308122f 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -263,6 +263,10 @@ BlockIO InterpreterInsertQuery::execute() QueryPipelineBuilder pipeline; StoragePtr table = getTable(query); + StoragePtr inner_table; + if (const auto * mv = dynamic_cast(table.get())) + inner_table = mv->getTargetTable(); + if (query.partition_by && !table->supportsPartitionBy()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "PARTITION BY clause is not supported by storage"); @@ -450,11 +454,8 @@ BlockIO InterpreterInsertQuery::execute() } res.pipeline.addStorageHolder(table); - if (const auto * mv = dynamic_cast(table.get())) - { - if (auto inner_table = mv->tryGetTargetTable()) - res.pipeline.addStorageHolder(inner_table); - } + if (inner_table) + res.pipeline.addStorageHolder(inner_table); return res; } @@ -466,7 +467,7 @@ StorageID InterpreterInsertQuery::getDatabaseTable() const } -void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, ContextPtr context_) const +void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, ContextPtr context_) { elem.query_kind = "Insert"; const auto & insert_table = context_->getInsertionTable(); @@ -477,4 +478,9 @@ void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, cons } } +void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, ContextPtr context_) const +{ + extendQueryLogElemImpl(elem, context_); +} + } diff --git a/src/Interpreters/InterpreterInsertQuery.h b/src/Interpreters/InterpreterInsertQuery.h index e5733a8c28b..93de92a0680 100644 --- a/src/Interpreters/InterpreterInsertQuery.h +++ b/src/Interpreters/InterpreterInsertQuery.h @@ -40,6 +40,7 @@ public: ThreadStatus * thread_status = nullptr, std::atomic_uint64_t * elapsed_counter_ms = nullptr); + static void extendQueryLogElemImpl(QueryLogElement & elem, ContextPtr context_); void extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, ContextPtr context_) const override; StoragePtr getTable(ASTInsertQuery & query); diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 960fddccb8c..123ff6ba2ca 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -250,6 +250,7 @@ BlockIO InterpreterSystemQuery::execute() } case Type::SUSPEND: { + getContext()->checkAccess(AccessType::SYSTEM_SHUTDOWN); auto command = fmt::format("kill -STOP {0} && sleep {1} && kill -CONT {0}", getpid(), query.seconds); LOG_DEBUG(log, "Will run {}", command); auto res = ShellCommand::execute(command); @@ -453,9 +454,11 @@ BlockIO InterpreterSystemQuery::execute() case Type::START_LISTEN_QUERIES: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "{} is not supported yet", query.type); case Type::STOP_THREAD_FUZZER: + getContext()->checkAccess(AccessType::SYSTEM_THREAD_FUZZER); ThreadFuzzer::stop(); break; case Type::START_THREAD_FUZZER: + getContext()->checkAccess(AccessType::SYSTEM_THREAD_FUZZER); ThreadFuzzer::start(); break; default: @@ -469,7 +472,7 @@ void InterpreterSystemQuery::restoreReplica() { getContext()->checkAccess(AccessType::SYSTEM_RESTORE_REPLICA, table_id); - const zkutil::ZooKeeperPtr& zookeeper = getContext()->getZooKeeper(); + const zkutil::ZooKeeperPtr & zookeeper = getContext()->getZooKeeper(); if (zookeeper->expired()) throw Exception(ErrorCodes::NO_ZOOKEEPER, diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index a4583685a90..e7e52142fc8 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -86,6 +86,20 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as throw Exception("Too many simultaneous queries. Maximum: " + toString(max_size), ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES); } + String query_kind{ast->getQueryKindString()}; + if (!is_unlimited_query) + { + auto amount = getQueryKindAmount(query_kind); + if (max_insert_queries_amount && query_kind == "Insert" && amount >= max_insert_queries_amount) + throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, + "Too many simultaneous insert queries. Maximum: {}, current: {}", + max_insert_queries_amount, amount); + if (max_select_queries_amount && query_kind == "Select" && amount >= max_select_queries_amount) + throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, + "Too many simultaneous select queries. Maximum: {}, current: {}", + max_select_queries_amount, amount); + } + { /** * `max_size` check above is controlled by `max_concurrent_queries` server setting and is a "hard" limit for how many @@ -176,7 +190,9 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as } auto process_it = processes.emplace(processes.end(), - query_context, query_, client_info, priorities.insert(settings.priority)); + query_context, query_, client_info, priorities.insert(settings.priority), query_kind); + + increaseQueryKindAmount(query_kind); res = std::make_shared(*this, process_it); @@ -242,6 +258,7 @@ ProcessListEntry::~ProcessListEntry() String user = it->getClientInfo().current_user; String query_id = it->getClientInfo().current_query_id; + String query_kind = it->query_kind; const QueryStatus * process_list_element_ptr = &*it; @@ -273,6 +290,9 @@ ProcessListEntry::~ProcessListEntry() LOG_ERROR(&Poco::Logger::get("ProcessList"), "Logical error: cannot find query by query_id and pointer to ProcessListElement in ProcessListForUser"); std::terminate(); } + + parent.decreaseQueryKindAmount(query_kind); + parent.have_space.notify_all(); /// If there are no more queries for the user, then we will reset memory tracker and network throttler. @@ -286,11 +306,12 @@ ProcessListEntry::~ProcessListEntry() QueryStatus::QueryStatus( - ContextPtr context_, const String & query_, const ClientInfo & client_info_, QueryPriorities::Handle && priority_handle_) + ContextPtr context_, const String & query_, const ClientInfo & client_info_, QueryPriorities::Handle && priority_handle_, const String & query_kind_) : WithContext(context_) , query(query_) , client_info(client_info_) , priority_handle(std::move(priority_handle_)) + , query_kind(query_kind_) { auto settings = getContext()->getSettings(); limits.max_execution_time = settings.max_execution_time; @@ -411,9 +432,8 @@ QueryStatusInfo QueryStatus::getInfo(bool get_thread_list, bool get_profile_even res.read_bytes = progress_in.read_bytes; res.total_rows = progress_in.total_rows_to_read; - /// TODO: Use written_rows and written_bytes when real time progress is implemented - res.written_rows = progress_out.read_rows; - res.written_bytes = progress_out.read_bytes; + res.written_rows = progress_out.written_rows; + res.written_bytes = progress_out.written_bytes; if (thread_group) { @@ -485,4 +505,33 @@ ProcessList::UserInfo ProcessList::getUserInfo(bool get_profile_events) const return per_user_infos; } +void ProcessList::increaseQueryKindAmount(const String & query_kind) +{ + auto found = query_kind_amounts.find(query_kind); + if (found == query_kind_amounts.end()) + query_kind_amounts[query_kind] = 1; + else + found->second += 1; +} + +void ProcessList::decreaseQueryKindAmount(const String & query_kind) +{ + auto found = query_kind_amounts.find(query_kind); + /// TODO: we could just rebuild the map, as we have saved all query_kind. + if (found == query_kind_amounts.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong query kind amount: decrease before increase on '{}'", query_kind); + else if (found->second == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong query kind amount: decrease to negative on '{}'", query_kind, found->second); + else + found->second -= 1; + +} +ProcessList::QueryAmount ProcessList::getQueryKindAmount(const String & query_kind) +{ + auto found = query_kind_amounts.find(query_kind); + if (found == query_kind_amounts.end()) + return 0; + return found->second; +} + } diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 9c826bde061..ada24c03275 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -94,7 +94,7 @@ protected: ExecutionSpeedLimits limits; OverflowMode overflow_mode; - QueryPriorities::Handle priority_handle; + QueryPriorities::Handle priority_handle = nullptr; std::atomic is_killed { false }; @@ -118,13 +118,17 @@ protected: ProcessListForUser * user_process_list = nullptr; + String query_kind; + public: QueryStatus( ContextPtr context_, const String & query_, const ClientInfo & client_info_, - QueryPriorities::Handle && priority_handle_); + QueryPriorities::Handle && priority_handle_, + const String & query_kind_ + ); ~QueryStatus(); @@ -256,6 +260,7 @@ class ProcessList public: using Element = QueryStatus; using Entry = ProcessListEntry; + using QueryAmount = UInt64; /// list, for iterators not to invalidate. NOTE: could replace with cyclic buffer, but not worth. using Container = std::list; @@ -265,6 +270,8 @@ public: /// User -> queries using UserToQueries = std::unordered_map; + using QueryKindToAmount = std::unordered_map; + protected: friend class ProcessListEntry; @@ -287,6 +294,19 @@ protected: /// Call under lock. Finds process with specified current_user and current_query_id. QueryStatus * tryGetProcessListElement(const String & current_query_id, const String & current_user); + /// limit for insert. 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. + size_t max_insert_queries_amount = 0; + + /// limit for select. 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. + size_t max_select_queries_amount = 0; + + /// amount of queries by query kind. + QueryKindToAmount query_kind_amounts; + + void increaseQueryKindAmount(const String & query_kind); + void decreaseQueryKindAmount(const String & query_kind); + QueryAmount getQueryKindAmount(const String & query_kind); + public: using EntryPtr = std::shared_ptr; @@ -312,6 +332,18 @@ public: max_size = max_size_; } + void setMaxInsertQueriesAmount(size_t max_insert_queries_amount_) + { + std::lock_guard lock(mutex); + max_insert_queries_amount = max_insert_queries_amount_; + } + + void setMaxSelectQueriesAmount(size_t max_select_queries_amount_) + { + std::lock_guard lock(mutex); + max_select_queries_amount = max_select_queries_amount_; + } + /// Try call cancel() for input and output streams of query with specified id and user CancellationCode sendCancelToQuery(const String & current_query_id, const String & current_user, bool kill = false); diff --git a/src/Interpreters/RowRefs.h b/src/Interpreters/RowRefs.h index 047146d569c..987fd197d9d 100644 --- a/src/Interpreters/RowRefs.h +++ b/src/Interpreters/RowRefs.h @@ -103,7 +103,7 @@ struct RowRefList : RowRef } } - bool ok() const { return first || (batch && position < batch->size); } + bool ok() const { return first || batch; } private: const RowRefList * root; diff --git a/src/Interpreters/TableOverrideUtils.cpp b/src/Interpreters/TableOverrideUtils.cpp new file mode 100644 index 00000000000..922dd6af25b --- /dev/null +++ b/src/Interpreters/TableOverrideUtils.cpp @@ -0,0 +1,174 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_TABLE_OVERRIDE; +} + +namespace +{ + +class MaybeNullableColumnsMatcher +{ +public: + using Visitor = ConstInDepthNodeVisitor; + using Data = RequiredSourceColumnsData; + + static bool needChildVisit(const ASTPtr & node, const ASTPtr & child) + { + if (const auto * f = node->as(); f && f->name == "assumeNotNull") + return false; + return RequiredSourceColumnsMatcher::needChildVisit(node, child); + } + + static void visit(const ASTPtr & ast, Data & data) + { + RequiredSourceColumnsMatcher::visit(ast, data); + } +}; + +using MaybeNullableColumnsVisitor = MaybeNullableColumnsMatcher::Visitor; + +} + +static void checkRequiredColumns(const IAST * ast, const NameToTypeMap & existing_types, NamesAndTypes & used_columns, const String & what, bool allow_nulls = false) +{ + if (!ast) + return; + RequiredSourceColumnsData columns_data; + RequiredSourceColumnsVisitor(columns_data).visit(ast->clone()); + auto required_columns = columns_data.requiredColumns(); + for (const auto & column : required_columns) + { + auto type = existing_types.find(column); + if (type == existing_types.end()) + throw Exception(ErrorCodes::INVALID_TABLE_OVERRIDE, "{} override refers to unknown column {}", what, backQuote(column)); + } + if (!allow_nulls) + { + RequiredSourceColumnsData nullable_data; + MaybeNullableColumnsVisitor(nullable_data).visit(ast->clone()); + for (const auto & column : nullable_data.requiredColumns()) + { + if (existing_types.find(column)->second->isNullable()) + throw Exception( + ErrorCodes::INVALID_TABLE_OVERRIDE, + "{} override refers to nullable column {} (use assumeNotNull() if the column does not in fact contain NULL values)", + what, + backQuote(column)); + } + } + for (const auto & col : required_columns) + { + used_columns.push_back({col, existing_types.find(col)->second}); + } +} + +void TableOverrideAnalyzer::analyze(const StorageInMemoryMetadata & metadata, Result & result) const +{ + for (const auto & column : metadata.columns) + result.existing_types[column.name] = column.type; + checkRequiredColumns(override->storage->order_by, result.existing_types, result.order_by_columns, "ORDER BY"); + checkRequiredColumns(override->storage->primary_key, result.existing_types, result.primary_key_columns, "PRIMARY KEY"); + checkRequiredColumns(override->storage->partition_by, result.existing_types, result.partition_by_columns, "PARTITION BY"); + checkRequiredColumns(override->storage->sample_by, result.existing_types, result.sample_by_columns, "SAMPLE BY"); + checkRequiredColumns(override->storage->ttl_table, result.existing_types, result.ttl_columns, "TTL"); + if (override->columns && override->columns->columns) + { + for (const auto & column_ast : override->columns->columns->children) + { + auto * override_column = column_ast->as(); + auto override_type = DataTypeFactory::instance().get(override_column->type); + auto found = metadata.columns.tryGetColumnOrSubcolumn(ColumnsDescription::GetFlags::All, override_column->name); + std::optional override_default_kind; + if (!override_column->default_specifier.empty()) + override_default_kind = columnDefaultKindFromString(override_column->default_specifier); + if (found) + { + std::optional existing_default_kind; + if (auto col_default = metadata.columns.getDefault(found->name)) + existing_default_kind = col_default->kind; + if (existing_default_kind != override_default_kind) + throw Exception(ErrorCodes::INVALID_TABLE_OVERRIDE, "column {}: modifying default specifier is not allowed", backQuote(override_column->name)); + result.modified_columns.push_back({found->name, override_type}); + /// TODO: validate that the original type can be converted to the overridden type + } + else + { + if (override_default_kind && *override_default_kind == ColumnDefaultKind::Alias) + result.added_columns.push_back({override_column->name, override_type}); + else + throw Exception(ErrorCodes::INVALID_TABLE_OVERRIDE, "column {}: can only add ALIAS columns", backQuote(override_column->name)); + } + /// TODO: validate default and materialized expressions (use checkRequiredColumns, allowing nulls) + } + } +} + +void TableOverrideAnalyzer::Result::appendTo(WriteBuffer & ostr) +{ + const auto & format_names = [&](const NamesAndTypes & names) -> String + { + WriteBufferFromOwnString buf; + bool first = true; + for (const auto & name : names) + { + if (!first) + buf << ", "; + first = false; + buf << backQuote(name.name) << " "; + auto old_type = existing_types.find(name.name); + if (old_type != existing_types.end() && old_type->second != name.type) + buf << old_type->second->getName() << " -> "; + buf << name.type->getName(); + } + return buf.str(); + }; + if (!modified_columns.empty()) + { + ostr << "Modified columns: " << format_names(modified_columns) << "\n"; + } + if (!added_columns.empty()) + { + ostr << "Added columns: " << format_names(added_columns) << "\n"; + } + if (!order_by_columns.empty()) + { + ostr << "ORDER BY uses columns: " << format_names(order_by_columns) << "\n"; + } + if (!primary_key_columns.empty()) + { + ostr << "PRIMARY KEY uses columns: " << format_names(primary_key_columns) << "\n"; + } + if (!partition_by_columns.empty()) + { + ostr << "PARTITION BY uses columns: " << format_names(partition_by_columns) << "\n"; + } + if (!sample_by_columns.empty()) + { + ostr << "SAMPLE BY uses columns: " << format_names(sample_by_columns) << "\n"; + } + if (!ttl_columns.empty()) + { + ostr << "TTL uses columns: " << format_names(ttl_columns) << "\n"; + } +} + +} diff --git a/src/Interpreters/TableOverrideUtils.h b/src/Interpreters/TableOverrideUtils.h new file mode 100644 index 00000000000..810ffecd573 --- /dev/null +++ b/src/Interpreters/TableOverrideUtils.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +struct StorageInMemoryMetadata; + +using NameToTypeMap = std::map; + +struct TableOverrideAnalyzer +{ + struct Result + { + NameToTypeMap existing_types; + NamesAndTypes order_by_columns; + NamesAndTypes primary_key_columns; + NamesAndTypes partition_by_columns; + NamesAndTypes sample_by_columns; + NamesAndTypes ttl_columns; + NamesAndTypes added_columns; + NamesAndTypes modified_columns; + + void appendTo(WriteBuffer &); + }; + + ASTTableOverride * override; + + explicit TableOverrideAnalyzer(ASTPtr ast) : override(assert_cast(ast.get())) { } + + void analyze(const StorageInMemoryMetadata & metadata, Result & result) const; +}; + +} diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp index fc6aa15a1e8..b3720b89eaa 100644 --- a/src/Interpreters/ThreadStatusExt.cpp +++ b/src/Interpreters/ThreadStatusExt.cpp @@ -24,12 +24,6 @@ # include #endif -namespace ProfileEvents -{ - extern const Event InsertedRows; - extern const Event InsertedBytes; -} - /// Implement some methods of ThreadStatus and CurrentThread here to avoid extra linking dependencies in clickhouse_common_io /// TODO It doesn't make sense. @@ -447,9 +441,8 @@ void ThreadStatus::logToQueryThreadLog(QueryThreadLog & thread_log, const String elem.read_rows = progress_in.read_rows.load(std::memory_order_relaxed); elem.read_bytes = progress_in.read_bytes.load(std::memory_order_relaxed); - /// TODO: Use written_rows and written_bytes when run time progress is implemented - elem.written_rows = progress_out.read_rows.load(std::memory_order_relaxed); - elem.written_bytes = progress_out.read_bytes.load(std::memory_order_relaxed); + elem.written_rows = progress_out.written_rows.load(std::memory_order_relaxed); + elem.written_bytes = progress_out.written_bytes.load(std::memory_order_relaxed); elem.memory_usage = memory_tracker.get(); elem.peak_memory_usage = memory_tracker.getPeak(); @@ -520,8 +513,8 @@ void ThreadStatus::logToQueryViewsLog(const ViewRuntimeData & vinfo) auto events = std::make_shared(performance_counters.getPartiallyAtomicSnapshot()); element.read_rows = progress_in.read_rows.load(std::memory_order_relaxed); element.read_bytes = progress_in.read_bytes.load(std::memory_order_relaxed); - element.written_rows = (*events)[ProfileEvents::InsertedRows]; - element.written_bytes = (*events)[ProfileEvents::InsertedBytes]; + element.written_rows = progress_out.written_rows.load(std::memory_order_relaxed); + element.written_bytes = progress_out.written_bytes.load(std::memory_order_relaxed); element.peak_memory_usage = memory_tracker.getPeak() > 0 ? memory_tracker.getPeak() : 0; if (query_context_ptr->getSettingsRef().log_profile_events != 0) { diff --git a/src/Interpreters/UserDefinedExecutableFunction.cpp b/src/Interpreters/UserDefinedExecutableFunction.cpp index d57978d0fd6..e5a852b0e75 100644 --- a/src/Interpreters/UserDefinedExecutableFunction.cpp +++ b/src/Interpreters/UserDefinedExecutableFunction.cpp @@ -13,14 +13,12 @@ namespace DB UserDefinedExecutableFunction::UserDefinedExecutableFunction( const UserDefinedExecutableFunctionConfiguration & configuration_, - const ExternalLoadableLifetime & lifetime_, - std::shared_ptr process_pool_) + std::shared_ptr coordinator_, + const ExternalLoadableLifetime & lifetime_) : configuration(configuration_) + , coordinator(std::move(coordinator_)) , lifetime(lifetime_) - , process_pool(process_pool_) { - if (!process_pool && configuration.type == UserDefinedExecutableFunctionType::executable_pool) - process_pool = std::make_shared(configuration.pool_size == 0 ? std::numeric_limits::max() : configuration.pool_size); } }; diff --git a/src/Interpreters/UserDefinedExecutableFunction.h b/src/Interpreters/UserDefinedExecutableFunction.h index 1cb1de47578..a4fad8ceb7b 100644 --- a/src/Interpreters/UserDefinedExecutableFunction.h +++ b/src/Interpreters/UserDefinedExecutableFunction.h @@ -10,26 +10,13 @@ namespace DB { -enum class UserDefinedExecutableFunctionType -{ - executable, - executable_pool -}; - struct UserDefinedExecutableFunctionConfiguration { - UserDefinedExecutableFunctionType type = UserDefinedExecutableFunctionType::executable; std::string name; - std::string script_path; - std::string format; + std::string command; + std::vector command_arguments; std::vector argument_types; DataTypePtr result_type; - /// Pool settings - size_t pool_size = 0; - size_t command_termination_timeout = 0; - size_t max_command_execution_time = 0; - /// Send number_of_rows\n before sending chunk to process - bool send_chunk_header = false; }; class UserDefinedExecutableFunction final : public IExternalLoadable @@ -38,8 +25,8 @@ public: UserDefinedExecutableFunction( const UserDefinedExecutableFunctionConfiguration & configuration_, - const ExternalLoadableLifetime & lifetime_, - std::shared_ptr process_pool_ = nullptr); + std::shared_ptr coordinator_, + const ExternalLoadableLifetime & lifetime_); const ExternalLoadableLifetime & getLifetime() const override { @@ -63,7 +50,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(configuration, lifetime, process_pool); + return std::make_shared(configuration, coordinator, lifetime); } const UserDefinedExecutableFunctionConfiguration & getConfiguration() const @@ -71,9 +58,9 @@ public: return configuration; } - std::shared_ptr getProcessPool() const + std::shared_ptr getCoordinator() const { - return process_pool; + return coordinator; } std::shared_ptr shared_from_this() @@ -87,13 +74,9 @@ public: } private: - UserDefinedExecutableFunction(const UserDefinedExecutableFunctionConfiguration & configuration_, - std::shared_ptr process_pool_, - const ExternalLoadableLifetime & lifetime_); - UserDefinedExecutableFunctionConfiguration configuration; + std::shared_ptr coordinator; ExternalLoadableLifetime lifetime; - std::shared_ptr process_pool; }; } diff --git a/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp b/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp index 4cb3e034b01..10cb806028e 100644 --- a/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp +++ b/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp @@ -1,8 +1,13 @@ #include "UserDefinedExecutableFunctionFactory.h" +#include + +#include + #include #include +#include #include #include @@ -19,7 +24,6 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; - extern const int TIMEOUT_EXCEEDED; } class UserDefinedFunction final : public IFunction @@ -52,10 +56,36 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { + auto coordinator = executable_function->getCoordinator(); + const auto & coordinator_configuration = coordinator->getConfiguration(); const auto & configuration = executable_function->getConfiguration(); + + String command = configuration.command; + + if (coordinator_configuration.execute_direct) + { + auto user_scripts_path = context->getUserScriptsPath(); + auto script_path = user_scripts_path + '/' + command; + + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} must be inside user scripts folder {}", + command, + user_scripts_path); + + if (!std::filesystem::exists(std::filesystem::path(script_path))) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} does not exist inside user scripts folder {}", + command, + user_scripts_path); + + command = std::move(script_path); + } + + size_t argument_size = arguments.size(); auto arguments_copy = arguments; - for (size_t i = 0; i < arguments.size(); ++i) + for (size_t i = 0; i < argument_size; ++i) { auto & column_with_type = arguments_copy[i]; column_with_type.column = column_with_type.column->convertToFullColumnIfConst(); @@ -68,56 +98,36 @@ public: column_with_type.column = castColumnAccurate(column_to_cast, argument_type); column_with_type.type = argument_type; - column_with_type = column_to_cast; + column_with_type = std::move(column_to_cast); } - std::unique_ptr process = getProcess(); - ColumnWithTypeAndName result(result_type, "result"); Block result_block({result}); Block arguments_block(arguments_copy); - auto * process_in = &process->in; - - auto process_pool = executable_function->getProcessPool(); - bool is_executable_pool_function = (process_pool != nullptr); + auto source = std::make_shared(std::move(arguments_block)); + auto shell_input_pipe = Pipe(std::move(source)); ShellCommandSourceConfiguration shell_command_source_configuration; - if (is_executable_pool_function) + if (coordinator_configuration.is_executable_pool) { shell_command_source_configuration.read_fixed_number_of_rows = true; shell_command_source_configuration.number_of_rows_to_read = input_rows_count; } - ShellCommandSource::SendDataTask task = {[process_in, arguments_block, &configuration, is_executable_pool_function, this]() - { - auto & out = *process_in; + Pipes shell_input_pipes; + shell_input_pipes.emplace_back(std::move(shell_input_pipe)); - if (configuration.send_chunk_header) - { - writeText(arguments_block.rows(), out); - writeChar('\n', out); - } - - auto output_format = context->getOutputFormat(configuration.format, out, arguments_block.cloneEmpty()); - formatBlock(output_format, arguments_block); - if (!is_executable_pool_function) - out.close(); - }}; - std::vector tasks = {std::move(task)}; - - Pipe pipe(std::make_unique( + Pipe pipe = coordinator->createPipe( + command, + configuration.command_arguments, + std::move(shell_input_pipes), + result_block, context, - configuration.format, - result_block.cloneEmpty(), - std::move(process), - std::move(tasks), - shell_command_source_configuration, - process_pool)); + shell_command_source_configuration); QueryPipeline pipeline(std::move(pipe)); - PullingPipelineExecutor executor(pipeline); auto result_column = result_type->createColumn(); @@ -133,8 +143,8 @@ public: size_t result_column_size = result_column->size(); if (result_column_size != input_rows_count) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, - "Function {} wrong result rows count expected {} actual {}", - getName(), + "Function {}: wrong result, expected {} row(s), actual {}", + quoteString(getName()), input_rows_count, result_column_size); @@ -143,36 +153,6 @@ public: private: - std::unique_ptr getProcess() const - { - auto process_pool = executable_function->getProcessPool(); - auto executable_function_configuration = executable_function->getConfiguration(); - - std::unique_ptr process; - bool is_executable_pool_function = (process_pool != nullptr); - if (is_executable_pool_function) - { - bool result = process_pool->tryBorrowObject(process, [&]() - { - ShellCommand::Config process_config(executable_function_configuration.script_path); - process_config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, executable_function_configuration.command_termination_timeout }; - auto shell_command = ShellCommand::execute(process_config); - return shell_command; - }, executable_function_configuration.max_command_execution_time * 1000); - - if (!result) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Could not get process from pool, max command execution timeout exceeded {} seconds", - executable_function_configuration.max_command_execution_time); - } - else - { - process = ShellCommand::execute(executable_function_configuration.script_path); - } - - return process; - } - ExternalUserDefinedExecutableFunctionsLoader::UserDefinedExecutableFunctionPtr executable_function; ContextPtr context; }; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index e177fd8e6b3..7dcfc4b95b3 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -556,9 +556,14 @@ static std::tuple executeQueryImpl( auto * insert_query = ast->as(); - if (insert_query && insert_query->table_id) - /// Resolve database before trying to use async insert feature - to properly hash the query. - insert_query->table_id = context->resolveStorageID(insert_query->table_id); + /// Resolve database before trying to use async insert feature - to properly hash the query. + if (insert_query) + { + if (insert_query->table_id) + insert_query->table_id = context->resolveStorageID(insert_query->table_id); + else if (auto table = insert_query->getTable(); !table.empty()) + insert_query->table_id = context->resolveStorageID(StorageID{insert_query->getDatabase(), table}); + } if (insert_query && insert_query->select) { @@ -579,8 +584,14 @@ static std::tuple executeQueryImpl( } } else + { /// reset Input callbacks if query is not INSERT SELECT context->resetInputCallbacks(); + } + + StreamLocalLimits limits; + std::shared_ptr quota; + std::unique_ptr interpreter; auto * queue = context->getAsynchronousInsertQueue(); const bool async_insert = queue @@ -591,65 +602,71 @@ static std::tuple executeQueryImpl( { queue->push(ast, context); - BlockIO io; if (settings.wait_for_async_insert) { auto timeout = settings.wait_for_async_insert_timeout.totalMilliseconds(); auto query_id = context->getCurrentQueryId(); auto source = std::make_shared(query_id, timeout, *queue); - io.pipeline = QueryPipeline(Pipe(std::move(source))); + res.pipeline = QueryPipeline(Pipe(std::move(source))); } - return std::make_tuple(ast, std::move(io)); - } - - auto interpreter = InterpreterFactory::get(ast, context, SelectQueryOptions(stage).setInternal(internal)); - - std::shared_ptr quota; - if (!interpreter->ignoreQuota()) - { quota = context->getQuota(); if (quota) { - if (ast->as() || ast->as()) - { - quota->used(QuotaType::QUERY_SELECTS, 1); - } - else if (ast->as()) - { - quota->used(QuotaType::QUERY_INSERTS, 1); - } + quota->used(QuotaType::QUERY_INSERTS, 1); quota->used(QuotaType::QUERIES, 1); - quota->checkExceeded(QuotaType::ERRORS); } - } - StreamLocalLimits limits; - if (!interpreter->ignoreLimits()) - { - limits.mode = LimitsMode::LIMITS_CURRENT; //-V1048 - limits.size_limits = SizeLimits(settings.max_result_rows, settings.max_result_bytes, settings.result_overflow_mode); - } - - { - std::unique_ptr span; - if (context->query_trace_context.trace_id != UUID()) - { - auto * raw_interpreter_ptr = interpreter.get(); - std::string class_name(abi::__cxa_demangle(typeid(*raw_interpreter_ptr).name(), nullptr, nullptr, nullptr)); - span = std::make_unique(class_name + "::execute()"); - } - res = interpreter->execute(); - } - - QueryPipeline & pipeline = res.pipeline; - - if (const auto * insert_interpreter = typeid_cast(&*interpreter)) - { - /// Save insertion table (not table function). TODO: support remote() table function. - auto table_id = insert_interpreter->getDatabaseTable(); + const auto & table_id = insert_query->table_id; if (!table_id.empty()) - context->setInsertionTable(std::move(table_id)); + context->setInsertionTable(table_id); + } + else + { + interpreter = InterpreterFactory::get(ast, context, SelectQueryOptions(stage).setInternal(internal)); + + if (!interpreter->ignoreQuota()) + { + quota = context->getQuota(); + if (quota) + { + if (ast->as() || ast->as()) + { + quota->used(QuotaType::QUERY_SELECTS, 1); + } + else if (ast->as()) + { + quota->used(QuotaType::QUERY_INSERTS, 1); + } + quota->used(QuotaType::QUERIES, 1); + quota->checkExceeded(QuotaType::ERRORS); + } + } + + if (!interpreter->ignoreLimits()) + { + limits.mode = LimitsMode::LIMITS_CURRENT; //-V1048 + limits.size_limits = SizeLimits(settings.max_result_rows, settings.max_result_bytes, settings.result_overflow_mode); + } + + { + std::unique_ptr span; + if (context->query_trace_context.trace_id != UUID()) + { + auto * raw_interpreter_ptr = interpreter.get(); + std::string class_name(abi::__cxa_demangle(typeid(*raw_interpreter_ptr).name(), nullptr, nullptr, nullptr)); + span = std::make_unique(class_name + "::execute()"); + } + res = interpreter->execute(); + } + + if (const auto * insert_interpreter = typeid_cast(&*interpreter)) + { + /// Save insertion table (not table function). TODO: support remote() table function. + auto table_id = insert_interpreter->getDatabaseTable(); + if (!table_id.empty()) + context->setInsertionTable(std::move(table_id)); + } } if (process_list_entry) @@ -663,6 +680,8 @@ static std::tuple executeQueryImpl( /// Hold element of process list till end of query execution. res.process_list_entry = process_list_entry; + auto & pipeline = res.pipeline; + if (pipeline.pulling() || pipeline.completed()) { /// Limits on the result, the quota on the result, and also callback for progress. @@ -712,7 +731,10 @@ static std::tuple executeQueryImpl( elem.query_views = info.views; } - interpreter->extendQueryLogElem(elem, ast, context, query_database, query_table); + if (async_insert) + InterpreterInsertQuery::extendQueryLogElemImpl(elem, context); + else if (interpreter) + interpreter->extendQueryLogElem(elem, ast, context, query_database, query_table); if (settings.log_query_settings) elem.query_settings = std::make_shared(context->getSettingsRef()); @@ -819,8 +841,8 @@ static std::tuple executeQueryImpl( else /// will be used only for ordinary INSERT queries { auto progress_out = process_list_elem->getProgressOut(); - elem.result_rows = progress_out.read_rows; - elem.result_bytes = progress_out.read_bytes; + elem.result_rows = progress_out.written_rows; + elem.result_bytes = progress_out.written_rows; } if (elem.read_rows != 0) diff --git a/src/Parsers/ASTCreateQuery.cpp b/src/Parsers/ASTCreateQuery.cpp index 3e77bee19a9..e61a0f55142 100644 --- a/src/Parsers/ASTCreateQuery.cpp +++ b/src/Parsers/ASTCreateQuery.cpp @@ -359,7 +359,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat if (as_table_function) { - if (columns_list) + if (columns_list && !columns_list->empty()) { frame.expression_list_always_start_on_new_line = true; settings.ostr << (settings.one_line ? " (" : "\n("); @@ -375,7 +375,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat frame.expression_list_always_start_on_new_line = true; - if (columns_list && !as_table_function) + if (columns_list && !columns_list->empty() && !as_table_function) { settings.ostr << (settings.one_line ? " (" : "\n("); FormatStateStacked frame_nested = frame; diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h index 93fced7dba5..2e35731acad 100644 --- a/src/Parsers/ASTCreateQuery.h +++ b/src/Parsers/ASTCreateQuery.h @@ -50,6 +50,12 @@ public: ASTPtr clone() const override; void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override; + + bool empty() + { + return (!columns || columns->children.empty()) && (!indices || indices->children.empty()) && (!constraints || constraints->children.empty()) + && (!projections || projections->children.empty()); + } }; diff --git a/src/Parsers/ASTExplainQuery.h b/src/Parsers/ASTExplainQuery.h index 5c50a8cd82e..abed9803a7b 100644 --- a/src/Parsers/ASTExplainQuery.h +++ b/src/Parsers/ASTExplainQuery.h @@ -18,6 +18,7 @@ public: QueryPlan, /// 'EXPLAIN SELECT ...' QueryPipeline, /// 'EXPLAIN PIPELINE ...' QueryEstimates, /// 'EXPLAIN ESTIMATE ...' + TableOverride, /// 'EXPLAIN TABLE OVERRIDE ...' }; explicit ASTExplainQuery(ExplainKind kind_) : kind(kind_) {} @@ -45,8 +46,22 @@ public: ast_settings = std::move(settings_); } + void setTableFunction(ASTPtr table_function_) + { + children.emplace_back(table_function_); + table_function = std::move(table_function_); + } + + void setTableOverride(ASTPtr table_override_) + { + children.emplace_back(table_override_); + table_override = std::move(table_override_); + } + const ASTPtr & getExplainedQuery() const { return query; } const ASTPtr & getSettings() const { return ast_settings; } + const ASTPtr & getTableFunction() const { return table_function; } + const ASTPtr & getTableOverride() const { return table_override; } protected: void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override @@ -59,8 +74,21 @@ protected: ast_settings->formatImpl(settings, state, frame); } - settings.ostr << settings.nl_or_ws; - query->formatImpl(settings, state, frame); + if (query) + { + settings.ostr << settings.nl_or_ws; + query->formatImpl(settings, state, frame); + } + if (table_function) + { + settings.ostr << settings.nl_or_ws; + table_function->formatImpl(settings, state, frame); + } + if (table_override) + { + settings.ostr << settings.nl_or_ws; + table_override->formatImpl(settings, state, frame); + } } private: @@ -69,6 +97,10 @@ private: ASTPtr query; ASTPtr ast_settings; + /// Used by EXPLAIN TABLE OVERRIDE + ASTPtr table_function; + ASTPtr table_override; + static String toString(ExplainKind kind) { switch (kind) @@ -78,6 +110,7 @@ private: case QueryPlan: return "EXPLAIN"; case QueryPipeline: return "EXPLAIN PIPELINE"; case QueryEstimates: return "EXPLAIN ESTIMATE"; + case TableOverride: return "EXPLAIN TABLE OVERRIDE"; } __builtin_unreachable(); diff --git a/src/Parsers/ASTTableOverrides.cpp b/src/Parsers/ASTTableOverrides.cpp index d2625bf19b4..8fc21db218f 100644 --- a/src/Parsers/ASTTableOverrides.cpp +++ b/src/Parsers/ASTTableOverrides.cpp @@ -31,13 +31,19 @@ void ASTTableOverride::formatImpl(const FormatSettings & settings_, FormatState String hl_keyword = settings.hilite ? hilite_keyword : ""; String hl_none = settings.hilite ? hilite_none : ""; - settings.ostr << hl_keyword << "TABLE OVERRIDE " << hl_none; - ASTIdentifier(table_name).formatImpl(settings, state, frame); + if (is_standalone) + { + settings.ostr << hl_keyword << "TABLE OVERRIDE " << hl_none; + ASTIdentifier(table_name).formatImpl(settings, state, frame); + } if (!columns && (!storage || storage->children.empty())) return; auto override_frame = frame; - ++override_frame.indent; - settings.ostr << nl_or_ws << '(' << nl_or_nothing; + if (is_standalone) + { + ++override_frame.indent; + settings.ostr << nl_or_ws << '(' << nl_or_nothing; + } String indent_str = settings.one_line ? "" : String(4 * override_frame.indent, ' '); size_t override_elems = 0; if (columns) @@ -68,7 +74,8 @@ void ASTTableOverride::formatImpl(const FormatSettings & settings_, FormatState format_storage_elem(storage->ttl_table, "TTL"); } - settings.ostr << nl_or_nothing << ')'; + if (is_standalone) + settings.ostr << nl_or_nothing << ')'; } ASTPtr ASTTableOverrideList::clone() const diff --git a/src/Parsers/ASTTableOverrides.h b/src/Parsers/ASTTableOverrides.h index 62e96b16b01..c0603f7a8e0 100644 --- a/src/Parsers/ASTTableOverrides.h +++ b/src/Parsers/ASTTableOverrides.h @@ -15,7 +15,7 @@ class ASTStorage; /// Storage and column overrides for a single table, for example: /// -/// TABLE OVERRIDE `foo` PARTITION BY toYYYYMM(`createtime`) +/// TABLE OVERRIDE `foo` (PARTITION BY toYYYYMM(`createtime`)) /// class ASTTableOverride : public IAST { @@ -23,6 +23,7 @@ public: String table_name; ASTColumns * columns = nullptr; ASTStorage * storage = nullptr; + bool is_standalone = true; String getID(char) const override { return "TableOverride " + table_name; } ASTPtr clone() const override; void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; diff --git a/src/Parsers/CommonParsers.cpp b/src/Parsers/CommonParsers.cpp index bffba4aa773..275679d61f0 100644 --- a/src/Parsers/CommonParsers.cpp +++ b/src/Parsers/CommonParsers.cpp @@ -3,6 +3,7 @@ namespace DB { + bool ParserKeyword::parseImpl(Pos & pos, [[maybe_unused]] ASTPtr & node, Expected & expected) { if (pos->type != TokenType::BareWord) @@ -36,4 +37,5 @@ bool ParserKeyword::parseImpl(Pos & pos, [[maybe_unused]] ASTPtr & node, Expecte return true; } + } diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 584c2a32afd..526b3aeb2bd 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -34,10 +35,10 @@ #include #include -#include "ASTColumnsMatcher.h" #include + namespace DB { @@ -273,15 +274,398 @@ bool ParserCompoundIdentifier::parseImpl(Pos & pos, ASTPtr & node, Expected & ex return true; } + +ASTPtr createFunctionCast(const ASTPtr & expr_ast, const ASTPtr & type_ast) +{ + /// Convert to canonical representation in functional form: CAST(expr, 'type') + auto type_literal = std::make_shared(queryToString(type_ast)); + + auto expr_list_args = std::make_shared(); + expr_list_args->children.push_back(expr_ast); + expr_list_args->children.push_back(std::move(type_literal)); + + auto func_node = std::make_shared(); + func_node->name = "CAST"; + func_node->arguments = std::move(expr_list_args); + func_node->children.push_back(func_node->arguments); + + return func_node; +} + + +namespace +{ + bool parseCastAs(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + /// expr AS type + + ASTPtr expr_node; + ASTPtr type_node; + + if (ParserExpression().parse(pos, expr_node, expected)) + { + if (ParserKeyword("AS").ignore(pos, expected)) + { + if (ParserDataType().parse(pos, type_node, expected)) + { + node = createFunctionCast(expr_node, type_node); + return true; + } + } + else if (ParserToken(TokenType::Comma).ignore(pos, expected)) + { + if (ParserExpression().parse(pos, type_node, expected)) + { + node = makeASTFunction("CAST", expr_node, type_node); + return true; + } + } + } + + return false; + } + + bool parseSubstring(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + /// Either SUBSTRING(expr FROM start) or SUBSTRING(expr FROM start FOR length) or SUBSTRING(expr, start, length) + /// The latter will be parsed normally as a function later. + + ASTPtr expr_node; + ASTPtr start_node; + ASTPtr length_node; + + if (!ParserExpression().parse(pos, expr_node, expected)) + return false; + + if (pos->type != TokenType::Comma) + { + if (!ParserKeyword("FROM").ignore(pos, expected)) + return false; + } + else + { + ++pos; + } + + if (!ParserExpression().parse(pos, start_node, expected)) + return false; + + if (pos->type != TokenType::ClosingRoundBracket) + { + if (pos->type != TokenType::Comma) + { + if (!ParserKeyword("FOR").ignore(pos, expected)) + return false; + } + else + { + ++pos; + } + + if (!ParserExpression().parse(pos, length_node, expected)) + return false; + } + + /// Convert to canonical representation in functional form: SUBSTRING(expr, start, length) + if (length_node) + node = makeASTFunction("substring", expr_node, start_node, length_node); + else + node = makeASTFunction("substring", expr_node, start_node); + + return true; + } + + bool parseTrim(bool trim_left, bool trim_right, IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + /// Handles all possible TRIM/LTRIM/RTRIM call variants + + std::string func_name; + bool char_override = false; + ASTPtr expr_node; + ASTPtr pattern_node; + ASTPtr to_remove; + + if (!trim_left && !trim_right) + { + if (ParserKeyword("BOTH").ignore(pos, expected)) + { + trim_left = true; + trim_right = true; + char_override = true; + } + else if (ParserKeyword("LEADING").ignore(pos, expected)) + { + trim_left = true; + char_override = true; + } + else if (ParserKeyword("TRAILING").ignore(pos, expected)) + { + trim_right = true; + char_override = true; + } + else + { + trim_left = true; + trim_right = true; + } + + if (char_override) + { + if (!ParserExpression().parse(pos, to_remove, expected)) + return false; + if (!ParserKeyword("FROM").ignore(pos, expected)) + return false; + + auto quote_meta_func_node = std::make_shared(); + auto quote_meta_list_args = std::make_shared(); + quote_meta_list_args->children = {to_remove}; + + quote_meta_func_node->name = "regexpQuoteMeta"; + quote_meta_func_node->arguments = std::move(quote_meta_list_args); + quote_meta_func_node->children.push_back(quote_meta_func_node->arguments); + + to_remove = std::move(quote_meta_func_node); + } + } + + if (!ParserExpression().parse(pos, expr_node, expected)) + return false; + + /// Convert to regexp replace function call + + if (char_override) + { + auto pattern_func_node = std::make_shared(); + auto pattern_list_args = std::make_shared(); + if (trim_left && trim_right) + { + pattern_list_args->children = { + std::make_shared("^["), + to_remove, + std::make_shared("]*|["), + to_remove, + std::make_shared("]*$") + }; + func_name = "replaceRegexpAll"; + } + else + { + if (trim_left) + { + pattern_list_args->children = { + std::make_shared("^["), + to_remove, + std::make_shared("]*") + }; + } + else + { + /// trim_right == false not possible + pattern_list_args->children = { + std::make_shared("["), + to_remove, + std::make_shared("]*$") + }; + } + func_name = "replaceRegexpOne"; + } + + pattern_func_node->name = "concat"; + pattern_func_node->arguments = std::move(pattern_list_args); + pattern_func_node->children.push_back(pattern_func_node->arguments); + + pattern_node = std::move(pattern_func_node); + } + else + { + if (trim_left && trim_right) + { + func_name = "trimBoth"; + } + else + { + if (trim_left) + { + func_name = "trimLeft"; + } + else + { + /// trim_right == false not possible + func_name = "trimRight"; + } + } + } + + if (char_override) + node = makeASTFunction(func_name, expr_node, pattern_node, std::make_shared("")); + else + node = makeASTFunction(func_name, expr_node); + return true; + } + + bool parseExtract(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + ASTPtr expr; + + IntervalKind interval_kind; + if (!parseIntervalKind(pos, expected, interval_kind)) + { + ASTPtr expr_list; + if (!ParserExpressionList(false, false).parse(pos, expr_list, expected)) + return false; + + auto res = std::make_shared(); + res->name = "extract"; + res->arguments = expr_list; + res->children.push_back(res->arguments); + node = std::move(res); + return true; + } + + ParserKeyword s_from("FROM"); + if (!s_from.ignore(pos, expected)) + return false; + + ParserExpression elem_parser; + if (!elem_parser.parse(pos, expr, expected)) + return false; + + node = makeASTFunction(interval_kind.toNameOfFunctionExtractTimePart(), expr); + return true; + } + + bool parsePosition(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + ASTPtr expr_list_node; + if (!ParserExpressionList(false, false).parse(pos, expr_list_node, expected)) + return false; + + ASTExpressionList * expr_list = typeid_cast(expr_list_node.get()); + if (expr_list && expr_list->children.size() == 1) + { + ASTFunction * func_in = typeid_cast(expr_list->children[0].get()); + if (func_in && func_in->name == "in") + { + ASTExpressionList * in_args = typeid_cast(func_in->arguments.get()); + if (in_args && in_args->children.size() == 2) + { + node = makeASTFunction("position", in_args->children[1], in_args->children[0]); + return true; + } + } + } + + auto res = std::make_shared(); + res->name = "position"; + res->arguments = expr_list_node; + res->children.push_back(res->arguments); + node = std::move(res); + return true; + } + + bool parseDateAdd(const char * function_name, IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + ASTPtr timestamp_node; + ASTPtr offset_node; + + IntervalKind interval_kind; + ASTPtr interval_func_node; + if (parseIntervalKind(pos, expected, interval_kind)) + { + /// function(unit, offset, timestamp) + if (pos->type != TokenType::Comma) + return false; + ++pos; + + if (!ParserExpression().parse(pos, offset_node, expected)) + return false; + + if (pos->type != TokenType::Comma) + return false; + ++pos; + + if (!ParserExpression().parse(pos, timestamp_node, expected)) + return false; + auto interval_expr_list_args = std::make_shared(); + interval_expr_list_args->children = {offset_node}; + + interval_func_node = std::make_shared(); + interval_func_node->as().name = interval_kind.toNameOfFunctionToIntervalDataType(); + interval_func_node->as().arguments = std::move(interval_expr_list_args); + interval_func_node->as().children.push_back(interval_func_node->as().arguments); + } + else + { + ASTPtr expr_list; + if (!ParserExpressionList(false, false).parse(pos, expr_list, expected)) + return false; + + auto res = std::make_shared(); + res->name = function_name; + res->arguments = expr_list; + res->children.push_back(res->arguments); + node = std::move(res); + return true; + } + + node = makeASTFunction(function_name, timestamp_node, interval_func_node); + return true; + } + + bool parseDateDiff(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + ASTPtr left_node; + ASTPtr right_node; + + IntervalKind interval_kind; + if (!parseIntervalKind(pos, expected, interval_kind)) + { + ASTPtr expr_list; + if (!ParserExpressionList(false, false).parse(pos, expr_list, expected)) + return false; + + auto res = std::make_shared(); + res->name = "dateDiff"; + res->arguments = expr_list; + res->children.push_back(res->arguments); + node = std::move(res); + return true; + } + + if (pos->type != TokenType::Comma) + return false; + ++pos; + + if (!ParserExpression().parse(pos, left_node, expected)) + return false; + + if (pos->type != TokenType::Comma) + return false; + ++pos; + + if (!ParserExpression().parse(pos, right_node, expected)) + return false; + + node = makeASTFunction("dateDiff", std::make_shared(interval_kind.toDateDiffUnit()), left_node, right_node); + return true; + } + + bool parseExists(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + if (!ParserSelectWithUnionQuery().parse(pos, node, expected)) + return false; + + auto subquery = std::make_shared(); + subquery->children.push_back(node); + node = makeASTFunction("exists", subquery); + return true; + } +} + + bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserIdentifier id_parser; - ParserKeyword distinct("DISTINCT"); - ParserKeyword all("ALL"); - ParserExpressionList contents(false, is_table_function); - ParserSelectWithUnionQuery select; - ParserKeyword filter("FILTER"); - ParserKeyword over("OVER"); bool has_all = false; bool has_distinct = false; @@ -304,9 +688,73 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return false; ++pos; + /// Avoid excessive backtracking. + //pos.putBarrier(); + + /// Special cases for expressions that look like functions but contain some syntax sugar: + + /// CAST, EXTRACT, POSITION, EXISTS + /// DATE_ADD, DATEADD, TIMESTAMPADD, DATE_SUB, DATESUB, TIMESTAMPSUB, + /// DATE_DIFF, DATEDIFF, TIMESTAMPDIFF, TIMESTAMP_DIFF, + /// SUBSTRING, TRIM, LTRIM, RTRIM, POSITION + + /// Can be parsed as a composition of functions, but the contents must be unwrapped: + /// POSITION(x IN y) -> POSITION(in(x, y)) -> POSITION(y, x) + + /// Can be parsed as a function, but not always: + /// CAST(x AS type) - alias has to be unwrapped + /// CAST(x AS type(params)) + + /// Can be parsed as a function, but some identifier arguments have special meanings. + /// DATE_ADD(MINUTE, x, y) -> addMinutes(x, y) + /// DATE_DIFF(MINUTE, x, y) + + /// Have keywords that have to processed explicitly: + /// EXTRACT(x FROM y) + /// TRIM(BOTH|LEADING|TRAILING x FROM y) + /// SUBSTRING(x FROM a) + /// SUBSTRING(x FROM a FOR b) + + String function_name = getIdentifierName(identifier); + String function_name_lowercase = Poco::toLower(function_name); + + std::optional parsed_special_function; + + if (function_name_lowercase == "cast") + parsed_special_function = parseCastAs(pos, node, expected); + else if (function_name_lowercase == "extract") + parsed_special_function = parseExtract(pos, node, expected); + else if (function_name_lowercase == "substring") + parsed_special_function = parseSubstring(pos, node, expected); + else if (function_name_lowercase == "position") + parsed_special_function = parsePosition(pos, node, expected); + else if (function_name_lowercase == "exists") + parsed_special_function = parseExists(pos, node, expected); + else if (function_name_lowercase == "trim") + parsed_special_function = parseTrim(false, false, pos, node, expected); + else if (function_name_lowercase == "ltrim") + parsed_special_function = parseTrim(true, false, pos, node, expected); + else if (function_name_lowercase == "rtrim") + parsed_special_function = parseTrim(false, true, pos, node, expected); + else if (function_name_lowercase == "dateadd" || function_name_lowercase == "date_add" + || function_name_lowercase == "timestampadd" || function_name_lowercase == "timestamp_add") + parsed_special_function = parseDateAdd("plus", pos, node, expected); + else if (function_name_lowercase == "datesub" || function_name_lowercase == "date_sub" + || function_name_lowercase == "timestampsub" || function_name_lowercase == "timestamp_sub") + parsed_special_function = parseDateAdd("minus", pos, node, expected); + else if (function_name_lowercase == "datediff" || function_name_lowercase == "date_diff" + || function_name_lowercase == "timestampdiff" || function_name_lowercase == "timestamp_diff") + parsed_special_function = parseDateDiff(pos, node, expected); + + if (parsed_special_function.has_value()) + return parsed_special_function.value() && ParserToken(TokenType::ClosingRoundBracket).ignore(pos); + auto pos_after_bracket = pos; auto old_expected = expected; + ParserKeyword all("ALL"); + ParserKeyword distinct("DISTINCT"); + if (all.ignore(pos, expected)) has_all = true; @@ -331,6 +779,8 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) } } + ParserExpressionList contents(false, is_table_function); + const char * contents_begin = pos->begin; if (!contents.parse(pos, expr_list_args, expected)) return false; @@ -345,7 +795,7 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) * If you do not report that the first option is an error, then the argument will be interpreted as 2014 - 01 - 01 - some number, * and the query silently returns an unexpected result. */ - if (getIdentifierName(identifier) == "toDate" + if (function_name == "toDate" && contents_end - contents_begin == strlen("2014-01-01") && contents_begin[0] >= '2' && contents_begin[0] <= '3' && contents_begin[1] >= '0' && contents_begin[1] <= '9' @@ -362,26 +812,6 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) throw Exception("Argument of function toDate is unquoted: toDate(" + contents_str + "), must be: toDate('" + contents_str + "')" , ErrorCodes::SYNTAX_ERROR); } - else if (Poco::toLower(getIdentifierName(identifier)) == "position") - { - /// POSITION(needle IN haystack) is equivalent to function position(haystack, needle) - if (const auto * list = expr_list_args->as()) - { - if (list->children.size() == 1) - { - if (const auto * in_func = list->children[0]->as()) - { - if (in_func->name == "in") - { - // switch the two arguments - const auto & arg_list = in_func->arguments->as(); - if (arg_list.children.size() == 2) - expr_list_args->children = {arg_list.children[1], arg_list.children[0]}; - } - } - } - } - } /// The parametric aggregate function has two lists (parameters and arguments) in parentheses. Example: quantile(0.9)(x). if (allow_function_parameters && pos->type == TokenType::OpeningRoundBracket) @@ -445,6 +875,9 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) function_node->children.push_back(function_node->parameters); } + ParserKeyword filter("FILTER"); + ParserKeyword over("OVER"); + if (filter.ignore(pos, expected)) { // We are slightly breaking the parser interface by parsing the window @@ -455,9 +888,7 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ParserFilterClause filter_parser; if (!filter_parser.parse(pos, function_node_as_iast, expected)) - { return false; - } } if (over.ignore(pos, expected)) @@ -468,9 +899,7 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ParserWindowReference window_reference; if (!window_reference.parse(pos, function_node_as_iast, expected)) - { return false; - } } node = function_node; @@ -877,22 +1306,6 @@ bool ParserCodec::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return true; } -ASTPtr createFunctionCast(const ASTPtr & expr_ast, const ASTPtr & type_ast) -{ - /// Convert to canonical representation in functional form: CAST(expr, 'type') - auto type_literal = std::make_shared(queryToString(type_ast)); - - auto expr_list_args = std::make_shared(); - expr_list_args->children.push_back(expr_ast); - expr_list_args->children.push_back(std::move(type_literal)); - - auto func_node = std::make_shared(); - func_node->name = "CAST"; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - return func_node; -} template static bool isOneOf(TokenType token) @@ -1005,509 +1418,6 @@ bool ParserCastOperator::parseImpl(Pos & pos, ASTPtr & node, Expected & expected return false; } -bool ParserCastAsExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - /// Either CAST(expr AS type) or CAST(expr, 'type') - /// The latter will be parsed normally as a function later. - - ASTPtr expr_node; - ASTPtr type_node; - - if (ParserKeyword("CAST").ignore(pos, expected) - && ParserToken(TokenType::OpeningRoundBracket).ignore(pos, expected) - && ParserExpression().parse(pos, expr_node, expected) - && ParserKeyword("AS").ignore(pos, expected) - && ParserDataType().parse(pos, type_node, expected) - && ParserToken(TokenType::ClosingRoundBracket).ignore(pos, expected)) - { - node = createFunctionCast(expr_node, type_node); - return true; - } - - return false; -} - -bool ParserSubstringExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - /// Either SUBSTRING(expr FROM start) or SUBSTRING(expr FROM start FOR length) or SUBSTRING(expr, start, length) - /// The latter will be parsed normally as a function later. - - ASTPtr expr_node; - ASTPtr start_node; - ASTPtr length_node; - - if (!ParserKeyword("SUBSTRING").ignore(pos, expected)) - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - if (!ParserExpression().parse(pos, expr_node, expected)) - return false; - - if (pos->type != TokenType::Comma) - { - if (!ParserKeyword("FROM").ignore(pos, expected)) - return false; - } - else - { - ++pos; - } - - if (!ParserExpression().parse(pos, start_node, expected)) - return false; - - if (pos->type == TokenType::ClosingRoundBracket) - { - ++pos; - } - else - { - if (pos->type != TokenType::Comma) - { - if (!ParserKeyword("FOR").ignore(pos, expected)) - return false; - } - else - { - ++pos; - } - - if (!ParserExpression().parse(pos, length_node, expected)) - return false; - - ParserToken(TokenType::ClosingRoundBracket).ignore(pos, expected); - } - - /// Convert to canonical representation in functional form: SUBSTRING(expr, start, length) - - auto expr_list_args = std::make_shared(); - expr_list_args->children = {expr_node, start_node}; - - if (length_node) - expr_list_args->children.push_back(length_node); - - auto func_node = std::make_shared(); - func_node->name = "substring"; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - return true; -} - -bool ParserTrimExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - /// Handles all possible TRIM/LTRIM/RTRIM call variants - - std::string func_name; - bool trim_left = false; - bool trim_right = false; - bool char_override = false; - ASTPtr expr_node; - ASTPtr pattern_node; - ASTPtr to_remove; - - if (ParserKeyword("LTRIM").ignore(pos, expected)) - { - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - trim_left = true; - } - else if (ParserKeyword("RTRIM").ignore(pos, expected)) - { - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - trim_right = true; - } - else if (ParserKeyword("TRIM").ignore(pos, expected)) - { - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - if (ParserKeyword("BOTH").ignore(pos, expected)) - { - trim_left = true; - trim_right = true; - char_override = true; - } - else if (ParserKeyword("LEADING").ignore(pos, expected)) - { - trim_left = true; - char_override = true; - } - else if (ParserKeyword("TRAILING").ignore(pos, expected)) - { - trim_right = true; - char_override = true; - } - else - { - trim_left = true; - trim_right = true; - } - - if (char_override) - { - if (!ParserExpression().parse(pos, to_remove, expected)) - return false; - if (!ParserKeyword("FROM").ignore(pos, expected)) - return false; - - auto quote_meta_func_node = std::make_shared(); - auto quote_meta_list_args = std::make_shared(); - quote_meta_list_args->children = {to_remove}; - - quote_meta_func_node->name = "regexpQuoteMeta"; - quote_meta_func_node->arguments = std::move(quote_meta_list_args); - quote_meta_func_node->children.push_back(quote_meta_func_node->arguments); - - to_remove = std::move(quote_meta_func_node); - } - } - - if (!(trim_left || trim_right)) - return false; - - if (!ParserExpression().parse(pos, expr_node, expected)) - return false; - - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - /// Convert to regexp replace function call - - if (char_override) - { - auto pattern_func_node = std::make_shared(); - auto pattern_list_args = std::make_shared(); - if (trim_left && trim_right) - { - pattern_list_args->children = { - std::make_shared("^["), - to_remove, - std::make_shared("]*|["), - to_remove, - std::make_shared("]*$") - }; - func_name = "replaceRegexpAll"; - } - else - { - if (trim_left) - { - pattern_list_args->children = { - std::make_shared("^["), - to_remove, - std::make_shared("]*") - }; - } - else - { - /// trim_right == false not possible - pattern_list_args->children = { - std::make_shared("["), - to_remove, - std::make_shared("]*$") - }; - } - func_name = "replaceRegexpOne"; - } - - pattern_func_node->name = "concat"; - pattern_func_node->arguments = std::move(pattern_list_args); - pattern_func_node->children.push_back(pattern_func_node->arguments); - - pattern_node = std::move(pattern_func_node); - } - else - { - if (trim_left && trim_right) - { - func_name = "trimBoth"; - } - else - { - if (trim_left) - { - func_name = "trimLeft"; - } - else - { - /// trim_right == false not possible - func_name = "trimRight"; - } - } - } - - auto expr_list_args = std::make_shared(); - if (char_override) - expr_list_args->children = {expr_node, pattern_node, std::make_shared("")}; - else - expr_list_args->children = {expr_node}; - - auto func_node = std::make_shared(); - func_node->name = func_name; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - return true; -} - -bool ParserLeftExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - /// Rewrites left(expr, length) to SUBSTRING(expr, 1, length) - - ASTPtr expr_node; - ASTPtr start_node; - ASTPtr length_node; - - if (!ParserKeyword("LEFT").ignore(pos, expected)) - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - if (!ParserExpression().parse(pos, expr_node, expected)) - return false; - - ParserToken(TokenType::Comma).ignore(pos, expected); - - if (!ParserExpression().parse(pos, length_node, expected)) - return false; - - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - auto expr_list_args = std::make_shared(); - start_node = std::make_shared(1); - expr_list_args->children = {expr_node, start_node, length_node}; - - auto func_node = std::make_shared(); - func_node->name = "substring"; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - return true; -} - -bool ParserRightExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - /// Rewrites RIGHT(expr, length) to substring(expr, -length) - - ASTPtr expr_node; - ASTPtr length_node; - - if (!ParserKeyword("RIGHT").ignore(pos, expected)) - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - if (!ParserExpression().parse(pos, expr_node, expected)) - return false; - - ParserToken(TokenType::Comma).ignore(pos, expected); - - if (!ParserExpression().parse(pos, length_node, expected)) - return false; - - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - auto start_expr_list_args = std::make_shared(); - start_expr_list_args->children = {length_node}; - - auto start_node = std::make_shared(); - start_node->name = "negate"; - start_node->arguments = std::move(start_expr_list_args); - start_node->children.push_back(start_node->arguments); - - auto expr_list_args = std::make_shared(); - expr_list_args->children = {expr_node, start_node}; - - auto func_node = std::make_shared(); - func_node->name = "substring"; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - return true; -} - -bool ParserExtractExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - if (!ParserKeyword("EXTRACT").ignore(pos, expected)) - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - ASTPtr expr; - - IntervalKind interval_kind; - if (!parseIntervalKind(pos, expected, interval_kind)) - return false; - - ParserKeyword s_from("FROM"); - if (!s_from.ignore(pos, expected)) - return false; - - ParserExpression elem_parser; - if (!elem_parser.parse(pos, expr, expected)) - return false; - - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - auto function = std::make_shared(); - auto exp_list = std::make_shared(); - function->name = interval_kind.toNameOfFunctionExtractTimePart(); - function->arguments = exp_list; - function->children.push_back(exp_list); - exp_list->children.push_back(expr); - node = function; - - return true; -} - -bool ParserDateAddExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - const char * function_name = nullptr; - ASTPtr timestamp_node; - ASTPtr offset_node; - - if (ParserKeyword("DATEADD").ignore(pos, expected) || ParserKeyword("DATE_ADD").ignore(pos, expected) - || ParserKeyword("TIMESTAMPADD").ignore(pos, expected) || ParserKeyword("TIMESTAMP_ADD").ignore(pos, expected)) - function_name = "plus"; - else if (ParserKeyword("DATESUB").ignore(pos, expected) || ParserKeyword("DATE_SUB").ignore(pos, expected) - || ParserKeyword("TIMESTAMPSUB").ignore(pos, expected) || ParserKeyword("TIMESTAMP_SUB").ignore(pos, expected)) - function_name = "minus"; - else - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - IntervalKind interval_kind; - ASTPtr interval_func_node; - if (parseIntervalKind(pos, expected, interval_kind)) - { - /// function(unit, offset, timestamp) - if (pos->type != TokenType::Comma) - return false; - ++pos; - - if (!ParserExpression().parse(pos, offset_node, expected)) - return false; - - if (pos->type != TokenType::Comma) - return false; - ++pos; - - if (!ParserExpression().parse(pos, timestamp_node, expected)) - return false; - auto interval_expr_list_args = std::make_shared(); - interval_expr_list_args->children = {offset_node}; - - interval_func_node = std::make_shared(); - interval_func_node->as().name = interval_kind.toNameOfFunctionToIntervalDataType(); - interval_func_node->as().arguments = std::move(interval_expr_list_args); - interval_func_node->as().children.push_back(interval_func_node->as().arguments); - } - else - { - /// function(timestamp, INTERVAL offset unit) - if (!ParserExpression().parse(pos, timestamp_node, expected)) - return false; - - if (pos->type != TokenType::Comma) - return false; - ++pos; - - if (!ParserIntervalOperatorExpression{}.parse(pos, interval_func_node, expected)) - return false; - } - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - auto expr_list_args = std::make_shared(); - expr_list_args->children = {timestamp_node, interval_func_node}; - - auto func_node = std::make_shared(); - func_node->name = function_name; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - - return true; -} - -bool ParserDateDiffExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - ASTPtr left_node; - ASTPtr right_node; - - if (!(ParserKeyword("DATEDIFF").ignore(pos, expected) || ParserKeyword("DATE_DIFF").ignore(pos, expected) - || ParserKeyword("TIMESTAMPDIFF").ignore(pos, expected) || ParserKeyword("TIMESTAMP_DIFF").ignore(pos, expected))) - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - IntervalKind interval_kind; - if (!parseIntervalKind(pos, expected, interval_kind)) - return false; - - if (pos->type != TokenType::Comma) - return false; - ++pos; - - if (!ParserExpression().parse(pos, left_node, expected)) - return false; - - if (pos->type != TokenType::Comma) - return false; - ++pos; - - if (!ParserExpression().parse(pos, right_node, expected)) - return false; - - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - auto expr_list_args = std::make_shared(); - expr_list_args->children = {std::make_shared(interval_kind.toDateDiffUnit()), left_node, right_node}; - - auto func_node = std::make_shared(); - func_node->name = "dateDiff"; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - - return true; -} - bool ParserNull::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { @@ -2246,16 +2156,6 @@ bool ParserMySQLGlobalVariable::parseImpl(Pos & pos, ASTPtr & node, Expected & e return true; } -bool ParserExistsExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - if (ParserKeyword("EXISTS").ignore(pos, expected) && ParserSubquery().parse(pos, node, expected)) - { - node = makeASTFunction("exists", node); - return true; - } - return false; -} - bool ParserExpressionElement::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { @@ -2266,20 +2166,11 @@ bool ParserExpressionElement::parseImpl(Pos & pos, ASTPtr & node, Expected & exp || ParserArrayOfLiterals().parse(pos, node, expected) || ParserArray().parse(pos, node, expected) || ParserLiteral().parse(pos, node, expected) - || ParserCastAsExpression().parse(pos, node, expected) - || ParserExtractExpression().parse(pos, node, expected) - || ParserDateAddExpression().parse(pos, node, expected) - || ParserDateDiffExpression().parse(pos, node, expected) - || ParserSubstringExpression().parse(pos, node, expected) - || ParserTrimExpression().parse(pos, node, expected) - || ParserLeftExpression().parse(pos, node, expected) - || ParserRightExpression().parse(pos, node, expected) || ParserCase().parse(pos, node, expected) || ParserColumnsMatcher().parse(pos, node, expected) /// before ParserFunction because it can be also parsed as a function. || ParserFunction().parse(pos, node, expected) || ParserQualifiedAsterisk().parse(pos, node, expected) || ParserAsterisk().parse(pos, node, expected) - || ParserExistsExpression().parse(pos, node, expected) || ParserCompoundIdentifier(false, true).parse(pos, node, expected) || ParserSubstitution().parse(pos, node, expected) || ParserMySQLGlobalVariable().parse(pos, node, expected); diff --git a/src/Parsers/ExpressionElementParsers.h b/src/Parsers/ExpressionElementParsers.h index 5dff2e026be..c86721dca18 100644 --- a/src/Parsers/ExpressionElementParsers.h +++ b/src/Parsers/ExpressionElementParsers.h @@ -228,63 +228,6 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; -ASTPtr createFunctionCast(const ASTPtr & expr_ast, const ASTPtr & type_ast); -class ParserCastAsExpression : public IParserBase -{ -protected: - const char * getName() const override { return "CAST AS expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserSubstringExpression : public IParserBase -{ -protected: - const char * getName() const override { return "SUBSTRING expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserTrimExpression : public IParserBase -{ -protected: - const char * getName() const override { return "TRIM expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserLeftExpression : public IParserBase -{ -protected: - const char * getName() const override { return "LEFT expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserRightExpression : public IParserBase -{ -protected: - const char * getName() const override { return "RIGHT expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserExtractExpression : public IParserBase -{ -protected: - const char * getName() const override { return "EXTRACT expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserDateAddExpression : public IParserBase -{ -protected: - const char * getName() const override { return "DATE_ADD expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserDateDiffExpression : public IParserBase -{ -protected: - const char * getName() const override { return "DATE_DIFF expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - /** NULL literal. */ class ParserNull : public IParserBase @@ -333,17 +276,6 @@ protected: }; -/** - * Parse query with EXISTS expression. - */ -class ParserExistsExpression : public IParserBase -{ -protected: - const char * getName() const override { return "exists expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - - /** An array or tuple of literals. * Arrays can also be parsed as an application of [] operator and tuples as an application of 'tuple' function. * But parsing the whole array/tuple as a whole constant seriously speeds up the analysis of expressions in the case of very large collection. @@ -535,4 +467,6 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; +ASTPtr createFunctionCast(const ASTPtr & expr_ast, const ASTPtr & type_ast); + } diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index 680d3f6031b..96c1bad75c2 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -689,7 +689,7 @@ bool ParserUnaryExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expec bool ParserCastExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ASTPtr expr_ast; - if (!elem_parser.parse(pos, expr_ast, expected)) + if (!ParserExpressionElement().parse(pos, expr_ast, expected)) return false; ASTPtr type_ast; diff --git a/src/Parsers/ExpressionListParsers.h b/src/Parsers/ExpressionListParsers.h index e9389f15bbb..a035d4a2ef0 100644 --- a/src/Parsers/ExpressionListParsers.h +++ b/src/Parsers/ExpressionListParsers.h @@ -203,9 +203,6 @@ protected: /// Example: "[1, 1 + 1, 1 + 2]::Array(UInt8)" class ParserCastExpression : public IParserBase { -private: - ParserExpressionElement elem_parser; - protected: const char * getName() const override { return "CAST expression"; } diff --git a/src/Parsers/IParser.h b/src/Parsers/IParser.h index 64f117c707f..4e6dbca15a6 100644 --- a/src/Parsers/IParser.h +++ b/src/Parsers/IParser.h @@ -60,7 +60,9 @@ public: uint32_t depth = 0; uint32_t max_depth = 0; - Pos(Tokens & tokens_, uint32_t max_depth_) : TokenIterator(tokens_), max_depth(max_depth_) {} + Pos(Tokens & tokens_, uint32_t max_depth_) : TokenIterator(tokens_), max_depth(max_depth_) + { + } ALWAYS_INLINE void increaseDepth() { diff --git a/src/Parsers/IParserBase.h b/src/Parsers/IParserBase.h index ce08bdef790..6fd195da40d 100644 --- a/src/Parsers/IParserBase.h +++ b/src/Parsers/IParserBase.h @@ -17,7 +17,7 @@ public: Pos begin = pos; bool res = func(); if (!res) - pos = begin; + pos = begin; return res; } @@ -31,7 +31,7 @@ public: bool res = func(); pos.decreaseDepth(); if (!res) - pos = begin; + pos = begin; return res; } diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index dbbea986404..6d295a0d516 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -557,34 +557,43 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe } } } + /** Create queries without list of columns: + * - CREATE|ATTACH TABLE ... AS ... + * - CREATE|ATTACH TABLE ... ENGINE = engine + */ else { storage_p.parse(pos, storage, expected); - if (!s_as.ignore(pos, expected)) - return false; - - if (!select_p.parse(pos, select, expected)) /// AS SELECT ... + /// CREATE|ATTACH TABLE ... AS ... + if (s_as.ignore(pos, expected)) { - /// ENGINE can not be specified for table functions. - if (storage || !table_function_p.parse(pos, as_table_function, expected)) + if (!select_p.parse(pos, select, expected)) /// AS SELECT ... { - /// AS [db.]table - if (!name_p.parse(pos, as_table, expected)) - return false; - - if (s_dot.ignore(pos, expected)) + /// ENGINE can not be specified for table functions. + if (storage || !table_function_p.parse(pos, as_table_function, expected)) { - as_database = as_table; + /// AS [db.]table if (!name_p.parse(pos, as_table, expected)) return false; - } - /// Optional - ENGINE can be specified. - if (!storage) - storage_p.parse(pos, storage, expected); + if (s_dot.ignore(pos, expected)) + { + as_database = as_table; + if (!name_p.parse(pos, as_table, expected)) + return false; + } + + /// Optional - ENGINE can be specified. + if (!storage) + storage_p.parse(pos, storage, expected); + } } } + else if (!storage) + { + return false; + } } auto comment = parseComment(pos, expected); @@ -960,14 +969,15 @@ bool ParserTableOverrideDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expecte ASTPtr sample_by; ASTPtr ttl_table; - if (!s_table_override.ignore(pos, expected)) - return false; - - if (!table_name_p.parse(pos, table_name, expected)) - return false; - - if (!lparen_p.ignore(pos, expected)) - return false; + if (is_standalone) + { + if (!s_table_override.ignore(pos, expected)) + return false; + if (!table_name_p.parse(pos, table_name, expected)) + return false; + if (!lparen_p.ignore(pos, expected)) + return false; + } while (true) { @@ -1025,7 +1035,7 @@ bool ParserTableOverrideDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expecte break; } - if (!rparen_p.ignore(pos, expected)) + if (is_standalone && !rparen_p.ignore(pos, expected)) return false; auto storage = std::make_shared(); @@ -1036,7 +1046,9 @@ bool ParserTableOverrideDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expecte storage->set(storage->ttl_table, ttl_table); auto res = std::make_shared(); - res->table_name = table_name->as()->name(); + if (table_name) + res->table_name = table_name->as()->name(); + res->is_standalone = is_standalone; res->set(res->storage, storage); if (columns) res->set(res->columns, columns); diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index bc1ebd65639..615121eae58 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -361,6 +361,8 @@ protected: * Or: * CREATE|ATTACH TABLE [IF NOT EXISTS] [db.]name [UUID 'uuid'] [ON CLUSTER cluster] AS ENGINE = engine SELECT ... * + * Or (for engines that supports schema inference): + * CREATE|ATTACH TABLE [IF NOT EXISTS] [db.]name [UUID 'uuid'] [ON CLUSTER cluster] ENGINE = engine */ class ParserCreateTableQuery : public IParserBase { @@ -387,6 +389,10 @@ protected: class ParserTableOverrideDeclaration : public IParserBase { +public: + const bool is_standalone; + ParserTableOverrideDeclaration(bool is_standalone_ = true) : is_standalone(is_standalone_) { } + protected: const char * getName() const override { return "table override declaration"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; diff --git a/src/Parsers/ParserExplainQuery.cpp b/src/Parsers/ParserExplainQuery.cpp index ffaab0f2b6d..e072f6a14d7 100644 --- a/src/Parsers/ParserExplainQuery.cpp +++ b/src/Parsers/ParserExplainQuery.cpp @@ -21,6 +21,7 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserKeyword s_pipeline("PIPELINE"); ParserKeyword s_plan("PLAN"); ParserKeyword s_estimates("ESTIMATE"); + ParserKeyword s_table_override("TABLE OVERRIDE"); if (s_explain.ignore(pos, expected)) { @@ -36,6 +37,8 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected kind = ASTExplainQuery::ExplainKind::QueryPlan; //-V1048 else if (s_estimates.ignore(pos, expected)) kind = ASTExplainQuery::ExplainKind::QueryEstimates; //-V1048 + else if (s_table_override.ignore(pos, expected)) + kind = ASTExplainQuery::ExplainKind::TableOverride; } else return false; @@ -65,6 +68,17 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected else return false; } + else if (kind == ASTExplainQuery::ExplainKind::TableOverride) + { + ASTPtr table_function; + if (!ParserFunction(true, true).parse(pos, table_function, expected)) + return false; + ASTPtr table_override; + if (!ParserTableOverrideDeclaration(false).parse(pos, table_override, expected)) + return false; + explain_query->setTableFunction(table_function); + explain_query->setTableOverride(table_override); + } else if (select_p.parse(pos, query, expected) || create_p.parse(pos, query, expected) || insert_p.parse(pos, query, expected)) diff --git a/src/Processors/Executors/PipelineExecutor.h b/src/Processors/Executors/PipelineExecutor.h index 12f2bd8b75b..0b1fe5dedf6 100644 --- a/src/Processors/Executors/PipelineExecutor.h +++ b/src/Processors/Executors/PipelineExecutor.h @@ -26,7 +26,7 @@ public: /// During pipeline execution new processors can appear. They will be added to existing set. /// /// Explicit graph representation is built in constructor. Throws if graph is not correct. - explicit PipelineExecutor(Processors & processors, QueryStatus * elem = nullptr); + explicit PipelineExecutor(Processors & processors, QueryStatus * elem); ~PipelineExecutor(); /// Execute pipeline in multiple threads. Must be called once. diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp new file mode 100644 index 00000000000..096e39a2893 --- /dev/null +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -0,0 +1,160 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; +} + +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) + : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +{ +} + +NamesAndTypesList IRowSchemaReader::readSchema() +{ + DataTypes data_types = readRowAndGetDataTypes(); + for (size_t row = 1; row < max_rows_to_read; ++row) + { + DataTypes new_data_types = readRowAndGetDataTypes(); + if (new_data_types.empty()) + /// We reached eof. + break; + + if (new_data_types.size() != data_types.size()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Rows have different amount of values"); + + for (size_t i = 0; i != data_types.size(); ++i) + { + /// We couldn't determine the type of this column in a new row, just skip it. + if (!new_data_types[i]) + continue; + + /// If we couldn't determine the type of column yet, just set the new type. + if (!data_types[i]) + data_types[i] = new_data_types[i]; + /// If the new type and the previous type for this column are different, + /// we will use default type if we have it or throw an exception. + else if (data_types[i]->getName() != new_data_types[i]->getName()) + { + if (default_type) + data_types[i] = default_type; + else + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", new_data_types[i]->getName(), i + 1, row, data_types[i]->getName()); + } + } + } + + /// Check that we read at list one column. + if (data_types.empty()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data"); + + /// If column names weren't set, use default names 'c1', 'c2', ... + if (column_names.empty()) + { + column_names.reserve(data_types.size()); + for (size_t i = 0; i != data_types.size(); ++i) + column_names.push_back("c" + std::to_string(i + 1)); + } + /// If column names were set, check that the number of names match the number of types. + else if (column_names.size() != data_types.size()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The number of column names {} differs with the number of types {}", column_names.size(), data_types.size()); + + NamesAndTypesList result; + for (size_t i = 0; i != data_types.size(); ++i) + { + /// Check that we could determine the type of this column. + if (!data_types[i]) + { + if (!default_type) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " + "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", + max_rows_to_read); + + data_types[i] = default_type; + } + result.emplace_back(column_names[i], data_types[i]); + } + + return result; +} + +IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) + : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +{ +} + +NamesAndTypesList IRowWithNamesSchemaReader::readSchema() +{ + auto names_and_types = readRowAndGetNamesAndDataTypes(); + for (size_t row = 1; row < max_rows_to_read; ++row) + { + auto new_names_and_types = readRowAndGetNamesAndDataTypes(); + if (new_names_and_types.empty()) + /// We reached eof. + break; + + for (const auto & [name, new_type] : new_names_and_types) + { + auto it = names_and_types.find(name); + /// If we didn't see this column before, just add it. + if (it == names_and_types.end()) + { + names_and_types[name] = new_type; + continue; + } + + auto & type = it->second; + /// If we couldn't determine the type of column yet, just set the new type. + if (!type) + type = new_type; + /// If the new type and the previous type for this column are different, + /// we will use default type if we have it or throw an exception. + else if (new_type && type->getName() != new_type->getName()) + { + if (default_type) + type = default_type; + else + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", type->getName(), name, row, new_type->getName()); + } + } + } + + /// Check that we read at list one column. + if (names_and_types.empty()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data"); + + NamesAndTypesList result; + for (auto & [name, type] : names_and_types) + { + /// Check that we could determine the type of this column. + if (!type) + { + if (!default_type) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " + "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", + max_rows_to_read); + + type = default_type; + } + result.emplace_back(name, type); + } + + return result; +} + +} diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h new file mode 100644 index 00000000000..67a8eb88d61 --- /dev/null +++ b/src/Processors/Formats/ISchemaReader.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +/// Base class for schema inference for the data in some specific format. +/// It reads some data from read buffer and try to determine the schema +/// from read data. +class ISchemaReader +{ +public: + ISchemaReader(ReadBuffer & in_) : in(in_) {} + + virtual NamesAndTypesList readSchema() = 0; + + virtual ~ISchemaReader() = default; + +protected: + ReadBuffer & in; +}; + +/// Base class for schema inference for formats that read data row by row. +/// It reads data row by row (up to max_rows_to_read), determines types of columns +/// for each row and compare them with types from the previous rows. If some column +/// contains values with different types in different rows, the default type will be +/// used for this column or the exception will be thrown (if default type is not set). +class IRowSchemaReader : public ISchemaReader +{ +public: + IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + NamesAndTypesList readSchema() override; + +protected: + /// Read one row and determine types of columns in it. + /// Return types in the same order in which the values were in the row. + /// If it's impossible to determine the type for some column, return nullptr for it. + /// Return empty list if can't read more data. + virtual DataTypes readRowAndGetDataTypes() = 0; + + void setColumnNames(const std::vector & names) { column_names = names; } + +private: + size_t max_rows_to_read; + DataTypePtr default_type; + std::vector column_names; +}; + +/// Base class for schema inference for formats that read data row by row and each +/// row contains column names and values (ex: JSONEachRow, TSKV). +/// Differ from IRowSchemaReader in that after reading a row we get +/// a map {column_name : type} and some columns may be missed in a single row +/// (in this case we will use types from the previous rows for missed columns). +class IRowWithNamesSchemaReader : public ISchemaReader +{ +public: + IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + NamesAndTypesList readSchema() override; + +protected: + /// Read one row and determine types of columns in it. + /// Return map {column_name : type}. + /// If it's impossible to determine the type for some column, return nullptr for it. + /// Return empty map is can't read more data. + virtual std::unordered_map readRowAndGetNamesAndDataTypes() = 0; + +private: + size_t max_rows_to_read; + DataTypePtr default_type; +}; + +/// Base class for schema inference for formats that don't need any data to +/// determine the schema: formats with constant schema (ex: JSONAsString, LineAsString) +/// and formats that use external format schema (ex: Protobuf, CapnProto). +class IExternalSchemaReader +{ +public: + virtual NamesAndTypesList readSchema() = 0; + + virtual ~IExternalSchemaReader() = default; +}; + +} diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp index b1f9eaa59a1..4af2c651c39 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp @@ -37,6 +37,9 @@ Chunk ArrowBlockInputFormat::generate() if (!stream_reader) prepareReader(); + if (is_stopped) + return {}; + batch_result = stream_reader->Next(); if (batch_result.ok() && !(*batch_result)) return res; @@ -46,6 +49,9 @@ Chunk ArrowBlockInputFormat::generate() if (!file_reader) prepareReader(); + if (is_stopped) + return {}; + if (record_batch_current >= record_batch_total) return res; @@ -79,27 +85,38 @@ void ArrowBlockInputFormat::resetParser() record_batch_current = 0; } +static std::shared_ptr createStreamReader(ReadBuffer & in) +{ + auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique(in)); + if (!stream_reader_status.ok()) + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Error while opening a table: {}", stream_reader_status.status().ToString()); + return *stream_reader_status; +} + +static std::shared_ptr createFileReader(ReadBuffer & in, const FormatSettings & format_settings, std::atomic & is_stopped) +{ + auto arrow_file = asArrowFile(in, format_settings, is_stopped); + if (is_stopped) + return nullptr; + + auto file_reader_status = arrow::ipc::RecordBatchFileReader::Open(std::move(arrow_file)); + if (!file_reader_status.ok()) + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Error while opening a table: {}", file_reader_status.status().ToString()); + return *file_reader_status; +} + + void ArrowBlockInputFormat::prepareReader() { - std::shared_ptr schema; - if (stream) - { - auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique(*in)); - if (!stream_reader_status.ok()) - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Error while opening a table: {}", stream_reader_status.status().ToString()); - stream_reader = *stream_reader_status; - schema = stream_reader->schema(); - } + stream_reader = createStreamReader(*in); else { - auto file_reader_status = arrow::ipc::RecordBatchFileReader::Open(asArrowFile(*in, format_settings)); - if (!file_reader_status.ok()) - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Error while opening a table: {}", file_reader_status.status().ToString()); - file_reader = *file_reader_status; - schema = file_reader->schema(); + file_reader = createFileReader(*in, format_settings, is_stopped); + if (!file_reader) + return; } arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Arrow", format_settings.arrow.import_nested); @@ -112,6 +129,27 @@ void ArrowBlockInputFormat::prepareReader() record_batch_current = 0; } +ArrowSchemaReader::ArrowSchemaReader(ReadBuffer & in_, bool stream_, const FormatSettings & format_settings_) + : ISchemaReader(in_), stream(stream_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ArrowSchemaReader::readSchema() +{ + std::shared_ptr schema; + + if (stream) + schema = createStreamReader(in)->schema(); + else + { + std::atomic is_stopped = 0; + schema = createFileReader(in, format_settings, is_stopped)->schema(); + } + + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, stream ? "ArrowStream" : "Arrow"); + return header.getNamesAndTypesList(); +} + void registerInputFormatArrow(FormatFactory & factory) { factory.registerInputFormat( @@ -135,6 +173,20 @@ void registerInputFormatArrow(FormatFactory & factory) }); } +void registerArrowSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "Arrow", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, false, settings); + }); + factory.registerSchemaReader( + "ArrowStream", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, true, settings); + });} } #else @@ -144,6 +196,8 @@ class FormatFactory; void registerInputFormatArrow(FormatFactory &) { } + +void registerArrowSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.h b/src/Processors/Formats/Impl/ArrowBlockInputFormat.h index 44e18e3f852..62cbf949fc2 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.h @@ -4,6 +4,7 @@ #if USE_ARROW #include +#include #include namespace arrow { class RecordBatchReader; } @@ -27,6 +28,11 @@ public: private: Chunk generate() override; + void onCancel() override + { + is_stopped = 1; + } + // Whether to use ArrowStream format bool stream; // This field is only used for ArrowStream format @@ -42,6 +48,20 @@ private: const FormatSettings format_settings; void prepareReader(); + + std::atomic is_stopped{0}; +}; + +class ArrowSchemaReader : public ISchemaReader +{ +public: + ArrowSchemaReader(ReadBuffer & in_, bool stream_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + bool stream; + const FormatSettings format_settings; }; } diff --git a/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp b/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp index 148faabf352..86d278397c2 100644 --- a/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp +++ b/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp @@ -140,7 +140,7 @@ arrow::Status ArrowInputStreamFromReadBuffer::Close() return arrow::Status(); } -std::shared_ptr asArrowFile(ReadBuffer & in, const FormatSettings & settings) +std::shared_ptr asArrowFile(ReadBuffer & in, const FormatSettings & settings, std::atomic & is_cancelled) { if (auto * fd_in = dynamic_cast(&in)) { @@ -160,7 +160,7 @@ std::shared_ptr asArrowFile(ReadBuffer & in, const std::string file_data; { WriteBufferFromString file_buffer(file_data); - copyData(in, file_buffer); + copyData(in, file_buffer, is_cancelled); } return std::make_shared(arrow::Buffer::FromString(std::move(file_data))); diff --git a/src/Processors/Formats/Impl/ArrowBufferedStreams.h b/src/Processors/Formats/Impl/ArrowBufferedStreams.h index 29c869e4152..d649c52557f 100644 --- a/src/Processors/Formats/Impl/ArrowBufferedStreams.h +++ b/src/Processors/Formats/Impl/ArrowBufferedStreams.h @@ -86,7 +86,7 @@ private: ARROW_DISALLOW_COPY_AND_ASSIGN(ArrowInputStreamFromReadBuffer); }; -std::shared_ptr asArrowFile(ReadBuffer & in, const FormatSettings & settings); +std::shared_ptr asArrowFile(ReadBuffer & in, const FormatSettings & settings, std::atomic & is_cancelled); } diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 272907022a1..aa181ea0b8b 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -239,10 +239,8 @@ static ColumnWithTypeAndName readColumnWithTimestampData(std::shared_ptr -static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr & arrow_column, const String & column_name) +static ColumnWithTypeAndName readColumnWithDecimalDataImpl(std::shared_ptr & arrow_column, const String & column_name, DataTypePtr internal_type) { - const auto * arrow_decimal_type = static_cast(arrow_column->type().get()); - auto internal_type = std::make_shared>(arrow_decimal_type->precision(), arrow_decimal_type->scale()); auto internal_column = internal_type->createColumn(); auto & column = assert_cast &>(*internal_column); auto & column_data = column.getData(); @@ -259,6 +257,21 @@ static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr +static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr & arrow_column, const String & column_name) +{ + const auto * arrow_decimal_type = static_cast(arrow_column->type().get()); + size_t precision = arrow_decimal_type->precision(); + auto internal_type = createDecimal(precision, arrow_decimal_type->scale()); + if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + else if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + else if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); +} + /// Creates a null bytemap from arrow's null bitmap static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr & arrow_column) { @@ -328,12 +341,13 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( const std::string & column_name, const std::string & format_name, bool is_nullable, - std::unordered_map> & dictionary_values) + std::unordered_map> & dictionary_values, + bool read_ints_as_dates) { if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT) { - auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates); auto nullmap_column = readByteMapFromArrowColumn(arrow_column); auto nullable_type = std::make_shared(std::move(nested_column.type)); auto nullable_column = ColumnNullable::create(std::move(nested_column.column), std::move(nullmap_column)); @@ -358,25 +372,27 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::UINT16: { auto column = readColumnWithNumericData(arrow_column, column_name); - column.type = std::make_shared(); + if (read_ints_as_dates) + column.type = std::make_shared(); return column; } case arrow::Type::UINT32: { auto column = readColumnWithNumericData(arrow_column, column_name); - column.type = std::make_shared(); + if (read_ints_as_dates) + column.type = std::make_shared(); return column; } case arrow::Type::TIMESTAMP: return readColumnWithTimestampData(arrow_column, column_name); case arrow::Type::DECIMAL128: - return readColumnWithDecimalData(arrow_column, column_name); + return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::DECIMAL256: - return readColumnWithDecimalData(arrow_column, column_name); + return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::MAP: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); const auto * tuple_column = assert_cast(nested_column.column.get()); @@ -388,7 +404,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::LIST: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); auto array_column = ColumnArray::create(std::move(nested_column.column), std::move(offsets_column)); auto array_type = std::make_shared(nested_column.type); @@ -413,7 +429,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( for (int i = 0; i != arrow_struct_type->num_fields(); ++i) { auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); - auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values); + auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates); tuple_elements.emplace_back(std::move(element.column)); tuple_types.emplace_back(std::move(element.type)); tuple_names.emplace_back(std::move(element.name)); @@ -436,7 +452,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( dict_array.emplace_back(dict_chunk.dictionary()); } auto arrow_dict_column = std::make_shared(dict_array); - auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values); + auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); /// We should convert read column to ColumnUnique. auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn(); @@ -483,7 +499,7 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } -static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name) +Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name) { ColumnsWithTypeAndName sample_columns; for (const auto & field : schema.fields()) @@ -493,24 +509,21 @@ static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::stri std::unique_ptr array_builder; arrow::Status status = MakeBuilder(pool, field->type(), &array_builder); checkStatus(status, field->name(), format_name); + std::shared_ptr arrow_array; status = array_builder->Finish(&arrow_array); checkStatus(status, field->name(), format_name); + arrow::ArrayVector array_vector = {arrow_array}; auto arrow_column = std::make_shared(array_vector); std::unordered_map> dict_values; - ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values); + ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); + sample_columns.emplace_back(std::move(sample_column)); } return Block(std::move(sample_columns)); } -ArrowColumnToCHColumn::ArrowColumnToCHColumn( - const arrow::Schema & schema, const std::string & format_name_, bool import_nested_) - : header(arrowSchemaToCHHeader(schema, format_name_)), format_name(format_name_), import_nested(import_nested_) -{ -} - ArrowColumnToCHColumn::ArrowColumnToCHColumn( const Block & header_, const std::string & format_name_, bool import_nested_) : header(header_), format_name(format_name_), import_nested(import_nested_) @@ -553,7 +566,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (!nested_tables.contains(nested_table_name)) { std::shared_ptr arrow_column = name_to_column_ptr[nested_table_name]; - ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values)}; + ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; Block block(cols); nested_tables[nested_table_name] = std::make_shared(Nested::flatten(block)); } @@ -573,7 +586,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (read_from_nested) column = nested_tables[nested_table_name]->getByName(header_column.name); else - column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values); + column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true); try { diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index 46976093f0b..58f8f1536b5 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -23,16 +23,14 @@ public: ArrowColumnToCHColumn(const Block & header_, const std::string & format_name_, bool import_nested_); - /// Constructor that create header by arrow schema. It will be useful for inserting - /// data from file without knowing table structure. - ArrowColumnToCHColumn(const arrow::Schema & schema, const std::string & format_name, bool import_nested_); - void arrowTableToCHChunk(Chunk & res, std::shared_ptr & table); void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr); + static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name); + private: - const Block header; + const Block & header; const std::string format_name; bool import_nested; diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 11e56ecbe0c..a372df41344 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -815,6 +815,92 @@ const AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(Sc return it->second; } +AvroSchemaReader::AvroSchemaReader(ReadBuffer & in_, bool confluent_, const FormatSettings & format_settings_) + : ISchemaReader(in_), confluent(confluent_), format_settings(format_settings_) +{ +} + +NamesAndTypesList AvroSchemaReader::readSchema() +{ + avro::NodePtr root_node; + if (confluent) + { + UInt32 schema_id = readConfluentSchemaId(in); + root_node = getConfluentSchemaRegistry(format_settings)->getSchema(schema_id).root(); + } + else + { + auto file_reader_ptr = std::make_unique(std::make_unique(in)); + root_node = file_reader_ptr->dataSchema().root(); + } + + if (root_node->type() != avro::Type::AVRO_RECORD) + throw Exception("Root schema must be a record", ErrorCodes::TYPE_MISMATCH); + + NamesAndTypesList names_and_types; + for (size_t i = 0; i != root_node->leaves(); ++i) + names_and_types.emplace_back(root_node->nameAt(i), avroNodeToDataType(root_node->leafAt(i))); + + return names_and_types; +} + +DataTypePtr AvroSchemaReader::avroNodeToDataType(avro::NodePtr node) +{ + switch (node->type()) + { + case avro::Type::AVRO_INT: + return {std::make_shared()}; + case avro::Type::AVRO_LONG: + return std::make_shared(); + case avro::Type::AVRO_BOOL: + return std::make_shared(); + case avro::Type::AVRO_FLOAT: + return std::make_shared(); + case avro::Type::AVRO_DOUBLE: + return std::make_shared(); + case avro::Type::AVRO_STRING: + return std::make_shared(); + case avro::Type::AVRO_BYTES: + return std::make_shared(); + case avro::Type::AVRO_ENUM: + { + if (node->names() < 128) + { + EnumValues::Values values; + for (size_t i = 0; i != node->names(); ++i) + values.emplace_back(node->nameAt(i), i); + return std::make_shared(std::move(values)); + } + else if (node->names() < 32768) + { + EnumValues::Values values; + for (size_t i = 0; i != node->names(); ++i) + values.emplace_back(node->nameAt(i), i); + return std::make_shared(std::move(values)); + } + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ClickHouse supports only 8 and 16-bit Enum."); + } + case avro::Type::AVRO_FIXED: + return std::make_shared(node->fixedSize()); + case avro::Type::AVRO_ARRAY: + return std::make_shared(avroNodeToDataType(node->leafAt(0))); + case avro::Type::AVRO_NULL: + return std::make_shared(); + case avro::Type::AVRO_UNION: + if (node->leaves() == 2 && (node->leafAt(0)->type() == avro::Type::AVRO_NULL || node->leafAt(1)->type() == avro::Type::AVRO_NULL)) + { + size_t nested_leaf_index = node->leafAt(0)->type() == avro::Type::AVRO_NULL ? 1 : 0; + return makeNullable(avroNodeToDataType(node->leafAt(nested_leaf_index))); + } + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Avro type UNION is not supported for inserting."); + case avro::Type::AVRO_SYMBOLIC: + return avroNodeToDataType(avro::resolveSymbol(node)); + default: + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Avro column {} is not supported for inserting."); + } +} + void registerInputFormatAvro(FormatFactory & factory) { factory.registerInputFormat("Avro", []( @@ -836,6 +922,21 @@ void registerInputFormatAvro(FormatFactory & factory) }); } +void registerAvroSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, false, settings); + }); + + factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, true, settings); + }); + +} + + } #else @@ -846,6 +947,8 @@ class FormatFactory; void registerInputFormatAvro(FormatFactory &) { } + +void registerAvroSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.h b/src/Processors/Formats/Impl/AvroRowInputFormat.h index 73237369e56..46e571d87ec 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.h +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -160,6 +161,20 @@ private: FormatSettings format_settings; }; +class AvroSchemaReader : public ISchemaReader +{ +public: + AvroSchemaReader(ReadBuffer & in_, bool confluent_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + DataTypePtr avroNodeToDataType(avro::NodePtr node); + + bool confluent; + const FormatSettings format_settings; +}; + } #endif diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp index 0506c539c0f..b356967a544 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp @@ -5,7 +5,6 @@ #include #include - namespace DB { @@ -15,11 +14,23 @@ namespace ErrorCodes } BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(std::move(header), in_, std::move(params_), with_names_, with_types_, format_settings_) + : RowInputFormatWithNamesAndTypes( + std::move(header), + in_, + std::move(params_), + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, format_settings_)) { } -std::vector BinaryRowInputFormat::readHeaderRow() + +BinaryFormatReader::BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_) +{ +} + +std::vector BinaryFormatReader::readHeaderRow() { std::vector fields; String field; @@ -31,13 +42,13 @@ std::vector BinaryRowInputFormat::readHeaderRow() return fields; } -std::vector BinaryRowInputFormat::readNames() +std::vector BinaryFormatReader::readNames() { readVarUInt(read_columns, *in); return readHeaderRow(); } -std::vector BinaryRowInputFormat::readTypes() +std::vector BinaryFormatReader::readTypes() { auto types = readHeaderRow(); for (const auto & type_name : types) @@ -45,31 +56,37 @@ std::vector BinaryRowInputFormat::readTypes() return types; } -bool BinaryRowInputFormat::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/) +bool BinaryFormatReader::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/) { serialization->deserializeBinary(column, *in); return true; } -void BinaryRowInputFormat::skipHeaderRow() +void BinaryFormatReader::skipHeaderRow() { String tmp; for (size_t i = 0; i < read_columns; ++i) readStringBinary(tmp, *in); } -void BinaryRowInputFormat::skipNames() +void BinaryFormatReader::skipNames() { readVarUInt(read_columns, *in); skipHeaderRow(); } -void BinaryRowInputFormat::skipTypes() +void BinaryFormatReader::skipTypes() { + if (read_columns == 0) + { + /// It's possible only when with_names = false and with_types = true + readVarUInt(read_columns, *in); + } + skipHeaderRow(); } -void BinaryRowInputFormat::skipField(size_t file_column) +void BinaryFormatReader::skipField(size_t file_column) { if (file_column >= read_data_types.size()) throw Exception(ErrorCodes::CANNOT_SKIP_UNKNOWN_FIELD, "Cannot skip unknown field in RowBinaryWithNames format, because it's type is unknown"); @@ -77,6 +94,11 @@ void BinaryRowInputFormat::skipField(size_t file_column) read_data_types[file_column]->getDefaultSerialization()->deserializeBinary(field, *in); } +BinaryWithNamesAndTypesSchemaReader::BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, 0, true, true, &reader), reader(in_, format_settings_) +{ +} + void registerInputFormatRowBinary(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) @@ -94,4 +116,13 @@ void registerInputFormatRowBinary(FormatFactory & factory) registerWithNamesAndTypes("RowBinary", register_func); } +void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} + + } diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.h b/src/Processors/Formats/Impl/BinaryRowInputFormat.h index 61d6df77522..d98e75bf621 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.h +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.h @@ -1,15 +1,19 @@ #pragma once #include -#include #include +#include namespace DB { -class ReadBuffer; +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} +class ReadBuffer; /** A stream for inputting data in a binary line-by-line format. */ @@ -24,9 +28,15 @@ public: /// in this format we cannot provide any DiagnosticInfo, because here we have /// just binary data. std::string getDiagnosticInfo() override { return {}; } +}; + +class BinaryFormatReader : public FormatWithNamesAndTypesReader +{ +public: + BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_); -private: bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; + void skipField(size_t file_column) override; void skipNames() override; @@ -37,9 +47,24 @@ private: std::vector readTypes() override; std::vector readHeaderRow(); +private: /// Data types read from input data. DataTypes read_data_types; - UInt64 read_columns = 0; + UInt64 read_columns; +}; + +class BinaryWithNamesAndTypesSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + DataTypes readRowAndGetDataTypes() override + { + throw Exception{ErrorCodes::NOT_IMPLEMENTED, "Method readRowAndGetDataTypes is not implemented"}; + } + + BinaryFormatReader reader; }; } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 9de2b908b1e..735a549d0a6 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -5,13 +5,16 @@ #include #include #include +#include +#include #include #include -#include +#include +#include + namespace DB { - namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -26,7 +29,14 @@ CSVRowInputFormat::CSVRowInputFormat( bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_) + : RowInputFormatWithNamesAndTypes( + header_, + in_, + params_, + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, format_settings_)) { const String bad_delimiters = " \t\"'.UL"; if (bad_delimiters.find(format_settings.csv.delimiter) != String::npos) @@ -36,6 +46,11 @@ CSVRowInputFormat::CSVRowInputFormat( ErrorCodes::BAD_ARGUMENTS); } +void CSVRowInputFormat::syncAfterError() +{ + skipToNextLineOrEOF(*in); +} + static void skipEndOfLine(ReadBuffer & in) { /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) @@ -52,8 +67,10 @@ static void skipEndOfLine(ReadBuffer & in) if (!in.eof() && *in.position() == '\n') ++in.position(); else - throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." - " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA); + throw Exception( + "Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." + " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", + ErrorCodes::INCORRECT_DATA); } else if (!in.eof()) throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA); @@ -62,32 +79,38 @@ static void skipEndOfLine(ReadBuffer & in) /// Skip `whitespace` symbols allowed in CSV. static inline void skipWhitespacesAndTabs(ReadBuffer & in) { - while (!in.eof() - && (*in.position() == ' ' - || *in.position() == '\t')) + while (!in.eof() && (*in.position() == ' ' || *in.position() == '\t')) ++in.position(); } -void CSVRowInputFormat::skipFieldDelimiter() +CSVFormatReader::CSVFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_) +{ +} + +void CSVFormatReader::skipFieldDelimiter() { skipWhitespacesAndTabs(*in); assertChar(format_settings.csv.delimiter, *in); } -String CSVRowInputFormat::readFieldIntoString() +template +String CSVFormatReader::readCSVFieldIntoString() { skipWhitespacesAndTabs(*in); String field; - readCSVString(field, *in, format_settings.csv); + if constexpr (read_string) + readCSVString(field, *in, format_settings.csv); + else + readCSVField(field, *in, format_settings.csv); return field; } -void CSVRowInputFormat::skipField() +void CSVFormatReader::skipField() { - readFieldIntoString(); + readCSVFieldIntoString(); } -void CSVRowInputFormat::skipRowEndDelimiter() +void CSVFormatReader::skipRowEndDelimiter() { skipWhitespacesAndTabs(*in); @@ -105,33 +128,32 @@ void CSVRowInputFormat::skipRowEndDelimiter() skipEndOfLine(*in); } -void CSVRowInputFormat::skipHeaderRow() +void CSVFormatReader::skipHeaderRow() { do { skipField(); skipWhitespacesAndTabs(*in); - } - while (checkChar(format_settings.csv.delimiter, *in)); + } while (checkChar(format_settings.csv.delimiter, *in)); skipRowEndDelimiter(); } -std::vector CSVRowInputFormat::readHeaderRow() +template +std::vector CSVFormatReader::readRowImpl() { std::vector fields; do { - fields.push_back(readFieldIntoString()); + fields.push_back(readCSVFieldIntoString()); skipWhitespacesAndTabs(*in); - } - while (checkChar(format_settings.csv.delimiter, *in)); + } while (checkChar(format_settings.csv.delimiter, *in)); skipRowEndDelimiter(); return fields; } -bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { const char delimiter = format_settings.csv.delimiter; @@ -144,7 +166,8 @@ bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { if (*in->position() == '\n' || *in->position() == '\r') { - out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected." + out << "ERROR: Line feed found where delimiter (" << delimiter + << ") is expected." " It's like your file has less columns than expected.\n" "And if your file has the right number of columns, maybe it has unescaped quotes in values.\n"; } @@ -160,7 +183,7 @@ bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) return true; } -bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { skipWhitespacesAndTabs(*in); @@ -191,23 +214,21 @@ bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) return true; } -void CSVRowInputFormat::syncAfterError() -{ - skipToNextLineOrEOF(*in); -} - -bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) +bool CSVFormatReader::readField( + IColumn & column, + const DataTypePtr & type, + const SerializationPtr & serialization, + bool is_last_file_column, + const String & /*column_name*/) { skipWhitespacesAndTabs(*in); const bool at_delimiter = !in->eof() && *in->position() == format_settings.csv.delimiter; - const bool at_last_column_line_end = is_last_file_column - && (in->eof() || *in->position() == '\n' || *in->position() == '\r'); + const bool at_last_column_line_end = is_last_file_column && (in->eof() || *in->position() == '\n' || *in->position() == '\r'); /// Note: Tuples are serialized in CSV as separate columns, but with empty_as_default or null_as_default /// only one empty or NULL column will be expected - if (format_settings.csv.empty_as_default - && (at_delimiter || at_last_column_line_end)) + if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end)) { /// Treat empty unquoted column value as default value, if /// specified in the settings. Tuple columns might seem @@ -231,6 +252,31 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co } } + +CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_) + : FormatWithNamesAndTypesSchemaReader( + in_, + format_setting_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV)) + , reader(in_, format_setting_) + , context(context_) +{ +} + + +DataTypes CSVSchemaReader::readRowAndGetDataTypes() +{ + if (in.eof()) + return {}; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV, context); +} + + void registerInputFormatCSV(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) @@ -326,4 +372,17 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory) registerWithNamesAndTypes("CSV", register_func); } +void registerCSVSchemaReader(FormatFactory & factory) +{ + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, with_names, with_types, settings, context); + }); + }; + + registerWithNamesAndTypes("CSV", register_func); +} + } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index d7c557b58d8..d723647595e 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -5,6 +5,7 @@ #include #include +#include #include @@ -28,6 +29,12 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; +}; + +class CSVFormatReader : public FormatWithNamesAndTypesReader +{ +public: + CSVFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_); bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; @@ -42,17 +49,34 @@ private: void skipField(size_t /*file_column*/) override { skipField(); } void skipField(); - void skipHeaderRow() ; + void skipHeaderRow(); void skipNames() override { skipHeaderRow(); } void skipTypes() override { skipHeaderRow(); } void skipFieldDelimiter() override; void skipRowEndDelimiter() override; - std::vector readHeaderRow(); std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } + std::vector readHeaderRow() { return readRowImpl(); } + std::vector readRow() { return readRowImpl(); } - String readFieldIntoString(); + template + std::vector readRowImpl(); + + template + String readCSVFieldIntoString(); +}; + +class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + CSVFormatReader reader; + ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index 4d000bb1f35..311f4742335 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -273,6 +273,7 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension #endif auto root_reader = msg.getRoot(root); + for (size_t i = 0; i != columns.size(); ++i) { auto value = getReaderByColumnName(root_reader, column_names[i]); @@ -282,6 +283,24 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension return true; } +CapnProtoSchemaReader::CapnProtoSchemaReader(const FormatSettings & format_settings_) : format_settings(format_settings_) +{ +} + +NamesAndTypesList CapnProtoSchemaReader::readSchema() +{ + auto schema_info = FormatSchemaInfo( + format_settings.schema.format_schema, + "CapnProto", + true, + format_settings.schema.is_server, + format_settings.schema.format_schema_path); + + auto schema_parser = CapnProtoSchemaParser(); + auto schema = schema_parser.getMessageSchema(schema_info); + return capnProtoSchemaToCHSchema(schema); +} + void registerInputFormatCapnProto(FormatFactory & factory) { factory.registerInputFormat( @@ -293,6 +312,14 @@ void registerInputFormatCapnProto(FormatFactory & factory) }); } +void registerCapnProtoSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("CapnProto", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); +} + } #else @@ -301,6 +328,7 @@ namespace DB { class FormatFactory; void registerInputFormatCapnProto(FormatFactory &) {} + void registerCapnProtoSchemaReader(FormatFactory &) {} } #endif // USE_CAPNP diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h index 4c0f34d70a3..053de14d1a4 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -38,6 +39,17 @@ private: Names column_names; }; +class CapnProtoSchemaReader : public IExternalSchemaReader +{ +public: + explicit CapnProtoSchemaReader(const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; +}; + } #endif // USE_CAPNP diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 56aa4345777..d2e0d6e21a9 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -31,7 +31,7 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( bool ignore_spaces_, const FormatSettings & format_settings_) : CustomSeparatedRowInputFormat( - header_, std::make_unique(in_buf_), params_, with_names_, with_types_, ignore_spaces_, format_settings_) + header_, std::make_unique(in_buf_), params_, with_names_, with_types_, ignore_spaces_, updateFormatSettings(format_settings_)) { } @@ -43,10 +43,15 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( bool with_types_, bool ignore_spaces_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, *buf_, params_, with_names_, with_types_, updateFormatSettings(format_settings_)) + : RowInputFormatWithNamesAndTypes( + header_, + *buf_, + params_, + with_names_, + with_types_, + format_settings_, + std::make_unique(*buf_, ignore_spaces_, format_settings_)) , buf(std::move(buf_)) - , ignore_spaces(ignore_spaces_) - , escaping_rule(format_settings_.custom.escaping_rule) { /// In case of CustomSeparatedWithNames(AndTypes) formats and enabled setting input_format_with_names_use_header we don't know /// the exact number of columns in data (because it can contain unknown columns). So, if field_delimiter and row_after_delimiter are @@ -61,43 +66,76 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( } } -void CustomSeparatedRowInputFormat::skipPrefixBeforeHeader() + +bool CustomSeparatedRowInputFormat::allowSyncAfterError() const +{ + return !format_settings.custom.row_after_delimiter.empty() || !format_settings.custom.row_between_delimiter.empty(); +} + +void CustomSeparatedRowInputFormat::syncAfterError() +{ + skipToNextRowOrEof(*buf, format_settings.custom.row_after_delimiter, format_settings.custom.row_between_delimiter, ignore_spaces); + end_of_stream = buf->eof(); + /// It can happen that buf->position() is not at the beginning of row + /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter. + /// It will cause another parsing error. +} + +void CustomSeparatedRowInputFormat::setReadBuffer(ReadBuffer & in_) +{ + buf = std::make_unique(in_); + RowInputFormatWithNamesAndTypes::setReadBuffer(*buf); +} + +CustomSeparatedFormatReader::CustomSeparatedFormatReader( + PeekableReadBuffer & buf_, bool ignore_spaces_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesReader(buf_, format_settings_), buf(&buf_), ignore_spaces(ignore_spaces_) +{ +} + +void CustomSeparatedRowInputFormat::resetParser() +{ + RowInputFormatWithNamesAndTypes::resetParser(); + buf->reset(); +} + +void CustomSeparatedFormatReader::skipPrefixBeforeHeader() { skipSpaces(); assertString(format_settings.custom.result_before_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipRowStartDelimiter() +void CustomSeparatedFormatReader::skipRowStartDelimiter() { skipSpaces(); assertString(format_settings.custom.row_before_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipFieldDelimiter() +void CustomSeparatedFormatReader::skipFieldDelimiter() { skipSpaces(); assertString(format_settings.custom.field_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipRowEndDelimiter() +void CustomSeparatedFormatReader::skipRowEndDelimiter() { skipSpaces(); assertString(format_settings.custom.row_after_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipRowBetweenDelimiter() +void CustomSeparatedFormatReader::skipRowBetweenDelimiter() { skipSpaces(); assertString(format_settings.custom.row_between_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipField() +void CustomSeparatedFormatReader::skipField() { skipSpaces(); - skipFieldByEscapingRule(*buf, escaping_rule, format_settings); + skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); } -bool CustomSeparatedRowInputFormat::checkEndOfRow() +bool CustomSeparatedFormatReader::checkEndOfRow() { PeekableReadBufferCheckpoint checkpoint{*buf, true}; @@ -118,43 +156,66 @@ bool CustomSeparatedRowInputFormat::checkEndOfRow() return checkForSuffixImpl(true); } -std::vector CustomSeparatedRowInputFormat::readHeaderRow() +template +String CustomSeparatedFormatReader::readFieldIntoString(bool is_first) +{ + if (!is_first) + skipFieldDelimiter(); + skipSpaces(); + if constexpr (is_header) + return readStringByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); + else + return readFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); +} + +template +std::vector CustomSeparatedFormatReader::readRowImpl() { std::vector values; skipRowStartDelimiter(); - do + + if (columns == 0) { - if (!values.empty()) - skipFieldDelimiter(); - skipSpaces(); - values.push_back(readStringByEscapingRule(*buf, escaping_rule, format_settings)); + do + { + values.push_back(readFieldIntoString(values.empty())); + } while (!checkEndOfRow()); + columns = values.size(); + } + else + { + for (size_t i = 0; i != columns; ++i) + values.push_back(readFieldIntoString(i == 0)); } - while (!checkEndOfRow()); skipRowEndDelimiter(); return values; } -void CustomSeparatedRowInputFormat::skipHeaderRow() +void CustomSeparatedFormatReader::skipHeaderRow() { - size_t columns = getPort().getHeader().columns(); skipRowStartDelimiter(); - for (size_t i = 0; i != columns; ++i) + bool first = true; + do { - skipField(); - if (i + 1 != columns) + if (!first) skipFieldDelimiter(); + first = false; + + skipField(); } + while (!checkEndOfRow()); + skipRowEndDelimiter(); } -bool CustomSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &) +bool CustomSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &) { skipSpaces(); - return deserializeFieldByEscapingRule(type, serialization, column, *buf, escaping_rule, format_settings); + return deserializeFieldByEscapingRule(type, serialization, column, *buf, format_settings.custom.escaping_rule, format_settings); } -bool CustomSeparatedRowInputFormat::checkForSuffixImpl(bool check_eof) +bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof) { skipSpaces(); if (format_settings.custom.result_after_delimiter.empty()) @@ -177,7 +238,7 @@ bool CustomSeparatedRowInputFormat::checkForSuffixImpl(bool check_eof) return false; } -bool CustomSeparatedRowInputFormat::tryParseSuffixWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::tryParseSuffixWithDiagnosticInfo(WriteBuffer & out) { PeekableReadBufferCheckpoint checkpoint{*buf}; if (checkForSuffixImpl(false)) @@ -192,7 +253,7 @@ bool CustomSeparatedRowInputFormat::tryParseSuffixWithDiagnosticInfo(WriteBuffer return true; } -bool CustomSeparatedRowInputFormat::checkForSuffix() +bool CustomSeparatedFormatReader::checkForSuffix() { PeekableReadBufferCheckpoint checkpoint{*buf}; if (checkForSuffixImpl(true)) @@ -201,51 +262,60 @@ bool CustomSeparatedRowInputFormat::checkForSuffix() return false; } - -bool CustomSeparatedRowInputFormat::allowSyncAfterError() const -{ - return !format_settings.custom.row_after_delimiter.empty() || !format_settings.custom.row_between_delimiter.empty(); -} - -void CustomSeparatedRowInputFormat::syncAfterError() -{ - skipToNextRowOrEof(*buf, format_settings.custom.row_after_delimiter, format_settings.custom.row_between_delimiter, ignore_spaces); - end_of_stream = buf->eof(); - /// It can happen that buf->position() is not at the beginning of row - /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter. - /// It will cause another parsing error. -} - -bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces); } -bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.field_delimiter, "delimiter between fields", ignore_spaces); } -bool CustomSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_after_delimiter, "delimiter after last field", ignore_spaces); } -bool CustomSeparatedRowInputFormat::parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_between_delimiter, "delimiter between rows", ignore_spaces); } -void CustomSeparatedRowInputFormat::resetParser() +void CustomSeparatedFormatReader::setReadBuffer(ReadBuffer & in_) { - RowInputFormatWithNamesAndTypes::resetParser(); - buf->reset(); + buf = assert_cast(&in_); + FormatWithNamesAndTypesReader::setReadBuffer(in_); } -void CustomSeparatedRowInputFormat::setReadBuffer(ReadBuffer & in_) +CustomSeparatedSchemaReader::CustomSeparatedSchemaReader( + ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_) + : FormatWithNamesAndTypesSchemaReader( + buf, + format_setting_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(format_setting_.custom.escaping_rule)) + , buf(in_) + , reader(buf, ignore_spaces_, updateFormatSettings(format_setting_)) + , context(context_) { - buf = std::make_unique(in_); - IInputFormat::setReadBuffer(*buf); +} + +DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes() +{ + if (reader.checkForSuffix()) + return {}; + + if (!first_row || with_names || with_types) + reader.skipRowBetweenDelimiter(); + + if (first_row) + first_row = false; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), context); } void registerInputFormatCustomSeparated(FormatFactory & factory) @@ -267,4 +337,20 @@ void registerInputFormatCustomSeparated(FormatFactory & factory) } } +void registerCustomSeparatedSchemaReader(FormatFactory & factory) +{ + for (bool ignore_spaces : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, with_names, with_types, ignore_spaces, settings, context); + }); + }; + + registerWithNamesAndTypes(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", register_func); + } +} + } diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index 6b572ca1417..d38d5bf0da4 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -19,7 +19,6 @@ public: void resetParser() override; String getName() const override { return "CustomSeparatedRowInputFormat"; } - void setReadBuffer(ReadBuffer & in_) override; private: @@ -28,6 +27,19 @@ private: std::unique_ptr in_buf_, const Params & params_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_settings_); + + bool allowSyncAfterError() const override; + void syncAfterError() override; + + std::unique_ptr buf; + bool ignore_spaces; +}; + +class CustomSeparatedFormatReader : public FormatWithNamesAndTypesReader +{ +public: + CustomSeparatedFormatReader(PeekableReadBuffer & buf_, bool ignore_spaces_, const FormatSettings & format_settings_); + using EscapingRule = FormatSettings::EscapingRule; bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; @@ -46,9 +58,6 @@ private: bool checkForSuffix() override; - bool allowSyncAfterError() const override; - void syncAfterError() override; - bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; @@ -57,15 +66,41 @@ private: std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } - std::vector readHeaderRow(); + std::vector readHeaderRow() {return readRowImpl(); } + + std::vector readRow() { return readRowImpl(); } bool checkEndOfRow(); bool checkForSuffixImpl(bool check_eof); inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } - std::unique_ptr buf; + EscapingRule getEscapingRule() { return format_settings.custom.escaping_rule; } + + void setReadBuffer(ReadBuffer & in_) override; +private: + template + std::vector readRowImpl(); + + template + String readFieldIntoString(bool is_first); + + PeekableReadBuffer * buf; bool ignore_spaces; - EscapingRule escaping_rule; + size_t columns = 0; +}; + +class CustomSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + CustomSeparatedFormatReader reader; + ContextPtr context; + bool first_row = true; }; } diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index 476985c2509..56ba975dea1 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -202,4 +202,12 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factor factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsString", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } +void registerJSONAsStringSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("JSONAsString", [](const FormatSettings &) + { + return std::make_shared(); + }); +} + } diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h index d86142af795..ea6e9a1ed2f 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h @@ -1,8 +1,10 @@ #pragma once #include +#include #include #include +#include namespace DB { @@ -39,4 +41,13 @@ private: bool allow_new_rows = true; }; +class JSONAsStringExternalSchemaReader : public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"json", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 88fb411ffbd..263702ad20f 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -8,16 +9,13 @@ #include #include #include +#include +#include +#include namespace DB { -namespace ErrorCodes -{ - extern const int INCORRECT_DATA; -} - - JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat( const Block & header_, ReadBuffer & in_, @@ -26,24 +24,40 @@ JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat( bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, std::move(params_), with_names_, with_types_, format_settings_) - , yield_strings(yield_strings_) + : RowInputFormatWithNamesAndTypes( + header_, + in_, + std::move(params_), + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, yield_strings_, format_settings_)) { } -void JSONCompactEachRowRowInputFormat::skipRowStartDelimiter() +void JSONCompactEachRowRowInputFormat::syncAfterError() +{ + skipToUnescapedNextLineOrEOF(*in); +} + +JSONCompactEachRowFormatReader::JSONCompactEachRowFormatReader(ReadBuffer & in_, bool yield_strings_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesReader(in_, format_settings_), yield_strings(yield_strings_) +{ +} + +void JSONCompactEachRowFormatReader::skipRowStartDelimiter() { skipWhitespaceIfAny(*in); assertChar('[', *in); } -void JSONCompactEachRowRowInputFormat::skipFieldDelimiter() +void JSONCompactEachRowFormatReader::skipFieldDelimiter() { skipWhitespaceIfAny(*in); assertChar(',', *in); } -void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter() +void JSONCompactEachRowFormatReader::skipRowEndDelimiter() { skipWhitespaceIfAny(*in); assertChar(']', *in); @@ -55,29 +69,18 @@ void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter() skipWhitespaceIfAny(*in); } -String JSONCompactEachRowRowInputFormat::readFieldIntoString() +void JSONCompactEachRowFormatReader::skipField() { skipWhitespaceIfAny(*in); - String field; - readJSONString(field, *in); - return field; + skipJSONField(*in, "skipped_field"); } -void JSONCompactEachRowRowInputFormat::skipField(size_t file_column) -{ - skipWhitespaceIfAny(*in); - skipJSONField(*in, column_mapping->names_of_columns[file_column]); -} - -void JSONCompactEachRowRowInputFormat::skipHeaderRow() +void JSONCompactEachRowFormatReader::skipHeaderRow() { skipRowStartDelimiter(); - size_t i = 0; do { - if (i >= column_mapping->names_of_columns.size()) - throw Exception(ErrorCodes::INCORRECT_DATA, "The number of columns in a row differs from the number of column names"); - skipField(i++); + skipField(); skipWhitespaceIfAny(*in); } while (checkChar(',', *in)); @@ -85,13 +88,16 @@ void JSONCompactEachRowRowInputFormat::skipHeaderRow() skipRowEndDelimiter(); } -std::vector JSONCompactEachRowRowInputFormat::readHeaderRow() +std::vector JSONCompactEachRowFormatReader::readHeaderRow() { skipRowStartDelimiter(); std::vector fields; + String field; do { - fields.push_back(readFieldIntoString()); + skipWhitespaceIfAny(*in); + readJSONString(field, *in); + fields.push_back(field); skipWhitespaceIfAny(*in); } while (checkChar(',', *in)); @@ -100,18 +106,13 @@ std::vector JSONCompactEachRowRowInputFormat::readHeaderRow() return fields; } -bool JSONCompactEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) +bool JSONCompactEachRowFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) { skipWhitespaceIfAny(*in); return readFieldImpl(*in, column, type, serialization, column_name, format_settings, yield_strings); } -void JSONCompactEachRowRowInputFormat::syncAfterError() -{ - skipToUnescapedNextLineOrEOF(*in); -} - -bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) { skipWhitespaceIfAny(*in); if (!checkChar('[', *in)) @@ -123,7 +124,7 @@ bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuff return true; } -bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { try { @@ -150,7 +151,7 @@ bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(Wri return true; } -bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { skipWhitespaceIfAny(*in); @@ -180,6 +181,20 @@ bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer return true; } +JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, format_settings_.max_rows_to_read_for_schema_inference, with_names_, with_types_, &reader), reader(in_, yield_strings_, format_settings_) +{ +} + +DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes() +{ + skipWhitespaceIfAny(in); + if (in.eof()) + return {}; + + return readRowAndGetDataTypesForJSONCompactEachRow(in, reader.yieldStrings()); +} + void registerInputFormatJSONCompactEachRow(FormatFactory & factory) { for (bool yield_strings : {true, false}) @@ -200,6 +215,21 @@ void registerInputFormatJSONCompactEachRow(FormatFactory & factory) } } +void registerJSONCompactEachRowSchemaReader(FormatFactory & factory) +{ + for (bool json_strings : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, with_names, with_types, json_strings, settings); + }); + }; + registerWithNamesAndTypes(json_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func); + } +} + void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h index e01a4f49b30..0551aa8b64e 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -10,6 +11,7 @@ namespace DB class ReadBuffer; + /** A stream for reading data in a bunch of formats: * - JSONCompactEachRow * - JSONCompactEachRowWithNamesAndTypes @@ -34,6 +36,13 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; +}; + +class JSONCompactEachRowFormatReader : public FormatWithNamesAndTypesReader +{ +public: + JSONCompactEachRowFormatReader(ReadBuffer & in_, bool yield_strings_, const FormatSettings & format_settings_); + bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; @@ -45,7 +54,8 @@ private: bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; - void skipField(size_t file_column) override; + void skipField(size_t /*column_index*/) override { skipField(); } + void skipField(); void skipHeaderRow(); void skipNames() override { skipHeaderRow(); } void skipTypes() override { skipHeaderRow(); } @@ -56,9 +66,21 @@ private: std::vector readHeaderRow(); std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } - String readFieldIntoString(); + bool yieldStrings() const { return yield_strings; } +private: bool yield_strings; }; +class JSONCompactEachRowRowSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_); + +private: + DataTypes readRowAndGetDataTypes() override; + + JSONCompactEachRowFormatReader reader; +}; + } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 28481313974..75beca955b9 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -286,11 +287,7 @@ void JSONEachRowRowInputFormat::readPrefix() skipBOMIfExists(*in); skipWhitespaceIfAny(*in); - if (!in->eof() && *in->position() == '[') - { - ++in->position(); - data_in_square_brackets = true; - } + data_in_square_brackets = checkChar('[', *in); } void JSONEachRowRowInputFormat::readSuffix() @@ -309,6 +306,28 @@ void JSONEachRowRowInputFormat::readSuffix() assertEOF(*in); } +JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings_, const FormatSettings & format_settings) + : IRowWithNamesSchemaReader(in_, format_settings.max_rows_to_read_for_schema_inference), json_strings(json_strings_) +{ +} + + +std::unordered_map JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes() +{ + if (first_row) + { + skipBOMIfExists(in); + skipWhitespaceIfAny(in); + checkChar('[', in); + first_row = false; + } + + skipWhitespaceIfAny(in); + if (in.eof()) + return {}; + + return readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings); +} void registerInputFormatJSONEachRow(FormatFactory & factory) { @@ -343,4 +362,17 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory factory.registerNonTrivialPrefixAndSuffixChecker("JSONStringsEachRow", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } +void registerJSONEachRowSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_unique(buf, false, settings); + }); + + factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_unique(buf, true, settings); + }); +} + } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index 9810f2dc765..323909a7730 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -84,4 +85,16 @@ private: bool yield_strings; }; +class JSONEachRowSchemaReader : public IRowWithNamesSchemaReader +{ +public: + JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings); + +private: + std::unordered_map readRowAndGetNamesAndDataTypes() override; + + bool json_strings; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp index 1a05f61d36b..5983f3170e5 100644 --- a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp @@ -72,4 +72,13 @@ void registerInputFormatLineAsString(FormatFactory & factory) return std::make_shared(sample, buf, params); }); } + +void registerLineAsStringSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("LineAsString", []( + const FormatSettings &) + { + return std::make_shared(); + }); +} } diff --git a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h index 1a6c6247558..c4c17c47dbe 100644 --- a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h @@ -1,7 +1,9 @@ #pragma once #include +#include #include +#include namespace DB { @@ -26,4 +28,13 @@ private: void readLineObject(IColumn & column); }; +class LinaAsStringSchemaReader : public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"line", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index 60db32d879a..c56af536e15 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,8 @@ namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int INCORRECT_DATA; + extern const int BAD_ARGUMENTS; + extern const int UNEXPECTED_END_OF_FILE; } MsgPackRowInputFormat::MsgPackRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) @@ -369,7 +372,108 @@ bool MsgPackRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & void MsgPackRowInputFormat::setReadBuffer(ReadBuffer & in_) { buf = std::make_unique(in_); - IInputFormat::setReadBuffer(*buf); + IInputFormat::setReadBuffer(in_); +} + +MsgPackSchemaReader::MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns) +{ + if (!number_of_columns) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "You must specify setting input_format_msgpack_number_of_columns to extract table schema from MsgPack data"); +} + + +msgpack::object_handle MsgPackSchemaReader::readObject() +{ + if (buf.eof()) + throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected eof while parsing msgpack object"); + + PeekableReadBufferCheckpoint checkpoint{buf}; + size_t offset = 0; + bool need_more_data = true; + msgpack::object_handle object_handle; + while (need_more_data) + { + offset = 0; + try + { + object_handle = msgpack::unpack(buf.position(), buf.buffer().end() - buf.position(), offset); + need_more_data = false; + } + catch (msgpack::insufficient_bytes &) + { + buf.position() = buf.buffer().end(); + if (buf.eof()) + throw Exception("Unexpected end of file while parsing msgpack object", ErrorCodes::UNEXPECTED_END_OF_FILE); + buf.position() = buf.buffer().end(); + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + } + } + buf.position() += offset; + return object_handle; +} + +DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) +{ + switch (object.type) + { + case msgpack::type::object_type::POSITIVE_INTEGER: [[fallthrough]]; + case msgpack::type::object_type::NEGATIVE_INTEGER: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::FLOAT32: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::FLOAT64: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::BOOLEAN: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::BIN: [[fallthrough]]; + case msgpack::type::object_type::STR: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::ARRAY: + { + msgpack::object_array object_array = object.via.array; + if (object_array.size) + { + auto nested_type = getDataType(object_array.ptr[0]); + if (nested_type) + return std::make_shared(getDataType(object_array.ptr[0])); + } + return nullptr; + } + case msgpack::type::object_type::MAP: + { + msgpack::object_map object_map = object.via.map; + if (object_map.size) + { + auto key_type = removeNullable(getDataType(object_map.ptr[0].key)); + auto value_type = getDataType(object_map.ptr[0].val); + if (key_type && value_type) + return std::make_shared(key_type, value_type); + } + return nullptr; + } + case msgpack::type::object_type::NIL: + return nullptr; + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack type is not supported"); + } +} + +DataTypes MsgPackSchemaReader::readRowAndGetDataTypes() +{ + if (buf.eof()) + return {}; + + DataTypes data_types; + data_types.reserve(number_of_columns); + for (size_t i = 0; i != number_of_columns; ++i) + { + auto object_handle = readObject(); + data_types.push_back(getDataType(object_handle.get())); + } + + return data_types; } void registerInputFormatMsgPack(FormatFactory & factory) @@ -384,6 +488,14 @@ void registerInputFormatMsgPack(FormatFactory & factory) }); } +void registerMsgPackSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} + } #else @@ -394,6 +506,10 @@ class FormatFactory; void registerInputFormatMsgPack(FormatFactory &) { } + +void registerMsgPackSchemaReader(FormatFactory &) +{ +} } #endif diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h index bb3887695eb..dd5655c80fc 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h @@ -6,6 +6,7 @@ #if USE_MSGPACK #include +#include #include #include #include @@ -76,6 +77,20 @@ private: const DataTypes data_types; }; +class MsgPackSchemaReader : public IRowSchemaReader +{ +public: + MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + msgpack::object_handle readObject(); + DataTypePtr getDataType(const msgpack::object & object); + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + UInt64 number_of_columns; +}; + } #endif diff --git a/src/Processors/Formats/Impl/NativeFormat.cpp b/src/Processors/Formats/Impl/NativeFormat.cpp index 07cf4670981..19e2ede6b65 100644 --- a/src/Processors/Formats/Impl/NativeFormat.cpp +++ b/src/Processors/Formats/Impl/NativeFormat.cpp @@ -1,8 +1,10 @@ #include #include + #include #include #include +#include #include @@ -82,6 +84,20 @@ private: NativeWriter writer; }; +class NativeSchemaReader : public ISchemaReader +{ +public: + explicit NativeSchemaReader(ReadBuffer & in_) : ISchemaReader(in_) {} + + NamesAndTypesList readSchema() override + { + auto reader = NativeReader(in, 0); + auto block = reader.read(); + return block.getNamesAndTypesList(); + } +}; + + void registerInputFormatNative(FormatFactory & factory) { factory.registerInputFormat("Native", []( @@ -106,4 +122,14 @@ void registerOutputFormatNative(FormatFactory & factory) }); } + +void registerNativeSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &, ContextPtr) + { + return std::make_shared(buf); + }); +} + + } diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index bac7f25a8e9..9a787e5a614 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -30,6 +30,9 @@ Chunk ORCBlockInputFormat::generate() if (!file_reader) prepareReader(); + if (is_stopped) + return {}; + std::shared_ptr batch_reader; auto result = file_reader->NextStripeReader(format_settings.orc.row_batch_size, include_indices); if (!result.ok()) @@ -84,9 +87,18 @@ static size_t countIndicesForType(std::shared_ptr type) return 1; } -void ORCBlockInputFormat::prepareReader() +static void getFileReaderAndSchema( + ReadBuffer & in, + std::unique_ptr & file_reader, + std::shared_ptr & schema, + const FormatSettings & format_settings, + std::atomic & is_stopped) { - auto result = arrow::adapters::orc::ORCFileReader::Open(asArrowFile(*in, format_settings), arrow::default_memory_pool()); + auto arrow_file = asArrowFile(in, format_settings, is_stopped); + if (is_stopped) + return; + + auto result = arrow::adapters::orc::ORCFileReader::Open(std::move(arrow_file), arrow::default_memory_pool()); if (!result.ok()) throw Exception(result.status().ToString(), ErrorCodes::BAD_ARGUMENTS); file_reader = std::move(result).ValueOrDie(); @@ -94,7 +106,15 @@ void ORCBlockInputFormat::prepareReader() auto read_schema_result = file_reader->ReadSchema(); if (!read_schema_result.ok()) throw Exception(read_schema_result.status().ToString(), ErrorCodes::BAD_ARGUMENTS); - std::shared_ptr schema = std::move(read_schema_result).ValueOrDie(); + schema = std::move(read_schema_result).ValueOrDie(); +} + +void ORCBlockInputFormat::prepareReader() +{ + std::shared_ptr schema; + getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped); + if (is_stopped) + return; arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "ORC", format_settings.orc.import_nested); @@ -121,7 +141,21 @@ void ORCBlockInputFormat::prepareReader() } } -void registerInputFormatORC(FormatFactory &factory) +ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ORCSchemaReader::readSchema() +{ + std::unique_ptr file_reader; + std::shared_ptr schema; + std::atomic is_stopped = 0; + getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "ORC"); + return header.getNamesAndTypesList(); +} + +void registerInputFormatORC(FormatFactory & factory) { factory.registerInputFormat( "ORC", @@ -135,6 +169,17 @@ void registerInputFormatORC(FormatFactory &factory) factory.markFormatAsColumnOriented("ORC"); } +void registerORCSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "ORC", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + } + ); +} + } #else @@ -144,6 +189,10 @@ namespace DB void registerInputFormatORC(FormatFactory &) { } + + void registerORCSchemaReader(FormatFactory &) + { + } } #endif diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index 639aaee73bb..9b55747f552 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -3,6 +3,7 @@ #if USE_ORC #include +#include #include #include @@ -29,6 +30,11 @@ public: protected: Chunk generate() override; + void onCancel() override + { + is_stopped = 1; + } + private: // TODO: check that this class implements every part of its parent @@ -45,6 +51,19 @@ private: const FormatSettings format_settings; void prepareReader(); + + std::atomic is_stopped{0}; +}; + +class ORCSchemaReader : public ISchemaReader +{ +public: + ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; }; } diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 4c8f6ab2c54..651b9545c81 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -87,6 +87,7 @@ ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & t { return orc::createPrimitiveType(orc::TypeKind::DOUBLE); } + case TypeIndex::Date32: [[fallthrough]]; case TypeIndex::Date: { return orc::createPrimitiveType(orc::TypeKind::DATE); @@ -292,6 +293,7 @@ void ORCBlockOutputFormat::writeColumn( writeNumbers(orc_column, column, null_bytemap, [](const UInt16 & value){ return value; }); break; } + case TypeIndex::Date32: [[fallthrough]]; case TypeIndex::Int32: { writeNumbers(orc_column, column, null_bytemap, [](const Int32 & value){ return value; }); diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 901531d81cf..1d303014d31 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -44,6 +44,9 @@ Chunk ParquetBlockInputFormat::generate() if (!file_reader) prepareReader(); + if (is_stopped) + return {}; + if (row_group_current >= row_group_total) return res; @@ -91,15 +94,30 @@ static size_t countIndicesForType(std::shared_ptr type) return 1; } +static void getFileReaderAndSchema( + ReadBuffer & in, + std::unique_ptr & file_reader, + std::shared_ptr & schema, + const FormatSettings & format_settings, + std::atomic & is_stopped) +{ + auto arrow_file = asArrowFile(in, format_settings, is_stopped); + if (is_stopped) + return; + THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader)); + THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); +} + void ParquetBlockInputFormat::prepareReader() { - THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(asArrowFile(*in, format_settings), arrow::default_memory_pool(), &file_reader)); + std::shared_ptr schema; + getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped); + if (is_stopped) + return; + row_group_total = file_reader->num_row_groups(); row_group_current = 0; - std::shared_ptr schema; - THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); - arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested); std::unordered_set nested_table_names; @@ -123,7 +141,21 @@ void ParquetBlockInputFormat::prepareReader() } } -void registerInputFormatParquet(FormatFactory &factory) +ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ParquetSchemaReader::readSchema() +{ + std::unique_ptr file_reader; + std::shared_ptr schema; + std::atomic is_stopped = 0; + getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "Parquet"); + return header.getNamesAndTypesList(); +} + +void registerInputFormatParquet(FormatFactory & factory) { factory.registerInputFormat( "Parquet", @@ -137,6 +169,17 @@ void registerInputFormatParquet(FormatFactory &factory) factory.markFormatAsColumnOriented("Parquet"); } +void registerParquetSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "Parquet", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + } + ); +} + } #else @@ -147,6 +190,8 @@ class FormatFactory; void registerInputFormatParquet(FormatFactory &) { } + +void registerParquetSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index 472aec66da3..dbc99c08a35 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -3,6 +3,7 @@ #if USE_PARQUET #include +#include #include namespace parquet::arrow { class FileReader; } @@ -28,6 +29,11 @@ private: void prepareReader(); + void onCancel() override + { + is_stopped = 1; + } + std::unique_ptr file_reader; int row_group_total = 0; // indices of columns to read from Parquet file @@ -35,6 +41,19 @@ private: std::unique_ptr arrow_column_to_ch_column; int row_group_current = 0; const FormatSettings format_settings; + + std::atomic is_stopped{0}; +}; + +class ParquetSchemaReader : public ISchemaReader +{ +public: + ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; }; } diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index df7b7102739..66da27e8829 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -73,6 +73,34 @@ void registerInputFormatProtobuf(FormatFactory & factory) } } +ProtobufSchemaReader::ProtobufSchemaReader(const FormatSettings & format_settings) + : schema_info( + format_settings.schema.format_schema, + "Protobuf", + true, + format_settings.schema.is_server, + format_settings.schema.format_schema_path) +{ +} + +NamesAndTypesList ProtobufSchemaReader::readSchema() +{ + const auto * message_descriptor = ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info); + return protobufSchemaToCHSchema(message_descriptor); +} + +void registerProtobufSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("Protobuf", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); + factory.registerExternalSchemaReader("ProtobufSingle", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); +} + } #else @@ -81,6 +109,8 @@ namespace DB { class FormatFactory; void registerInputFormatProtobuf(FormatFactory &) {} + +void registerProtobufSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h index 6f465e3f0b8..d7d16d36ddf 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h @@ -3,7 +3,9 @@ #include "config_formats.h" #if USE_PROTOBUF +# include # include +# include namespace DB { @@ -42,5 +44,16 @@ private: std::unique_ptr serializer; }; +class ProtobufSchemaReader : public IExternalSchemaReader +{ +public: + explicit ProtobufSchemaReader(const FormatSettings & format_settings); + + NamesAndTypesList readSchema() override; + +private: + FormatSchemaInfo schema_info; +}; + } #endif diff --git a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp index 34424fffd34..91b1cc60fae 100644 --- a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp @@ -51,5 +51,14 @@ void registerInputFormatRawBLOB(FormatFactory & factory) }); } +void registerRawBLOBSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("RawBLOB", []( + const FormatSettings &) + { + return std::make_shared(); + }); +} + } diff --git a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h index 343af9f4068..367ca04f9d8 100644 --- a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h +++ b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h @@ -1,6 +1,8 @@ #pragma once #include +#include +#include namespace DB @@ -22,5 +24,14 @@ private: bool readRow(MutableColumns & columns, RowReadExtension &) override; }; +class RawBLOBSchemaReader: public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"raw_blob", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index 279ae89aba5..90db6f6f0ec 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -14,18 +14,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -RegexpRowInputFormat::RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_) - : RegexpRowInputFormat(std::make_unique(in_), header_, params_, format_settings_) -{ -} - -RegexpRowInputFormat::RegexpRowInputFormat( - std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_) - : IRowInputFormat(header_, *buf_, std::move(params_)) - , buf(std::move(buf_)) - , format_settings(format_settings_) - , escaping_rule(format_settings_.regexp.escaping_rule) - , regexp(format_settings_.regexp.regexp) +RegexpFieldExtractor::RegexpFieldExtractor(const FormatSettings & format_settings) : regexp(format_settings.regexp.regexp), skip_unmatched(format_settings.regexp.skip_unmatched) { size_t fields_count = regexp.NumberOfCapturingGroups(); matched_fields.resize(fields_count); @@ -40,6 +29,50 @@ RegexpRowInputFormat::RegexpRowInputFormat( } } +bool RegexpFieldExtractor::parseRow(PeekableReadBuffer & buf) +{ + PeekableReadBufferCheckpoint checkpoint{buf}; + + size_t line_size = 0; + + do + { + char * pos = find_first_symbols<'\n', '\r'>(buf.position(), buf.buffer().end()); + line_size += pos - buf.position(); + buf.position() = pos; + } while (buf.position() == buf.buffer().end() && !buf.eof()); + + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + + bool match = RE2::FullMatchN(re2::StringPiece(buf.position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size()); + + if (!match && !skip_unmatched) + throw Exception("Line \"" + std::string(buf.position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA); + + buf.position() += line_size; + checkChar('\r', buf); + if (!buf.eof() && !checkChar('\n', buf)) + throw Exception("No \\n after \\r at the end of line.", ErrorCodes::INCORRECT_DATA); + + return match; +} + +RegexpRowInputFormat::RegexpRowInputFormat( + ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_) + : RegexpRowInputFormat(std::make_unique(in_), header_, params_, format_settings_) +{ +} + +RegexpRowInputFormat::RegexpRowInputFormat( + std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_) + : IRowInputFormat(header_, *buf_, std::move(params_)) + , buf(std::move(buf_)) + , format_settings(format_settings_) + , escaping_rule(format_settings_.regexp.escaping_rule) + , field_extractor(RegexpFieldExtractor(format_settings_)) +{ +} void RegexpRowInputFormat::resetParser() { @@ -50,7 +83,8 @@ void RegexpRowInputFormat::resetParser() bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) { const auto & type = getPort().getHeader().getByPosition(index).type; - ReadBuffer field_buf(const_cast(matched_fields[index].data()), matched_fields[index].size(), 0); + auto matched_field = field_extractor.getField(index); + ReadBuffer field_buf(const_cast(matched_field.data()), matched_field.size(), 0); try { return deserializeFieldByEscapingRule(type, serializations[index], *columns[index], field_buf, escaping_rule, format_settings); @@ -64,7 +98,7 @@ bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) void RegexpRowInputFormat::readFieldsFromMatch(MutableColumns & columns, RowReadExtension & ext) { - if (matched_fields.size() != columns.size()) + if (field_extractor.getMatchedFieldsSize() != columns.size()) throw Exception("The number of matched fields in line doesn't match the number of columns.", ErrorCodes::INCORRECT_DATA); ext.read_columns.assign(columns.size(), false); @@ -79,39 +113,8 @@ bool RegexpRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & if (buf->eof()) return false; - PeekableReadBufferCheckpoint checkpoint{*buf}; - - size_t line_size = 0; - - do - { - char * pos = find_first_symbols<'\n', '\r'>(buf->position(), buf->buffer().end()); - line_size += pos - buf->position(); - buf->position() = pos; - } while (buf->position() == buf->buffer().end() && !buf->eof()); - - buf->makeContinuousMemoryFromCheckpointToPos(); - buf->rollbackToCheckpoint(); - - bool match = RE2::FullMatchN(re2::StringPiece(buf->position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size()); - bool read_line = true; - - if (!match) - { - if (!format_settings.regexp.skip_unmatched) - throw Exception("Line \"" + std::string(buf->position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA); - read_line = false; - } - - if (read_line) + if (field_extractor.parseRow(*buf)) readFieldsFromMatch(columns, ext); - - buf->position() += line_size; - - checkChar('\r', *buf); - if (!buf->eof() && !checkChar('\n', *buf)) - throw Exception("No \\n after \\r at the end of line.", ErrorCodes::INCORRECT_DATA); - return true; } @@ -121,6 +124,36 @@ void RegexpRowInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } +RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) + : IRowSchemaReader( + buf, + format_settings_.max_rows_to_read_for_schema_inference, + getDefaultDataTypeForEscapingRule(format_settings_.regexp.escaping_rule)) + , format_settings(format_settings_) + , field_extractor(format_settings) + , buf(in_) + , context(context_) +{ +} + +DataTypes RegexpSchemaReader::readRowAndGetDataTypes() +{ + if (buf.eof()) + return {}; + + field_extractor.parseRow(buf); + + DataTypes data_types; + data_types.reserve(field_extractor.getMatchedFieldsSize()); + for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i) + { + String field(field_extractor.getField(i)); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, context)); + } + + return data_types; +} + void registerInputFormatRegexp(FormatFactory & factory) { factory.registerInputFormat("Regexp", []( @@ -172,4 +205,12 @@ void registerFileSegmentationEngineRegexp(FormatFactory & factory) factory.registerFileSegmentationEngine("Regexp", &fileSegmentationEngineRegexpImpl); } +void registerRegexpSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, settings, context); + }); +} + } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index dbce31a9b49..dffd2f82e02 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,29 @@ namespace DB class ReadBuffer; +/// Class for extracting row fields from data by regexp. +class RegexpFieldExtractor +{ +public: + RegexpFieldExtractor(const FormatSettings & format_settings); + + /// Return true if row was successfully parsed and row fields were extracted. + bool parseRow(PeekableReadBuffer & buf); + + re2::StringPiece getField(size_t index) { return matched_fields[index]; } + size_t getMatchedFieldsSize() const { return matched_fields.size(); } + size_t getNumberOfGroups() const { return regexp.NumberOfCapturingGroups(); } + +private: + const RE2 regexp; + // The vector of fields extracted from line using regexp. + std::vector matched_fields; + // These two vectors are needed to use RE2::FullMatchN (function for extracting fields). + std::vector re2_arguments; + std::vector re2_arguments_ptrs; + bool skip_unmatched; +}; + /// Regexp input format. /// This format applies regular expression from format_regexp setting for every line of file /// (the lines must be separated by newline character ('\n') or DOS-style newline ("\r\n")). @@ -25,7 +49,6 @@ class ReadBuffer; class RegexpRowInputFormat : public IRowInputFormat { - using EscapingRule = FormatSettings::EscapingRule; public: RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_); @@ -36,6 +59,8 @@ public: private: RegexpRowInputFormat(std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_); + using EscapingRule = FormatSettings::EscapingRule; + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; bool readField(size_t index, MutableColumns & columns); @@ -44,13 +69,22 @@ private: std::unique_ptr buf; const FormatSettings format_settings; const EscapingRule escaping_rule; + RegexpFieldExtractor field_extractor; +}; - const RE2 regexp; - // The vector of fields extracted from line using regexp. - std::vector matched_fields; - // These two vectors are needed to use RE2::FullMatchN (function for extracting fields). - std::vector re2_arguments; - std::vector re2_arguments_ptrs; +class RegexpSchemaReader : public IRowSchemaReader +{ +public: + RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + using EscapingRule = FormatSettings::EscapingRule; + const FormatSettings format_settings; + RegexpFieldExtractor field_extractor; + PeekableReadBuffer buf; + ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index eef97e15dd5..8a56c2ed5c7 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -1,7 +1,10 @@ #include #include #include +#include #include +#include +#include namespace DB @@ -211,6 +214,59 @@ void TSKVRowInputFormat::resetParser() name_buf.clear(); } +TSKVSchemaReader::TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : IRowWithNamesSchemaReader( + in_, + format_settings_.max_rows_to_read_for_schema_inference, + getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::Escaped)) + , format_settings(format_settings_) +{ +} + +std::unordered_map TSKVSchemaReader::readRowAndGetNamesAndDataTypes() +{ + if (first_row) + { + skipBOMIfExists(in); + first_row = false; + } + + if (in.eof()) + return {}; + + if (*in.position() == '\n') + { + ++in.position(); + return {}; + } + + std::unordered_map names_and_types; + StringRef name_ref; + String name_tmp; + String value; + do + { + bool has_value = readName(in, name_ref, name_tmp); + if (has_value) + { + readEscapedString(value, in); + names_and_types[String(name_ref)] = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped); + } + else + { + /// The only thing that can go without value is `tskv` fragment that is ignored. + if (!(name_ref.size == 4 && 0 == memcmp(name_ref.data, "tskv", 4))) + throw Exception("Found field without value while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA); + } + + } + while (checkChar('\t', in)); + + assertChar('\n', in); + + return names_and_types; +} + void registerInputFormatTSKV(FormatFactory & factory) { factory.registerInputFormat("TSKV", []( @@ -222,5 +278,12 @@ void registerInputFormatTSKV(FormatFactory & factory) return std::make_shared(buf, sample, std::move(params), settings); }); } +void registerTSKVSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.h b/src/Processors/Formats/Impl/TSKVRowInputFormat.h index 7d732bae691..6aef50a0f84 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.h +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -52,4 +53,16 @@ private: /// for row like ..., non-nullable column name=\N, ... }; +class TSKVSchemaReader : public IRowWithNamesSchemaReader +{ +public: + TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + std::unordered_map readRowAndGetNamesAndDataTypes() override; + + const FormatSettings format_settings; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 1e6d238b202..bb844ec68ea 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -1,13 +1,15 @@ #include #include -#include -#include -#include -#include -#include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include namespace DB { @@ -38,40 +40,50 @@ TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( bool with_types_, bool is_raw_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_), is_raw(is_raw_) + : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_, std::make_unique(in_, format_settings_, is_raw_)) { } -void TabSeparatedRowInputFormat::skipFieldDelimiter() +TabSeparatedFormatReader::TabSeparatedFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool is_raw_) + : FormatWithNamesAndTypesReader(in_, format_settings_), is_raw(is_raw_) +{ +} + +void TabSeparatedFormatReader::skipFieldDelimiter() { assertChar('\t', *in); } -void TabSeparatedRowInputFormat::skipRowEndDelimiter() +void TabSeparatedFormatReader::skipRowEndDelimiter() { if (in->eof()) return; - if (unlikely(row_num <= 1)) + if (unlikely(first_row)) + { checkForCarriageReturn(*in); + first_row = false; + } assertChar('\n', *in); } -String TabSeparatedRowInputFormat::readFieldIntoString() +String TabSeparatedFormatReader::readFieldIntoString() { String field; - readEscapedString(field, *in); + if (is_raw) + readString(field, *in); + else + readEscapedString(field, *in); return field; } -void TabSeparatedRowInputFormat::skipField() +void TabSeparatedFormatReader::skipField() { - NullOutput null_sink; - readEscapedStringInto(null_sink, *in); + readFieldIntoString(); } -void TabSeparatedRowInputFormat::skipHeaderRow() +void TabSeparatedFormatReader::skipHeaderRow() { do { @@ -82,7 +94,7 @@ void TabSeparatedRowInputFormat::skipHeaderRow() skipRowEndDelimiter(); } -std::vector TabSeparatedRowInputFormat::readHeaderRow() +std::vector TabSeparatedFormatReader::readRow() { std::vector fields; do @@ -95,7 +107,7 @@ std::vector TabSeparatedRowInputFormat::readHeaderRow() return fields; } -bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, +bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) { const bool at_delimiter = !is_last_file_column && !in->eof() && *in->position() == '\t'; @@ -118,6 +130,7 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & return true; } + if (as_nullable) return SerializationNullable::deserializeTextEscapedImpl(column, *in, format_settings, serialization); @@ -125,7 +138,7 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & return true; } -bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool TabSeparatedFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { try { @@ -156,7 +169,7 @@ bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuff return true; } -bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool TabSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { if (in->eof()) return true; @@ -190,7 +203,7 @@ bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out return true; } -void TabSeparatedRowInputFormat::checkNullValueForNonNullable(DataTypePtr type) +void TabSeparatedFormatReader::checkNullValueForNonNullable(DataTypePtr type) { bool can_be_parsed_as_null = type->isNullable() || type->isLowCardinalityNullable() || format_settings.null_as_default; @@ -218,6 +231,28 @@ void TabSeparatedRowInputFormat::syncAfterError() skipToUnescapedNextLineOrEOF(*in); } +TabSeparatedSchemaReader::TabSeparatedSchemaReader( + ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader( + in_, + format_settings_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(is_raw_ ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped)) + , reader(in_, format_settings_, is_raw_) +{ +} + +DataTypes TabSeparatedSchemaReader::readRowAndGetDataTypes() +{ + if (in.eof()) + return {}; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule()); +} + void registerInputFormatTabSeparated(FormatFactory & factory) { for (bool is_raw : {false, true}) @@ -239,6 +274,23 @@ void registerInputFormatTabSeparated(FormatFactory & factory) } } +void registerTSVSchemaReader(FormatFactory & factory) +{ + for (bool is_raw : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, with_names, with_types, is_raw, settings); + }); + }; + + registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); + registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); + } +} + static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, bool is_raw, size_t min_rows) { bool need_more_data = true; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 6e2e283e792..1f2bfc255b8 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -24,6 +25,13 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; + bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } +}; + +class TabSeparatedFormatReader : public FormatWithNamesAndTypesReader +{ +public: + TabSeparatedFormatReader(ReadBuffer & in_, const FormatSettings & format_settings, bool is_raw_); bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; @@ -36,18 +44,34 @@ private: void skipFieldDelimiter() override; void skipRowEndDelimiter() override; - std::vector readHeaderRow(); - std::vector readNames() override { return readHeaderRow(); } - std::vector readTypes() override { return readHeaderRow(); } + std::vector readRow(); + std::vector readNames() override { return readRow(); } + std::vector readTypes() override { return readRow(); } String readFieldIntoString(); void checkNullValueForNonNullable(DataTypePtr type) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; - bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } + FormatSettings::EscapingRule getEscapingRule() + { + return is_raw ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped; + } +private: bool is_raw; + bool first_row = true; +}; + +class TabSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + TabSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings); + +private: + DataTypes readRowAndGetDataTypes() override; + + TabSeparatedFormatReader reader; }; } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp index 5d87f5a0b14..03a3ea99b28 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp @@ -22,7 +22,10 @@ void TabSeparatedRowOutputFormat::writeLine(const std::vector & values) { for (size_t i = 0; i < values.size(); ++i) { - writeEscapedString(values[i], out); + if (is_raw) + writeString(values[i], out); + else + writeEscapedString(values[i], out); if (i + 1 == values.size()) writeRowEndDelimiter(); else @@ -95,6 +98,8 @@ void registerOutputFormatTabSeparated(FormatFactory & factory) registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); + if (is_raw) + registerWithNamesAndTypes("LineAsString", register_func); } } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index fccf6eb10df..06d6ba06bcc 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include namespace DB @@ -12,13 +11,19 @@ namespace DB namespace ErrorCodes { -extern const int ATTEMPT_TO_READ_AFTER_EOF; -extern const int CANNOT_READ_ALL_DATA; -extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; -extern const int CANNOT_PARSE_QUOTED_STRING; -extern const int SYNTAX_ERROR; + extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int CANNOT_READ_ALL_DATA; + extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; + extern const int CANNOT_PARSE_QUOTED_STRING; + extern const int SYNTAX_ERROR; } +[[noreturn]] static void throwUnexpectedEof(size_t row_num) +{ + throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " + "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", + ErrorCodes::CANNOT_READ_ALL_DATA); +} TemplateRowInputFormat::TemplateRowInputFormat( const Block & header_, @@ -41,37 +46,13 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::uniqu : RowInputFormatWithDiagnosticInfo(header_, *buf_, params_), buf(std::move(buf_)), data_types(header_.getDataTypes()), settings(std::move(settings_)), ignore_spaces(ignore_spaces_), format(std::move(format_)), row_format(std::move(row_format_)), - default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(std::move(row_between_delimiter_)) + default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(row_between_delimiter_), + format_reader(std::make_unique(*buf, ignore_spaces_, format, row_format, row_between_delimiter, settings)) { - /// Validate format string for result set - bool has_data = false; - for (size_t i = 0; i < format.columnsCount(); ++i) - { - if (format.format_idx_to_column_idx[i]) - { - if (*format.format_idx_to_column_idx[i] != 0) - format.throwInvalidFormat("Invalid input part", i); - if (has_data) - format.throwInvalidFormat("${data} can occur only once", i); - if (format.escaping_rules[i] != EscapingRule::None) - format.throwInvalidFormat("${data} must have empty or None deserialization type", i); - has_data = true; - format_data_idx = i; - } - else - { - if (format.escaping_rules[i] == EscapingRule::XML) - format.throwInvalidFormat("XML deserialization is not supported", i); - } - } - /// Validate format string for rows std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - if (row_format.escaping_rules[i] == EscapingRule::XML) - row_format.throwInvalidFormat("XML deserialization is not supported", i); - if (row_format.format_idx_to_column_idx[i]) { if (header_.columns() <= *row_format.format_idx_to_column_idx[i]) @@ -94,69 +75,7 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::uniqu void TemplateRowInputFormat::readPrefix() { - size_t last_successfully_parsed_idx = 0; - try - { - tryReadPrefixOrSuffix(last_successfully_parsed_idx, format_data_idx); - } - catch (Exception & e) - { - format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx); - } -} - -/// Asserts delimiters and skips fields in prefix or suffix. -/// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row -/// (most likely false will be returned on first call of checkString(...)) -template -ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end) -{ - static constexpr bool throw_exception = std::is_same_v; - - skipSpaces(); - if constexpr (throw_exception) - assertString(format.delimiters[input_part_beg], *buf); - else - { - if (likely(!checkString(format.delimiters[input_part_beg], *buf))) - return ReturnType(false); - } - - while (input_part_beg < input_part_end) - { - skipSpaces(); - if constexpr (throw_exception) - skipField(format.escaping_rules[input_part_beg]); - else - { - try - { - skipField(format.escaping_rules[input_part_beg]); - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && - e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && - e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) - throw; - /// If it's parsing error, then suffix is not found - return ReturnType(false); - } - } - ++input_part_beg; - - skipSpaces(); - if constexpr (throw_exception) - assertString(format.delimiters[input_part_beg], *buf); - else - { - if (likely(!checkString(format.delimiters[input_part_beg], *buf))) - return ReturnType(false); - } - } - - if constexpr (!throw_exception) - return ReturnType(true); + format_reader->readPrefix(); } bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra) @@ -165,9 +84,7 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension if (unlikely(end_of_stream)) return false; - skipSpaces(); - - if (unlikely(checkForSuffix())) + if (unlikely(format_reader->checkForSuffix())) { end_of_stream = true; return false; @@ -176,27 +93,24 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension updateDiagnosticInfo(); if (likely(row_num != 1)) - assertString(row_between_delimiter, *buf); + format_reader->skipRowBetweenDelimiter(); extra.read_columns.assign(columns.size(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - skipSpaces(); - assertString(row_format.delimiters[i], *buf); - skipSpaces(); + format_reader->skipDelimiter(i); + if (row_format.format_idx_to_column_idx[i]) { size_t col_idx = *row_format.format_idx_to_column_idx[i]; extra.read_columns[col_idx] = deserializeField(data_types[col_idx], serializations[col_idx], *columns[col_idx], i); } else - skipField(row_format.escaping_rules[i]); - + format_reader->skipField(row_format.escaping_rules[i]); } - skipSpaces(); - assertString(row_format.delimiters.back(), *buf); + format_reader->skipRowEndDelimiter(); for (const auto & idx : always_default_columns) data_types[idx]->insertDefaultInto(*columns[idx]); @@ -219,65 +133,21 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, catch (Exception & e) { if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throwUnexpectedEof(); + throwUnexpectedEof(row_num); throw; } } -void TemplateRowInputFormat::skipField(TemplateRowInputFormat::EscapingRule escaping_rule) -{ - try - { - skipFieldByEscapingRule(*buf, escaping_rule, settings); - } - catch (Exception & e) - { - if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throwUnexpectedEof(); - throw; - } -} - -/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF. -/// Otherwise returns false -bool TemplateRowInputFormat::checkForSuffix() -{ - PeekableReadBufferCheckpoint checkpoint{*buf}; - bool suffix_found = false; - size_t last_successfully_parsed_idx = format_data_idx + 1; - try - { - suffix_found = tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && - e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && - e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) - throw; - } - - if (unlikely(suffix_found)) - { - skipSpaces(); - if (buf->eof()) - return true; - } - - buf->rollbackToCheckpoint(); - return false; -} - bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { out << "Suffix does not match: "; - size_t last_successfully_parsed_idx = format_data_idx + 1; + size_t last_successfully_parsed_idx = format_reader->getFormatDataIdx() + 1; const ReadBuffer::Position row_begin_pos = buf->position(); bool caught = false; try { PeekableReadBufferCheckpoint checkpoint{*buf, true}; - tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); + format_reader->tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); } catch (Exception & e) { @@ -309,7 +179,7 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col if (!parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters[i], "delimiter before field " + std::to_string(i), ignore_spaces)) return false; - skipSpaces(); + format_reader->skipSpaces(); if (row_format.format_idx_to_column_idx[i]) { const auto & header = getPort().getHeader(); @@ -364,7 +234,7 @@ void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColu if (index) deserializeField(type, serializations[*index], column, file_column); else - skipField(row_format.escaping_rules[file_column]); + format_reader->skipField(row_format.escaping_rules[file_column]); } bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position) @@ -387,13 +257,6 @@ void TemplateRowInputFormat::syncAfterError() /// It will cause another parsing error. } -void TemplateRowInputFormat::throwUnexpectedEof() -{ - throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " - "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", - ErrorCodes::CANNOT_READ_ALL_DATA); -} - void TemplateRowInputFormat::resetParser() { RowInputFormatWithDiagnosticInfo::resetParser(); @@ -407,6 +270,268 @@ void TemplateRowInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } +TemplateFormatReader::TemplateFormatReader( + PeekableReadBuffer & buf_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter_, + const FormatSettings & format_settings_) + : buf(&buf_) + , ignore_spaces(ignore_spaces_) + , format(format_) + , row_format(row_format_) + , row_between_delimiter(row_between_delimiter_) + , format_settings(format_settings_) +{ + /// Validate format string for result set + bool has_data = false; + for (size_t i = 0; i < format.columnsCount(); ++i) + { + if (format.format_idx_to_column_idx[i]) + { + if (*format.format_idx_to_column_idx[i] != 0) + format.throwInvalidFormat("Invalid input part", i); + if (has_data) + format.throwInvalidFormat("${data} can occur only once", i); + if (format.escaping_rules[i] != EscapingRule::None) + format.throwInvalidFormat("${data} must have empty or None deserialization type", i); + has_data = true; + format_data_idx = i; + } + else + { + if (format.escaping_rules[i] == EscapingRule::XML) + format.throwInvalidFormat("XML deserialization is not supported", i); + } + } + + /// Validate format string for rows + for (size_t i = 0; i < row_format.columnsCount(); ++i) + { + if (row_format.escaping_rules[i] == EscapingRule::XML) + row_format.throwInvalidFormat("XML deserialization is not supported", i); + } +} + +void TemplateFormatReader::readPrefix() +{ + size_t last_successfully_parsed_idx = 0; + try + { + tryReadPrefixOrSuffix(last_successfully_parsed_idx, format_data_idx); + } + catch (Exception & e) + { + format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx); + } +} + +void TemplateFormatReader::skipField(EscapingRule escaping_rule) +{ + try + { + skipFieldByEscapingRule(*buf, escaping_rule, format_settings); + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + throwUnexpectedEof(row_num); + throw; + } +} + +/// Asserts delimiters and skips fields in prefix or suffix. +/// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row +/// (most likely false will be returned on first call of checkString(...)) +template +ReturnType TemplateFormatReader::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end) +{ + static constexpr bool throw_exception = std::is_same_v; + + skipSpaces(); + if constexpr (throw_exception) + assertString(format.delimiters[input_part_beg], *buf); + else + { + if (likely(!checkString(format.delimiters[input_part_beg], *buf))) + return ReturnType(false); + } + + while (input_part_beg < input_part_end) + { + skipSpaces(); + if constexpr (throw_exception) + skipField(format.escaping_rules[input_part_beg]); + else + { + try + { + skipField(format.escaping_rules[input_part_beg]); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && + e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) + throw; + /// If it's parsing error, then suffix is not found + return ReturnType(false); + } + } + ++input_part_beg; + + skipSpaces(); + if constexpr (throw_exception) + assertString(format.delimiters[input_part_beg], *buf); + else + { + if (likely(!checkString(format.delimiters[input_part_beg], *buf))) + return ReturnType(false); + } + } + + if constexpr (!throw_exception) + return ReturnType(true); +} + +/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF. +/// Otherwise returns false +bool TemplateFormatReader::checkForSuffix() +{ + PeekableReadBufferCheckpoint checkpoint{*buf}; + bool suffix_found = false; + size_t last_successfully_parsed_idx = format_data_idx + 1; + try + { + suffix_found = tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && + e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) + throw; + } + + if (unlikely(suffix_found)) + { + skipSpaces(); + if (buf->eof()) + return true; + } + + buf->rollbackToCheckpoint(); + return false; +} + +void TemplateFormatReader::skipDelimiter(size_t index) +{ + skipSpaces(); + assertString(row_format.delimiters[index], *buf); + skipSpaces(); +} + +void TemplateFormatReader::skipRowEndDelimiter() +{ + ++row_num; + skipSpaces(); + assertString(row_format.delimiters.back(), *buf); + skipSpaces(); +} + +void TemplateFormatReader::skipRowBetweenDelimiter() +{ + skipSpaces(); + assertString(row_between_delimiter, *buf); + skipSpaces(); +} + +TemplateSchemaReader::TemplateSchemaReader( + ReadBuffer & in_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_, + ContextPtr context_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference) + , buf(in_) + , format(format_) + , row_format(row_format_) + , format_settings(format_settings_) + , context(context_) + , format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings) +{ + setColumnNames(row_format.column_names); +} + +DataTypes TemplateSchemaReader::readRowAndGetDataTypes() +{ + if (first_row) + format_reader.readPrefix(); + + if (format_reader.checkForSuffix()) + return {}; + + if (first_row) + first_row = false; + else + format_reader.skipRowBetweenDelimiter(); + + DataTypes data_types; + data_types.reserve(row_format.columnsCount()); + String field; + for (size_t i = 0; i != row_format.columnsCount(); ++i) + { + format_reader.skipDelimiter(i); + if (row_format.escaping_rules[i] == FormatSettings::EscapingRule::CSV) + format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front(); + + field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], context)); + } + + format_reader.skipRowEndDelimiter(); + return data_types; +} + +static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings) +{ + ParsedTemplateFormatString resultset_format; + if (settings.template_settings.resultset_format.empty()) + { + /// Default format string: "${data}" + resultset_format.delimiters.resize(2); + resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); + resultset_format.format_idx_to_column_idx.emplace_back(0); + resultset_format.column_names.emplace_back("data"); + } + else + { + /// Read format string from file + resultset_format = ParsedTemplateFormatString( + FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, + settings.schema.is_server, settings.schema.format_schema_path), + [&](const String & partName) -> std::optional + { + if (partName == "data") + return 0; + throw Exception("Unknown input part " + partName, + ErrorCodes::SYNTAX_ERROR); + }); + } + return resultset_format; +} + +static ParsedTemplateFormatString fillRowFormat(const FormatSettings & settings, ParsedTemplateFormatString::ColumnIdxGetter idx_getter, bool allow_indexes) +{ + return ParsedTemplateFormatString( + FormatSchemaInfo( + settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), + idx_getter, allow_indexes); +} + void registerInputFormatTemplate(FormatFactory & factory) { for (bool ignore_spaces : {false, true}) @@ -417,39 +542,34 @@ void registerInputFormatTemplate(FormatFactory & factory) IRowInputFormat::Params params, const FormatSettings & settings) { - ParsedTemplateFormatString resultset_format; - if (settings.template_settings.resultset_format.empty()) + auto idx_getter = [&](const String & colName) -> std::optional { - /// Default format string: "${data}" - resultset_format.delimiters.resize(2); - resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); - resultset_format.format_idx_to_column_idx.emplace_back(0); - resultset_format.column_names.emplace_back("data"); - } - else - { - /// Read format string from file - resultset_format = ParsedTemplateFormatString( - FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, - settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & partName) -> std::optional - { - if (partName == "data") - return 0; - throw Exception("Unknown input part " + partName, - ErrorCodes::SYNTAX_ERROR); - }); - } + return sample.getPositionByName(colName); + }; - ParsedTemplateFormatString row_format = ParsedTemplateFormatString( - FormatSchemaInfo(settings.template_settings.row_format, "Template", false, - settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & colName) -> std::optional - { - return sample.getPositionByName(colName); - }); + return std::make_shared( + sample, + buf, + params, + settings, + ignore_spaces, + fillResultSetFormat(settings), + fillRowFormat(settings, idx_getter, true), + settings.template_settings.row_between_delimiter); + }); + } +} - return std::make_shared(sample, buf, params, settings, ignore_spaces, resultset_format, row_format, settings.template_settings.row_between_delimiter); +void registerTemplateSchemaReader(FormatFactory & factory) +{ + for (bool ignore_spaces : {false, true}) + { + factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + size_t index = 0; + auto idx_getter = [&](const String &) -> std::optional { return index++; }; + auto row_format = fillRowFormat(settings, idx_getter, false); + return std::make_shared(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings, context); }); } } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 61cd97413bf..755ad6cb39b 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -2,15 +2,19 @@ #include #include +#include #include #include #include #include +#include namespace DB { +class TemplateFormatReader; + class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo { using EscapingRule = FormatSettings::EscapingRule; @@ -40,14 +44,6 @@ private: bool deserializeField(const DataTypePtr & type, const SerializationPtr & serialization, IColumn & column, size_t file_column); - void skipField(EscapingRule escaping_rule); - inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } - - template - ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end); - bool checkForSuffix(); - [[noreturn]] void throwUnexpectedEof(); - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; @@ -63,12 +59,76 @@ private: const ParsedTemplateFormatString format; const ParsedTemplateFormatString row_format; - size_t format_data_idx; bool end_of_stream = false; std::vector always_default_columns; const char default_csv_delimiter; const std::string row_between_delimiter; + + std::unique_ptr format_reader; +}; + +class TemplateFormatReader +{ + using EscapingRule = FormatSettings::EscapingRule; + +public: + TemplateFormatReader( + PeekableReadBuffer & buf_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_); + + void readPrefix(); + void skipField(EscapingRule escaping_rule); + inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } + + template + ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end); + bool checkForSuffix(); + + void setReadBuffer(PeekableReadBuffer & buf_) { buf = &buf_; } + + void skipDelimiter(size_t index); + void skipRowEndDelimiter(); + void skipRowBetweenDelimiter(); + + size_t getFormatDataIdx() const { return format_data_idx; } + +private: + PeekableReadBuffer * buf; + bool ignore_spaces; + const ParsedTemplateFormatString & format; + const ParsedTemplateFormatString & row_format; + const std::string row_between_delimiter; + const FormatSettings & format_settings; + size_t format_data_idx; + size_t row_num; +}; + +class TemplateSchemaReader : public IRowSchemaReader +{ +public: + TemplateSchemaReader(ReadBuffer & in_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_, + ContextPtr context_); + + DataTypes readRowAndGetDataTypes() override; + +private: + PeekableReadBuffer buf; + const ParsedTemplateFormatString format; + const ParsedTemplateFormatString row_format; + FormatSettings format_settings; + ContextPtr context; + TemplateFormatReader format_reader; + bool first_row = true; }; bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces); diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index adf6d2e8a25..b58be3f5526 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,7 @@ #include #include +#include namespace DB { @@ -286,6 +288,50 @@ namespace } } +/// Can be used in fileSegmentationEngine for parallel parsing of Values +static bool skipToNextRow(PeekableReadBuffer * buf, size_t min_chunk_bytes, int balance) +{ + skipWhitespaceIfAny(*buf); + if (buf->eof() || *buf->position() == ';') + return false; + bool quoted = false; + + size_t chunk_begin_buf_count = buf->count(); + while (!buf->eof() && (balance || buf->count() - chunk_begin_buf_count < min_chunk_bytes)) + { + buf->position() = find_first_symbols<'\\', '\'', ')', '('>(buf->position(), buf->buffer().end()); + if (buf->position() == buf->buffer().end()) + continue; + if (*buf->position() == '\\') + { + ++buf->position(); + if (!buf->eof()) + ++buf->position(); + } + else if (*buf->position() == '\'') + { + quoted ^= true; + ++buf->position(); + } + else if (*buf->position() == ')') + { + ++buf->position(); + if (!quoted) + --balance; + } + else if (*buf->position() == '(') + { + ++buf->position(); + if (!quoted) + ++balance; + } + } + + if (!buf->eof() && *buf->position() == ',') + ++buf->position(); + return true; +} + bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx) { const Block & header = getPort().getHeader(); @@ -293,7 +339,7 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx auto settings = context->getSettingsRef(); /// We need continuous memory containing the expression to use Lexer - skipToNextRow(0, 1); + skipToNextRow(buf.get(), 0, 1); buf->makeContinuousMemoryFromCheckpointToPos(); buf->rollbackToCheckpoint(); @@ -437,50 +483,6 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx return true; } -/// Can be used in fileSegmentationEngine for parallel parsing of Values -bool ValuesBlockInputFormat::skipToNextRow(size_t min_chunk_bytes, int balance) -{ - skipWhitespaceIfAny(*buf); - if (buf->eof() || *buf->position() == ';') - return false; - bool quoted = false; - - size_t chunk_begin_buf_count = buf->count(); - while (!buf->eof() && (balance || buf->count() - chunk_begin_buf_count < min_chunk_bytes)) - { - buf->position() = find_first_symbols<'\\', '\'', ')', '('>(buf->position(), buf->buffer().end()); - if (buf->position() == buf->buffer().end()) - continue; - if (*buf->position() == '\\') - { - ++buf->position(); - if (!buf->eof()) - ++buf->position(); - } - else if (*buf->position() == '\'') - { - quoted ^= true; - ++buf->position(); - } - else if (*buf->position() == ')') - { - ++buf->position(); - if (!quoted) - --balance; - } - else if (*buf->position() == '(') - { - ++buf->position(); - if (!quoted) - ++balance; - } - } - - if (!buf->eof() && *buf->position() == ',') - ++buf->position(); - return true; -} - void ValuesBlockInputFormat::assertDelimiterAfterValue(size_t column_idx) { if (unlikely(!checkDelimiterAfterValue(column_idx))) @@ -559,6 +561,63 @@ void ValuesBlockInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } +ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), context(context_) +{ +} + +DataTypes ValuesSchemaReader::readRowAndGetDataTypes() +{ + if (first_row) + { + skipBOMIfExists(buf); + first_row = false; + } + + skipWhitespaceIfAny(buf); + if (buf.eof()) + return {}; + + assertChar('(', buf); + PeekableReadBufferCheckpoint checkpoint(buf); + skipToNextRow(&buf, 0, 1); + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + + Tokens tokens(buf.position(), buf.buffer().end()); + IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); + + DataTypes data_types; + bool finish = false; + while (!finish) + { + Expected expected; + ASTPtr ast; + + bool parsed = parser.parse(token_iterator, ast, expected); + /// Consider delimiter after value (',' or ')') as part of expression + parsed &= token_iterator->type == TokenType::Comma || token_iterator->type == TokenType::ClosingRoundBracket; + + if (!parsed) + throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse expression here: {}, token: {}", + String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())), String(token_iterator.get().begin, token_iterator.get().end)); + + std::pair result = evaluateConstantExpression(ast, context); + data_types.push_back(generalizeDataType(result.second)); + + if (token_iterator->type == TokenType::ClosingRoundBracket) + finish = true; + ++token_iterator; + buf.position() = const_cast(token_iterator->begin); + } + + skipWhitespaceIfAny(buf); + if (!buf.eof() && *buf.position() == ',') + ++buf.position(); + + return data_types; +} + void registerInputFormatValues(FormatFactory & factory) { factory.registerInputFormat("Values", []( @@ -571,4 +630,12 @@ void registerInputFormatValues(FormatFactory & factory) }); } +void registerValuesSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, settings, context); + }); +} + } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index 5bbd4bea5ba..e1521955472 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace DB @@ -68,8 +69,6 @@ private: void readPrefix(); void readSuffix(); - bool skipToNextRow(size_t min_chunk_bytes = 0, int balance = 0); - std::unique_ptr buf; const RowInputFormatParams params; @@ -95,4 +94,18 @@ private: BlockMissingValues block_missing_values; }; +class ValuesSchemaReader : public IRowSchemaReader +{ +public: + ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + ContextPtr context; + ParserExpression parser; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 87fa5ec1c4a..7720b01dc74 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -1,5 +1,7 @@ #include +#include #include +#include #include #include @@ -9,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_DATA; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes( @@ -17,8 +20,13 @@ RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes( const Params & params_, bool with_names_, bool with_types_, - const FormatSettings & format_settings_) - : RowInputFormatWithDiagnosticInfo(header_, in_, params_), format_settings(format_settings_), with_names(with_names_), with_types(with_types_) + const FormatSettings & format_settings_, + std::unique_ptr format_reader_) + : RowInputFormatWithDiagnosticInfo(header_, in_, params_) + , format_settings(format_settings_) + , with_names(with_names_) + , with_types(with_types_) + , format_reader(std::move(format_reader_)) { const auto & sample = getPort().getHeader(); size_t num_columns = sample.columns(); @@ -88,7 +96,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() } /// Skip prefix before names and types. - skipPrefixBeforeHeader(); + format_reader->skipPrefixBeforeHeader(); /// This is a bit of abstraction leakage, but we need it in parallel parsing: /// we check if this InputFormat is working with the "real" beginning of the data. @@ -97,7 +105,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() if (format_settings.with_names_use_header) { std::vector read_columns(data_types.size(), false); - auto column_names = readNames(); + auto column_names = format_reader->readNames(); for (const auto & name : column_names) addInputColumn(name, read_columns); @@ -110,7 +118,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() else { setupAllColumnsByTableSchema(); - skipNames(); + format_reader->skipNames(); } } else if (!column_mapping->is_set) @@ -119,10 +127,10 @@ void RowInputFormatWithNamesAndTypes::readPrefix() if (with_types) { /// Skip delimiter between names and types. - skipRowBetweenDelimiter(); + format_reader->skipRowBetweenDelimiter(); if (format_settings.with_types_use_header) { - auto types = readTypes(); + auto types = format_reader->readTypes(); if (types.size() != column_mapping->column_indexes_for_input_fields.size()) throw Exception( ErrorCodes::INCORRECT_DATA, @@ -143,7 +151,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() } } else - skipTypes(); + format_reader->skipTypes(); } } @@ -161,7 +169,7 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE if (unlikely(end_of_stream)) return false; - if (unlikely(checkForSuffix())) + if (unlikely(format_reader->checkForSuffix())) { end_of_stream = true; return false; @@ -170,9 +178,9 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE updateDiagnosticInfo(); if (likely(row_num != 1 || (getCurrentUnitNumber() == 0 && (with_names || with_types)))) - skipRowBetweenDelimiter(); + format_reader->skipRowBetweenDelimiter(); - skipRowStartDelimiter(); + format_reader->skipRowStartDelimiter(); ext.read_columns.resize(data_types.size()); for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) @@ -180,20 +188,20 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); if (column_index) - ext.read_columns[*column_index] = readField( + ext.read_columns[*column_index] = format_reader->readField( *columns[*column_index], data_types[*column_index], serializations[*column_index], is_last_file_column, column_mapping->names_of_columns[file_column]); else - skipField(file_column); + format_reader->skipField(file_column); if (!is_last_file_column) - skipFieldDelimiter(); + format_reader->skipFieldDelimiter(); } - skipRowEndDelimiter(); + format_reader->skipRowEndDelimiter(); insertDefaultsForNotSeenColumns(columns, ext); @@ -218,13 +226,13 @@ void RowInputFormatWithNamesAndTypes::tryDeserializeField(const DataTypePtr & ty const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; if (index) { - checkNullValueForNonNullable(type); + format_reader->checkNullValueForNonNullable(type); const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]); + format_reader->readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]); } else { - skipField(file_column); + format_reader->skipField(file_column); } } @@ -236,13 +244,13 @@ bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColu return false; } - if (!tryParseSuffixWithDiagnosticInfo(out)) + if (!format_reader->tryParseSuffixWithDiagnosticInfo(out)) return false; - if (likely(row_num != 1) && !parseRowBetweenDelimiterWithDiagnosticInfo(out)) + if (likely(row_num != 1) && !format_reader->parseRowBetweenDelimiterWithDiagnosticInfo(out)) return false; - if (!parseRowStartWithDiagnosticInfo(out)) + if (!format_reader->parseRowStartWithDiagnosticInfo(out)) return false; for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) @@ -266,22 +274,68 @@ bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColu /// Delimiters if (file_column + 1 != column_mapping->column_indexes_for_input_fields.size()) { - if (!parseFieldDelimiterWithDiagnosticInfo(out)) + if (!format_reader->parseFieldDelimiterWithDiagnosticInfo(out)) return false; } } - return parseRowEndWithDiagnosticInfo(out); + return format_reader->parseRowEndWithDiagnosticInfo(out); } - -void registerFileSegmentationEngineForFormatWithNamesAndTypes( - FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine) +bool RowInputFormatWithNamesAndTypes::isGarbageAfterField(size_t index, ReadBuffer::Position pos) { - factory.registerFileSegmentationEngine(base_format_name, segmentation_engine); - factory.registerFileSegmentationEngine(base_format_name + "WithNames", segmentation_engine); - factory.registerFileSegmentationEngine(base_format_name + "WithNamesAndTypes", segmentation_engine); + return format_reader->isGarbageAfterField(index, pos); } +void RowInputFormatWithNamesAndTypes::setReadBuffer(ReadBuffer & in_) +{ + format_reader->setReadBuffer(in_); + IInputFormat::setReadBuffer(in_); +} + +FormatWithNamesAndTypesSchemaReader::FormatWithNamesAndTypesSchemaReader( + ReadBuffer & in_, + size_t max_rows_to_read_, + bool with_names_, + bool with_types_, + FormatWithNamesAndTypesReader * format_reader_, + DataTypePtr default_type_) + : IRowSchemaReader(in_, max_rows_to_read_, default_type_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) +{ +} + +NamesAndTypesList FormatWithNamesAndTypesSchemaReader::readSchema() +{ + if (with_names || with_types) + skipBOMIfExists(in); + + format_reader->skipPrefixBeforeHeader(); + + Names names; + if (with_names) + names = format_reader->readNames(); + + if (with_types) + { + format_reader->skipRowBetweenDelimiter(); + std::vector data_type_names = format_reader->readTypes(); + if (data_type_names.size() != names.size()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The number of column names {} differs with the number of types {}", names.size(), data_type_names.size()); + + NamesAndTypesList result; + for (size_t i = 0; i != data_type_names.size(); ++i) + result.emplace_back(names[i], DataTypeFactory::instance().get(data_type_names[i])); + return result; + } + + if (!names.empty()) + setColumnNames(names); + + /// We should determine types by reading rows with data. Use the implementation from IRowSchemaReader. + return IRowSchemaReader::readSchema(); +} } + diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index cd7cd34d7e6..25ffc8d6de2 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -1,12 +1,15 @@ #pragma once #include +#include #include #include namespace DB { +class FormatWithNamesAndTypesReader; + /// Base class for input formats with -WithNames and -WithNamesAndTypes suffixes. /// It accepts 2 parameters in constructor - with_names and with_types and implements /// input format depending on them: @@ -20,7 +23,7 @@ namespace DB /// then reads/skips types. So you can this invariant. class RowInputFormatWithNamesAndTypes : public RowInputFormatWithDiagnosticInfo { -public: +protected: /** with_names - in the first line the header with column names * with_types - in the second line the header with column names */ @@ -28,44 +31,14 @@ public: const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, bool with_types_, const FormatSettings & format_settings_); + bool with_names_, + bool with_types_, + const FormatSettings & format_settings_, + std::unique_ptr format_reader_); void resetParser() override; - -protected: - /// Read single field from input. Return false if there was no real value and we inserted default value. - virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0; - - /// Skip single field, it's used to skip unknown columns. - virtual void skipField(size_t file_column) = 0; - /// Skip the whole row with names. - virtual void skipNames() = 0; - /// Skip the whole row with types. - virtual void skipTypes() = 0; - - /// Skip delimiters, if any. - virtual void skipPrefixBeforeHeader() {} - virtual void skipRowStartDelimiter() {} - virtual void skipFieldDelimiter() {} - virtual void skipRowEndDelimiter() {} - virtual void skipRowBetweenDelimiter() {} - - /// Check suffix. - virtual bool checkForSuffix() { return in->eof(); } - - /// Methods for parsing with diagnostic info. - virtual void checkNullValueForNonNullable(DataTypePtr) {} - virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer &) { return true; } - virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer &) { return true; } - virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer &) { return true;} - virtual bool parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer &) { return true;} - virtual bool tryParseSuffixWithDiagnosticInfo(WriteBuffer &) { return true; } - bool isGarbageAfterField(size_t, ReadBuffer::Position) override {return false; } - - /// Read row with names and return the list of them. - virtual std::vector readNames() = 0; - /// Read row with types and return the list of them. - virtual std::vector readTypes() = 0; + bool isGarbageAfterField(size_t index, ReadBuffer::Position pos) override; + void setReadBuffer(ReadBuffer & in_) override; const FormatSettings format_settings; DataTypes data_types; @@ -84,10 +57,90 @@ private: bool with_names; bool with_types; + std::unique_ptr format_reader; std::unordered_map column_indexes_by_names; }; -void registerFileSegmentationEngineForFormatWithNamesAndTypes( - FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine); +/// Base class for parsing data in input formats with -WithNames and -WithNamesAndTypes suffixes. +/// Used for reading/skipping names/types/delimiters in specific format. +class FormatWithNamesAndTypesReader +{ +public: + explicit FormatWithNamesAndTypesReader(ReadBuffer & in_, const FormatSettings & format_settings_) : in(&in_), format_settings(format_settings_) {} + + /// Read single field from input. Return false if there was no real value and we inserted default value. + virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0; + + /// Methods for parsing with diagnostic info. + virtual void checkNullValueForNonNullable(DataTypePtr) {} + virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer &) { return true;} + virtual bool parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer &) { return true;} + virtual bool tryParseSuffixWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool isGarbageAfterField(size_t, ReadBuffer::Position) { return false; } + + /// Read row with names and return the list of them. + virtual std::vector readNames() = 0; + /// Read row with types and return the list of them. + virtual std::vector readTypes() = 0; + + /// Skip single field, it's used to skip unknown columns. + virtual void skipField(size_t file_column) = 0; + /// Skip the whole row with names. + virtual void skipNames() = 0; + /// Skip the whole row with types. + virtual void skipTypes() = 0; + + /// Skip delimiters, if any. + virtual void skipPrefixBeforeHeader() {} + virtual void skipRowStartDelimiter() {} + virtual void skipFieldDelimiter() {} + virtual void skipRowEndDelimiter() {} + virtual void skipRowBetweenDelimiter() {} + + /// Check suffix. + virtual bool checkForSuffix() { return in->eof(); } + + const FormatSettings & getFormatSettings() const { return format_settings; } + + virtual void setReadBuffer(ReadBuffer & in_) { in = &in_; } + + virtual ~FormatWithNamesAndTypesReader() = default; + +protected: + ReadBuffer * in; + const FormatSettings format_settings; +}; + +/// Base class for schema inference for formats with -WithNames and -WithNamesAndTypes suffixes. +/// For formats with -WithNamesAndTypes suffix the schema will be determined by first two rows. +/// For formats with -WithNames suffix the names of columns will be determined by the first row +/// and types of columns by the rows with data. +/// For formats without suffixes default column names will be used +/// and types will be determined by the rows with data. +class FormatWithNamesAndTypesSchemaReader : public IRowSchemaReader +{ +public: + FormatWithNamesAndTypesSchemaReader( + ReadBuffer & in, + size_t max_rows_to_read_, + bool with_names_, + bool with_types_, + FormatWithNamesAndTypesReader * format_reader_, + DataTypePtr default_type_ = nullptr); + + NamesAndTypesList readSchema() override; + +protected: + virtual DataTypes readRowAndGetDataTypes() override = 0; + + bool with_names; + bool with_types; + +private: + FormatWithNamesAndTypesReader * format_reader; +}; } + diff --git a/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp b/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp index 2480673d65e..fb3ed7f80fc 100644 --- a/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp +++ b/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp @@ -6,16 +6,13 @@ namespace DB { -BuildQueryPipelineSettings BuildQueryPipelineSettings::fromSettings(const Settings & from) +BuildQueryPipelineSettings BuildQueryPipelineSettings::fromContext(ContextPtr from) { BuildQueryPipelineSettings settings; - settings.actions_settings = ExpressionActionsSettings::fromSettings(from, CompileExpressions::yes); + settings.actions_settings = ExpressionActionsSettings::fromSettings(from->getSettingsRef(), CompileExpressions::yes); + settings.process_list_element = from->getProcessListElement(); + settings.progress_callback = from->getProgressCallback(); return settings; } -BuildQueryPipelineSettings BuildQueryPipelineSettings::fromContext(ContextPtr from) -{ - return fromSettings(from->getSettingsRef()); -} - } diff --git a/src/Processors/QueryPlan/BuildQueryPipelineSettings.h b/src/Processors/QueryPlan/BuildQueryPipelineSettings.h index c3282d43778..fadbd061fbd 100644 --- a/src/Processors/QueryPlan/BuildQueryPipelineSettings.h +++ b/src/Processors/QueryPlan/BuildQueryPipelineSettings.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -8,14 +9,15 @@ namespace DB { struct Settings; +class QueryStatus; struct BuildQueryPipelineSettings { ExpressionActionsSettings actions_settings; + QueryStatus * process_list_element = nullptr; + ProgressCallback progress_callback = nullptr; const ExpressionActionsSettings & getActionsSettings() const { return actions_settings; } - - static BuildQueryPipelineSettings fromSettings(const Settings & from); static BuildQueryPipelineSettings fromContext(ContextPtr from); }; diff --git a/src/Processors/QueryPlan/QueryPlan.cpp b/src/Processors/QueryPlan/QueryPlan.cpp index f319e562bfb..a271ef78dfa 100644 --- a/src/Processors/QueryPlan/QueryPlan.cpp +++ b/src/Processors/QueryPlan/QueryPlan.cpp @@ -180,6 +180,9 @@ QueryPipelineBuilderPtr QueryPlan::buildQueryPipeline( for (auto & context : interpreter_context) last_pipeline->addInterpreterContext(std::move(context)); + last_pipeline->setProgressCallback(build_pipeline_settings.progress_callback); + last_pipeline->setProcessListElement(build_pipeline_settings.process_list_element); + return last_pipeline; } diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 826ef084d87..0f56e4ab33f 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -289,8 +289,6 @@ void ReadFromRemote::initializePipeline(QueryPipelineBuilder & pipeline, const B { for (const auto & shard : shards) { - auto coordinator = std::make_shared(); - if (shard.lazy) addLazyPipe(pipes, shard, /*coordinator=*/nullptr, /*pool*/{}, /*replica_info*/std::nullopt); else diff --git a/src/Processors/Sources/ShellCommandSource.cpp b/src/Processors/Sources/ShellCommandSource.cpp new file mode 100644 index 00000000000..dc272ace01e --- /dev/null +++ b/src/Processors/Sources/ShellCommandSource.cpp @@ -0,0 +1,586 @@ +#include + +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNSUPPORTED_METHOD; + extern const int TIMEOUT_EXCEEDED; + extern const int CANNOT_FCNTL; + extern const int CANNOT_READ_FROM_FILE_DESCRIPTOR; + extern const int CANNOT_POLL; + extern const int CANNOT_WRITE_TO_FILE_DESCRIPTOR; +} + +static bool tryMakeFdNonBlocking(int fd) +{ + int flags = fcntl(fd, F_GETFL, 0); + if (-1 == flags) + return false; + if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) + return false; + + return true; +} + +static void makeFdNonBlocking(int fd) +{ + bool result = tryMakeFdNonBlocking(fd); + if (!result) + throwFromErrno("Cannot set non-blocking mode of pipe", ErrorCodes::CANNOT_FCNTL); +} + +static bool tryMakeFdBlocking(int fd) +{ + int flags = fcntl(fd, F_GETFL, 0); + if (-1 == flags) + return false; + + if (-1 == fcntl(fd, F_SETFL, flags & (~O_NONBLOCK))) + return false; + + return true; +} + +static void makeFdBlocking(int fd) +{ + bool result = tryMakeFdBlocking(fd); + if (!result) + throwFromErrno("Cannot set blocking mode of pipe", ErrorCodes::CANNOT_FCNTL); +} + +static bool pollFd(int fd, size_t timeout_milliseconds, int events) +{ + pollfd pfd; + pfd.fd = fd; + pfd.events = events; + pfd.revents = 0; + + Stopwatch watch; + + int res; + + while (true) + { + res = poll(&pfd, 1, timeout_milliseconds); + + if (res < 0) + { + if (errno == EINTR) + { + watch.stop(); + timeout_milliseconds -= watch.elapsedMilliseconds(); + watch.start(); + + continue; + } + else + { + throwFromErrno("Cannot poll", ErrorCodes::CANNOT_POLL); + } + } + else + { + break; + } + } + + return res > 0; +} + +class TimeoutReadBufferFromFileDescriptor : public BufferWithOwnMemory +{ +public: + explicit TimeoutReadBufferFromFileDescriptor(int fd_, size_t timeout_milliseconds_) + : fd(fd_) + , timeout_milliseconds(timeout_milliseconds_) + { + makeFdNonBlocking(fd); + } + + bool nextImpl() override + { + size_t bytes_read = 0; + + while (!bytes_read) + { + if (!pollFd(fd, timeout_milliseconds, POLLIN)) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Pipe read timeout exceeded {} milliseconds", timeout_milliseconds); + + ssize_t res = ::read(fd, internal_buffer.begin(), internal_buffer.size()); + + if (-1 == res && errno != EINTR) + throwFromErrno("Cannot read from pipe ", ErrorCodes::CANNOT_READ_FROM_FILE_DESCRIPTOR); + + if (res == 0) + break; + + if (res > 0) + bytes_read += res; + } + + if (bytes_read > 0) + { + working_buffer = internal_buffer; + working_buffer.resize(bytes_read); + } + else + { + return false; + } + + return true; + } + + void reset() const + { + makeFdBlocking(fd); + } + + ~TimeoutReadBufferFromFileDescriptor() override + { + tryMakeFdBlocking(fd); + } + +private: + int fd; + size_t timeout_milliseconds; +}; + +class TimeoutWriteBufferFromFileDescriptor : public BufferWithOwnMemory +{ +public: + explicit TimeoutWriteBufferFromFileDescriptor(int fd_, size_t timeout_milliseconds_) + : fd(fd_) + , timeout_milliseconds(timeout_milliseconds_) + { + makeFdNonBlocking(fd); + } + + void nextImpl() override + { + if (!offset()) + return; + + size_t bytes_written = 0; + + while (bytes_written != offset()) + { + if (!pollFd(fd, timeout_milliseconds, POLLOUT)) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Pipe write timeout exceeded {} milliseconds", timeout_milliseconds); + + ssize_t res = ::write(fd, working_buffer.begin() + bytes_written, offset() - bytes_written); + + if ((-1 == res || 0 == res) && errno != EINTR) + throwFromErrno("Cannot write into pipe ", ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR); + + if (res > 0) + bytes_written += res; + } + } + + void reset() const + { + makeFdBlocking(fd); + } + + ~TimeoutWriteBufferFromFileDescriptor() override + { + tryMakeFdBlocking(fd); + } + +private: + int fd; + size_t timeout_milliseconds; +}; + +class ShellCommandHolder +{ +public: + using ShellCommandBuilderFunc = std::function()>; + + explicit ShellCommandHolder(ShellCommandBuilderFunc && func_) + : func(std::move(func_)) + {} + + std::unique_ptr buildCommand() + { + if (returned_command) + return std::move(returned_command); + + return func(); + } + + void returnCommand(std::unique_ptr command) + { + returned_command = std::move(command); + } + +private: + std::unique_ptr returned_command; + ShellCommandBuilderFunc func; +}; + +namespace +{ + /** A stream, that get child process and sends data using tasks in background threads. + * For each send data task background thread is created. Send data task must send data to process input pipes. + * ShellCommandPoolSource receives data from process stdout. + * + * If process_pool is passed in constructor then after source is destroyed process is returned to pool. + */ + class ShellCommandSource final : public SourceWithProgress + { + public: + + using SendDataTask = std::function; + + ShellCommandSource( + ContextPtr context_, + const std::string & format_, + size_t command_read_timeout_milliseconds, + const Block & sample_block_, + std::unique_ptr && command_, + std::vector && send_data_tasks = {}, + const ShellCommandSourceConfiguration & configuration_ = {}, + std::unique_ptr && command_holder_ = nullptr, + std::shared_ptr process_pool_ = nullptr) + : SourceWithProgress(sample_block_) + , context(context_) + , format(format_) + , sample_block(sample_block_) + , command(std::move(command_)) + , configuration(configuration_) + , timeout_command_out(command->out.getFD(), command_read_timeout_milliseconds) + , command_holder(std::move(command_holder_)) + , process_pool(process_pool_) + { + for (auto && send_data_task : send_data_tasks) + { + send_data_threads.emplace_back([task = std::move(send_data_task), this]() + { + try + { + task(); + } + catch (...) + { + std::lock_guard lock(send_data_lock); + exception_during_send_data = std::current_exception(); + } + }); + } + + size_t max_block_size = configuration.max_block_size; + + if (configuration.read_fixed_number_of_rows) + { + /** Currently parallel parsing input format cannot read exactly max_block_size rows from input, + * so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof. + */ + auto context_for_reading = Context::createCopy(context); + context_for_reading->setSetting("input_format_parallel_parsing", false); + context = context_for_reading; + + if (configuration.read_number_of_rows_from_process_output) + { + /// Initialize executor in generate + return; + } + + max_block_size = configuration.number_of_rows_to_read; + } + + pipeline = QueryPipeline(Pipe(context->getInputFormat(format, timeout_command_out, sample_block, max_block_size))); + executor = std::make_unique(pipeline); + } + + ~ShellCommandSource() override + { + for (auto & thread : send_data_threads) + if (thread.joinable()) + thread.join(); + + if (command_is_invalid) + command = nullptr; + + if (command_holder && process_pool) + { + bool valid_command = configuration.read_fixed_number_of_rows && current_read_rows >= configuration.number_of_rows_to_read; + + if (command && valid_command) + command_holder->returnCommand(std::move(command)); + + process_pool->returnObject(std::move(command_holder)); + } + } + + protected: + + Chunk generate() override + { + rethrowExceptionDuringSendDataIfNeeded(); + + Chunk chunk; + + try + { + if (configuration.read_fixed_number_of_rows) + { + if (!executor && configuration.read_number_of_rows_from_process_output) + { + readText(configuration.number_of_rows_to_read, timeout_command_out); + char dummy; + readChar(dummy, timeout_command_out); + + size_t max_block_size = configuration.number_of_rows_to_read; + pipeline = QueryPipeline(Pipe(context->getInputFormat(format, timeout_command_out, sample_block, max_block_size))); + executor = std::make_unique(pipeline); + } + + if (current_read_rows >= configuration.number_of_rows_to_read) + return {}; + } + + if (!executor->pull(chunk)) + return {}; + + current_read_rows += chunk.getNumRows(); + } + catch (...) + { + command_is_invalid = true; + throw; + } + + return chunk; + } + + Status prepare() override + { + auto status = SourceWithProgress::prepare(); + + if (status == Status::Finished) + { + for (auto & thread : send_data_threads) + if (thread.joinable()) + thread.join(); + + rethrowExceptionDuringSendDataIfNeeded(); + } + + return status; + } + + String getName() const override { return "ShellCommandSource"; } + + private: + + void rethrowExceptionDuringSendDataIfNeeded() + { + std::lock_guard lock(send_data_lock); + if (exception_during_send_data) + { + command_is_invalid = true; + std::rethrow_exception(exception_during_send_data); + } + } + + ContextPtr context; + std::string format; + Block sample_block; + + std::unique_ptr command; + ShellCommandSourceConfiguration configuration; + + TimeoutReadBufferFromFileDescriptor timeout_command_out; + + size_t current_read_rows = 0; + + ShellCommandHolderPtr command_holder; + std::shared_ptr process_pool; + + QueryPipeline pipeline; + std::unique_ptr executor; + + std::vector send_data_threads; + + std::mutex send_data_lock; + std::exception_ptr exception_during_send_data; + + std::atomic command_is_invalid {false}; + }; + + class SendingChunkHeaderTransform final : public ISimpleTransform + { + public: + SendingChunkHeaderTransform(const Block & header, std::shared_ptr buffer_) + : ISimpleTransform(header, header, false) + , buffer(buffer_) + { + } + + String getName() const override { return "SendingChunkHeaderTransform"; } + + protected: + + void transform(Chunk & chunk) override + { + writeText(chunk.getNumRows(), *buffer); + writeChar('\n', *buffer); + } + + private: + std::shared_ptr buffer; + }; + +} + +ShellCommandSourceCoordinator::ShellCommandSourceCoordinator(const Configuration & configuration_) + : configuration(configuration_) +{ + if (configuration.is_executable_pool) + process_pool = std::make_shared(configuration.pool_size ? configuration.pool_size : std::numeric_limits::max()); +} + +Pipe ShellCommandSourceCoordinator::createPipe( + const std::string & command, + const std::vector & arguments, + std::vector && input_pipes, + Block sample_block, + ContextPtr context, + const ShellCommandSourceConfiguration & source_configuration) +{ + ShellCommand::Config command_config(command); + command_config.arguments = arguments; + for (size_t i = 1; i < input_pipes.size(); ++i) + command_config.write_fds.emplace_back(i + 2); + + std::unique_ptr process; + std::unique_ptr process_holder; + + auto destructor_strategy = ShellCommand::DestructorStrategy{true /*terminate_in_destructor*/, configuration.command_termination_timeout_seconds}; + command_config.terminate_in_destructor_strategy = destructor_strategy; + + bool is_executable_pool = (process_pool != nullptr); + if (is_executable_pool) + { + bool execute_direct = configuration.execute_direct; + + bool result = process_pool->tryBorrowObject( + process_holder, + [command_config, execute_direct]() + { + ShellCommandHolder::ShellCommandBuilderFunc func = [command_config, execute_direct]() mutable + { + if (execute_direct) + return ShellCommand::executeDirect(command_config); + else + return ShellCommand::execute(command_config); + }; + + return std::make_unique(std::move(func)); + }, + configuration.max_command_execution_time_seconds * 10000); + + if (!result) + throw Exception( + ErrorCodes::TIMEOUT_EXCEEDED, + "Could not get process from pool, max command execution timeout exceeded {} seconds", + configuration.max_command_execution_time_seconds); + + process = process_holder->buildCommand(); + } + else + { + if (configuration.execute_direct) + process = ShellCommand::executeDirect(command_config); + else + process = ShellCommand::execute(command_config); + } + + std::vector tasks; + tasks.reserve(input_pipes.size()); + + for (size_t i = 0; i < input_pipes.size(); ++i) + { + WriteBufferFromFile * write_buffer = nullptr; + + if (i == 0) + { + write_buffer = &process->in; + } + else + { + auto descriptor = i + 2; + auto it = process->write_fds.find(descriptor); + if (it == process->write_fds.end()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Process does not contain descriptor to write {}", descriptor); + + write_buffer = &it->second; + } + + int write_buffer_fd = write_buffer->getFD(); + auto timeout_write_buffer = std::make_shared(write_buffer_fd, configuration.command_write_timeout_milliseconds); + + input_pipes[i].resize(1); + + if (configuration.send_chunk_header) + { + auto transform = std::make_shared(input_pipes[i].getHeader(), timeout_write_buffer); + input_pipes[i].addTransform(std::move(transform)); + } + + auto pipeline = std::make_shared(std::move(input_pipes[i])); + auto out = context->getOutputFormat(configuration.format, *timeout_write_buffer, materializeBlock(pipeline->getHeader())); + out->setAutoFlush(); + pipeline->complete(std::move(out)); + + ShellCommandSource::SendDataTask task = [pipeline, timeout_write_buffer, write_buffer, is_executable_pool]() + { + CompletedPipelineExecutor executor(*pipeline); + executor.execute(); + + if (!is_executable_pool) + { + timeout_write_buffer->next(); + timeout_write_buffer->reset(); + + write_buffer->close(); + } + }; + + tasks.emplace_back(std::move(task)); + } + + auto source = std::make_unique( + context, + configuration.format, + configuration.command_read_timeout_milliseconds, + std::move(sample_block), + std::move(process), + std::move(tasks), + source_configuration, + std::move(process_holder), + process_pool); + auto pipe = Pipe(std::move(source)); + + return pipe; +} + +} diff --git a/src/Processors/Sources/ShellCommandSource.h b/src/Processors/Sources/ShellCommandSource.h index 4974c33f290..649c713afcb 100644 --- a/src/Processors/Sources/ShellCommandSource.h +++ b/src/Processors/Sources/ShellCommandSource.h @@ -19,14 +19,10 @@ namespace DB { -/** A stream, that get child process and sends data using tasks in background threads. - * For each send data task background thread is created. Send data task must send data to process input pipes. - * ShellCommandPoolSource receives data from process stdout. - * - * If process_pool is passed in constructor then after source is destroyed process is returned to pool. - */ +class ShellCommandHolder; +using ShellCommandHolderPtr = std::unique_ptr; -using ProcessPool = BorrowedObjectPool>; +using ProcessPool = BorrowedObjectPool; struct ShellCommandSourceConfiguration { @@ -37,148 +33,92 @@ struct ShellCommandSourceConfiguration /// Valid only if read_fixed_number_of_rows = true size_t number_of_rows_to_read = 0; /// Max block size - size_t max_block_size = DBMS_DEFAULT_BUFFER_SIZE; + size_t max_block_size = DEFAULT_BLOCK_SIZE; }; -class ShellCommandSource final : public SourceWithProgress +class ShellCommandSourceCoordinator { public: - using SendDataTask = std::function; + struct Configuration + { - ShellCommandSource( + /// Script output format + std::string format; + + /// Command termination timeout in seconds + size_t command_termination_timeout_seconds = 10; + + /// Timeout for reading data from command stdout + size_t command_read_timeout_milliseconds = 10000; + + /// Timeout for writing data to command stdin + size_t command_write_timeout_milliseconds = 10000; + + /// Pool size valid only if executable_pool = true + size_t pool_size = 16; + + /// Max command execution time in milliseconds. Valid only if executable_pool = true + size_t max_command_execution_time_seconds = 10; + + /// Should pool of processes be created. + bool is_executable_pool = false; + + /// Send number_of_rows\n before sending chunk to process. + bool send_chunk_header = false; + + /// Execute script direct or with /bin/bash. + bool execute_direct = true; + + }; + + explicit ShellCommandSourceCoordinator(const Configuration & configuration_); + + const Configuration & getConfiguration() const + { + return configuration; + } + + Pipe createPipe( + const std::string & command, + const std::vector & arguments, + std::vector && input_pipes, + Block sample_block, ContextPtr context, - const std::string & format, - const Block & sample_block, - std::unique_ptr && command_, - std::vector && send_data_tasks = {}, - const ShellCommandSourceConfiguration & configuration_ = {}, - std::shared_ptr process_pool_ = nullptr) - : SourceWithProgress(sample_block) - , command(std::move(command_)) - , configuration(configuration_) - , process_pool(process_pool_) + const ShellCommandSourceConfiguration & source_configuration = {}); + + Pipe createPipe( + const std::string & command, + std::vector && input_pipes, + Block sample_block, + ContextPtr context, + const ShellCommandSourceConfiguration & source_configuration = {}) { - for (auto && send_data_task : send_data_tasks) - { - send_data_threads.emplace_back([task = std::move(send_data_task), this]() - { - try - { - task(); - } - catch (...) - { - std::lock_guard lock(send_data_lock); - exception_during_send_data = std::current_exception(); - } - }); - } - - size_t max_block_size = configuration.max_block_size; - - if (configuration.read_fixed_number_of_rows) - { - /** Currently parallel parsing input format cannot read exactly max_block_size rows from input, - * so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof. - */ - auto context_for_reading = Context::createCopy(context); - context_for_reading->setSetting("input_format_parallel_parsing", false); - context = context_for_reading; - - if (configuration.read_number_of_rows_from_process_output) - { - readText(configuration.number_of_rows_to_read, command->out); - char dummy; - readChar(dummy, command->out); - } - - max_block_size = configuration.number_of_rows_to_read; - } - - pipeline = QueryPipeline(Pipe(context->getInputFormat(format, command->out, sample_block, max_block_size))); - executor = std::make_unique(pipeline); + return createPipe(command, {}, std::move(input_pipes), std::move(sample_block), std::move(context), source_configuration); } - ~ShellCommandSource() override + Pipe createPipe( + const std::string & command, + const std::vector & arguments, + Block sample_block, + ContextPtr context) { - for (auto & thread : send_data_threads) - if (thread.joinable()) - thread.join(); - - if (command && process_pool) - process_pool->returnObject(std::move(command)); + return createPipe(command, arguments, {}, std::move(sample_block), std::move(context), {}); } -protected: - - Chunk generate() override + Pipe createPipe( + const std::string & command, + Block sample_block, + ContextPtr context) { - rethrowExceptionDuringSendDataIfNeeded(); - - if (configuration.read_fixed_number_of_rows && configuration.number_of_rows_to_read == current_read_rows) - return {}; - - Chunk chunk; - - try - { - if (!executor->pull(chunk)) - return {}; - - current_read_rows += chunk.getNumRows(); - } - catch (...) - { - command = nullptr; - throw; - } - - return chunk; + return createPipe(command, {}, {}, std::move(sample_block), std::move(context), {}); } - Status prepare() override - { - auto status = SourceWithProgress::prepare(); - - if (status == Status::Finished) - { - for (auto & thread : send_data_threads) - if (thread.joinable()) - thread.join(); - - rethrowExceptionDuringSendDataIfNeeded(); - } - - return status; - } - - String getName() const override { return "ShellCommandSource"; } - private: - void rethrowExceptionDuringSendDataIfNeeded() - { - std::lock_guard lock(send_data_lock); - if (exception_during_send_data) - { - command = nullptr; - std::rethrow_exception(exception_during_send_data); - } - } + Configuration configuration; - std::unique_ptr command; - ShellCommandSourceConfiguration configuration; - - size_t current_read_rows = 0; - - std::shared_ptr process_pool; - - QueryPipeline pipeline; - std::unique_ptr executor; - - std::vector send_data_threads; - std::mutex send_data_lock; - std::exception_ptr exception_during_send_data; + std::shared_ptr process_pool = nullptr; }; + } diff --git a/src/Processors/Sources/SourceWithProgress.cpp b/src/Processors/Sources/SourceWithProgress.cpp index 9b7a5c6a762..60c39c919f6 100644 --- a/src/Processors/Sources/SourceWithProgress.cpp +++ b/src/Processors/Sources/SourceWithProgress.cpp @@ -26,6 +26,8 @@ SourceWithProgress::SourceWithProgress(Block header, bool enable_auto_progress) void SourceWithProgress::setProcessListElement(QueryStatus * elem) { process_list_elem = elem; + if (!elem) + return; /// Update total_rows_approx as soon as possible. /// diff --git a/src/Processors/Transforms/CountingTransform.cpp b/src/Processors/Transforms/CountingTransform.cpp index 79b6360f22e..eb191b36586 100644 --- a/src/Processors/Transforms/CountingTransform.cpp +++ b/src/Processors/Transforms/CountingTransform.cpp @@ -18,20 +18,21 @@ namespace DB void CountingTransform::onConsume(Chunk chunk) { - Progress local_progress(chunk.getNumRows(), chunk.bytes(), 0); + Progress local_progress{WriteProgress(chunk.getNumRows(), chunk.bytes())}; progress.incrementPiecewiseAtomically(local_progress); //std::cerr << "============ counting adding progress for " << static_cast(thread_status) << ' ' << chunk.getNumRows() << " rows\n"; if (thread_status) { - thread_status->performance_counters.increment(ProfileEvents::InsertedRows, local_progress.read_rows); - thread_status->performance_counters.increment(ProfileEvents::InsertedBytes, local_progress.read_bytes); + thread_status->performance_counters.increment(ProfileEvents::InsertedRows, local_progress.written_rows); + thread_status->performance_counters.increment(ProfileEvents::InsertedBytes, local_progress.written_bytes); + thread_status->progress_out.incrementPiecewiseAtomically(local_progress); } else { - ProfileEvents::increment(ProfileEvents::InsertedRows, local_progress.read_rows); - ProfileEvents::increment(ProfileEvents::InsertedBytes, local_progress.read_bytes); + ProfileEvents::increment(ProfileEvents::InsertedRows, local_progress.written_rows); + ProfileEvents::increment(ProfileEvents::InsertedBytes, local_progress.written_bytes); } if (process_elem) diff --git a/src/Processors/Transforms/PostgreSQLSource.cpp b/src/Processors/Transforms/PostgreSQLSource.cpp index ac8408d8338..88f092a2533 100644 --- a/src/Processors/Transforms/PostgreSQLSource.cpp +++ b/src/Processors/Transforms/PostgreSQLSource.cpp @@ -74,7 +74,17 @@ template void PostgreSQLSource::onStart() { if (!tx) - tx = std::make_shared(connection_holder->get()); + { + try + { + tx = std::make_shared(connection_holder->get()); + } + catch (const pqxx::broken_connection &) + { + connection_holder->update(); + tx = std::make_shared(connection_holder->get()); + } + } stream = std::make_unique(*tx, pqxx::from_query, std::string_view(query_str)); } diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index 82c2a337a45..17075e2b318 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +25,12 @@ #include #include +namespace ProfileEvents +{ + extern const Event SelectedBytes; + extern const Event SelectedRows; +} + namespace DB { @@ -451,13 +459,6 @@ static QueryPipeline process(Block block, ViewRuntimeData & view, const ViewsDat pipeline.getHeader(), std::make_shared(std::move(converting)))); - pipeline.setProgressCallback([context](const Progress & progress) - { - CurrentThread::updateProgressIn(progress); - if (auto callback = context->getProgressCallback()) - callback(progress); - }); - return QueryPipelineBuilder::getPipeline(std::move(pipeline)); } @@ -595,7 +596,11 @@ void PushingToLiveViewSink::consume(Chunk chunk) { Progress local_progress(chunk.getNumRows(), chunk.bytes(), 0); StorageLiveView::writeIntoLiveView(live_view, getHeader().cloneWithColumns(chunk.detachColumns()), context); - CurrentThread::updateProgressIn(local_progress); + auto * process = context->getProcessListElement(); + if (process) + process->updateProgressIn(local_progress); + ProfileEvents::increment(ProfileEvents::SelectedRows, local_progress.read_rows); + ProfileEvents::increment(ProfileEvents::SelectedBytes, local_progress.read_bytes); } @@ -614,7 +619,11 @@ void PushingToWindowViewSink::consume(Chunk chunk) Progress local_progress(chunk.getNumRows(), chunk.bytes(), 0); StorageWindowView::writeIntoWindowView( window_view, getHeader().cloneWithColumns(chunk.detachColumns()), context); - CurrentThread::updateProgressIn(local_progress); + auto * process = context->getProcessListElement(); + if (process) + process->updateProgressIn(local_progress); + ProfileEvents::increment(ProfileEvents::SelectedRows, local_progress.read_rows); + ProfileEvents::increment(ProfileEvents::SelectedBytes, local_progress.read_bytes); } diff --git a/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp b/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp index df3901e2eb1..ee661b39fac 100644 --- a/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp +++ b/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp @@ -27,7 +27,8 @@ TEST(Processors, PortsConnected) processors.emplace_back(std::move(source)); processors.emplace_back(std::move(sink)); - PipelineExecutor executor(processors); + QueryStatus * element = nullptr; + PipelineExecutor executor(processors, element); executor.execute(1); } @@ -51,7 +52,8 @@ TEST(Processors, PortsNotConnected) try { - PipelineExecutor executor(processors); + QueryStatus * element = nullptr; + PipelineExecutor executor(processors, element); executor.execute(1); ASSERT_TRUE(false) << "Should have thrown."; } diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 40c64046560..dba7c7cb8f7 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -560,6 +560,7 @@ QueryPipeline QueryPipelineBuilder::getPipeline(QueryPipelineBuilder builder) { QueryPipeline res(std::move(builder.pipe)); res.setNumThreads(builder.getNumThreads()); + res.setProcessListElement(builder.process_list_element); return res; } diff --git a/src/Server/GRPCServer.h b/src/Server/GRPCServer.h index 25c3813c11d..e2b48f1c16b 100644 --- a/src/Server/GRPCServer.h +++ b/src/Server/GRPCServer.h @@ -4,6 +4,7 @@ #if USE_GRPC #include +#include #include "clickhouse_grpc.grpc.pb.h" namespace Poco { class Logger; } @@ -30,6 +31,9 @@ public: /// Stops the server. No new connections will be accepted. void stop(); + /// Returns the port this server is listening to. + UInt16 portNumber() const { return address_to_listen.port(); } + /// Returns the number of currently handled connections. size_t currentConnections() const; diff --git a/src/Server/HTTP/HTTPServer.cpp b/src/Server/HTTP/HTTPServer.cpp index 42e6467d0af..2e91fad1c0f 100644 --- a/src/Server/HTTP/HTTPServer.cpp +++ b/src/Server/HTTP/HTTPServer.cpp @@ -5,31 +5,13 @@ namespace DB { -HTTPServer::HTTPServer( - ContextPtr context, - HTTPRequestHandlerFactoryPtr factory_, - UInt16 port_number, - Poco::Net::HTTPServerParams::Ptr params) - : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), port_number, params), factory(factory_) -{ -} - -HTTPServer::HTTPServer( - ContextPtr context, - HTTPRequestHandlerFactoryPtr factory_, - const Poco::Net::ServerSocket & socket, - Poco::Net::HTTPServerParams::Ptr params) - : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), socket, params), factory(factory_) -{ -} - HTTPServer::HTTPServer( ContextPtr context, HTTPRequestHandlerFactoryPtr factory_, Poco::ThreadPool & thread_pool, - const Poco::Net::ServerSocket & socket, + Poco::Net::ServerSocket & socket_, Poco::Net::HTTPServerParams::Ptr params) - : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), thread_pool, socket, params), factory(factory_) + : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), thread_pool, socket_, params), factory(factory_) { } diff --git a/src/Server/HTTP/HTTPServer.h b/src/Server/HTTP/HTTPServer.h index 3518fd66d20..07ad54d267f 100644 --- a/src/Server/HTTP/HTTPServer.h +++ b/src/Server/HTTP/HTTPServer.h @@ -1,9 +1,9 @@ #pragma once #include +#include #include -#include #include @@ -13,26 +13,14 @@ namespace DB class Context; -class HTTPServer : public Poco::Net::TCPServer +class HTTPServer : public TCPServer { public: explicit HTTPServer( - ContextPtr context, - HTTPRequestHandlerFactoryPtr factory, - UInt16 port_number = 80, - Poco::Net::HTTPServerParams::Ptr params = new Poco::Net::HTTPServerParams); - - HTTPServer( - ContextPtr context, - HTTPRequestHandlerFactoryPtr factory, - const Poco::Net::ServerSocket & socket, - Poco::Net::HTTPServerParams::Ptr params); - - HTTPServer( ContextPtr context, HTTPRequestHandlerFactoryPtr factory, Poco::ThreadPool & thread_pool, - const Poco::Net::ServerSocket & socket, + Poco::Net::ServerSocket & socket, Poco::Net::HTTPServerParams::Ptr params); ~HTTPServer() override; diff --git a/src/Server/HTTP/HTTPServerConnection.cpp b/src/Server/HTTP/HTTPServerConnection.cpp index de81da20ead..7020b8e9a23 100644 --- a/src/Server/HTTP/HTTPServerConnection.cpp +++ b/src/Server/HTTP/HTTPServerConnection.cpp @@ -1,4 +1,5 @@ #include +#include #include @@ -7,10 +8,11 @@ namespace DB HTTPServerConnection::HTTPServerConnection( ContextPtr context_, + TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket, Poco::Net::HTTPServerParams::Ptr params_, HTTPRequestHandlerFactoryPtr factory_) - : TCPServerConnection(socket), context(Context::createCopy(context_)), params(params_), factory(factory_), stopped(false) + : TCPServerConnection(socket), context(Context::createCopy(context_)), tcp_server(tcp_server_), params(params_), factory(factory_), stopped(false) { poco_check_ptr(factory); } @@ -20,12 +22,12 @@ void HTTPServerConnection::run() std::string server = params->getSoftwareVersion(); Poco::Net::HTTPServerSession session(socket(), params); - while (!stopped && session.hasMoreRequests()) + while (!stopped && tcp_server.isOpen() && session.hasMoreRequests()) { try { std::unique_lock lock(mutex); - if (!stopped) + if (!stopped && tcp_server.isOpen()) { HTTPServerResponse response(session); HTTPServerRequest request(context, response, session); @@ -48,6 +50,11 @@ void HTTPServerConnection::run() response.set("Server", server); try { + if (!tcp_server.isOpen()) + { + sendErrorResponse(session, Poco::Net::HTTPResponse::HTTP_SERVICE_UNAVAILABLE); + break; + } std::unique_ptr handler(factory->createRequestHandler(request)); if (handler) diff --git a/src/Server/HTTP/HTTPServerConnection.h b/src/Server/HTTP/HTTPServerConnection.h index 1c7ae6cd2b7..db3969f6ffb 100644 --- a/src/Server/HTTP/HTTPServerConnection.h +++ b/src/Server/HTTP/HTTPServerConnection.h @@ -9,12 +9,14 @@ namespace DB { +class TCPServer; class HTTPServerConnection : public Poco::Net::TCPServerConnection { public: HTTPServerConnection( ContextPtr context, + TCPServer & tcp_server, const Poco::Net::StreamSocket & socket, Poco::Net::HTTPServerParams::Ptr params, HTTPRequestHandlerFactoryPtr factory); @@ -26,6 +28,7 @@ protected: private: ContextPtr context; + TCPServer & tcp_server; Poco::Net::HTTPServerParams::Ptr params; HTTPRequestHandlerFactoryPtr factory; bool stopped; diff --git a/src/Server/HTTP/HTTPServerConnectionFactory.cpp b/src/Server/HTTP/HTTPServerConnectionFactory.cpp index 0e4fb6cfcec..008da222c79 100644 --- a/src/Server/HTTP/HTTPServerConnectionFactory.cpp +++ b/src/Server/HTTP/HTTPServerConnectionFactory.cpp @@ -11,9 +11,9 @@ HTTPServerConnectionFactory::HTTPServerConnectionFactory( poco_check_ptr(factory); } -Poco::Net::TCPServerConnection * HTTPServerConnectionFactory::createConnection(const Poco::Net::StreamSocket & socket) +Poco::Net::TCPServerConnection * HTTPServerConnectionFactory::createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) { - return new HTTPServerConnection(context, socket, params, factory); + return new HTTPServerConnection(context, tcp_server, socket, params, factory); } } diff --git a/src/Server/HTTP/HTTPServerConnectionFactory.h b/src/Server/HTTP/HTTPServerConnectionFactory.h index 3f11eca0f69..a19dc6d4d5c 100644 --- a/src/Server/HTTP/HTTPServerConnectionFactory.h +++ b/src/Server/HTTP/HTTPServerConnectionFactory.h @@ -2,19 +2,19 @@ #include #include +#include #include -#include namespace DB { -class HTTPServerConnectionFactory : public Poco::Net::TCPServerConnectionFactory +class HTTPServerConnectionFactory : public TCPServerConnectionFactory { public: HTTPServerConnectionFactory(ContextPtr context, Poco::Net::HTTPServerParams::Ptr params, HTTPRequestHandlerFactoryPtr factory); - Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override; + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) override; private: ContextPtr context; diff --git a/src/Server/KeeperTCPHandlerFactory.h b/src/Server/KeeperTCPHandlerFactory.h index 67bb3dab268..58dc73d7c27 100644 --- a/src/Server/KeeperTCPHandlerFactory.h +++ b/src/Server/KeeperTCPHandlerFactory.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include #include @@ -10,7 +10,7 @@ namespace DB { -class KeeperTCPHandlerFactory : public Poco::Net::TCPServerConnectionFactory +class KeeperTCPHandlerFactory : public TCPServerConnectionFactory { private: IServer & server; @@ -29,7 +29,7 @@ public: { } - Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer &) override { try { diff --git a/src/Server/MySQLHandler.cpp b/src/Server/MySQLHandler.cpp index deebc073ad5..2836ee05c30 100644 --- a/src/Server/MySQLHandler.cpp +++ b/src/Server/MySQLHandler.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -62,10 +63,11 @@ static String showTableStatusReplacementQuery(const String & query); static String killConnectionIdReplacementQuery(const String & query); static String selectLimitReplacementQuery(const String & query); -MySQLHandler::MySQLHandler(IServer & server_, const Poco::Net::StreamSocket & socket_, +MySQLHandler::MySQLHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_) : Poco::Net::TCPServerConnection(socket_) , server(server_) + , tcp_server(tcp_server_) , log(&Poco::Logger::get("MySQLHandler")) , connection_id(connection_id_) , auth_plugin(new MySQLProtocol::Authentication::Native41()) @@ -138,11 +140,14 @@ void MySQLHandler::run() OKPacket ok_packet(0, handshake_response.capability_flags, 0, 0, 0); packet_endpoint->sendPacket(ok_packet, true); - while (true) + while (tcp_server.isOpen()) { packet_endpoint->resetSequenceId(); MySQLPacketPayloadReadBuffer payload = packet_endpoint->getPayload(); + while (!in->poll(1000000)) + if (!tcp_server.isOpen()) + return; char command = 0; payload.readStrict(command); @@ -152,6 +157,8 @@ void MySQLHandler::run() LOG_DEBUG(log, "Received command: {}. Connection id: {}.", static_cast(static_cast(command)), connection_id); + if (!tcp_server.isOpen()) + return; try { switch (command) @@ -369,8 +376,8 @@ void MySQLHandler::finishHandshakeSSL( } #if USE_SSL -MySQLHandlerSSL::MySQLHandlerSSL(IServer & server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_, RSA & public_key_, RSA & private_key_) - : MySQLHandler(server_, socket_, ssl_enabled, connection_id_) +MySQLHandlerSSL::MySQLHandlerSSL(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_, RSA & public_key_, RSA & private_key_) + : MySQLHandler(server_, tcp_server_, socket_, ssl_enabled, connection_id_) , public_key(public_key_) , private_key(private_key_) {} diff --git a/src/Server/MySQLHandler.h b/src/Server/MySQLHandler.h index 7ef212bf36e..3af5f7a0eb2 100644 --- a/src/Server/MySQLHandler.h +++ b/src/Server/MySQLHandler.h @@ -24,11 +24,14 @@ namespace CurrentMetrics namespace DB { +class ReadBufferFromPocoSocket; +class TCPServer; + /// Handler for MySQL wire protocol connections. Allows to connect to ClickHouse using MySQL client. class MySQLHandler : public Poco::Net::TCPServerConnection { public: - MySQLHandler(IServer & server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_); + MySQLHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_); void run() final; @@ -52,6 +55,7 @@ protected: virtual void finishHandshakeSSL(size_t packet_size, char * buf, size_t pos, std::function read_bytes, MySQLProtocol::ConnectionPhase::HandshakeResponse & packet); IServer & server; + TCPServer & tcp_server; Poco::Logger * log; UInt64 connection_id = 0; @@ -68,7 +72,7 @@ protected: Replacements replacements; std::unique_ptr auth_plugin; - std::shared_ptr in; + std::shared_ptr in; std::shared_ptr out; bool secure_connection = false; }; @@ -77,7 +81,7 @@ protected: class MySQLHandlerSSL : public MySQLHandler { public: - MySQLHandlerSSL(IServer & server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_, RSA & public_key_, RSA & private_key_); + MySQLHandlerSSL(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool ssl_enabled, size_t connection_id_, RSA & public_key_, RSA & private_key_); private: void authPluginSSL() override; diff --git a/src/Server/MySQLHandlerFactory.cpp b/src/Server/MySQLHandlerFactory.cpp index 7a0bfd8ab09..f7bb073e275 100644 --- a/src/Server/MySQLHandlerFactory.cpp +++ b/src/Server/MySQLHandlerFactory.cpp @@ -118,14 +118,14 @@ void MySQLHandlerFactory::generateRSAKeys() } #endif -Poco::Net::TCPServerConnection * MySQLHandlerFactory::createConnection(const Poco::Net::StreamSocket & socket) +Poco::Net::TCPServerConnection * MySQLHandlerFactory::createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) { size_t connection_id = last_connection_id++; LOG_TRACE(log, "MySQL connection. Id: {}. Address: {}", connection_id, socket.peerAddress().toString()); #if USE_SSL - return new MySQLHandlerSSL(server, socket, ssl_enabled, connection_id, *public_key, *private_key); + return new MySQLHandlerSSL(server, tcp_server, socket, ssl_enabled, connection_id, *public_key, *private_key); #else - return new MySQLHandler(server, socket, ssl_enabled, connection_id); + return new MySQLHandler(server, tcp_server, socket, ssl_enabled, connection_id); #endif } diff --git a/src/Server/MySQLHandlerFactory.h b/src/Server/MySQLHandlerFactory.h index 106fdfdf341..25f1af85273 100644 --- a/src/Server/MySQLHandlerFactory.h +++ b/src/Server/MySQLHandlerFactory.h @@ -1,9 +1,9 @@ #pragma once -#include #include #include #include +#include #include @@ -13,8 +13,9 @@ namespace DB { +class TCPServer; -class MySQLHandlerFactory : public Poco::Net::TCPServerConnectionFactory +class MySQLHandlerFactory : public TCPServerConnectionFactory { private: IServer & server; @@ -43,7 +44,7 @@ public: void generateRSAKeys(); - Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override; + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) override; }; } diff --git a/src/Server/PostgreSQLHandler.cpp b/src/Server/PostgreSQLHandler.cpp index fee4ace3452..9808b538280 100644 --- a/src/Server/PostgreSQLHandler.cpp +++ b/src/Server/PostgreSQLHandler.cpp @@ -6,6 +6,7 @@ #include #include "PostgreSQLHandler.h" #include +#include #include #include #include @@ -28,11 +29,13 @@ namespace ErrorCodes PostgreSQLHandler::PostgreSQLHandler( const Poco::Net::StreamSocket & socket_, IServer & server_, + TCPServer & tcp_server_, bool ssl_enabled_, Int32 connection_id_, std::vector> & auth_methods_) : Poco::Net::TCPServerConnection(socket_) , server(server_) + , tcp_server(tcp_server_) , ssl_enabled(ssl_enabled_) , connection_id(connection_id_) , authentication_manager(auth_methods_) @@ -60,11 +63,18 @@ void PostgreSQLHandler::run() if (!startup()) return; - while (true) + while (tcp_server.isOpen()) { message_transport->send(PostgreSQLProtocol::Messaging::ReadyForQuery(), true); + + constexpr size_t connection_check_timeout = 1; // 1 second + while (!in->poll(1000000 * connection_check_timeout)) + if (!tcp_server.isOpen()) + return; PostgreSQLProtocol::Messaging::FrontMessageType message_type = message_transport->receiveMessageType(); + if (!tcp_server.isOpen()) + return; switch (message_type) { case PostgreSQLProtocol::Messaging::FrontMessageType::QUERY: diff --git a/src/Server/PostgreSQLHandler.h b/src/Server/PostgreSQLHandler.h index 1d33f41f255..4fd08cc2606 100644 --- a/src/Server/PostgreSQLHandler.h +++ b/src/Server/PostgreSQLHandler.h @@ -18,8 +18,9 @@ namespace CurrentMetrics namespace DB { - +class ReadBufferFromPocoSocket; class Session; +class TCPServer; /** PostgreSQL wire protocol implementation. * For more info see https://www.postgresql.org/docs/current/protocol.html @@ -30,6 +31,7 @@ public: PostgreSQLHandler( const Poco::Net::StreamSocket & socket_, IServer & server_, + TCPServer & tcp_server_, bool ssl_enabled_, Int32 connection_id_, std::vector> & auth_methods_); @@ -40,12 +42,13 @@ private: Poco::Logger * log = &Poco::Logger::get("PostgreSQLHandler"); IServer & server; + TCPServer & tcp_server; std::unique_ptr session; bool ssl_enabled = false; Int32 connection_id = 0; Int32 secret_key = 0; - std::shared_ptr in; + std::shared_ptr in; std::shared_ptr out; std::shared_ptr message_transport; diff --git a/src/Server/PostgreSQLHandlerFactory.cpp b/src/Server/PostgreSQLHandlerFactory.cpp index 1158cf5835e..6f2124861e7 100644 --- a/src/Server/PostgreSQLHandlerFactory.cpp +++ b/src/Server/PostgreSQLHandlerFactory.cpp @@ -1,5 +1,4 @@ #include "PostgreSQLHandlerFactory.h" -#include #include #include @@ -17,11 +16,11 @@ PostgreSQLHandlerFactory::PostgreSQLHandlerFactory(IServer & server_) }; } -Poco::Net::TCPServerConnection * PostgreSQLHandlerFactory::createConnection(const Poco::Net::StreamSocket & socket) +Poco::Net::TCPServerConnection * PostgreSQLHandlerFactory::createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) { Int32 connection_id = last_connection_id++; LOG_TRACE(log, "PostgreSQL connection. Id: {}. Address: {}", connection_id, socket.peerAddress().toString()); - return new PostgreSQLHandler(socket, server, ssl_enabled, connection_id, auth_methods); + return new PostgreSQLHandler(socket, server, tcp_server, ssl_enabled, connection_id, auth_methods); } } diff --git a/src/Server/PostgreSQLHandlerFactory.h b/src/Server/PostgreSQLHandlerFactory.h index dc3d4047d2a..e9241da6f0e 100644 --- a/src/Server/PostgreSQLHandlerFactory.h +++ b/src/Server/PostgreSQLHandlerFactory.h @@ -1,16 +1,16 @@ #pragma once -#include #include #include #include +#include #include #include namespace DB { -class PostgreSQLHandlerFactory : public Poco::Net::TCPServerConnectionFactory +class PostgreSQLHandlerFactory : public TCPServerConnectionFactory { private: IServer & server; @@ -28,6 +28,6 @@ private: public: explicit PostgreSQLHandlerFactory(IServer & server_); - Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override; + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & server) override; }; } diff --git a/src/Server/ProtocolServerAdapter.cpp b/src/Server/ProtocolServerAdapter.cpp index 6ec1ec572f7..b41ad2376f1 100644 --- a/src/Server/ProtocolServerAdapter.cpp +++ b/src/Server/ProtocolServerAdapter.cpp @@ -1,5 +1,5 @@ #include -#include +#include #if USE_GRPC #include @@ -11,20 +11,29 @@ namespace DB class ProtocolServerAdapter::TCPServerAdapterImpl : public Impl { public: - explicit TCPServerAdapterImpl(std::unique_ptr tcp_server_) : tcp_server(std::move(tcp_server_)) {} + explicit TCPServerAdapterImpl(std::unique_ptr tcp_server_) : tcp_server(std::move(tcp_server_)) {} ~TCPServerAdapterImpl() override = default; void start() override { tcp_server->start(); } void stop() override { tcp_server->stop(); } + bool isStopping() const override { return !tcp_server->isOpen(); } + UInt16 portNumber() const override { return tcp_server->portNumber(); } size_t currentConnections() const override { return tcp_server->currentConnections(); } size_t currentThreads() const override { return tcp_server->currentThreads(); } private: - std::unique_ptr tcp_server; + std::unique_ptr tcp_server; }; -ProtocolServerAdapter::ProtocolServerAdapter(const char * port_name_, std::unique_ptr tcp_server_) - : port_name(port_name_), impl(std::make_unique(std::move(tcp_server_))) +ProtocolServerAdapter::ProtocolServerAdapter( + const std::string & listen_host_, + const char * port_name_, + const std::string & description_, + std::unique_ptr tcp_server_) + : listen_host(listen_host_) + , port_name(port_name_) + , description(description_) + , impl(std::make_unique(std::move(tcp_server_))) { } @@ -36,16 +45,30 @@ public: ~GRPCServerAdapterImpl() override = default; void start() override { grpc_server->start(); } - void stop() override { grpc_server->stop(); } + void stop() override + { + is_stopping = true; + grpc_server->stop(); + } + bool isStopping() const override { return is_stopping; } + UInt16 portNumber() const override { return grpc_server->portNumber(); } size_t currentConnections() const override { return grpc_server->currentConnections(); } size_t currentThreads() const override { return grpc_server->currentThreads(); } private: std::unique_ptr grpc_server; + bool is_stopping = false; }; -ProtocolServerAdapter::ProtocolServerAdapter(const char * port_name_, std::unique_ptr grpc_server_) - : port_name(port_name_), impl(std::make_unique(std::move(grpc_server_))) +ProtocolServerAdapter::ProtocolServerAdapter( + const std::string & listen_host_, + const char * port_name_, + const std::string & description_, + std::unique_ptr grpc_server_) + : listen_host(listen_host_) + , port_name(port_name_) + , description(description_) + , impl(std::make_unique(std::move(grpc_server_))) { } #endif diff --git a/src/Server/ProtocolServerAdapter.h b/src/Server/ProtocolServerAdapter.h index 04c46b53356..9b3b1af0301 100644 --- a/src/Server/ProtocolServerAdapter.h +++ b/src/Server/ProtocolServerAdapter.h @@ -2,14 +2,14 @@ #include +#include #include #include -namespace Poco::Net { class TCPServer; } - namespace DB { class GRPCServer; +class TCPServer; /// Provides an unified interface to access a protocol implementing server /// no matter what type it has (HTTPServer, TCPServer, MySQLServer, GRPCServer, ...). @@ -19,10 +19,10 @@ class ProtocolServerAdapter public: ProtocolServerAdapter(ProtocolServerAdapter && src) = default; ProtocolServerAdapter & operator =(ProtocolServerAdapter && src) = default; - ProtocolServerAdapter(const char * port_name_, std::unique_ptr tcp_server_); + ProtocolServerAdapter(const std::string & listen_host_, const char * port_name_, const std::string & description_, std::unique_ptr tcp_server_); #if USE_GRPC - ProtocolServerAdapter(const char * port_name_, std::unique_ptr grpc_server_); + ProtocolServerAdapter(const std::string & listen_host_, const char * port_name_, const std::string & description_, std::unique_ptr grpc_server_); #endif /// Starts the server. A new thread will be created that waits for and accepts incoming connections. @@ -31,14 +31,23 @@ public: /// Stops the server. No new connections will be accepted. void stop() { impl->stop(); } + bool isStopping() const { return impl->isStopping(); } + /// Returns the number of currently handled connections. size_t currentConnections() const { return impl->currentConnections(); } /// Returns the number of current threads. size_t currentThreads() const { return impl->currentThreads(); } + /// Returns the port this server is listening to. + UInt16 portNumber() const { return impl->portNumber(); } + + const std::string & getListenHost() const { return listen_host; } + const std::string & getPortName() const { return port_name; } + const std::string & getDescription() const { return description; } + private: class Impl { @@ -46,13 +55,17 @@ private: virtual ~Impl() {} virtual void start() = 0; virtual void stop() = 0; + virtual bool isStopping() const = 0; + virtual UInt16 portNumber() const = 0; virtual size_t currentConnections() const = 0; virtual size_t currentThreads() const = 0; }; class TCPServerAdapterImpl; class GRPCServerAdapterImpl; + std::string listen_host; std::string port_name; + std::string description; std::unique_ptr impl; }; diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 3b1ce4cc846..6fa2b25d181 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -81,9 +82,10 @@ namespace ErrorCodes extern const int UNKNOWN_PROTOCOL; } -TCPHandler::TCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_) +TCPHandler::TCPHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_) : Poco::Net::TCPServerConnection(socket_) , server(server_) + , tcp_server(tcp_server_) , parse_proxy_protocol(parse_proxy_protocol_) , log(&Poco::Logger::get("TCPHandler")) , server_display_name(std::move(server_display_name_)) @@ -172,13 +174,13 @@ void TCPHandler::runImpl() throw; } - while (true) + while (tcp_server.isOpen()) { /// We are waiting for a packet from the client. Thus, every `poll_interval` seconds check whether we need to shut down. { Stopwatch idle_time; UInt64 timeout_ms = std::min(poll_interval, idle_connection_timeout) * 1000000; - while (!server.isCancelled() && !static_cast(*in).poll(timeout_ms)) + while (tcp_server.isOpen() && !server.isCancelled() && !static_cast(*in).poll(timeout_ms)) { if (idle_time.elapsedSeconds() > idle_connection_timeout) { @@ -189,7 +191,7 @@ void TCPHandler::runImpl() } /// If we need to shut down, or client disconnects. - if (server.isCancelled() || in->eof()) + if (!tcp_server.isOpen() || server.isCancelled() || in->eof()) break; Stopwatch watch; @@ -323,7 +325,7 @@ void TCPHandler::runImpl() if (state.is_cancelled) return std::nullopt; - sendMergeTreeReadTaskRequstAssumeLocked(std::move(request)); + sendMergeTreeReadTaskRequestAssumeLocked(std::move(request)); return receivePartitionMergeTreeReadTaskResponseAssumeLocked(); }); @@ -803,7 +805,7 @@ void TCPHandler::sendReadTaskRequestAssumeLocked() } -void TCPHandler::sendMergeTreeReadTaskRequstAssumeLocked(PartitionReadRequest request) +void TCPHandler::sendMergeTreeReadTaskRequestAssumeLocked(PartitionReadRequest request) { writeVarUInt(Protocol::Server::MergeTreeReadTaskRequest, *out); request.serialize(*out); diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 54af44759e7..6afda654e6a 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -35,6 +35,7 @@ class Session; struct Settings; class ColumnsDescription; struct ProfileInfo; +class TCPServer; /// State of query processing. struct QueryState @@ -127,7 +128,7 @@ public: * because it allows to check the IP ranges of the trusted proxy. * Proxy-forwarded (original client) IP address is used for quota accounting if quota is keyed by forwarded IP. */ - TCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_); + TCPHandler(IServer & server_, TCPServer & tcp_server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_); ~TCPHandler() override; void run() override; @@ -137,6 +138,7 @@ public: private: IServer & server; + TCPServer & tcp_server; bool parse_proxy_protocol = false; Poco::Logger * log; @@ -237,7 +239,7 @@ private: void sendEndOfStream(); void sendPartUUIDs(); void sendReadTaskRequestAssumeLocked(); - void sendMergeTreeReadTaskRequstAssumeLocked(PartitionReadRequest request); + void sendMergeTreeReadTaskRequestAssumeLocked(PartitionReadRequest request); void sendProfileInfo(const ProfileInfo & info); void sendTotals(const Block & totals); void sendExtremes(const Block & extremes); diff --git a/src/Server/TCPHandlerFactory.h b/src/Server/TCPHandlerFactory.h index e610bea330c..03b2592198d 100644 --- a/src/Server/TCPHandlerFactory.h +++ b/src/Server/TCPHandlerFactory.h @@ -1,17 +1,17 @@ #pragma once -#include #include #include #include #include +#include namespace Poco { class Logger; } namespace DB { -class TCPHandlerFactory : public Poco::Net::TCPServerConnectionFactory +class TCPHandlerFactory : public TCPServerConnectionFactory { private: IServer & server; @@ -38,13 +38,13 @@ public: server_display_name = server.config().getString("display_name", getFQDNOrHostName()); } - Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) override { try { LOG_TRACE(log, "TCP Request. Address: {}", socket.peerAddress().toString()); - return new TCPHandler(server, socket, parse_proxy_protocol, server_display_name); + return new TCPHandler(server, tcp_server, socket, parse_proxy_protocol, server_display_name); } catch (const Poco::Net::NetException &) { diff --git a/src/Server/TCPServer.cpp b/src/Server/TCPServer.cpp new file mode 100644 index 00000000000..380c4ef9924 --- /dev/null +++ b/src/Server/TCPServer.cpp @@ -0,0 +1,36 @@ +#include +#include + +namespace DB +{ + +class TCPServerConnectionFactoryImpl : public Poco::Net::TCPServerConnectionFactory +{ +public: + TCPServerConnectionFactoryImpl(TCPServer & tcp_server_, DB::TCPServerConnectionFactory::Ptr factory_) + : tcp_server(tcp_server_) + , factory(factory_) + {} + + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override + { + return factory->createConnection(socket, tcp_server); + } +private: + TCPServer & tcp_server; + DB::TCPServerConnectionFactory::Ptr factory; +}; + +TCPServer::TCPServer( + TCPServerConnectionFactory::Ptr factory_, + Poco::ThreadPool & thread_pool, + Poco::Net::ServerSocket & socket_, + Poco::Net::TCPServerParams::Ptr params) + : Poco::Net::TCPServer(new TCPServerConnectionFactoryImpl(*this, factory_), thread_pool, socket_, params) + , factory(factory_) + , socket(socket_) + , is_open(true) + , port_number(socket.address().port()) +{} + +} diff --git a/src/Server/TCPServer.h b/src/Server/TCPServer.h new file mode 100644 index 00000000000..219fed5342b --- /dev/null +++ b/src/Server/TCPServer.h @@ -0,0 +1,47 @@ +#pragma once + +#include + +#include +#include + + +namespace DB +{ +class Context; + +class TCPServer : public Poco::Net::TCPServer +{ +public: + explicit TCPServer( + TCPServerConnectionFactory::Ptr factory, + Poco::ThreadPool & thread_pool, + Poco::Net::ServerSocket & socket, + Poco::Net::TCPServerParams::Ptr params = new Poco::Net::TCPServerParams); + + /// Close the socket and ask existing connections to stop serving queries + void stop() + { + Poco::Net::TCPServer::stop(); + // This notifies already established connections that they should stop serving + // queries and close their socket as soon as they can. + is_open = false; + // Poco's stop() stops listening on the socket but leaves it open. + // To be able to hand over control of the listening port to a new server, and + // to get fast connection refusal instead of timeouts, we also need to close + // the listening socket. + socket.close(); + } + + bool isOpen() const { return is_open; } + + UInt16 portNumber() const { return port_number; } + +private: + TCPServerConnectionFactory::Ptr factory; + Poco::Net::ServerSocket socket; + std::atomic is_open; + UInt16 port_number; +}; + +} diff --git a/src/Server/TCPServerConnectionFactory.h b/src/Server/TCPServerConnectionFactory.h new file mode 100644 index 00000000000..613f98352bd --- /dev/null +++ b/src/Server/TCPServerConnectionFactory.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +namespace Poco +{ +namespace Net +{ + class StreamSocket; + class TCPServerConnection; +} +} +namespace DB +{ +class TCPServer; + +class TCPServerConnectionFactory +{ +public: + using Ptr = Poco::SharedPtr; + + virtual ~TCPServerConnectionFactory() = default; + + /// Same as Poco::Net::TCPServerConnectionFactory except we can pass the TCPServer + virtual Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket, TCPServer & tcp_server) = 0; +}; +} diff --git a/src/Storages/ExecutableSettings.h b/src/Storages/ExecutableSettings.h index 9c0cfc05fa5..c6c1f0b9eb2 100644 --- a/src/Storages/ExecutableSettings.h +++ b/src/Storages/ExecutableSettings.h @@ -9,16 +9,23 @@ namespace DB class ASTStorage; #define LIST_OF_EXECUTABLE_SETTINGS(M) \ - M(UInt64, send_chunk_header, false, "Send number_of_rows\n before sending chunk to process", 0) \ - M(UInt64, pool_size, 16, "Processes pool size. If size == 0, then there is no size restrictions", 0) \ + M(Bool, send_chunk_header, false, "Send number_of_rows\n before sending chunk to process.", 0) \ + M(UInt64, pool_size, 16, "Processes pool size. If size == 0, then there is no size restrictions.", 0) \ M(UInt64, max_command_execution_time, 10, "Max command execution time in seconds.", 0) \ M(UInt64, command_termination_timeout, 10, "Command termination timeout in seconds.", 0) \ + M(UInt64, command_read_timeout, 10000, "Timeout for reading data from command stdout in milliseconds.", 0) \ + M(UInt64, command_write_timeout, 10000, "Timeout for writing data to command stdin in milliseconds.", 0) DECLARE_SETTINGS_TRAITS(ExecutableSettingsTraits, LIST_OF_EXECUTABLE_SETTINGS) /// Settings for ExecutablePool engine. struct ExecutableSettings : public BaseSettings { + std::string script_name; + std::vector script_arguments; + + bool is_executable_pool = false; + void loadFromQuery(ASTStorage & storage_def); }; diff --git a/src/Storages/ExternalDataSourceConfiguration.cpp b/src/Storages/ExternalDataSourceConfiguration.cpp index 42b3b148551..265587d2b1a 100644 --- a/src/Storages/ExternalDataSourceConfiguration.cpp +++ b/src/Storages/ExternalDataSourceConfiguration.cpp @@ -16,6 +16,8 @@ #include #endif +#include + namespace DB { @@ -24,6 +26,12 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +static const std::unordered_set dictionary_allowed_keys = { + "host", "port", "user", "password", "db", + "database", "table", "schema", "replica", + "update_field", "update_tag", "invalidate_query", "query", + "where", "name", "secure", "uri", "collection"}; + String ExternalDataSourceConfiguration::toString() const { WriteBufferFromOwnString configuration_info; @@ -159,10 +167,23 @@ std::optional getExternalDataSourceConfiguration(const return std::nullopt; } +static void validateConfigKeys( + const Poco::Util::AbstractConfiguration & dict_config, const String & config_prefix, HasConfigKeyFunc has_config_key_func) +{ + Poco::Util::AbstractConfiguration::Keys config_keys; + dict_config.keys(config_prefix, config_keys); + for (const auto & config_key : config_keys) + { + if (!has_config_key_func(config_key)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected key `{}` in dictionary source configuration", config_key); + } +} std::optional getExternalDataSourceConfiguration( - const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context) + const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, + ContextPtr context, HasConfigKeyFunc has_config_key) { + validateConfigKeys(dict_config, dict_config_prefix, has_config_key); ExternalDataSourceConfiguration configuration; auto collection_name = dict_config.getString(dict_config_prefix + ".name", ""); @@ -170,6 +191,7 @@ std::optional getExternalDataSourceConfiguratio { const auto & config = context->getConfigRef(); const auto & collection_prefix = fmt::format("named_collections.{}", collection_name); + validateConfigKeys(dict_config, collection_prefix, has_config_key); if (!config.has(collection_prefix)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no collection named `{}` in config", collection_name); @@ -178,14 +200,15 @@ std::optional getExternalDataSourceConfiguratio configuration.port = dict_config.getInt(dict_config_prefix + ".port", config.getUInt(collection_prefix + ".port", 0)); configuration.username = dict_config.getString(dict_config_prefix + ".user", config.getString(collection_prefix + ".user", "")); configuration.password = dict_config.getString(dict_config_prefix + ".password", config.getString(collection_prefix + ".password", "")); - configuration.database = dict_config.getString(dict_config_prefix + ".db", config.getString(collection_prefix + ".database", "")); + configuration.database = dict_config.getString(dict_config_prefix + ".db", config.getString(dict_config_prefix + ".database", + config.getString(collection_prefix + ".db", config.getString(collection_prefix + ".database", "")))); configuration.table = dict_config.getString(dict_config_prefix + ".table", config.getString(collection_prefix + ".table", "")); configuration.schema = dict_config.getString(dict_config_prefix + ".schema", config.getString(collection_prefix + ".schema", "")); if (configuration.host.empty() || configuration.port == 0 || configuration.username.empty() || configuration.table.empty()) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Named collection of connection parameters is missing some of the parameters and dictionary parameters are added"); + "Named collection of connection parameters is missing some of the parameters and dictionary parameters are not added"); } return configuration; } @@ -194,11 +217,12 @@ std::optional getExternalDataSourceConfiguratio ExternalDataSourcesByPriority getExternalDataSourceConfigurationByPriority( - const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context) + const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context, HasConfigKeyFunc has_config_key) { + validateConfigKeys(dict_config, dict_config_prefix, has_config_key); ExternalDataSourceConfiguration common_configuration; - auto named_collection = getExternalDataSourceConfiguration(dict_config, dict_config_prefix, context); + auto named_collection = getExternalDataSourceConfiguration(dict_config, dict_config_prefix, context, has_config_key); if (named_collection) { common_configuration = *named_collection; @@ -209,7 +233,7 @@ ExternalDataSourcesByPriority getExternalDataSourceConfigurationByPriority( common_configuration.port = dict_config.getUInt(dict_config_prefix + ".port", 0); common_configuration.username = dict_config.getString(dict_config_prefix + ".user", ""); common_configuration.password = dict_config.getString(dict_config_prefix + ".password", ""); - common_configuration.database = dict_config.getString(dict_config_prefix + ".db", ""); + common_configuration.database = dict_config.getString(dict_config_prefix + ".db", dict_config.getString(dict_config_prefix + ".database", "")); common_configuration.table = dict_config.getString(fmt::format("{}.table", dict_config_prefix), ""); common_configuration.schema = dict_config.getString(fmt::format("{}.schema", dict_config_prefix), ""); } @@ -233,8 +257,9 @@ ExternalDataSourcesByPriority getExternalDataSourceConfigurationByPriority( { ExternalDataSourceConfiguration replica_configuration(common_configuration); String replica_name = dict_config_prefix + "." + config_key; - size_t priority = dict_config.getInt(replica_name + ".priority", 0); + validateConfigKeys(dict_config, replica_name, has_config_key); + size_t priority = dict_config.getInt(replica_name + ".priority", 0); replica_configuration.host = dict_config.getString(replica_name + ".host", common_configuration.host); replica_configuration.port = dict_config.getUInt(replica_name + ".port", common_configuration.port); replica_configuration.username = dict_config.getString(replica_name + ".user", common_configuration.username); diff --git a/src/Storages/ExternalDataSourceConfiguration.h b/src/Storages/ExternalDataSourceConfiguration.h index 502f8b800e3..930b83ffc71 100644 --- a/src/Storages/ExternalDataSourceConfiguration.h +++ b/src/Storages/ExternalDataSourceConfiguration.h @@ -64,8 +64,11 @@ struct ExternalDataSourceConfig */ std::optional getExternalDataSourceConfiguration(const ASTs & args, ContextPtr context, bool is_database_engine = false, bool throw_on_no_collection = true); +using HasConfigKeyFunc = std::function; + std::optional getExternalDataSourceConfiguration( - const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context); + const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, + ContextPtr context, HasConfigKeyFunc has_config_key); /// Highest priority is 0, the bigger the number in map, the less the priority. @@ -80,7 +83,7 @@ struct ExternalDataSourcesByPriority }; ExternalDataSourcesByPriority -getExternalDataSourceConfigurationByPriority(const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context); +getExternalDataSourceConfigurationByPriority(const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context, HasConfigKeyFunc has_config_key); struct URLBasedDataSourceConfiguration @@ -88,7 +91,7 @@ struct URLBasedDataSourceConfiguration String url; String format; String compression_method = "auto"; - String structure; + String structure = "auto"; std::vector> headers; String http_method; diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 306c010d2cd..f22f6f66ced 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -29,6 +28,8 @@ #include #include + +#include #include #include @@ -51,10 +52,70 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ACCESS_DENIED; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } +namespace +{ + /* Recursive directory listing with matched paths as a result. + * Have the same method in StorageFile. + */ + Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match) + { + const size_t first_glob = for_match.find_first_of("*?{"); -static Strings listFilesWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match); + const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); + const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' + const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' + const size_t next_slash = suffix_with_globs.find('/', 1); + re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); + + HDFSFileInfo ls; + ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); + Strings result; + for (int i = 0; i < ls.length; ++i) + { + const String full_path = String(ls.file_info[i].mName); + const size_t last_slash = full_path.rfind('/'); + const String file_name = full_path.substr(last_slash); + const bool looking_for_directory = next_slash != std::string::npos; + const bool is_directory = ls.file_info[i].mKind == 'D'; + /// Condition with type of current file_info means what kind of path is it in current iteration of ls + if (!is_directory && !looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + result.push_back(String(ls.file_info[i].mName)); + } + } + else if (is_directory && looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + Strings result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash)); + /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. + std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); + } + } + } + + return result; + } + + std::pair getPathFromUriAndUriWithoutPath(const String & uri) + { + const size_t begin_of_path = uri.find('/', uri.find("//") + 2); + return {uri.substr(begin_of_path), uri.substr(0, begin_of_path)}; + } + + std::vector getPathsList(const String & path_from_uri, const String & uri_without_path, ContextPtr context) + { + HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); + HDFSFSPtr fs = createHDFSFS(builder.get()); + + return LSWithRegexpMatching("/", fs, path_from_uri); + } +} StorageHDFS::StorageHDFS( const String & uri_, @@ -79,25 +140,52 @@ StorageHDFS::StorageHDFS( checkHDFSURL(uri); StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + + if (columns_.empty()) + { + auto columns = getTableStructureFromData(format_name, uri, compression_method, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription StorageHDFS::getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + ContextPtr ctx) +{ + auto read_buffer_creator = [&]() + { + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); + auto paths = getPathsList(path_from_uri, uri, ctx); + if (paths.empty()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files in HDFS with provided path. You must " + "specify table structure manually", + format); + + auto compression = chooseCompressionMethod(paths[0], compression_method); + return wrapReadBufferWithCompressionMethod( + std::make_unique(uri_without_path, paths[0], ctx->getGlobalContext()->getConfigRef()), compression); + }; + + return readSchemaFromFormat(format, std::nullopt, read_buffer_creator, ctx); +} + class HDFSSource::DisclosedGlobIterator::Impl { public: Impl(ContextPtr context_, const String & uri) { - const size_t begin_of_path = uri.find('/', uri.find("//") + 2); - const String path_from_uri = uri.substr(begin_of_path); - const String uri_without_path = uri.substr(0, begin_of_path); /// ends without '/' - - HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context_->getGlobalContext()->getConfigRef()); - HDFSFSPtr fs = createHDFSFS(builder.get()); - - uris = listFilesWithRegexpMatching("/", fs, path_from_uri); + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); + uris = getPathsList(path_from_uri, uri_without_path, context_); for (auto & elem : uris) elem = uri_without_path + elem; uris_iter = uris.begin(); @@ -176,6 +264,12 @@ HDFSSource::HDFSSource( initialize(); } +void HDFSSource::onCancel() +{ + if (reader) + reader->cancel(); +} + bool HDFSSource::initialize() { current_path = (*file_iterator)(); @@ -333,51 +427,6 @@ private: }; -/* Recursive directory listing with matched paths as a result. - * Have the same method in StorageFile. - */ -Strings listFilesWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match) -{ - const size_t first_glob = for_match.find_first_of("*?{"); - - const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); - const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' - const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' - - const size_t next_slash = suffix_with_globs.find('/', 1); - re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); - - HDFSFileInfo ls; - ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); - Strings result; - for (int i = 0; i < ls.length; ++i) - { - const String full_path = String(ls.file_info[i].mName); - const size_t last_slash = full_path.rfind('/'); - const String file_name = full_path.substr(last_slash); - const bool looking_for_directory = next_slash != std::string::npos; - const bool is_directory = ls.file_info[i].mKind == 'D'; - /// Condition with type of current file_info means what kind of path is it in current iteration of ls - if (!is_directory && !looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - { - result.push_back(String(ls.file_info[i].mName)); - } - } - else if (is_directory && looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - { - Strings result_part = listFilesWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash)); - /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. - std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); - } - } - } - return result; -} - bool StorageHDFS::isColumnOriented() const { return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); @@ -394,6 +443,7 @@ Pipe StorageHDFS::read( { bool need_path_column = false; bool need_file_column = false; + for (const auto & column : column_names) { if (column == "_path") @@ -522,6 +572,7 @@ void registerStorageHDFS(StorageFactory & factory) }, { .supports_sort_order = true, // for partition by + .supports_schema_inference = true, .source_access_type = AccessType::HDFS, }); } diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index 611ea3476b0..9e845d8fd74 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -31,7 +31,7 @@ public: size_t max_block_size, unsigned num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr /*context*/) override; void truncate( const ASTPtr & query, @@ -49,6 +49,12 @@ public: /// format to read only them. Note: this hack cannot be done with ordinary formats like TSV. bool isColumnOriented() const; + static ColumnsDescription getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + ContextPtr ctx); + protected: friend class HDFSSource; StorageHDFS( @@ -118,6 +124,8 @@ public: Chunk generate() override; + void onCancel() override; + private: StorageHDFSPtr storage; StorageMetadataPtr metadata_snapshot; diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 021335fea1f..a923258b111 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -139,7 +139,6 @@ void IStorage::alter(const AlterCommands & params, ContextPtr context, AlterLock setInMemoryMetadata(new_metadata); } - void IStorage::checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const { for (const auto & command : commands) diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index 2855e21356d..2a964aecd4e 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -361,10 +361,10 @@ void Service::sendPartFromDiskRemoteMeta(const MergeTreeData::DataPartPtr & part MergeTreeData::DataPartPtr Service::findPart(const String & name) { - /// It is important to include PreCommitted and Outdated parts here because remote replicas cannot reliably + /// It is important to include PreActive and Outdated parts here because remote replicas cannot reliably /// determine the local state of the part, so queries for the parts in these states are completely normal. auto part = data.getPartIfExists( - name, {MergeTreeDataPartState::PreCommitted, MergeTreeDataPartState::Committed, MergeTreeDataPartState::Outdated}); + name, {MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); if (part) return part; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 83328594363..da412e4941e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -28,6 +28,8 @@ namespace CurrentMetrics extern const Metric PartsTemporary; extern const Metric PartsPreCommitted; extern const Metric PartsCommitted; + extern const Metric PartsPreActive; + extern const Metric PartsActive; extern const Metric PartsOutdated; extern const Metric PartsDeleting; extern const Metric PartsDeleteOnDestroy; @@ -189,10 +191,12 @@ static void incrementStateMetric(IMergeTreeDataPart::State state) case IMergeTreeDataPart::State::Temporary: CurrentMetrics::add(CurrentMetrics::PartsTemporary); return; - case IMergeTreeDataPart::State::PreCommitted: + case IMergeTreeDataPart::State::PreActive: + CurrentMetrics::add(CurrentMetrics::PartsPreActive); CurrentMetrics::add(CurrentMetrics::PartsPreCommitted); return; - case IMergeTreeDataPart::State::Committed: + case IMergeTreeDataPart::State::Active: + CurrentMetrics::add(CurrentMetrics::PartsActive); CurrentMetrics::add(CurrentMetrics::PartsCommitted); return; case IMergeTreeDataPart::State::Outdated: @@ -214,10 +218,12 @@ static void decrementStateMetric(IMergeTreeDataPart::State state) case IMergeTreeDataPart::State::Temporary: CurrentMetrics::sub(CurrentMetrics::PartsTemporary); return; - case IMergeTreeDataPart::State::PreCommitted: + case IMergeTreeDataPart::State::PreActive: + CurrentMetrics::sub(CurrentMetrics::PartsPreActive); CurrentMetrics::sub(CurrentMetrics::PartsPreCommitted); return; - case IMergeTreeDataPart::State::Committed: + case IMergeTreeDataPart::State::Active: + CurrentMetrics::sub(CurrentMetrics::PartsActive); CurrentMetrics::sub(CurrentMetrics::PartsCommitted); return; case IMergeTreeDataPart::State::Outdated: @@ -286,7 +292,7 @@ IMergeTreeDataPart::IMergeTreeDataPart( , parent_part(parent_part_) { if (parent_part) - state = State::Committed; + state = State::Active; incrementStateMetric(state); incrementTypeMetric(part_type); @@ -311,7 +317,7 @@ IMergeTreeDataPart::IMergeTreeDataPart( , parent_part(parent_part_) { if (parent_part) - state = State::Committed; + state = State::Active; incrementStateMetric(state); incrementTypeMetric(part_type); @@ -1153,6 +1159,14 @@ void IMergeTreeDataPart::renameTo(const String & new_relative_path, bool remove_ storage.lockSharedData(*this); } +void IMergeTreeDataPart::cleanupOldName(const String & old_part_name) const +{ + if (name == old_part_name) + return; + + storage.unlockSharedData(*this, old_part_name); +} + std::optional IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const { /// NOTE: It's needed for zero-copy replication @@ -1615,6 +1629,12 @@ String IMergeTreeDataPart::getUniqueId() const } +UInt32 IMergeTreeDataPart::getNumberOfRefereneces() const +{ + return volume->getDisk()->getRefCount(fs::path(getFullRelativePath()) / "checksums.txt"); +} + + String IMergeTreeDataPart::getZeroLevelPartBlockID() const { if (info.level != 0) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index ab08ca1c33a..09449dc7521 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -218,19 +218,19 @@ public: * Part state should be modified under data_parts mutex. * * Possible state transitions: - * Temporary -> Precommitted: we are trying to commit a fetched, inserted or merged part to active set - * Precommitted -> Outdated: we could not add a part to active set and are doing a rollback (for example it is duplicated part) - * Precommitted -> Committed: we successfully committed a part to active dataset - * Precommitted -> Outdated: a part was replaced by a covering part or DROP PARTITION + * Temporary -> PreActive: we are trying to add a fetched, inserted or merged part to active set + * PreActive -> Outdated: we could not add a part to active set and are doing a rollback (for example it is duplicated part) + * PreActive -> Active: we successfully added a part to active dataset + * PreActive -> Outdated: a part was replaced by a covering part or DROP PARTITION * Outdated -> Deleting: a cleaner selected this part for deletion * Deleting -> Outdated: if an ZooKeeper error occurred during the deletion, we will retry deletion - * Committed -> DeleteOnDestroy: if part was moved to another disk + * Active -> DeleteOnDestroy: if part was moved to another disk */ enum class State { Temporary, /// the part is generating now, it is not in data_parts list - PreCommitted, /// the part is in data_parts, but not used for SELECTs - Committed, /// active data part, used by current and upcoming SELECTs + PreActive, /// the part is in data_parts, but not used for SELECTs + Active, /// active data part, used by current and upcoming SELECTs Outdated, /// not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes Deleting, /// not active data part with identity refcounter, it is deleting right now by a cleaner DeleteOnDestroy, /// part was moved to another disk and should be deleted in own destructor @@ -338,6 +338,9 @@ public: /// Changes only relative_dir_name, you need to update other metadata (name, is_temp) explicitly virtual void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const; + /// Cleanup shared locks made with old name after part renaming + virtual void cleanupOldName(const String & old_part_name) const; + /// Makes clone of a part in detached/ directory via hard links virtual void makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const; @@ -404,10 +407,14 @@ public: /// part creation (using alter query with materialize_ttl setting). bool checkAllTTLCalculated(const StorageMetadataPtr & metadata_snapshot) const; - /// Return some uniq string for file - /// Required for distinguish different copies of the same part on S3 + /// Return some uniq string for file. + /// Required for distinguish different copies of the same part on remote FS. String getUniqueId() const; + /// Return hardlink count for part. + /// Required for keep data on remote FS when part has shadow copies. + UInt32 getNumberOfRefereneces() const; + protected: /// Total size of all columns, calculated once in calcuateColumnSizesOnDisk diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index fbc818a7de9..5b69a4e68b6 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -22,6 +22,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER; extern const int LOGICAL_ERROR; + extern const int QUERY_WAS_CANCELLED; } @@ -131,8 +132,9 @@ bool MergeTreeBaseSelectProcessor::getTaskFromBuffer() if (Status::Accepted == res) return true; + /// To avoid any possibility of ignoring cancellation, exception will be thrown. if (Status::Cancelled == res) - break; + throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query had been cancelled"); } return false; } @@ -165,8 +167,18 @@ Chunk MergeTreeBaseSelectProcessor::generate() { while (!isCancelled()) { - if ((!task || task->isFinished()) && !getNewTask()) - return {}; + try + { + if ((!task || task->isFinished()) && !getNewTask()) + return {}; + } + catch (const Exception & e) + { + /// See MergeTreeBaseSelectProcessor::getTaskFromBuffer() + if (e.code() == ErrorCodes::QUERY_WAS_CANCELLED) + return {}; + throw; + } auto res = readFromPart(); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 1b7be8ca98d..b38a0112116 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +63,7 @@ #include #include +#include #include #include @@ -224,7 +226,6 @@ MergeTreeData::MergeTreeData( { try { - checkPartitionKeyAndInitMinMax(metadata_.partition_key); setProperties(metadata_, metadata_, attach); if (minmax_idx_date_column_pos == -1) @@ -358,10 +359,11 @@ static void checkKeyExpression(const ExpressionActions & expr, const Block & sam { const ColumnPtr & column = element.column; if (column && (isColumnConst(*column) || column->isDummy())) - throw Exception{key_name + " key cannot contain constants", ErrorCodes::ILLEGAL_COLUMN}; + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "{} key cannot contain constants", key_name); - if (!allow_nullable_key && element.type->isNullable()) - throw Exception{key_name + " key cannot contain nullable columns", ErrorCodes::ILLEGAL_COLUMN}; + if (!allow_nullable_key && hasNullable(element.type)) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, "{} key contains nullable columns, but `setting allow_nullable_key` is disabled", key_name); } } @@ -1038,8 +1040,8 @@ void MergeTreeData::loadDataPartsFromDisk( has_adaptive_parts.store(true, std::memory_order_relaxed); part->modification_time = part_disk_ptr->getLastModified(fs::path(relative_data_path) / part_name).epochTime(); - /// Assume that all parts are Committed, covered parts will be detected and marked as Outdated later - part->setState(DataPartState::Committed); + /// Assume that all parts are Active, covered parts will be detected and marked as Outdated later + part->setState(DataPartState::Active); std::lock_guard loading_lock(mutex); auto [it, inserted] = data_parts_indexes.insert(part); @@ -1131,12 +1133,12 @@ void MergeTreeData::loadDataPartsFromWAL( { for (auto & part : parts_from_wal) { - if (getActiveContainingPart(part->info, DataPartState::Committed, part_lock)) + if (getActiveContainingPart(part->info, DataPartState::Active, part_lock)) continue; part->modification_time = time(nullptr); - /// Assume that all parts are Committed, covered parts will be detected and marked as Outdated later - part->setState(DataPartState::Committed); + /// Assume that all parts are Active, covered parts will be detected and marked as Outdated later + part->setState(DataPartState::Active); auto [it, inserted] = data_parts_indexes.insert(part); if (!inserted) @@ -1292,9 +1294,9 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) removePartContributionToDataVolume(*it); }; - (*prev_jt)->assertState({DataPartState::Committed}); + (*prev_jt)->assertState({DataPartState::Active}); - while (curr_jt != data_parts_by_state_and_info.end() && (*curr_jt)->getState() == DataPartState::Committed) + while (curr_jt != data_parts_by_state_and_info.end() && (*curr_jt)->getState() == DataPartState::Active) { /// Don't consider data parts belonging to different partitions. if ((*curr_jt)->info.partition_id != (*prev_jt)->info.partition_id) @@ -2368,8 +2370,8 @@ MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace( DataPartsLock & /* data_parts_lock */) const { /// Parts contained in the part are consecutive in data_parts, intersecting the insertion place for the part itself. - auto it_middle = data_parts_by_state_and_info.lower_bound(DataPartStateAndInfo{DataPartState::Committed, new_part_info}); - auto committed_parts_range = getDataPartsStateRange(DataPartState::Committed); + auto it_middle = data_parts_by_state_and_info.lower_bound(DataPartStateAndInfo{DataPartState::Active, new_part_info}); + auto committed_parts_range = getDataPartsStateRange(DataPartState::Active); /// Go to the left. DataPartIteratorByStateAndInfo begin = it_middle; @@ -2457,6 +2459,8 @@ bool MergeTreeData::renameTempPartAndReplace( MergeTreePartInfo part_info = part->info; String part_name; + String old_part_name = part->name; + if (DataPartPtr existing_part_in_partition = getAnyPartInPartition(part->info.partition_id, lock)) { if (part->partition.value != existing_part_in_partition->partition.value) @@ -2520,10 +2524,11 @@ bool MergeTreeData::renameTempPartAndReplace( /// So, we maintain invariant: if a non-temporary part in filesystem then it is in data_parts /// /// If out_transaction is null, we commit the part to the active set immediately, else add it to the transaction. + part->name = part_name; part->info = part_info; part->is_temp = false; - part->setState(DataPartState::PreCommitted); + part->setState(DataPartState::PreActive); part->renameTo(part_name, true); auto part_it = data_parts_indexes.insert(part).first; @@ -2550,7 +2555,7 @@ bool MergeTreeData::renameTempPartAndReplace( decreaseDataVolume(reduce_bytes, reduce_rows, reduce_parts); - modifyPartState(part_it, DataPartState::Committed); + modifyPartState(part_it, DataPartState::Active); addPartContributionToColumnAndSecondaryIndexSizes(part); addPartContributionToDataVolume(part); } @@ -2568,6 +2573,9 @@ bool MergeTreeData::renameTempPartAndReplace( out_covered_parts->emplace_back(std::move(covered_part)); } + /// Cleanup shared locks made with old name + part->cleanupOldName(old_part_name); + return true; } @@ -2592,13 +2600,13 @@ void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVect for (const DataPartPtr & part : remove) { - if (part->getState() == IMergeTreeDataPart::State::Committed) + if (part->getState() == IMergeTreeDataPart::State::Active) { removePartContributionToColumnAndSecondaryIndexSizes(part); removePartContributionToDataVolume(part); } - if (part->getState() == IMergeTreeDataPart::State::Committed || clear_without_timeout) + if (part->getState() == IMergeTreeDataPart::State::Active || clear_without_timeout) part->remove_time.store(remove_time, std::memory_order_relaxed); if (part->getState() != IMergeTreeDataPart::State::Outdated) @@ -2634,7 +2642,7 @@ void MergeTreeData::removePartsFromWorkingSet(const DataPartsVector & remove, bo if (!data_parts_by_info.count(part->info)) throw Exception("Part " + part->getNameWithState() + " not found in data_parts", ErrorCodes::LOGICAL_ERROR); - part->assertState({DataPartState::PreCommitted, DataPartState::Committed, DataPartState::Outdated}); + part->assertState({DataPartState::PreActive, DataPartState::Active, DataPartState::Outdated}); } removePartsFromWorkingSet(remove, clear_without_timeout, lock); @@ -2732,7 +2740,7 @@ restore_covered) /// What if part_to_detach is a reference to *it_part? Make a new owner just in case. DataPartPtr part = *it_part; - if (part->getState() == DataPartState::Committed) + if (part->getState() == DataPartState::Active) { removePartContributionToDataVolume(part); removePartContributionToColumnAndSecondaryIndexSizes(part); @@ -2759,7 +2767,7 @@ restore_covered) auto is_appropriate_state = [] (DataPartState state) { - return state == DataPartState::Committed || state == DataPartState::Outdated; + return state == DataPartState::Active || state == DataPartState::Outdated; }; auto update_error = [&] (DataPartIteratorByInfo it) @@ -2781,11 +2789,11 @@ restore_covered) if ((*it)->info.min_block != part->info.min_block) update_error(it); - if ((*it)->getState() != DataPartState::Committed) + if ((*it)->getState() != DataPartState::Active) { addPartContributionToColumnAndSecondaryIndexSizes(*it); addPartContributionToDataVolume(*it); - modifyPartState(it, DataPartState::Committed); // iterator is not invalidated here + modifyPartState(it, DataPartState::Active); // iterator is not invalidated here } pos = (*it)->info.max_block + 1; @@ -2812,11 +2820,11 @@ restore_covered) if ((*it)->info.min_block > pos) update_error(it); - if ((*it)->getState() != DataPartState::Committed) + if ((*it)->getState() != DataPartState::Active) { addPartContributionToColumnAndSecondaryIndexSizes(*it); addPartContributionToDataVolume(*it); - modifyPartState(it, DataPartState::Committed); + modifyPartState(it, DataPartState::Active); } pos = (*it)->info.max_block + 1; @@ -2930,7 +2938,7 @@ size_t MergeTreeData::getMaxPartsCountForPartitionWithState(DataPartState state) size_t MergeTreeData::getMaxPartsCountForPartition() const { - return getMaxPartsCountForPartitionWithState(DataPartState::Committed); + return getMaxPartsCountForPartitionWithState(DataPartState::Active); } @@ -2945,7 +2953,7 @@ std::optional MergeTreeData::getMinPartDataVersion() const auto lock = lockParts(); std::optional result; - for (const auto & part : getDataPartsStateRange(DataPartState::Committed)) + for (const auto & part : getDataPartsStateRange(DataPartState::Active)) { if (!result || *result > part->info.getDataVersion()) result = part->info.getDataVersion(); @@ -3051,7 +3059,7 @@ MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart( void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) { auto lock = lockParts(); - for (auto original_active_part : getDataPartsStateRange(DataPartState::Committed)) // NOLINT (copy is intended) + for (auto original_active_part : getDataPartsStateRange(DataPartState::Active)) // NOLINT (copy is intended) { if (part_copy->name == original_active_part->name) { @@ -3076,7 +3084,7 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) data_parts_indexes.erase(active_part_it); auto part_it = data_parts_indexes.insert(part_copy).first; - modifyPartState(part_it, DataPartState::Committed); + modifyPartState(part_it, DataPartState::Active); removePartContributionToDataVolume(original_active_part); addPartContributionToDataVolume(part_copy); @@ -3101,7 +3109,7 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const MergeTreePartInfo & part_info) const { auto lock = lockParts(); - return getActiveContainingPart(part_info, DataPartState::Committed, lock); + return getActiveContainingPart(part_info, DataPartState::Active, lock); } MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const String & part_name) const @@ -3171,7 +3179,7 @@ void MergeTreeData::calculateColumnAndSecondaryIndexSizesImpl() column_sizes.clear(); /// Take into account only committed parts - auto committed_parts_range = getDataPartsStateRange(DataPartState::Committed); + auto committed_parts_range = getDataPartsStateRange(DataPartState::Active); for (const auto & part : committed_parts_range) addPartContributionToColumnAndSecondaryIndexSizes(part); } @@ -3266,7 +3274,7 @@ void MergeTreeData::checkAlterPartitionIsPossible( void MergeTreeData::checkPartitionCanBeDropped(const ASTPtr & partition) { const String partition_id = getPartitionIDFromQuery(partition, getContext()); - auto parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + auto parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); UInt64 partition_size = 0; @@ -3279,7 +3287,7 @@ void MergeTreeData::checkPartitionCanBeDropped(const ASTPtr & partition) void MergeTreeData::checkPartCanBeDropped(const String & part_name) { - auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Committed}); + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active}); if (!part) throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "No part {} in committed state", part_name); @@ -3305,7 +3313,7 @@ void MergeTreeData::movePartitionToDisk(const ASTPtr & partition, const String & throw Exception("Part " + partition_id + " is not exists or not active", ErrorCodes::NO_SUCH_DATA_PART); } else - parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); auto disk = getStoragePolicy()->getDiskByName(name); if (!disk) @@ -3350,7 +3358,7 @@ void MergeTreeData::movePartitionToVolume(const ASTPtr & partition, const String throw Exception("Part " + partition_id + " is not exists or not active", ErrorCodes::NO_SUCH_DATA_PART); } else - parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); auto volume = getStoragePolicy()->getVolumeByName(name); if (!volume) @@ -3532,7 +3540,7 @@ BackupEntries MergeTreeData::backup(const ASTs & partitions, ContextPtr local_co if (partitions.empty()) data_parts = getDataPartsVector(); else - data_parts = getDataPartsVectorInPartitions(MergeTreeDataPartState::Committed, getPartitionIDsFromQuery(partitions, local_context)); + data_parts = getDataPartsVectorInPartitions(MergeTreeDataPartState::Active, getPartitionIDsFromQuery(partitions, local_context)); return backupDataParts(data_parts); } @@ -3906,8 +3914,8 @@ void MergeTreeData::dropDetached(const ASTPtr & partition, bool part, ContextPtr for (auto & [old_name, new_name, disk] : renamed_parts.old_and_new_names) { - disk->removeRecursive(fs::path(relative_data_path) / "detached" / new_name / ""); - LOG_DEBUG(log, "Dropped detached part {}", old_name); + bool keep_shared = removeDetachedPart(disk, fs::path(relative_data_path) / "detached" / new_name / "", old_name, false); + LOG_DEBUG(log, "Dropped detached part {}, keep shared data: {}", old_name, keep_shared); old_name.clear(); } } @@ -4170,20 +4178,20 @@ MergeTreeData::DataParts MergeTreeData::getDataParts(const DataPartStates & affo MergeTreeData::DataParts MergeTreeData::getDataParts() const { - return getDataParts({DataPartState::Committed}); + return getDataParts({DataPartState::Active}); } MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVector() const { - return getDataPartsVector({DataPartState::Committed}); + return getDataPartsVector({DataPartState::Active}); } MergeTreeData::DataPartPtr MergeTreeData::getAnyPartInPartition( const String & partition_id, DataPartsLock & /*data_parts_lock*/) const { - auto it = data_parts_by_state_and_info.lower_bound(DataPartStateAndPartitionID{DataPartState::Committed, partition_id}); + auto it = data_parts_by_state_and_info.lower_bound(DataPartStateAndPartitionID{DataPartState::Active, partition_id}); - if (it != data_parts_by_state_and_info.end() && (*it)->getState() == DataPartState::Committed && (*it)->info.partition_id == partition_id) + if (it != data_parts_by_state_and_info.end() && (*it)->getState() == DataPartState::Active && (*it)->info.partition_id == partition_id) return *it; return nullptr; @@ -4276,7 +4284,7 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData: add_rows += part->rows_count; ++add_parts; - data.modifyPartState(part, DataPartState::Committed); + data.modifyPartState(part, DataPartState::Active); data.addPartContributionToColumnAndSecondaryIndexSizes(part); } } @@ -5195,7 +5203,9 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher( LOG_DEBUG(log, "Freezing part {} snapshot will be placed at {}", part->name, backup_path); - part->volume->getDisk()->createDirectories(backup_path); + auto disk = part->volume->getDisk(); + + disk->createDirectories(backup_path); String src_part_path = part->getFullRelativePath(); String backup_part_path = fs::path(backup_path) / relative_data_path / part->relative_path; @@ -5206,16 +5216,20 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher( src_part_path = fs::path(relative_data_path) / flushed_part_path / ""; } - localBackup(part->volume->getDisk(), src_part_path, backup_part_path); + localBackup(disk, src_part_path, backup_part_path); - part->volume->getDisk()->removeFileIfExists(fs::path(backup_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); + // Store metadata for replicated table. + // Do nothing for non-replocated. + createAndStoreFreezeMetadata(disk, part, backup_part_path); + + disk->removeFileIfExists(fs::path(backup_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); part->is_frozen.store(true, std::memory_order_relaxed); result.push_back(PartitionCommandResultInfo{ .partition_id = part->info.partition_id, .part_name = part->name, - .backup_path = fs::path(part->volume->getDisk()->getPath()) / backup_path, - .part_backup_path = fs::path(part->volume->getDisk()->getPath()) / backup_part_path, + .backup_path = fs::path(disk->getPath()) / backup_path, + .part_backup_path = fs::path(disk->getPath()) / backup_part_path, .backup_name = backup_name, }); ++parts_processed; @@ -5225,6 +5239,11 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher( return result; } +void MergeTreeData::createAndStoreFreezeMetadata(DiskPtr, DataPartPtr, String) const +{ + +} + PartitionCommandsResultInfo MergeTreeData::unfreezePartition( const ASTPtr & partition, const String & backup_name, @@ -5242,6 +5261,13 @@ PartitionCommandsResultInfo MergeTreeData::unfreezeAll( return unfreezePartitionsByMatcher([] (const String &) { return true; }, backup_name, local_context); } +bool MergeTreeData::removeDetachedPart(DiskPtr disk, const String & path, const String &, bool) +{ + disk->removeRecursive(path); + + return false; +} + PartitionCommandsResultInfo MergeTreeData::unfreezePartitionsByMatcher(MatcherFn matcher, const String & backup_name, ContextPtr) { auto backup_path = fs::path("shadow") / escapeForFileName(backup_name) / relative_data_path; @@ -5270,7 +5296,7 @@ PartitionCommandsResultInfo MergeTreeData::unfreezePartitionsByMatcher(MatcherFn const auto & path = it->path(); - disk->removeRecursive(path); + bool keep_shared = removeDetachedPart(disk, path, partition_directory, true); result.push_back(PartitionCommandResultInfo{ .partition_id = partition_id, @@ -5280,7 +5306,7 @@ PartitionCommandsResultInfo MergeTreeData::unfreezePartitionsByMatcher(MatcherFn .backup_name = backup_name, }); - LOG_DEBUG(log, "Unfreezed part by path {}", disk->getPath() + path); + LOG_DEBUG(log, "Unfreezed part by path {}, keep shared data: {}", disk->getPath() + path, keep_shared); } } @@ -5723,7 +5749,7 @@ ReservationPtr MergeTreeData::balancedReservation( for (const auto & part : covered_parts) submerging_big_parts_from_partition.insert(part->name); - for (const auto & part : getDataPartsStateRange(MergeTreeData::DataPartState::Committed)) + for (const auto & part : getDataPartsStateRange(MergeTreeData::DataPartState::Active)) { if (part->isStoredOnDisk() && part->getBytesOnDisk() >= min_bytes_to_rebalance_partition_over_jbod && part_info.partition_id == part->info.partition_id) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 380c2f4f4c5..f1d0abffc7a 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -233,7 +233,7 @@ public: const VolumePtr & volume, const String & relative_path, const IMergeTreeDataPart * parent_part = nullptr) const; /// Auxiliary object to add a set of parts into the working set in two steps: - /// * First, as PreCommitted parts (the parts are ready, but not yet in the active set). + /// * First, as PreActive parts (the parts are ready, but not yet in the active set). /// * Next, if commit() is called, the parts are added to the active set and the parts that are /// covered by them are marked Outdated. /// If neither commit() nor rollback() was called, the destructor rollbacks the operation. @@ -452,7 +452,7 @@ public: MutableDataPartsVector tryLoadPartsToAttach(const ASTPtr & partition, bool attach_part, ContextPtr context, PartsTemporaryRename & renamed_parts); - /// Returns Committed parts + /// Returns Active parts DataParts getDataParts() const; DataPartsVector getDataPartsVector() const; @@ -494,7 +494,7 @@ public: /// Renames temporary part to a permanent part and adds it to the parts set. /// It is assumed that the part does not intersect with existing parts. /// If increment != nullptr, part index is determining using increment. Otherwise part index remains unchanged. - /// If out_transaction != nullptr, adds the part in the PreCommitted state (the part will be added to the + /// If out_transaction != nullptr, adds the part in the PreActive state (the part will be added to the /// active set later with out_transaction->commit()). /// Else, commits the part immediately. /// Returns true if part was added. Returns false if part is covered by bigger part. @@ -518,7 +518,7 @@ public: void removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove); /// Removes parts from the working set parts. - /// Parts in add must already be in data_parts with PreCommitted, Committed, or Outdated states. + /// Parts in add must already be in data_parts with PreActive, Active, or Outdated states. /// If clear_without_timeout is true, the parts will be deleted at once, or during the next call to /// clearOldParts (ignoring old_parts_lifetime). void removePartsFromWorkingSet(const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock * acquired_lock = nullptr); @@ -873,10 +873,21 @@ public: /// Overridden in StorageReplicatedMergeTree virtual bool unlockSharedData(const IMergeTreeDataPart &) const { return true; } + /// Remove lock with old name for shared data part after rename + virtual bool unlockSharedData(const IMergeTreeDataPart &, const String &) const { return true; } + /// Fetch part only if some replica has it on shared storage like S3 /// Overridden in StorageReplicatedMergeTree virtual bool tryToFetchIfShared(const IMergeTreeDataPart &, const DiskPtr &, const String &) { return false; } + /// Check shared data usage on other replicas for detached/freezed part + /// Remove local files and remote files if needed + virtual bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed); + + /// Store metadata for replicated tables + /// Do nothing for non-replicated tables + virtual void createAndStoreFreezeMetadata(DiskPtr disk, DataPartPtr part, String backup_part_path) const; + /// Parts that currently submerging (merging to bigger parts) or emerging /// (to be appeared after merging finished). These two variables have to be used /// with `currently_submerging_emerging_mutex`. @@ -1049,7 +1060,7 @@ protected: /// If there is no part in the partition with ID `partition_id`, returns empty ptr. Should be called under the lock. DataPartPtr getAnyPartInPartition(const String & partition_id, DataPartsLock & data_parts_lock) const; - /// Return parts in the Committed set that are covered by the new_part_info or the part that covers it. + /// Return parts in the Active set that are covered by the new_part_info or the part that covers it. /// Will check that the new part doesn't already exist and that it doesn't intersect existing part. DataPartsVector getActivePartsToReplace( const MergeTreePartInfo & new_part_info, diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index 83ffcc41fd8..5a889ea5e8b 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -123,6 +123,9 @@ bool MergeTreePartsMover::selectPartsForMove( auto metadata_snapshot = data->getInMemoryMetadataPtr(); + if (need_to_move.empty() && !metadata_snapshot->hasAnyMoveTTL()) + return false; + for (const auto & part : data_parts) { String reason; diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index b991166b3b6..6861599a1ac 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -125,8 +125,10 @@ struct Settings; M(UInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.", 0) \ M(String, storage_policy, "default", "Name of storage disk policy", 0) \ M(Bool, allow_nullable_key, false, "Allow Nullable types as primary keys.", 0) \ - M(Bool, allow_remote_fs_zero_copy_replication, true, "Allow Zero-copy replication over remote fs", 0) \ - M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm", 0) \ + M(Bool, allow_remote_fs_zero_copy_replication, true, "Allow Zero-copy replication over remote fs.", 0) \ + M(String, remote_fs_zero_copy_zookeeper_path, "/clickhouse/zero_copy", "ZooKeeper path for Zero-copy table-independet info.", 0) \ + M(Bool, remote_fs_zero_copy_path_compatible_mode, false, "Run zero-copy in compatible mode during conversion process.", 0) \ + M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm.", 0) \ M(Bool, assign_part_uuids, false, "Generate UUIDs for parts. Before enabling check that all replicas support new format.", 0) \ M(Int64, max_partitions_to_read, -1, "Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited. This setting is the default that can be overridden by the query-level setting with the same name.", 0) \ M(UInt64, max_concurrent_queries, 0, "Max number of concurrently executed queries related to the MergeTree table (0 - disabled). Queries will still be limited by other max_concurrent_queries settings.", 0) \ diff --git a/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp b/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp index 5fe7de70a20..db5ca15ce8a 100644 --- a/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp @@ -217,9 +217,9 @@ bool ReplicatedMergeMutateTaskBase::executeImpl() ReplicatedMergeMutateTaskBase::CheckExistingPartResult ReplicatedMergeMutateTaskBase::checkExistingPart() { /// If we already have this part or a part covering it, we do not need to do anything. - /// The part may be still in the PreCommitted -> Committed transition so we first search - /// among PreCommitted parts to definitely find the desired part if it exists. - MergeTreeData::DataPartPtr existing_part = storage.getPartIfExists(entry.new_part_name, {MergeTreeDataPartState::PreCommitted}); + /// The part may be still in the PreActive -> Active transition so we first search + /// among PreActive parts to definitely find the desired part if it exists. + MergeTreeData::DataPartPtr existing_part = storage.getPartIfExists(entry.new_part_name, {MergeTreeDataPartState::PreActive}); if (!existing_part) existing_part = storage.getActiveContainingPart(entry.new_part_name); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index 8d3cb146990..8fcaee66007 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -272,10 +272,10 @@ std::pair ReplicatedMergeTreePartCheckThread::findLo /// but checker thread will remove part from zookeeper and queue fetch. bool exists_in_zookeeper = zookeeper->exists(part_path); - /// If the part is still in the PreCommitted -> Committed transition, it is not lost + /// If the part is still in the PreActive -> Active transition, it is not lost /// and there is no need to go searching for it on other replicas. To definitely find the needed part - /// if it exists (or a part containing it) we first search among the PreCommitted parts. - auto part = storage.getPartIfExists(part_name, {MergeTreeDataPartState::PreCommitted}); + /// if it exists (or a part containing it) we first search among the PreActive parts. + auto part = storage.getPartIfExists(part_name, {MergeTreeDataPartState::PreActive}); if (!part) part = storage.getActiveContainingPart(part_name); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index b3da3d47684..1432728d00a 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1177,7 +1177,7 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry( return false; } - auto part = data.getPartIfExists(name, {MergeTreeDataPartState::PreCommitted, MergeTreeDataPartState::Committed, MergeTreeDataPartState::Outdated}); + auto part = data.getPartIfExists(name, {MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); if (part) { if (auto part_in_memory = asInMemoryPart(part)) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp index 0cc6955ff72..7a5b82979bd 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp @@ -253,7 +253,7 @@ void ReplicatedMergeTreeRestartingThread::removeFailedQuorumParts() for (const auto & part_name : failed_parts) { auto part = storage.getPartIfExists( - part_name, {MergeTreeDataPartState::PreCommitted, MergeTreeDataPartState::Committed, MergeTreeDataPartState::Outdated}); + part_name, {MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); if (part) { diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 1ce748640dc..d2bf6ba308b 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -228,6 +228,8 @@ void ReplicatedMergeTreeSink::commitPart( bool is_already_existing_part = false; + String old_part_name = part->name; + while (true) { /// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem. @@ -370,7 +372,7 @@ void ReplicatedMergeTreeSink::commitPart( block_id, existing_part_name); /// If it does not exist, we will write a new part with existing name. - /// Note that it may also appear on filesystem right now in PreCommitted state due to concurrent inserts of the same data. + /// Note that it may also appear on filesystem right now in PreActive state due to concurrent inserts of the same data. /// It will be checked when we will try to rename directory. part->name = existing_part_name; @@ -508,6 +510,9 @@ void ReplicatedMergeTreeSink::commitPart( waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value); } + + /// Cleanup shared locks made with old name + part->cleanupOldName(old_part_name); } void ReplicatedMergeTreeSink::onStart() diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index cb52c8b86c0..fc3eff7459b 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -35,6 +35,7 @@ namespace ErrorCodes extern const int NO_ELEMENTS_IN_CONFIG; extern const int UNKNOWN_STORAGE; extern const int NO_REPLICA_NAME_GIVEN; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } @@ -258,6 +259,34 @@ If you use the Replicated version of engines, see https://clickhouse.com/docs/en return help; } +static ColumnsDescription getColumnsDescriptionFromZookeeper(const String & raw_zookeeper_path, ContextMutablePtr context) +{ + String zookeeper_name = zkutil::extractZooKeeperName(raw_zookeeper_path); + String zookeeper_path = zkutil::extractZooKeeperPath(raw_zookeeper_path, true); + + if (!context->hasZooKeeper() && !context->hasAuxiliaryZooKeeper(zookeeper_name)) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure without zookeeper, you must specify the structure manually"}; + + zkutil::ZooKeeperPtr zookeeper; + try + { + if (zookeeper_name == StorageReplicatedMergeTree::getDefaultZooKeeperName()) + zookeeper = context->getZooKeeper(); + else + zookeeper = context->getAuxiliaryZooKeeper(zookeeper_name); + } + catch (...) + { + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure from zookeeper, because cannot get zookeeper: {}. You must specify structure manually", getCurrentExceptionMessage(false)}; + } + + if (!zookeeper->exists(zookeeper_path + "/replicas")) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure, because there no other replicas in zookeeper. You must specify the structure manually"}; + + Coordination::Stat columns_stat; + return ColumnsDescription::parse(zookeeper->get(fs::path(zookeeper_path) / "columns", &columns_stat)); +} + static StoragePtr create(const StorageFactory::Arguments & args) { @@ -638,7 +667,14 @@ static StoragePtr create(const StorageFactory::Arguments & args) String date_column_name; StorageInMemoryMetadata metadata; - metadata.setColumns(args.columns); + + ColumnsDescription columns; + if (args.columns.empty() && replicated) + columns = getColumnsDescriptionFromZookeeper(zookeeper_path, args.getContext()); + else + columns = args.columns; + + metadata.setColumns(columns); metadata.setComment(args.comment); std::unique_ptr storage_settings; @@ -705,12 +741,12 @@ static StoragePtr create(const StorageFactory::Arguments & args) if (args.query.columns_list && args.query.columns_list->indices) for (auto & index : args.query.columns_list->indices->children) - metadata.secondary_indices.push_back(IndexDescription::getIndexFromAST(index, args.columns, args.getContext())); + metadata.secondary_indices.push_back(IndexDescription::getIndexFromAST(index, columns, args.getContext())); if (args.query.columns_list && args.query.columns_list->projections) for (auto & projection_ast : args.query.columns_list->projections->children) { - auto projection = ProjectionDescription::getProjectionFromAST(projection_ast, args.columns, args.getContext()); + auto projection = ProjectionDescription::getProjectionFromAST(projection_ast, columns, args.getContext()); metadata.projections.add(std::move(projection)); } @@ -720,10 +756,10 @@ static StoragePtr create(const StorageFactory::Arguments & args) constraints.push_back(constraint); metadata.constraints = ConstraintsDescription(constraints); - auto column_ttl_asts = args.columns.getColumnTTLs(); + auto column_ttl_asts = columns.getColumnTTLs(); for (const auto & [name, ast] : column_ttl_asts) { - auto new_ttl_entry = TTLDescription::getTTLFromAST(ast, args.columns, args.getContext(), metadata.primary_key); + auto new_ttl_entry = TTLDescription::getTTLFromAST(ast, columns, args.getContext(), metadata.primary_key); metadata.column_ttls_by_name[name] = new_ttl_entry; } @@ -850,6 +886,7 @@ void registerStorageMergeTree(StorageFactory & factory) features.supports_replication = true; features.supports_deduplication = true; + features.supports_schema_inference = true; factory.registerStorage("ReplicatedMergeTree", create, features); factory.registerStorage("ReplicatedCollapsingMergeTree", create, features); diff --git a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp index 4c66eda2fed..f02653d9167 100644 --- a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp +++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp @@ -18,6 +18,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int POSTGRESQL_REPLICATION_INTERNAL_ERROR; } MaterializedPostgreSQLConsumer::MaterializedPostgreSQLConsumer( @@ -29,7 +30,7 @@ MaterializedPostgreSQLConsumer::MaterializedPostgreSQLConsumer( const size_t max_block_size_, bool schema_as_a_part_of_table_name_, bool allow_automatic_update_, - Storages storages_, + StorageInfos storages_info_, const String & name_for_logger) : log(&Poco::Logger::get("PostgreSQLReplicaConsumer(" + name_for_logger + ")")) , context(context_) @@ -41,7 +42,6 @@ MaterializedPostgreSQLConsumer::MaterializedPostgreSQLConsumer( , max_block_size(max_block_size_) , schema_as_a_part_of_table_name(schema_as_a_part_of_table_name_) , allow_automatic_update(allow_automatic_update_) - , storages(storages_) { final_lsn = start_lsn; auto tx = std::make_shared(connection->getRef()); @@ -49,19 +49,28 @@ MaterializedPostgreSQLConsumer::MaterializedPostgreSQLConsumer( LOG_TRACE(log, "Starting replication. LSN: {} (last: {})", getLSNValue(current_lsn), getLSNValue(final_lsn)); tx->commit(); - for (const auto & [table_name, storage] : storages) - { - buffers.emplace(table_name, Buffer(storage)); - } + for (const auto & [table_name, storage_info] : storages_info_) + storages.emplace(table_name, storage_info); } -void MaterializedPostgreSQLConsumer::Buffer::createEmptyBuffer(StoragePtr storage) +MaterializedPostgreSQLConsumer::StorageData::StorageData(const StorageInfo & storage_info) + : storage(storage_info.storage), buffer(storage_info.storage->getInMemoryMetadataPtr(), storage_info.attributes) +{ + auto table_id = storage_info.storage->getStorageID(); + LOG_TRACE(&Poco::Logger::get("StorageMaterializedPostgreSQL"), + "New buffer for table {}, number of attributes: {}, number if columns: {}, structure: {}", + table_id.getNameForLogs(), buffer.attributes.size(), buffer.getColumnsNum(), buffer.description.sample_block.dumpStructure()); +} + + +MaterializedPostgreSQLConsumer::StorageData::Buffer::Buffer( + StorageMetadataPtr storage_metadata, const PostgreSQLTableStructure::Attributes & attributes_) + : attributes(attributes_) { - const auto storage_metadata = storage->getInMemoryMetadataPtr(); const Block sample_block = storage_metadata->getSampleBlock(); - /// Need to clear type, because in description.init() the types are appended (emplace_back) + /// Need to clear type, because in description.init() the types are appended description.types.clear(); description.init(sample_block); @@ -69,13 +78,13 @@ void MaterializedPostgreSQLConsumer::Buffer::createEmptyBuffer(StoragePtr storag const auto & storage_columns = storage_metadata->getColumns().getAllPhysical(); auto insert_columns = std::make_shared(); - auto table_id = storage->getStorageID(); - LOG_TRACE(&Poco::Logger::get("MaterializedPostgreSQLBuffer"), "New buffer for table {}.{} ({}), structure: {}", - table_id.database_name, table_id.table_name, toString(table_id.uuid), sample_block.dumpStructure()); + auto columns_num = description.sample_block.columns(); + assert(columns_num == storage_columns.size()); + if (attributes.size() + 2 != columns_num) /// +2 because sign and version columns + throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns number mismatch. Attributes: {}, buffer: {}", + attributes.size(), columns_num); - assert(description.sample_block.columns() == storage_columns.size()); size_t idx = 0; - for (const auto & column : storage_columns) { if (description.types[idx].first == ExternalResultDescription::ValueType::vtArray) @@ -85,37 +94,45 @@ void MaterializedPostgreSQLConsumer::Buffer::createEmptyBuffer(StoragePtr storag insert_columns->children.emplace_back(std::make_shared(column.name)); } - columnsAST = std::move(insert_columns); + columns_ast = std::move(insert_columns); } -void MaterializedPostgreSQLConsumer::insertValue(Buffer & buffer, const std::string & value, size_t column_idx) +void MaterializedPostgreSQLConsumer::insertValue(StorageData::Buffer & buffer, const std::string & value, size_t column_idx) { const auto & sample = buffer.description.sample_block.getByPosition(column_idx); bool is_nullable = buffer.description.types[column_idx].second; - if (is_nullable) + try { - ColumnNullable & column_nullable = assert_cast(*buffer.columns[column_idx]); - const auto & data_type = assert_cast(*sample.type); + if (is_nullable) + { + ColumnNullable & column_nullable = assert_cast(*buffer.columns[column_idx]); + const auto & data_type = assert_cast(*sample.type); - insertPostgreSQLValue( - column_nullable.getNestedColumn(), value, - buffer.description.types[column_idx].first, data_type.getNestedType(), buffer.array_info, column_idx); + insertPostgreSQLValue( + column_nullable.getNestedColumn(), value, + buffer.description.types[column_idx].first, data_type.getNestedType(), buffer.array_info, column_idx); - column_nullable.getNullMapData().emplace_back(0); + column_nullable.getNullMapData().emplace_back(0); + } + else + { + insertPostgreSQLValue( + *buffer.columns[column_idx], value, + buffer.description.types[column_idx].first, sample.type, + buffer.array_info, column_idx); + } } - else + catch (const pqxx::conversion_error & e) { - insertPostgreSQLValue( - *buffer.columns[column_idx], value, - buffer.description.types[column_idx].first, sample.type, - buffer.array_info, column_idx); + LOG_ERROR(log, "Conversion failed while inserting PostgreSQL value {}, will insert default value. Error: {}", value, e.what()); + insertDefaultValue(buffer, column_idx); } } -void MaterializedPostgreSQLConsumer::insertDefaultValue(Buffer & buffer, size_t column_idx) +void MaterializedPostgreSQLConsumer::insertDefaultValue(StorageData::Buffer & buffer, size_t column_idx) { const auto & sample = buffer.description.sample_block.getByPosition(column_idx); insertDefaultPostgreSQLValue(*buffer.columns[column_idx], *sample.column); @@ -186,10 +203,16 @@ Int8 MaterializedPostgreSQLConsumer::readInt8(const char * message, size_t & pos void MaterializedPostgreSQLConsumer::readTupleData( - Buffer & buffer, const char * message, size_t & pos, [[maybe_unused]] size_t size, PostgreSQLQuery type, bool old_value) + StorageData::Buffer & buffer, const char * message, size_t & pos, [[maybe_unused]] size_t size, PostgreSQLQuery type, bool old_value) { Int16 num_columns = readInt16(message, pos, size); + /// Sanity check. In fact, it was already checked. + if (static_cast(num_columns) + 2 != buffer.getColumnsNum()) /// +2 -- sign and version columns + throw Exception(ErrorCodes::POSTGRESQL_REPLICATION_INTERNAL_ERROR, + "Number of columns does not match. Got: {}, expected {}, current buffer structure: {}", + num_columns, buffer.getColumnsNum(), buffer.description.sample_block.dumpStructure()); + auto proccess_column_value = [&](Int8 identifier, Int16 column_idx) { switch (identifier) @@ -202,8 +225,15 @@ void MaterializedPostgreSQLConsumer::readTupleData( case 't': /// Text formatted value { Int32 col_len = readInt32(message, pos, size); - String value; + /// Sanity check for protocol misuse. + /// PostgreSQL uses a fixed page size (commonly 8 kB), and does not allow tuples to span multiple pages. + static constexpr Int32 sanity_check_max_col_len = 1024 * 8 * 2; /// *2 -- just in case. + if (unlikely(col_len > sanity_check_max_col_len)) + throw Exception(ErrorCodes::POSTGRESQL_REPLICATION_INTERNAL_ERROR, + "Column length is suspiciously long: {}", col_len); + + String value; for (Int32 i = 0; i < col_len; ++i) value += readInt8(message, pos, size); @@ -276,19 +306,20 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl { Int32 relation_id = readInt32(replication_message, pos, size); const auto & table_name = relation_id_to_name[relation_id]; - /// FIXME:If table name is empty here, it means we failed to load it, but it was included in publication. Need to remove? if (table_name.empty()) - LOG_WARNING(log, "No table mapping for relation id: {}. Probably table failed to be loaded", relation_id); + { + LOG_ERROR(log, "No table mapping for relation id: {}. It's a bug", relation_id); + return; + } if (!isSyncAllowed(relation_id, table_name)) return; Int8 new_tuple = readInt8(replication_message, pos, size); - auto buffer = buffers.find(table_name); - assert(buffer != buffers.end()); + auto & buffer = storages.find(table_name)->second.buffer; if (new_tuple) - readTupleData(buffer->second, replication_message, pos, size, PostgreSQLQuery::INSERT); + readTupleData(buffer, replication_message, pos, size, PostgreSQLQuery::INSERT); break; } @@ -296,15 +327,16 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl { Int32 relation_id = readInt32(replication_message, pos, size); const auto & table_name = relation_id_to_name[relation_id]; - /// FIXME:If table name is empty here, it means we failed to load it, but it was included in publication. Need to remove? if (table_name.empty()) - LOG_WARNING(log, "No table mapping for relation id: {}. Probably table failed to be loaded", relation_id); + { + LOG_ERROR(log, "No table mapping for relation id: {}. It's a bug", relation_id); + return; + } if (!isSyncAllowed(relation_id, table_name)) return; - auto buffer = buffers.find(table_name); - assert(buffer != buffers.end()); + auto & buffer = storages.find(table_name)->second.buffer; auto proccess_identifier = [&](Int8 identifier) -> bool { @@ -319,13 +351,13 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl /// it is much more efficient to use replica identity index, but support all possible cases. case 'O': { - readTupleData(buffer->second, replication_message, pos, size, PostgreSQLQuery::UPDATE, true); + readTupleData(buffer, replication_message, pos, size, PostgreSQLQuery::UPDATE, true); break; } case 'N': { /// New row. - readTupleData(buffer->second, replication_message, pos, size, PostgreSQLQuery::UPDATE); + readTupleData(buffer, replication_message, pos, size, PostgreSQLQuery::UPDATE); read_next = false; break; } @@ -347,9 +379,11 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl { Int32 relation_id = readInt32(replication_message, pos, size); const auto & table_name = relation_id_to_name[relation_id]; - /// FIXME:If table name is empty here, it means we failed to load it, but it was included in publication. Need to remove? if (table_name.empty()) - LOG_WARNING(log, "No table mapping for relation id: {}. Probably table failed to be loaded", relation_id); + { + LOG_ERROR(log, "No table mapping for relation id: {}. It's a bug", relation_id); + return; + } if (!isSyncAllowed(relation_id, table_name)) return; @@ -357,10 +391,8 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl /// 0 or 1 if replica identity is set to full. For now only default replica identity is supported (with primary keys). readInt8(replication_message, pos, size); - auto buffer = buffers.find(table_name); - assert(buffer != buffers.end()); - readTupleData(buffer->second, replication_message, pos, size, PostgreSQLQuery::DELETE); - + auto & buffer = storages.find(table_name)->second.buffer; + readTupleData(buffer, replication_message, pos, size, PostgreSQLQuery::DELETE); break; } case 'C': // Commit @@ -379,7 +411,6 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl Int32 relation_id = readInt32(replication_message, pos, size); String relation_namespace, relation_name; - readString(replication_message, pos, size, relation_namespace); readString(replication_message, pos, size, relation_name); @@ -389,22 +420,26 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl else table_name = relation_name; + if (!relation_id_to_name.contains(relation_id)) + relation_id_to_name[relation_id] = table_name; + if (!isSyncAllowed(relation_id, relation_name)) return; - if (storages.find(table_name) == storages.end()) + auto storage_iter = storages.find(table_name); + if (storage_iter == storages.end()) { - markTableAsSkipped(relation_id, table_name); - /// TODO: This can happen if we created a publication with this table but then got an exception that this + /// FIXME: This can happen if we created a publication with this table but then got an exception that this /// table has primary key or something else. LOG_ERROR(log, - "Storage for table {} does not exist, but is included in replication stream. (Storages number: {})", + "Storage for table {} does not exist, but is included in replication stream. (Storages number: {})" + "Please manually remove this table from replication (DETACH TABLE query) to avoid redundant replication", table_name, storages.size()); + markTableAsSkipped(relation_id, table_name); return; } - assert(buffers.contains(table_name)); - + auto & buffer = storage_iter->second.buffer; /// 'd' - default (primary key if any) /// 'n' - nothing @@ -412,7 +447,6 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl /// 'i' - user defined index with indisreplident set /// Only 'd' and 'i' - are supported. char replica_identity = readInt8(replication_message, pos, size); - if (replica_identity != 'd' && replica_identity != 'i') { LOG_WARNING(log, @@ -423,25 +457,29 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl Int16 num_columns = readInt16(replication_message, pos, size); - Int32 data_type_id; - Int32 type_modifier; /// For example, n in varchar(n) - - bool new_relation_definition = false; - if (schema_data.find(relation_id) == schema_data.end()) - { - relation_id_to_name[relation_id] = table_name; - schema_data.emplace(relation_id, SchemaData(num_columns)); - new_relation_definition = true; - } - - auto & current_schema_data = schema_data.find(relation_id)->second; - - if (current_schema_data.number_of_columns != num_columns) + if (static_cast(num_columns) + 2 != buffer.getColumnsNum()) /// +2 -- sign and version columns { markTableAsSkipped(relation_id, table_name); return; } + if (static_cast(num_columns) != buffer.attributes.size()) + { +#ifndef NDEBUG + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Mismatch in attributes size. Got {}, expected {}. It's a bug. Current buffer structure: {}", + num_columns, buffer.attributes.size(), buffer.description.sample_block.dumpStructure()); +#else + LOG_ERROR(log, "Mismatch in attributes size. Got {}, expected {}. It's a bug. Current buffer structure: {}", + num_columns, buffer.attributes.size(), buffer.description.sample_block.dumpStructure()); + markTableAsSkipped(relation_id, table_name); + return; +#endif + } + + Int32 data_type_id; + Int32 type_modifier; /// For example, n in varchar(n) + for (uint16_t i = 0; i < num_columns; ++i) { String column_name; @@ -451,23 +489,14 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl data_type_id = readInt32(replication_message, pos, size); type_modifier = readInt32(replication_message, pos, size); - if (new_relation_definition) + if (buffer.attributes[i].atttypid != data_type_id || buffer.attributes[i].atttypmod != type_modifier) { - current_schema_data.column_identifiers.emplace_back(std::make_pair(data_type_id, type_modifier)); - } - else - { - if (current_schema_data.column_identifiers[i].first != data_type_id - || current_schema_data.column_identifiers[i].second != type_modifier) - { - markTableAsSkipped(relation_id, table_name); - return; - } + markTableAsSkipped(relation_id, table_name); + return; } } tables_to_sync.insert(table_name); - break; } case 'O': // Origin @@ -489,19 +518,19 @@ void MaterializedPostgreSQLConsumer::syncTables() { for (const auto & table_name : tables_to_sync) { - auto & buffer = buffers.find(table_name)->second; - Block result_rows = buffer.description.sample_block.cloneWithColumns(std::move(buffer.columns)); + auto & storage_data = storages.find(table_name)->second; + Block result_rows = storage_data.buffer.description.sample_block.cloneWithColumns(std::move(storage_data.buffer.columns)); if (result_rows.rows()) { - auto storage = storages[table_name]; + auto storage = storage_data.storage; auto insert_context = Context::createCopy(context); insert_context->setInternalQuery(true); auto insert = std::make_shared(); insert->table_id = storage->getStorageID(); - insert->columns = buffer.columnsAST; + insert->columns = storage_data.buffer.columns_ast; InterpreterInsertQuery interpreter(insert, insert_context, true); auto io = interpreter.execute(); @@ -514,7 +543,7 @@ void MaterializedPostgreSQLConsumer::syncTables() CompletedPipelineExecutor executor(io.pipeline); executor.execute(); - buffer.columns = buffer.description.sample_block.cloneEmptyColumns(); + storage_data.buffer.columns = storage_data.buffer.description.sample_block.cloneEmptyColumns(); } } @@ -599,34 +628,21 @@ bool MaterializedPostgreSQLConsumer::isSyncAllowed(Int32 relation_id, const Stri void MaterializedPostgreSQLConsumer::markTableAsSkipped(Int32 relation_id, const String & relation_name) { - /// Empty lsn string means - continue waiting for valid lsn. - skip_list.insert({relation_id, ""}); + skip_list.insert({relation_id, ""}); /// Empty lsn string means - continue waiting for valid lsn. + storages.erase(relation_name); - if (storages.count(relation_name)) - { - /// Erase cached schema identifiers. It will be updated again once table is allowed back into replication stream - /// and it receives first data after update. - schema_data.erase(relation_id); - - /// Clear table buffer. - auto & buffer = buffers.find(relation_name)->second; - buffer.columns = buffer.description.sample_block.cloneEmptyColumns(); - - if (allow_automatic_update) - LOG_TRACE(log, "Table {} (relation_id: {}) is skipped temporarily. It will be reloaded in the background", relation_name, relation_id); - else - LOG_WARNING(log, "Table {} (relation_id: {}) is skipped, because table schema has changed", relation_name, relation_id); - } + if (allow_automatic_update) + LOG_TRACE(log, "Table {} (relation_id: {}) is skipped temporarily. It will be reloaded in the background", relation_name, relation_id); + else + LOG_WARNING(log, "Table {} (relation_id: {}) is skipped, because table schema has changed", relation_name, relation_id); } -void MaterializedPostgreSQLConsumer::addNested(const String & postgres_table_name, StoragePtr nested_storage, const String & table_start_lsn) +void MaterializedPostgreSQLConsumer::addNested( + const String & postgres_table_name, StorageInfo nested_storage_info, const String & table_start_lsn) { - /// Cache new pointer to replacingMergeTree table. - storages.emplace(postgres_table_name, nested_storage); - - /// Add new in-memory buffer. - buffers.emplace(postgres_table_name, Buffer(nested_storage)); + assert(!storages.contains(postgres_table_name)); + storages.emplace(postgres_table_name, nested_storage_info); /// Replication consumer will read wall and check for currently processed table whether it is allowed to start applying /// changes to this table. @@ -634,14 +650,10 @@ void MaterializedPostgreSQLConsumer::addNested(const String & postgres_table_nam } -void MaterializedPostgreSQLConsumer::updateNested(const String & table_name, StoragePtr nested_storage, Int32 table_id, const String & table_start_lsn) +void MaterializedPostgreSQLConsumer::updateNested(const String & table_name, StorageInfo nested_storage_info, Int32 table_id, const String & table_start_lsn) { - /// Cache new pointer to replacingMergeTree table. - storages[table_name] = nested_storage; - - /// Create a new empty buffer (with updated metadata), where data is first loaded before syncing into actual table. - auto & buffer = buffers.find(table_name)->second; - buffer.createEmptyBuffer(nested_storage); + assert(!storages.contains(table_name)); + storages.emplace(table_name, nested_storage_info); /// Set start position to valid lsn. Before it was an empty string. Further read for table allowed, if it has a valid lsn. skip_list[table_id] = table_start_lsn; @@ -651,7 +663,6 @@ void MaterializedPostgreSQLConsumer::updateNested(const String & table_name, Sto void MaterializedPostgreSQLConsumer::removeNested(const String & postgres_table_name) { storages.erase(postgres_table_name); - buffers.erase(postgres_table_name); deleted_tables.insert(postgres_table_name); } @@ -706,7 +717,17 @@ bool MaterializedPostgreSQLConsumer::readFromReplicationSlot() current_lsn = (*row)[0]; lsn_value = getLSNValue(current_lsn); - processReplicationMessage((*row)[1].c_str(), (*row)[1].size()); + try + { + // LOG_DEBUG(log, "Current message: {}", (*row)[1]); + processReplicationMessage((*row)[1].c_str(), (*row)[1].size()); + } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::POSTGRESQL_REPLICATION_INTERNAL_ERROR) + continue; + throw; + } } } catch (const Exception &) @@ -737,11 +758,6 @@ bool MaterializedPostgreSQLConsumer::readFromReplicationSlot() LOG_ERROR(log, "Conversion error: {}", e.what()); return false; } - catch (const pqxx::in_doubt_error & e) - { - LOG_ERROR(log, "PostgreSQL library has some doubts: {}", e.what()); - return false; - } catch (const pqxx::internal_error & e) { LOG_ERROR(log, "PostgreSQL library internal error: {}", e.what()); @@ -749,16 +765,8 @@ bool MaterializedPostgreSQLConsumer::readFromReplicationSlot() } catch (...) { - /// Since reading is done from a background task, it is important to catch any possible error - /// in order to understand why something does not work. - try - { - std::rethrow_exception(std::current_exception()); - } - catch (const std::exception& e) - { - LOG_ERROR(log, "Unexpected error: {}", e.what()); - } + tryLogCurrentException(__PRETTY_FUNCTION__); + return false; } if (!tables_to_sync.empty()) @@ -770,6 +778,11 @@ bool MaterializedPostgreSQLConsumer::readFromReplicationSlot() bool MaterializedPostgreSQLConsumer::consume(std::vector> & skipped_tables) { + /// Read up to max_block_size changed (approximately - in same cases might be more). + /// false: no data was read, reschedule. + /// true: some data was read, schedule as soon as possible. + auto read_next = readFromReplicationSlot(); + /// Check if there are tables, which are skipped from being updated by changes from replication stream, /// because schema changes were detected. Update them, if it is allowed. if (allow_automatic_update && !skip_list.empty()) @@ -786,10 +799,6 @@ bool MaterializedPostgreSQLConsumer::consume(std::vector #include #include +#include namespace DB { struct SettingChange; +struct StorageInfo +{ + StoragePtr storage; + PostgreSQLTableStructure::Attributes attributes; + + StorageInfo(StoragePtr storage_, const PostgreSQLTableStructure::Attributes & attributes_) + : storage(storage_), attributes(attributes_) {} +}; +using StorageInfos = std::unordered_map; + class MaterializedPostgreSQLConsumer { -public: - using Storages = std::unordered_map; +private: + struct StorageData + { + struct Buffer + { + ExternalResultDescription description; + MutableColumns columns; + /// Needed to pass to insert query columns list in syncTables(). + std::shared_ptr columns_ast; + /// Needed for insertPostgreSQLValue() method to parse array + std::unordered_map array_info; + /// To validate ddl. + PostgreSQLTableStructure::Attributes attributes; + + Buffer(StorageMetadataPtr storage_metadata, const PostgreSQLTableStructure::Attributes & attributes_); + + size_t getColumnsNum() const + { + const auto & sample_block = description.sample_block; + return sample_block.columns(); + } + }; + + StoragePtr storage; + Buffer buffer; + + explicit StorageData(const StorageInfo & storage_info); + StorageData(const StorageData & other) = delete; + }; + + using Storages = std::unordered_map; + +public: MaterializedPostgreSQLConsumer( ContextPtr context_, std::shared_ptr connection_, const String & replication_slot_name_, const String & publication_name_, const String & start_lsn, - const size_t max_block_size_, + size_t max_block_size_, bool schema_as_a_part_of_table_name_, bool allow_automatic_update_, - Storages storages_, + StorageInfos storages_, const String & name_for_logger); bool consume(std::vector> & skipped_tables); /// Called from reloadFromSnapshot by replication handler. This method is needed to move a table back into synchronization /// process if it was skipped due to schema changes. - void updateNested(const String & table_name, StoragePtr nested_storage, Int32 table_id, const String & table_start_lsn); + void updateNested(const String & table_name, StorageInfo nested_storage_info, Int32 table_id, const String & table_start_lsn); - void addNested(const String & postgres_table_name, StoragePtr nested_storage, const String & table_start_lsn); + void addNested(const String & postgres_table_name, StorageInfo nested_storage_info, const String & table_start_lsn); void removeNested(const String & postgres_table_name); @@ -55,25 +97,8 @@ private: bool isSyncAllowed(Int32 relation_id, const String & relation_name); - struct Buffer - { - ExternalResultDescription description; - MutableColumns columns; - - /// Needed to pass to insert query columns list in syncTables(). - std::shared_ptr columnsAST; - - /// Needed for insertPostgreSQLValue() method to parse array - std::unordered_map array_info; - - Buffer(StoragePtr storage) { createEmptyBuffer(storage); } - void createEmptyBuffer(StoragePtr storage); - }; - - using Buffers = std::unordered_map; - - static void insertDefaultValue(Buffer & buffer, size_t column_idx); - static void insertValue(Buffer & buffer, const std::string & value, size_t column_idx); + static void insertDefaultValue(StorageData::Buffer & buffer, size_t column_idx); + void insertValue(StorageData::Buffer & buffer, const std::string & value, size_t column_idx); enum class PostgreSQLQuery { @@ -82,7 +107,7 @@ private: DELETE }; - void readTupleData(Buffer & buffer, const char * message, size_t & pos, size_t size, PostgreSQLQuery type, bool old_value = false); + void readTupleData(StorageData::Buffer & buffer, const char * message, size_t & pos, size_t size, PostgreSQLQuery type, bool old_value = false); template static T unhexN(const char * message, size_t pos, size_t n); @@ -95,7 +120,7 @@ private: void markTableAsSkipped(Int32 relation_id, const String & relation_name); /// lsn - log sequnce nuumber, like wal offset (64 bit). - Int64 getLSNValue(const std::string & lsn) + static Int64 getLSNValue(const std::string & lsn) { UInt32 upper_half, lower_half; std::sscanf(lsn.data(), "%X/%X", &upper_half, &lower_half); @@ -125,28 +150,11 @@ private: /// Holds `postgres_table_name` set. std::unordered_set tables_to_sync; - /// `postgres_table_name` -> ReplacingMergeTree table. + /// `postgres_table_name` -> StorageData. Storages storages; - /// `postgres_table_name` -> In-memory buffer. - Buffers buffers; std::unordered_map relation_id_to_name; - struct SchemaData - { - Int16 number_of_columns; - /// data_type_id and type_modifier - std::vector> column_identifiers; - - SchemaData(Int16 number_of_columns_) : number_of_columns(number_of_columns_) {} - }; - - /// Cache for table schema data to be able to detect schema changes, because ddl is not - /// replicated with postgresql logical replication protocol, but some table schema info - /// is received if it is the first time we received dml message for given relation in current session or - /// if relation definition has changed since the last relation definition message. - std::unordered_map schema_data; - /// `postgres_relation_id` -> `start_lsn` /// skip_list contains relation ids for tables on which ddl was performed, which can break synchronization. /// This breaking changes are detected in replication stream in according replication message and table is added to skip list. diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp index 984a9cdd47a..a669504b2d7 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp @@ -20,6 +20,7 @@ namespace DB static const auto RESCHEDULE_MS = 1000; static const auto BACKOFF_TRESHOLD_MS = 10000; +static const auto CLEANUP_RESCHEDULE_MS = 600000 * 3; /// 30 min namespace ErrorCodes { @@ -28,6 +29,30 @@ namespace ErrorCodes extern const int POSTGRESQL_REPLICATION_INTERNAL_ERROR; } +class TemporaryReplicationSlot +{ +public: + TemporaryReplicationSlot( + PostgreSQLReplicationHandler * handler_, + std::shared_ptr tx_, + String & start_lsn, + String & snapshot_name) + : handler(handler_), tx(tx_) + { + handler->createReplicationSlot(*tx, start_lsn, snapshot_name, /* temporary */true); + } + + ~TemporaryReplicationSlot() + { + handler->dropReplicationSlot(*tx, /* temporary */true); + } + +private: + PostgreSQLReplicationHandler * handler; + std::shared_ptr tx; +}; + + PostgreSQLReplicationHandler::PostgreSQLReplicationHandler( const String & replication_identifier, const String & postgres_database_, @@ -69,6 +94,7 @@ PostgreSQLReplicationHandler::PostgreSQLReplicationHandler( startup_task = context->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ checkConnectionAndStart(); }); consumer_task = context->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ consumerFunc(); }); + cleanup_task = context->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ cleanupFunc(); }); } @@ -148,6 +174,8 @@ void PostgreSQLReplicationHandler::shutdown() stop_synchronization.store(true); startup_task->deactivate(); consumer_task->deactivate(); + cleanup_task->deactivate(); + consumer.reset(); /// Clear shared pointers to inner storages. } @@ -158,7 +186,7 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error) createPublicationIfNeeded(tx); /// List of nested tables (table_name -> nested_storage), which is passed to replication consumer. - std::unordered_map nested_storages; + std::unordered_map nested_storages; /// snapshot_name is initialized only if a new replication slot is created. /// start_lsn is initialized in two places: @@ -193,7 +221,7 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error) { try { - nested_storages[table_name] = loadFromSnapshot(*tmp_connection, snapshot_name, table_name, storage->as()); + nested_storages.emplace(table_name, loadFromSnapshot(*tmp_connection, snapshot_name, table_name, storage->as())); } catch (Exception & e) { @@ -235,7 +263,12 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error) auto * materialized_storage = storage->as (); try { - nested_storages[table_name] = materialized_storage->getNested(); + auto [postgres_table_schema, postgres_table_name] = getSchemaAndTableName(table_name); + auto table_structure = fetchPostgreSQLTableStructure(tx, postgres_table_name, postgres_table_schema, true, true, true); + if (!table_structure.physical_columns) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No columns"); + auto storage_info = StorageInfo(materialized_storage->getNested(), table_structure.physical_columns->attributes); + nested_storages.emplace(table_name, std::move(storage_info)); } catch (Exception & e) { @@ -268,6 +301,7 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error) (is_materialized_postgresql_database ? postgres_database : postgres_database + '.' + tables_list)); consumer_task->activateAndSchedule(); + cleanup_task->activateAndSchedule(); /// Do not rely anymore on saved storage pointers. materialized_storages.clear(); @@ -278,16 +312,16 @@ ASTPtr PostgreSQLReplicationHandler::getCreateNestedTableQuery(StorageMaterializ { postgres::Connection connection(connection_info); pqxx::nontransaction tx(connection.getRef()); - auto table_structure = std::make_unique(fetchPostgreSQLTableStructure(tx, table_name, postgres_schema, true, true, true)); - if (!table_structure) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to get PostgreSQL table structure"); + + auto [postgres_table_schema, postgres_table_name] = getSchemaAndTableName(table_name); + auto table_structure = std::make_unique(fetchPostgreSQLTableStructure(tx, postgres_table_name, postgres_table_schema, true, true, true)); auto table_override = tryGetTableOverride(current_database_name, table_name); return storage->getCreateNestedTableQuery(std::move(table_structure), table_override ? table_override->as() : nullptr); } -StoragePtr PostgreSQLReplicationHandler::loadFromSnapshot(postgres::Connection & connection, String & snapshot_name, const String & table_name, +StorageInfo PostgreSQLReplicationHandler::loadFromSnapshot(postgres::Connection & connection, String & snapshot_name, const String & table_name, StorageMaterializedPostgreSQL * materialized_storage) { auto tx = std::make_shared(connection.getRef()); @@ -301,8 +335,13 @@ StoragePtr PostgreSQLReplicationHandler::loadFromSnapshot(postgres::Connection & query_str = fmt::format("SELECT * FROM {}", quoted_name); LOG_DEBUG(log, "Loading PostgreSQL table {}.{}", postgres_database, quoted_name); + auto table_structure = fetchTableStructure(*tx, table_name); + if (!table_structure->physical_columns) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No table attributes"); + auto table_attributes = table_structure->physical_columns->attributes; + auto table_override = tryGetTableOverride(current_database_name, table_name); - materialized_storage->createNestedIfNeeded(fetchTableStructure(*tx, table_name), table_override ? table_override->as() : nullptr); + materialized_storage->createNestedIfNeeded(std::move(table_structure), table_override ? table_override->as() : nullptr); auto nested_storage = materialized_storage->getNested(); auto insert = std::make_shared(); @@ -327,7 +366,22 @@ StoragePtr PostgreSQLReplicationHandler::loadFromSnapshot(postgres::Connection & auto nested_table_id = nested_storage->getStorageID(); LOG_DEBUG(log, "Loaded table {}.{} (uuid: {})", nested_table_id.database_name, nested_table_id.table_name, toString(nested_table_id.uuid)); - return nested_storage; + return StorageInfo(nested_storage, std::move(table_attributes)); +} + + +void PostgreSQLReplicationHandler::cleanupFunc() +{ + /// It is very important to make sure temporary replication slots are removed! + /// So just in case every 30 minutes check if one still exists. + postgres::Connection connection(connection_info); + String last_committed_lsn; + connection.execWithRetry([&](pqxx::nontransaction & tx) + { + if (isReplicationSlotExist(tx, last_committed_lsn, /* temporary */true)) + dropReplicationSlot(tx, /* temporary */true); + }); + cleanup_task->scheduleAfter(CLEANUP_RESCHEDULE_MS); } @@ -516,17 +570,25 @@ void PostgreSQLReplicationHandler::dropPublication(pqxx::nontransaction & tx) void PostgreSQLReplicationHandler::addTableToPublication(pqxx::nontransaction & ntx, const String & table_name) { - std::string query_str = fmt::format("ALTER PUBLICATION {} ADD TABLE ONLY {}", publication_name, doubleQuoteString(table_name)); + std::string query_str = fmt::format("ALTER PUBLICATION {} ADD TABLE ONLY {}", publication_name, doubleQuoteWithSchema(table_name)); ntx.exec(query_str); - LOG_TRACE(log, "Added table `{}` to publication `{}`", table_name, publication_name); + LOG_TRACE(log, "Added table {} to publication `{}`", doubleQuoteWithSchema(table_name), publication_name); } void PostgreSQLReplicationHandler::removeTableFromPublication(pqxx::nontransaction & ntx, const String & table_name) { - std::string query_str = fmt::format("ALTER PUBLICATION {} DROP TABLE ONLY {}", publication_name, doubleQuoteString(table_name)); - ntx.exec(query_str); - LOG_TRACE(log, "Removed table `{}` from publication `{}`", table_name, publication_name); + try + { + std::string query_str = fmt::format("ALTER PUBLICATION {} DROP TABLE ONLY {}", publication_name, doubleQuoteWithSchema(table_name)); + ntx.exec(query_str); + LOG_TRACE(log, "Removed table `{}` from publication `{}`", doubleQuoteWithSchema(table_name), publication_name); + } + catch (const pqxx::undefined_table &) + { + /// Removing table from replication must succeed even if table does not exist in PostgreSQL. + LOG_WARNING(log, "Did not remove table {} from publication, because table does not exist in PostgreSQL", doubleQuoteWithSchema(table_name), publication_name); + } } @@ -736,9 +798,6 @@ std::set PostgreSQLReplicationHandler::fetchTablesFromPublication(pqxx:: PostgreSQLTableStructurePtr PostgreSQLReplicationHandler::fetchTableStructure( pqxx::ReplicationTransaction & tx, const std::string & table_name) const { - if (!is_materialized_postgresql_database) - return nullptr; - PostgreSQLTableStructure structure; try { @@ -764,13 +823,15 @@ void PostgreSQLReplicationHandler::addTableToReplication(StorageMaterializedPost LOG_TRACE(log, "Adding table `{}` to replication", postgres_table_name); postgres::Connection replication_connection(connection_info, /* replication */true); String snapshot_name, start_lsn; - StoragePtr nested_storage; + StorageInfo nested_storage_info{ nullptr, {} }; { - pqxx::nontransaction tx(replication_connection.getRef()); - if (isReplicationSlotExist(tx, start_lsn, /* temporary */true)) - dropReplicationSlot(tx, /* temporary */true); - createReplicationSlot(tx, start_lsn, snapshot_name, /* temporary */true); + auto tx = std::make_shared(replication_connection.getRef()); + + if (isReplicationSlotExist(*tx, start_lsn, /* temporary */true)) + dropReplicationSlot(*tx, /* temporary */true); + + TemporaryReplicationSlot temporary_slot(this, tx, start_lsn, snapshot_name); /// Protect against deadlock. auto nested = DatabaseCatalog::instance().tryGetTable(materialized_storage->getNestedStorageID(), materialized_storage->getNestedTableContext()); @@ -778,8 +839,8 @@ void PostgreSQLReplicationHandler::addTableToReplication(StorageMaterializedPost throw Exception(ErrorCodes::LOGICAL_ERROR, "Internal table was not created"); postgres::Connection tmp_connection(connection_info); - nested_storage = loadFromSnapshot(tmp_connection, snapshot_name, postgres_table_name, materialized_storage); - materialized_storage->set(nested_storage); + nested_storage_info = loadFromSnapshot(tmp_connection, snapshot_name, postgres_table_name, materialized_storage); + materialized_storage->set(nested_storage_info.storage); } { @@ -788,7 +849,7 @@ void PostgreSQLReplicationHandler::addTableToReplication(StorageMaterializedPost } /// Pass storage to consumer and lsn position, from which to start receiving replication messages for this table. - consumer->addNested(postgres_table_name, nested_storage, start_lsn); + consumer->addNested(postgres_table_name, nested_storage_info, start_lsn); LOG_TRACE(log, "Table `{}` successfully added to replication", postgres_table_name); } catch (...) @@ -841,81 +902,81 @@ void PostgreSQLReplicationHandler::reloadFromSnapshot(const std::vector(replication_connection.getRef()); - String snapshot_name, start_lsn; - - if (isReplicationSlotExist(tx, start_lsn, /* temporary */true)) - dropReplicationSlot(tx, /* temporary */true); - - createReplicationSlot(tx, start_lsn, snapshot_name, /* temporary */true); - postgres::Connection tmp_connection(connection_info); - - for (const auto & [relation_id, table_name] : relation_data) { - auto storage = DatabaseCatalog::instance().getTable(StorageID(current_database_name, table_name), context); - auto * materialized_storage = storage->as (); - auto materialized_table_lock = materialized_storage->lockForShare(String(), context->getSettingsRef().lock_acquire_timeout); + String snapshot_name, start_lsn; + if (isReplicationSlotExist(*tx, start_lsn, /* temporary */true)) + dropReplicationSlot(*tx, /* temporary */true); - /// If for some reason this temporary table already exists - also drop it. - auto temp_materialized_storage = materialized_storage->createTemporary(); + TemporaryReplicationSlot temporary_slot(this, tx, start_lsn, snapshot_name); + postgres::Connection tmp_connection(connection_info); - /// This snapshot is valid up to the end of the transaction, which exported it. - StoragePtr temp_nested_storage = loadFromSnapshot(tmp_connection, snapshot_name, table_name, - temp_materialized_storage->as ()); - - auto table_id = materialized_storage->getNestedStorageID(); - auto temp_table_id = temp_nested_storage->getStorageID(); - - LOG_DEBUG(log, "Starting background update of table {} ({} with {})", - table_name, table_id.getNameForLogs(), temp_table_id.getNameForLogs()); - - auto ast_rename = std::make_shared(); - ASTRenameQuery::Element elem + for (const auto & [relation_id, table_name] : relation_data) { - ASTRenameQuery::Table{table_id.database_name, table_id.table_name}, - ASTRenameQuery::Table{temp_table_id.database_name, temp_table_id.table_name} - }; - ast_rename->elements.push_back(std::move(elem)); - ast_rename->exchange = true; + auto storage = DatabaseCatalog::instance().getTable(StorageID(current_database_name, table_name), context); + auto * materialized_storage = storage->as (); + auto materialized_table_lock = materialized_storage->lockForShare(String(), context->getSettingsRef().lock_acquire_timeout); - auto nested_context = materialized_storage->getNestedTableContext(); + /// If for some reason this temporary table already exists - also drop it. + auto temp_materialized_storage = materialized_storage->createTemporary(); - try - { - InterpreterRenameQuery(ast_rename, nested_context).execute(); + /// This snapshot is valid up to the end of the transaction, which exported it. + auto [temp_nested_storage, table_attributes] = loadFromSnapshot( + tmp_connection, snapshot_name, table_name, temp_materialized_storage->as ()); - auto nested_storage = DatabaseCatalog::instance().getTable(StorageID(table_id.database_name, table_id.table_name, temp_table_id.uuid), nested_context); - materialized_storage->set(nested_storage); + auto table_id = materialized_storage->getNestedStorageID(); + auto temp_table_id = temp_nested_storage->getStorageID(); - auto nested_sample_block = nested_storage->getInMemoryMetadataPtr()->getSampleBlock(); - auto materialized_sample_block = materialized_storage->getInMemoryMetadataPtr()->getSampleBlock(); - assertBlocksHaveEqualStructure(nested_sample_block, materialized_sample_block, "while reloading table in the background"); + LOG_DEBUG(log, "Starting background update of table {} ({} with {})", + table_name, table_id.getNameForLogs(), temp_table_id.getNameForLogs()); - LOG_INFO(log, "Updated table {}. New structure: {}", - nested_storage->getStorageID().getNameForLogs(), nested_sample_block.dumpStructure()); + auto ast_rename = std::make_shared(); + ASTRenameQuery::Element elem + { + ASTRenameQuery::Table{table_id.database_name, table_id.table_name}, + ASTRenameQuery::Table{temp_table_id.database_name, temp_table_id.table_name} + }; + ast_rename->elements.push_back(std::move(elem)); + ast_rename->exchange = true; - /// Pass pointer to new nested table into replication consumer, remove current table from skip list and set start lsn position. - consumer->updateNested(table_name, nested_storage, relation_id, start_lsn); + auto nested_context = materialized_storage->getNestedTableContext(); - auto table_to_drop = DatabaseCatalog::instance().getTable(StorageID(temp_table_id.database_name, temp_table_id.table_name, table_id.uuid), nested_context); - auto drop_table_id = table_to_drop->getStorageID(); + try + { + InterpreterRenameQuery(ast_rename, nested_context).execute(); - if (drop_table_id == nested_storage->getStorageID()) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Cannot drop table because is has the same uuid as new table: {}", drop_table_id.getNameForLogs()); + auto nested_storage = DatabaseCatalog::instance().getTable(StorageID(table_id.database_name, table_id.table_name, temp_table_id.uuid), nested_context); + materialized_storage->set(nested_storage); - LOG_DEBUG(log, "Dropping table {}", drop_table_id.getNameForLogs()); - InterpreterDropQuery::executeDropQuery(ASTDropQuery::Kind::Drop, nested_context, nested_context, drop_table_id, true); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); + auto nested_sample_block = nested_storage->getInMemoryMetadataPtr()->getSampleBlock(); + auto materialized_sample_block = materialized_storage->getInMemoryMetadataPtr()->getSampleBlock(); + assertBlocksHaveEqualStructure(nested_sample_block, materialized_sample_block, "while reloading table in the background"); + + LOG_INFO(log, "Updated table {}. New structure: {}", + nested_storage->getStorageID().getNameForLogs(), nested_sample_block.dumpStructure()); + + /// Pass pointer to new nested table into replication consumer, remove current table from skip list and set start lsn position. + consumer->updateNested(table_name, StorageInfo(nested_storage, std::move(table_attributes)), relation_id, start_lsn); + + auto table_to_drop = DatabaseCatalog::instance().getTable(StorageID(temp_table_id.database_name, temp_table_id.table_name, table_id.uuid), nested_context); + auto drop_table_id = table_to_drop->getStorageID(); + + if (drop_table_id == nested_storage->getStorageID()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot drop table because is has the same uuid as new table: {}", drop_table_id.getNameForLogs()); + + LOG_DEBUG(log, "Dropping table {}", drop_table_id.getNameForLogs()); + InterpreterDropQuery::executeDropQuery(ASTDropQuery::Kind::Drop, nested_context, nested_context, drop_table_id, true); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } } } - dropReplicationSlot(tx, /* temporary */true); - tx.commit(); + tx->commit(); } catch (...) { diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h index cf44101db76..4403fb57aca 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h @@ -15,6 +15,8 @@ struct SettingChange; class PostgreSQLReplicationHandler { +friend class TemporaryReplicationSlot; + public: PostgreSQLReplicationHandler( const String & replication_identifier, @@ -52,6 +54,8 @@ public: void setSetting(const SettingChange & setting); + void cleanupFunc(); + private: using MaterializedStorages = std::unordered_map; @@ -83,7 +87,7 @@ private: void consumerFunc(); - StoragePtr loadFromSnapshot(postgres::Connection & connection, std::string & snapshot_name, const String & table_name, StorageMaterializedPostgreSQL * materialized_storage); + StorageInfo loadFromSnapshot(postgres::Connection & connection, std::string & snapshot_name, const String & table_name, StorageMaterializedPostgreSQL * materialized_storage); void reloadFromSnapshot(const std::vector> & relation_data); @@ -133,7 +137,9 @@ private: /// Replication consumer. Manages decoding of replication stream and syncing into tables. std::shared_ptr consumer; - BackgroundSchedulePool::TaskHolder startup_task, consumer_task; + BackgroundSchedulePool::TaskHolder startup_task; + BackgroundSchedulePool::TaskHolder consumer_task; + BackgroundSchedulePool::TaskHolder cleanup_task; std::atomic stop_synchronization = false; diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp index aefd1aedbf7..b312f7284c3 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp @@ -365,7 +365,7 @@ ASTPtr StorageMaterializedPostgreSQL::getColumnDeclaration(const DataTypePtr & d ast_expression->name = "DateTime64"; ast_expression->arguments = std::make_shared(); ast_expression->arguments->children.emplace_back(std::make_shared(UInt32(6))); - return ast_expression; + return std::move(ast_expression); } return std::make_shared(data_type->getName()); @@ -423,7 +423,7 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( table_id.database_name, table_id.table_name); } - if (!table_structure->columns && (!table_override || !table_override->columns)) + if (!table_structure->physical_columns && (!table_override || !table_override->columns)) { throw Exception(ErrorCodes::LOGICAL_ERROR, "No columns returned for table {}.{}", table_id.database_name, table_id.table_name); @@ -465,7 +465,7 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( } else { - ordinary_columns_and_types = *table_structure->columns; + ordinary_columns_and_types = table_structure->physical_columns->columns; columns_declare_list->set(columns_declare_list->columns, getColumnsExpressionList(ordinary_columns_and_types)); } @@ -475,7 +475,7 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( } else { - ordinary_columns_and_types = *table_structure->columns; + ordinary_columns_and_types = table_structure->physical_columns->columns; columns_declare_list->set(columns_declare_list->columns, getColumnsExpressionList(ordinary_columns_and_types)); } @@ -485,9 +485,9 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( NamesAndTypesList merging_columns; if (table_structure->primary_key_columns) - merging_columns = *table_structure->primary_key_columns; + merging_columns = table_structure->primary_key_columns->columns; else - merging_columns = *table_structure->replica_identity_columns; + merging_columns = table_structure->replica_identity_columns->columns; order_by_expression->name = "tuple"; order_by_expression->arguments = std::make_shared(); @@ -524,7 +524,7 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( storage_metadata.setConstraints(constraints); setInMemoryMetadata(storage_metadata); - return create_table_query; + return std::move(create_table_query); } diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index f5526781f41..0cc401aa93c 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -126,7 +126,13 @@ StorageBuffer::StorageBuffer( , bg_pool(getContext()->getBufferFlushSchedulePool()) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + auto dest_table = DatabaseCatalog::instance().getTable(destination_id, context_); + storage_metadata.setColumns(dest_table->getInMemoryMetadataPtr()->getColumns()); + } + else + storage_metadata.setColumns(columns_); storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); @@ -455,10 +461,8 @@ static void appendBlock(const Block & from, Block & to) size_t rows = from.rows(); size_t bytes = from.bytes(); - CurrentMetrics::add(CurrentMetrics::StorageBufferRows, rows); - CurrentMetrics::add(CurrentMetrics::StorageBufferBytes, bytes); - size_t old_rows = to.rows(); + size_t old_bytes = to.bytes(); MutableColumnPtr last_col; try @@ -468,6 +472,8 @@ static void appendBlock(const Block & from, Block & to) if (to.rows() == 0) { to = from; + CurrentMetrics::add(CurrentMetrics::StorageBufferRows, rows); + CurrentMetrics::add(CurrentMetrics::StorageBufferBytes, bytes); } else { @@ -480,6 +486,8 @@ static void appendBlock(const Block & from, Block & to) to.getByPosition(column_no).column = std::move(last_col); } + CurrentMetrics::add(CurrentMetrics::StorageBufferRows, rows); + CurrentMetrics::add(CurrentMetrics::StorageBufferBytes, to.bytes() - old_bytes); } } catch (...) @@ -1165,6 +1173,7 @@ void registerStorageBuffer(StorageFactory & factory) }, { .supports_parallel_insert = true, + .supports_schema_inference = true, }); } diff --git a/src/Storages/StorageDictionary.cpp b/src/Storages/StorageDictionary.cpp index f6b330fe3df..da8c5f115b2 100644 --- a/src/Storages/StorageDictionary.cpp +++ b/src/Storages/StorageDictionary.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace DB @@ -21,6 +22,7 @@ namespace ErrorCodes extern const int THERE_IS_NO_COLUMN; extern const int CANNOT_DETACH_DICTIONARY_AS_TABLE; extern const int DICTIONARY_ALREADY_EXISTS; + extern const int NOT_IMPLEMENTED; } namespace @@ -111,10 +113,11 @@ StorageDictionary::StorageDictionary( const StorageID & table_id_, const String & dictionary_name_, const DictionaryStructure & dictionary_structure_, + const String & comment, Location location_, ContextPtr context_) : StorageDictionary( - table_id_, dictionary_name_, ColumnsDescription{getNamesAndTypes(dictionary_structure_)}, String{}, location_, context_) + table_id_, dictionary_name_, ColumnsDescription{getNamesAndTypes(dictionary_structure_)}, comment, location_, context_) { } @@ -126,6 +129,7 @@ StorageDictionary::StorageDictionary( table_id, table_id.getFullNameNotQuoted(), context_->getExternalDictionariesLoader().getDictionaryStructure(*dictionary_configuration), + dictionary_configuration->getString("dictionary.comment", ""), Location::SameDatabaseAndNameAsDictionary, context_) { @@ -230,7 +234,7 @@ void StorageDictionary::renameInMemory(const StorageID & new_table_id) if (move_to_atomic) configuration->setString("dictionary.uuid", toString(new_table_id.uuid)); else if (move_to_ordinary) - configuration->remove("dictionary.uuid"); + configuration->remove("dictionary.uuid"); } /// Dictionary is moving between databases of different engines or is renaming inside Ordinary database @@ -260,6 +264,40 @@ void StorageDictionary::renameInMemory(const StorageID & new_table_id) } } +void StorageDictionary::checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const +{ + for (const auto & command : commands) + { + if (location == Location::DictionaryDatabase || command.type != AlterCommand::COMMENT_TABLE) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Alter of type '{}' is not supported by storage {}", + command.type, getName()); + } +} + +void StorageDictionary::alter(const AlterCommands & params, ContextPtr alter_context, AlterLockHolder & lock_holder) +{ + IStorage::alter(params, alter_context, lock_holder); + + if (location == Location::Custom) + return; + + auto new_comment = getInMemoryMetadataPtr()->comment; + + auto storage_id = getStorageID(); + const auto & external_dictionaries_loader = getContext()->getExternalDictionariesLoader(); + auto result = external_dictionaries_loader.getLoadResult(storage_id.getInternalDictionaryName()); + + if (result.object) + { + auto dictionary = std::static_pointer_cast(result.object); + auto * dictionary_non_const = const_cast(dictionary.get()); + dictionary_non_const->setDictionaryComment(new_comment); + } + + std::lock_guard lock(dictionary_config_mutex); + configuration->setString("dictionary.comment", std::move(new_comment)); +} + void registerStorageDictionary(StorageFactory & factory) { factory.registerStorage("Dictionary", [](const StorageFactory::Arguments & args) diff --git a/src/Storages/StorageDictionary.h b/src/Storages/StorageDictionary.h index 7d0af8c0ee3..855d02b0947 100644 --- a/src/Storages/StorageDictionary.h +++ b/src/Storages/StorageDictionary.h @@ -42,6 +42,10 @@ public: void renameInMemory(const StorageID & new_table_id) override; + void checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const override; + + void alter(const AlterCommands & params, ContextPtr alter_context, AlterLockHolder &) override; + Poco::Timestamp getUpdateTime() const; LoadablesConfigurationPtr getConfiguration() const; @@ -89,6 +93,7 @@ private: const StorageID & table_id_, const String & dictionary_name_, const DictionaryStructure & dictionary_structure, + const String & comment, Location location_, ContextPtr context_); diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index ddf363e3957..19869b77106 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include #include @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -63,7 +62,6 @@ #include #include -#include #include #include @@ -71,8 +69,6 @@ #include #include -#include - #include #include #include @@ -329,7 +325,16 @@ StorageDistributed::StorageDistributed( , rng(randomSeed()) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + StorageID id = StorageID::createEmpty(); + id.table_name = remote_table; + id.database_name = remote_database; + storage_metadata.setColumns(getStructureOfRemoteTable(*getCluster(), id, getContext(), remote_table_function_ptr)); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); @@ -1398,6 +1403,7 @@ void registerStorageDistributed(StorageFactory & factory) { .supports_settings = true, .supports_parallel_insert = true, + .supports_schema_inference = true, .source_access_type = AccessType::REMOTE, }); } diff --git a/src/Storages/StorageExecutable.cpp b/src/Storages/StorageExecutable.cpp index 51ecfc1e884..21143438725 100644 --- a/src/Storages/StorageExecutable.cpp +++ b/src/Storages/StorageExecutable.cpp @@ -2,6 +2,8 @@ #include +#include + #include #include @@ -16,13 +18,12 @@ #include #include #include +#include #include #include #include #include -#include - namespace DB { @@ -30,80 +31,78 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; - extern const int LOGICAL_ERROR; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int TIMEOUT_EXCEEDED; } -StorageExecutable::StorageExecutable( - const StorageID & table_id_, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, - const ColumnsDescription & columns, - const ConstraintsDescription & constraints) - : IStorage(table_id_) - , script_name(script_name_) - , arguments(arguments_) - , format(format_) - , input_queries(input_queries_) - , log(&Poco::Logger::get("StorageExecutable")) +namespace { - StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns); - storage_metadata.setConstraints(constraints); - setInMemoryMetadata(storage_metadata); + void transformToSingleBlockSources(Pipes & inputs) + { + size_t inputs_size = inputs.size(); + for (size_t i = 0; i < inputs_size; ++i) + { + auto && input = inputs[i]; + QueryPipeline input_pipeline(std::move(input)); + PullingPipelineExecutor input_pipeline_executor(input_pipeline); + + auto header = input_pipeline_executor.getHeader(); + auto result_block = header.cloneEmpty(); + + size_t result_block_columns = result_block.columns(); + + Block result; + while (input_pipeline_executor.pull(result)) + { + for (size_t result_block_index = 0; result_block_index < result_block_columns; ++result_block_index) + { + auto & block_column = result.safeGetByPosition(result_block_index); + auto & result_block_column = result_block.safeGetByPosition(result_block_index); + + result_block_column.column->assumeMutable()->insertRangeFrom(*block_column.column, 0, block_column.column->size()); + } + } + + auto source = std::make_shared(std::move(result_block)); + inputs[i] = Pipe(std::move(source)); + } + } } StorageExecutable::StorageExecutable( const StorageID & table_id_, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, + const String & format, const ExecutableSettings & settings_, + const std::vector & input_queries_, const ColumnsDescription & columns, const ConstraintsDescription & constraints) : IStorage(table_id_) - , script_name(script_name_) - , arguments(arguments_) - , format(format_) - , input_queries(input_queries_) , settings(settings_) - /// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type. - , process_pool(std::make_shared(settings.pool_size == 0 ? std::numeric_limits::max() : settings.pool_size)) - , log(&Poco::Logger::get("StorageExecutablePool")) + , input_queries(input_queries_) + , log(settings.is_executable_pool ? &Poco::Logger::get("StorageExecutablePool") : &Poco::Logger::get("StorageExecutable")) { StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(columns); storage_metadata.setConstraints(constraints); setInMemoryMetadata(storage_metadata); + + ShellCommandSourceCoordinator::Configuration configuration + { + .format = format, + .command_termination_timeout_seconds = settings.command_termination_timeout, + .command_read_timeout_milliseconds = settings.command_read_timeout, + .command_write_timeout_milliseconds = settings.command_write_timeout, + + .pool_size = settings.pool_size, + .max_command_execution_time_seconds = settings.max_command_execution_time, + + .is_executable_pool = settings.is_executable_pool, + .send_chunk_header = settings.send_chunk_header, + .execute_direct = true + }; + + coordinator = std::make_unique(std::move(configuration)); } -class SendingChunkHeaderTransform final : public ISimpleTransform -{ -public: - SendingChunkHeaderTransform(const Block & header, WriteBuffer & buffer_) - : ISimpleTransform(header, header, false) - , buffer(buffer_) - { - } - - String getName() const override { return "SendingChunkHeaderTransform"; } - -protected: - - void transform(Chunk & chunk) override - { - writeText(chunk.getNumRows(), buffer); - writeChar('\n', buffer); - } - -private: - WriteBuffer & buffer; -}; - Pipe StorageExecutable::read( const Names & /*column_names*/, const StorageMetadataPtr & metadata_snapshot, @@ -113,10 +112,12 @@ Pipe StorageExecutable::read( size_t max_block_size, unsigned /*threads*/) { + auto & script_name = settings.script_name; + auto user_scripts_path = context->getUserScriptsPath(); auto script_path = user_scripts_path + '/' + script_name; - if (!pathStartsWith(script_path, user_scripts_path)) + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Executable file {} must be inside user scripts folder {}", script_name, @@ -128,101 +129,31 @@ Pipe StorageExecutable::read( script_name, user_scripts_path); - std::vector inputs; + Pipes inputs; inputs.reserve(input_queries.size()); for (auto & input_query : input_queries) { InterpreterSelectWithUnionQuery interpreter(input_query, context, {}); - inputs.emplace_back(interpreter.buildQueryPipeline()); + inputs.emplace_back(QueryPipelineBuilder::getPipe(interpreter.buildQueryPipeline())); } - ShellCommand::Config config(script_path); - config.arguments = arguments; - for (size_t i = 1; i < inputs.size(); ++i) - config.write_fds.emplace_back(i + 2); - - std::unique_ptr process; - - bool is_executable_pool = (process_pool != nullptr); - if (is_executable_pool) - { - bool result = process_pool->tryBorrowObject(process, [&config, this]() - { - config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, settings.command_termination_timeout }; - auto shell_command = ShellCommand::executeDirect(config); - return shell_command; - }, settings.max_command_execution_time * 10000); - - if (!result) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Could not get process from pool, max command execution timeout exceeded {} seconds", - settings.max_command_execution_time); - } - else - { - process = ShellCommand::executeDirect(config); - } - - std::vector tasks; - tasks.reserve(inputs.size()); - - for (size_t i = 0; i < inputs.size(); ++i) - { - WriteBufferFromFile * write_buffer = nullptr; - - if (i == 0) - { - write_buffer = &process->in; - } - else - { - auto descriptor = i + 2; - auto it = process->write_fds.find(descriptor); - if (it == process->write_fds.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Process does not contain descriptor to write {}", descriptor); - - write_buffer = &it->second; - } - - inputs[i].resize(1); - if (settings.send_chunk_header) - { - auto transform = std::make_shared(inputs[i].getHeader(), *write_buffer); - inputs[i].addTransform(std::move(transform)); - } - - auto pipeline = std::make_shared(QueryPipelineBuilder::getPipeline(std::move(inputs[i]))); - - auto out = context->getOutputFormat(format, *write_buffer, materializeBlock(pipeline->getHeader())); - out->setAutoFlush(); - pipeline->complete(std::move(out)); - - ShellCommandSource::SendDataTask task = [pipeline, write_buffer, is_executable_pool]() - { - CompletedPipelineExecutor executor(*pipeline); - executor.execute(); - - if (!is_executable_pool) - write_buffer->close(); - }; - - tasks.emplace_back(std::move(task)); - } + /// For executable pool we read data from input streams and convert it to single blocks streams. + if (settings.is_executable_pool) + transformToSingleBlockSources(inputs); auto sample_block = metadata_snapshot->getSampleBlock(); ShellCommandSourceConfiguration configuration; configuration.max_block_size = max_block_size; - if (is_executable_pool) + if (settings.is_executable_pool) { configuration.read_fixed_number_of_rows = true; configuration.read_number_of_rows_from_process_output = true; } - Pipe pipe(std::make_unique(context, format, std::move(sample_block), std::move(process), std::move(tasks), configuration, process_pool)); - return pipe; + return coordinator->createPipe(script_path, settings.script_arguments, std::move(inputs), std::move(sample_block), context, configuration); } void registerStorageExecutable(StorageFactory & factory) @@ -262,6 +193,11 @@ void registerStorageExecutable(StorageFactory & factory) const auto & columns = args.columns; const auto & constraints = args.constraints; + ExecutableSettings settings; + settings.script_name = script_name; + settings.script_arguments = script_name_with_arguments; + settings.is_executable_pool = is_executable_pool; + if (is_executable_pool) { size_t max_command_execution_time = 10; @@ -270,28 +206,28 @@ void registerStorageExecutable(StorageFactory & factory) if (max_execution_time_seconds != 0 && max_command_execution_time > max_execution_time_seconds) max_command_execution_time = max_execution_time_seconds; - ExecutableSettings pool_settings; - pool_settings.max_command_execution_time = max_command_execution_time; - if (args.storage_def->settings) - pool_settings.loadFromQuery(*args.storage_def); + settings.max_command_execution_time = max_command_execution_time; + } - return StorageExecutable::create(args.table_id, script_name, script_name_with_arguments, format, input_queries, pool_settings, columns, constraints); - } - else - { - return StorageExecutable::create(args.table_id, script_name, script_name_with_arguments, format, input_queries, columns, constraints); - } + if (args.storage_def->settings) + settings.loadFromQuery(*args.storage_def); + + auto global_context = args.getContext()->getGlobalContext(); + return StorageExecutable::create(args.table_id, format, settings, input_queries, columns, constraints); }; + StorageFactory::StorageFeatures storage_features; + storage_features.supports_settings = true; + factory.registerStorage("Executable", [&](const StorageFactory::Arguments & args) { return register_storage(args, false /*is_executable_pool*/); - }); + }, storage_features); factory.registerStorage("ExecutablePool", [&](const StorageFactory::Arguments & args) { return register_storage(args, true /*is_executable_pool*/); - }); + }, storage_features); } }; diff --git a/src/Storages/StorageExecutable.h b/src/Storages/StorageExecutable.h index 74df17f1463..b6248abae97 100644 --- a/src/Storages/StorageExecutable.h +++ b/src/Storages/StorageExecutable.h @@ -23,7 +23,7 @@ public: String getName() const override { - if (process_pool) + if (settings.is_executable_pool) return "ExecutablePool"; else return "Executable"; @@ -42,31 +42,17 @@ protected: StorageExecutable( const StorageID & table_id, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, - const ColumnsDescription & columns, - const ConstraintsDescription & constraints); - - StorageExecutable( - const StorageID & table_id, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, - const ExecutableSettings & settings_, + const String & format, + const ExecutableSettings & settings, + const std::vector & input_queries, const ColumnsDescription & columns, const ConstraintsDescription & constraints); private: - String script_name; - std::vector arguments; - String format; - std::vector input_queries; ExecutableSettings settings; - std::shared_ptr process_pool; + std::vector input_queries; Poco::Logger * log; + std::unique_ptr coordinator; }; } diff --git a/src/Storages/StorageFactory.h b/src/Storages/StorageFactory.h index 20db1a44897..6ffa6327176 100644 --- a/src/Storages/StorageFactory.h +++ b/src/Storages/StorageFactory.h @@ -66,6 +66,7 @@ public: bool supports_deduplication = false; /// See also IStorage::supportsParallelInsert() bool supports_parallel_insert = false; + bool supports_schema_inference = false; AccessType source_access_type = AccessType::NONE; }; @@ -98,6 +99,7 @@ public: .supports_replication = false, .supports_deduplication = false, .supports_parallel_insert = false, + .supports_schema_inference = false, .source_access_type = AccessType::NONE, }); @@ -126,6 +128,12 @@ public: AccessType getSourceAccessType(const String & table_engine) const; + bool checkIfStorageSupportsSchemaInterface(const String & storage_name) + { + if (storages.contains(storage_name)) + return storages[storage_name].features.supports_schema_inference; + return false; + } private: Storages storages; }; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 13a70af2ada..a479f982c70 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -15,8 +15,9 @@ #include #include -#include #include +#include +#include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +65,7 @@ namespace ErrorCodes extern const int INCOMPATIBLE_COLUMNS; extern const int CANNOT_STAT; extern const int LOGICAL_ERROR; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } namespace @@ -135,6 +138,56 @@ void checkCreationIsAllowed(ContextPtr context_global, const std::string & db_di throw Exception("File must not be a directory", ErrorCodes::INCORRECT_FILE_NAME); } +std::unique_ptr createReadBuffer( + const String & current_path, + bool use_table_fd, + const String & storage_name, + int table_fd, + const String & compression_method, + ContextPtr context) +{ + std::unique_ptr nested_buffer; + CompressionMethod method; + + struct stat file_stat{}; + + if (use_table_fd) + { + /// Check if file descriptor allows random reads (and reading it twice). + if (0 != fstat(table_fd, &file_stat)) + throwFromErrno("Cannot stat table file descriptor, inside " + storage_name, ErrorCodes::CANNOT_STAT); + + if (S_ISREG(file_stat.st_mode)) + nested_buffer = std::make_unique(table_fd); + else + nested_buffer = std::make_unique(table_fd); + + method = chooseCompressionMethod("", compression_method); + } + else + { + /// Check if file descriptor allows random reads (and reading it twice). + if (0 != stat(current_path.c_str(), &file_stat)) + throwFromErrno("Cannot stat file " + current_path, ErrorCodes::CANNOT_STAT); + + if (S_ISREG(file_stat.st_mode)) + nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); + else + nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); + + method = chooseCompressionMethod(current_path, compression_method); + } + + /// For clickhouse-local add progress callback to display progress bar. + if (context->getApplicationType() == Context::ApplicationType::LOCAL) + { + auto & in = static_cast(*nested_buffer); + in.setProgressCallback(context); + } + + return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); +} + } Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read) @@ -164,6 +217,42 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user return paths; } + +ColumnsDescription StorageFile::getTableStructureFromData( + const String & format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + ContextPtr context) +{ + if (format == "Distributed") + { + if (paths.empty()) + throw Exception( + "Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); + + auto source = StorageDistributedDirectoryMonitor::createSourceFromFile(paths[0]); + return ColumnsDescription(source->getOutputs().front().getHeader().getNamesAndTypesList()); + } + + auto read_buffer_creator = [&]() + { + String path; + auto it = std::find_if(paths.begin(), paths.end(), [](const String & p){ return std::filesystem::exists(p); }); + if (it == paths.end()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files with provided path. You must specify " + "table structure manually", + format); + + path = *it; + return createReadBuffer(path, false, "File", -1, compression_method, context); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, context); +} + bool StorageFile::isColumnOriented() const { return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); @@ -182,10 +271,13 @@ StorageFile::StorageFile(int table_fd_, CommonArguments args) throw Exception("Using file descriptor as source of storage isn't allowed for server daemons", ErrorCodes::DATABASE_ACCESS_DENIED); if (args.format_name == "Distributed") throw Exception("Distributed format is allowed only with explicit file path", ErrorCodes::INCORRECT_FILE_NAME); + if (args.columns.empty()) + throw Exception("Automatic schema inference is not allowed when using file descriptor as source of storage", ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE); is_db_table = false; use_table_fd = true; table_fd = table_fd_; + setStorageMetadata(args); } StorageFile::StorageFile(const std::string & table_path_, const std::string & user_files_path, CommonArguments args) @@ -194,22 +286,7 @@ StorageFile::StorageFile(const std::string & table_path_, const std::string & us is_db_table = false; paths = getPathsList(table_path_, user_files_path, args.getContext(), total_bytes_to_read); path_for_partitioned_write = table_path_; - - if (args.format_name == "Distributed") - { - if (paths.empty()) - throw Exception("Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); - - auto & first_path = paths[0]; - Block header = StorageDistributedDirectoryMonitor::createSourceFromFile(first_path)->getOutputs().front().getHeader(); - - StorageInMemoryMetadata storage_metadata; - auto columns = ColumnsDescription(header.getNamesAndTypesList()); - if (!args.columns.empty() && columns != args.columns) - throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS); - storage_metadata.setColumns(columns); - setInMemoryMetadata(storage_metadata); - } + setStorageMetadata(args); } StorageFile::StorageFile(const std::string & relative_table_dir_path, CommonArguments args) @@ -225,6 +302,8 @@ StorageFile::StorageFile(const std::string & relative_table_dir_path, CommonArgu paths = {getTablePath(table_dir_path, format_name)}; if (fs::exists(paths[0])) total_bytes_to_read = fs::file_size(paths[0]); + + setStorageMetadata(args); } StorageFile::StorageFile(CommonArguments args) @@ -233,9 +312,21 @@ StorageFile::StorageFile(CommonArguments args) , format_settings(args.format_settings) , compression_method(args.compression_method) , base_path(args.getContext()->getPath()) +{ +} + +void StorageFile::setStorageMetadata(CommonArguments args) { StorageInMemoryMetadata storage_metadata; - if (args.format_name != "Distributed") + + if (args.format_name == "Distributed" || args.columns.empty()) + { + auto columns = getTableStructureFromData(format_name, paths, compression_method, format_settings, args.getContext()); + if (!args.columns.empty() && args.columns != columns) + throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS); + storage_metadata.setColumns(columns); + } + else storage_metadata.setColumns(args.columns); storage_metadata.setConstraints(args.constraints); @@ -350,46 +441,7 @@ public: } } - std::unique_ptr nested_buffer; - CompressionMethod method; - - struct stat file_stat{}; - - if (storage->use_table_fd) - { - /// Check if file descriptor allows random reads (and reading it twice). - if (0 != fstat(storage->table_fd, &file_stat)) - throwFromErrno("Cannot stat table file descriptor, inside " + storage->getName(), ErrorCodes::CANNOT_STAT); - - if (S_ISREG(file_stat.st_mode)) - nested_buffer = std::make_unique(storage->table_fd); - else - nested_buffer = std::make_unique(storage->table_fd); - - method = chooseCompressionMethod("", storage->compression_method); - } - else - { - /// Check if file descriptor allows random reads (and reading it twice). - if (0 != stat(current_path.c_str(), &file_stat)) - throwFromErrno("Cannot stat file " + current_path, ErrorCodes::CANNOT_STAT); - - if (S_ISREG(file_stat.st_mode)) - nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); - else - nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); - - method = chooseCompressionMethod(current_path, storage->compression_method); - } - - /// For clickhouse-local add progress callback to display progress bar. - if (context->getApplicationType() == Context::ApplicationType::LOCAL) - { - auto & in = static_cast(*nested_buffer); - in.setProgressCallback(context); - } - - read_buf = wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); + read_buf = createReadBuffer(current_path, storage->use_table_fd, storage->getName(), storage->table_fd, storage->compression_method, context); auto get_block_for_format = [&]() -> Block { @@ -853,7 +905,8 @@ void registerStorageFile(StorageFactory & factory) { StorageFactory::StorageFeatures storage_features{ .supports_settings = true, - .source_access_type = AccessType::FILE + .supports_schema_inference = true, + .source_access_type = AccessType::FILE, }; factory.registerStorage( diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index f48d1c285da..6b015976589 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -1,6 +1,7 @@ #pragma once #include + #include #include @@ -70,6 +71,13 @@ public: bool supportsPartitionBy() const override { return true; } + static ColumnsDescription getTableStructureFromData( + const String & format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + ContextPtr context); + protected: friend class StorageFileSource; friend class StorageFileSink; @@ -86,6 +94,8 @@ protected: private: explicit StorageFile(CommonArguments args); + void setStorageMetadata(CommonArguments args); + std::string format_name; // We use format settings from global context + CREATE query for File table // function -- in this case, format_settings is set. diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index f82f9d21217..bdb7ddb744a 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -43,6 +43,7 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int SAMPLING_NOT_SUPPORTED; extern const int ALTER_OF_COLUMN_IS_FORBIDDEN; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } StorageMerge::StorageMerge( @@ -61,7 +62,7 @@ StorageMerge::StorageMerge( , database_is_regexp(database_is_regexp_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + storage_metadata.setColumns(columns_.empty() ? getColumnsDescriptionFromSourceTables() : columns_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } @@ -82,11 +83,19 @@ StorageMerge::StorageMerge( , database_is_regexp(database_is_regexp_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + storage_metadata.setColumns(columns_.empty() ? getColumnsDescriptionFromSourceTables() : columns_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription StorageMerge::getColumnsDescriptionFromSourceTables() const +{ + auto table = getFirstTable([](auto && t) { return t; }); + if (!table) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "There are no tables satisfied provided regexp, you must specify table structure manually"}; + return table->getInMemoryMetadataPtr()->getColumns(); +} + template StoragePtr StorageMerge::getFirstTable(F && predicate) const { @@ -762,7 +771,6 @@ void StorageMerge::convertingSourceStream( IStorage::ColumnSizeByName StorageMerge::getColumnSizes() const { - auto first_materialized_mysql = getFirstTable([](const StoragePtr & table) { return table && table->getName() == "MaterializedMySQL"; }); if (!first_materialized_mysql) return {}; @@ -816,6 +824,9 @@ void registerStorageMerge(StorageFactory & factory) return StorageMerge::create( args.table_id, args.columns, args.comment, source_database_name_or_regexp, is_regexp, table_name_regexp, args.getContext()); + }, + { + .supports_schema_inference = true }); } diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 56adeab9279..ad3075efd08 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -132,6 +132,8 @@ protected: static SelectQueryInfo getModifiedQueryInfo( const SelectQueryInfo & query_info, ContextPtr modified_context, const StorageID & current_storage_id, bool is_merge_engine); + + ColumnsDescription getColumnsDescriptionFromSourceTables() const; }; } diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 03ac27d0e46..11815d9ceef 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -229,7 +229,7 @@ std::optional StorageMergeTree::totalRows(const Settings &) const std::optional StorageMergeTree::totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, ContextPtr local_context) const { - auto parts = getDataPartsVector({DataPartState::Committed}); + auto parts = getDataPartsVector({DataPartState::Active}); return totalRowsByPartitionPredicateImpl(query_info, local_context, parts); } @@ -1294,7 +1294,7 @@ MergeTreeDataPartPtr StorageMergeTree::outdatePart(const String & part_name, boo { /// Forcefully stop merges and make part outdated auto merge_blocker = stopMergesAndWait(); - auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Committed}); + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active}); if (!part) throw Exception("Part " + part_name + " not found, won't try to drop it.", ErrorCodes::NO_SUCH_DATA_PART); removePartsFromWorkingSet({part}, true); @@ -1306,7 +1306,7 @@ MergeTreeDataPartPtr StorageMergeTree::outdatePart(const String & part_name, boo /// Wait merges selector std::unique_lock lock(currently_processing_in_background_mutex); - auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Committed}); + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active}); /// It's okay, part was already removed if (!part) return nullptr; @@ -1344,7 +1344,7 @@ void StorageMergeTree::dropPartition(const ASTPtr & partition, bool detach, Cont /// This protects against "revival" of data for a removed partition after completion of merge. auto merge_blocker = stopMergesAndWait(); String partition_id = getPartitionIDFromQuery(partition, local_context); - parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); /// TODO should we throw an exception if parts_to_remove is empty? removePartsFromWorkingSet(parts_to_remove, true); @@ -1426,7 +1426,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con MergeTreeData & src_data = checkStructureAndGetMergeTreeData(source_table, source_metadata_snapshot, my_metadata_snapshot); String partition_id = getPartitionIDFromQuery(partition, local_context); - DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); MutableDataPartsVector dst_parts; static const String TMP_PREFIX = "tmp_replace_from_"; @@ -1511,7 +1511,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const MergeTreeData & src_data = dest_table_storage->checkStructureAndGetMergeTreeData(*this, metadata_snapshot, dest_metadata_snapshot); String partition_id = getPartitionIDFromQuery(partition, local_context); - DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); MutableDataPartsVector dst_parts; static const String TMP_PREFIX = "tmp_move_from_"; @@ -1591,7 +1591,7 @@ CheckResults StorageMergeTree::checkData(const ASTPtr & query, ContextPtr local_ if (const auto & check_query = query->as(); check_query.partition) { String partition_id = getPartitionIDFromQuery(check_query.partition, local_context); - data_parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + data_parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); } else data_parts = getDataPartsVector(); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a1f82e14868..91a9c8567ba 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3,7 +3,6 @@ #include "Common/hex.h" #include #include -#include #include #include #include @@ -20,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -35,7 +33,6 @@ #include -#include #include #include @@ -45,7 +42,6 @@ #include #include #include -#include #include #include @@ -68,7 +64,6 @@ #include -#include #include #include @@ -194,56 +189,6 @@ zkutil::ZooKeeperPtr StorageReplicatedMergeTree::getZooKeeper() const return res; } -static std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log = nullptr) -{ - if (!zookeeper_path.empty() && zookeeper_path.back() == '/') - zookeeper_path.resize(zookeeper_path.size() - 1); - /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. - if (!zookeeper_path.empty() && zookeeper_path.front() != '/') - { - /// Do not allow this for new tables, print warning for tables created in old versions - if (check_starts_with_slash) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "ZooKeeper path must starts with '/', got '{}'", zookeeper_path); - if (log) - LOG_WARNING(log, "ZooKeeper path ('{}') does not start with '/'. It will not be supported in future releases"); - zookeeper_path = "/" + zookeeper_path; - } - - return zookeeper_path; -} - -static String extractZooKeeperName(const String & path) -{ - static constexpr auto default_zookeeper_name = "default"; - if (path.empty()) - throw Exception("ZooKeeper path should not be empty", ErrorCodes::BAD_ARGUMENTS); - if (path[0] == '/') - return default_zookeeper_name; - auto pos = path.find(":/"); - if (pos != String::npos && pos < path.find('/')) - { - auto zookeeper_name = path.substr(0, pos); - if (zookeeper_name.empty()) - throw Exception("Zookeeper path should start with '/' or ':/'", ErrorCodes::BAD_ARGUMENTS); - return zookeeper_name; - } - return default_zookeeper_name; -} - -static String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr) -{ - if (path.empty()) - throw Exception("ZooKeeper path should not be empty", ErrorCodes::BAD_ARGUMENTS); - if (path[0] == '/') - return normalizeZooKeeperPath(path, check_starts_with_slash, log); - auto pos = path.find(":/"); - if (pos != String::npos && pos < path.find('/')) - { - return normalizeZooKeeperPath(path.substr(pos + 1, String::npos), check_starts_with_slash, log); - } - return normalizeZooKeeperPath(path, check_starts_with_slash, log); -} - static MergeTreePartInfo makeDummyDropRangeForMovePartitionOrAttachPartitionFrom(const String & partition_id) { /// NOTE We don't have special log entry type for MOVE PARTITION/ATTACH PARTITION FROM, @@ -287,8 +232,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( true, /// require_part_metadata attach, [this] (const std::string & name) { enqueuePartForCheck(name); }) - , zookeeper_name(extractZooKeeperName(zookeeper_path_)) - , zookeeper_path(extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log)) + , zookeeper_name(zkutil::extractZooKeeperName(zookeeper_path_)) + , zookeeper_path(zkutil::extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log)) , replica_name(replica_name_) , replica_path(fs::path(zookeeper_path) / "replicas" / replica_name_) , reader(*this) @@ -497,6 +442,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( createNewZooKeeperNodes(); syncPinnedPartUUIDs(); + + createTableSharedID(); } @@ -1175,8 +1122,8 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks) /// Parts in ZK. NameSet expected_parts(expected_parts_vec.begin(), expected_parts_vec.end()); - /// There are no PreCommitted parts at startup. - auto parts = getDataParts({MergeTreeDataPartState::Committed, MergeTreeDataPartState::Outdated}); + /// There are no PreActive parts at startup. + auto parts = getDataParts({MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); /** Local parts that are not in ZK. * In very rare cases they may cover missing parts @@ -1515,9 +1462,9 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) if (is_get_or_attach || entry.type == LogEntry::MERGE_PARTS || entry.type == LogEntry::MUTATE_PART) { /// If we already have this part or a part covering it, we do not need to do anything. - /// The part may be still in the PreCommitted -> Committed transition so we first search - /// among PreCommitted parts to definitely find the desired part if it exists. - DataPartPtr existing_part = getPartIfExists(entry.new_part_name, {MergeTreeDataPartState::PreCommitted}); + /// The part may be still in the PreActive -> Active transition so we first search + /// among PreActive parts to definitely find the desired part if it exists. + DataPartPtr existing_part = getPartIfExists(entry.new_part_name, {MergeTreeDataPartState::PreActive}); if (!existing_part) existing_part = getActiveContainingPart(entry.new_part_name); @@ -1958,7 +1905,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) for (const PartDescriptionPtr & part_desc : all_parts) { - if (!getActiveContainingPart(part_desc->new_part_info, MergeTreeDataPartState::Committed, data_parts_lock)) + if (!getActiveContainingPart(part_desc->new_part_info, MergeTreeDataPartState::Active, data_parts_lock)) parts_to_add.emplace_back(part_desc); } @@ -2016,7 +1963,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); DataPartStates valid_states{ - MergeTreeDataPartState::PreCommitted, MergeTreeDataPartState::Committed, MergeTreeDataPartState::Outdated}; + MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}; size_t num_clonable_parts = 0; for (PartDescriptionPtr & part_desc : parts_to_add) @@ -3342,7 +3289,7 @@ void StorageReplicatedMergeTree::removePartAndEnqueueFetch(const String & part_n /// It's quite dangerous, so clone covered parts to detached. auto broken_part_info = MergeTreePartInfo::fromPartName(part_name, format_version); - auto partition_range = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, broken_part_info.partition_id); + auto partition_range = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, broken_part_info.partition_id); for (const auto & part : partition_range) { if (!broken_part_info.contains(part->info)) @@ -4274,7 +4221,7 @@ Pipe StorageReplicatedMergeTree::read( template -void StorageReplicatedMergeTree::foreachCommittedParts(Func && func, bool select_sequential_consistency) const +void StorageReplicatedMergeTree::foreachActiveParts(Func && func, bool select_sequential_consistency) const { std::optional max_added_blocks = {}; @@ -4285,7 +4232,7 @@ void StorageReplicatedMergeTree::foreachCommittedParts(Func && func, bool select max_added_blocks = getMaxAddedBlocks(); auto lock = lockParts(); - for (const auto & part : getDataPartsStateRange(DataPartState::Committed)) + for (const auto & part : getDataPartsStateRange(DataPartState::Active)) { if (part->isEmpty()) continue; @@ -4304,21 +4251,21 @@ void StorageReplicatedMergeTree::foreachCommittedParts(Func && func, bool select std::optional StorageReplicatedMergeTree::totalRows(const Settings & settings) const { UInt64 res = 0; - foreachCommittedParts([&res](auto & part) { res += part->rows_count; }, settings.select_sequential_consistency); + foreachActiveParts([&res](auto & part) { res += part->rows_count; }, settings.select_sequential_consistency); return res; } std::optional StorageReplicatedMergeTree::totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, ContextPtr local_context) const { DataPartsVector parts; - foreachCommittedParts([&](auto & part) { parts.push_back(part); }, local_context->getSettingsRef().select_sequential_consistency); + foreachActiveParts([&](auto & part) { parts.push_back(part); }, local_context->getSettingsRef().select_sequential_consistency); return totalRowsByPartitionPredicateImpl(query_info, local_context, parts); } std::optional StorageReplicatedMergeTree::totalBytes(const Settings & settings) const { UInt64 res = 0; - foreachCommittedParts([&res](auto & part) { res += part->getBytesOnDisk(); }, settings.select_sequential_consistency); + foreachActiveParts([&res](auto & part) { res += part->getBytesOnDisk(); }, settings.select_sequential_consistency); return res; } @@ -4863,12 +4810,12 @@ void StorageReplicatedMergeTree::restoreMetadataInZooKeeper() const DataPartsVector all_parts = getAllDataPartsVector(); Strings active_parts_names; - /// Why all parts (not only Committed) are moved to detached/: + /// Why all parts (not only Active) are moved to detached/: /// After ZK metadata restoration ZK resets sequential counters (including block number counters), so one may /// potentially encounter a situation that a part we want to attach already exists. for (const auto & part : all_parts) { - if (part->getState() == DataPartState::Committed) + if (part->getState() == DataPartState::Active) active_parts_names.push_back(part->name); forgetPartAndMoveToDetached(part); @@ -5561,8 +5508,8 @@ void StorageReplicatedMergeTree::fetchPartition( info.table_id = getStorageID(); info.table_id.uuid = UUIDHelpers::Nil; auto expand_from = query_context->getMacros()->expand(from_, info); - String auxiliary_zookeeper_name = extractZooKeeperName(expand_from); - String from = extractZooKeeperPath(expand_from, /* check_starts_with_slash */ true); + String auxiliary_zookeeper_name = zkutil::extractZooKeeperName(expand_from); + String from = zkutil::extractZooKeeperPath(expand_from, /* check_starts_with_slash */ true); if (from.empty()) throw Exception("ZooKeeper path should not be empty", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -6216,7 +6163,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( String partition_id = getPartitionIDFromQuery(partition, query_context); /// NOTE: Some covered parts may be missing in src_all_parts if corresponding log entries are not executed yet. - DataPartsVector src_all_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + DataPartsVector src_all_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); LOG_DEBUG(log, "Cloning {} parts", src_all_parts.size()); @@ -6638,7 +6585,7 @@ void StorageReplicatedMergeTree::movePartitionToShard( if (!move_part) throw Exception("MOVE PARTITION TO SHARD is not supported, use MOVE PART instead", ErrorCodes::NOT_IMPLEMENTED); - if (normalizeZooKeeperPath(zookeeper_path, /* check_starts_with_slash */ true) == normalizeZooKeeperPath(to, /* check_starts_with_slash */ true)) + if (zkutil::normalizeZooKeeperPath(zookeeper_path, /* check_starts_with_slash */ true) == zkutil::normalizeZooKeeperPath(to, /* check_starts_with_slash */ true)) throw Exception("Source and destination are the same", ErrorCodes::BAD_ARGUMENTS); auto zookeeper = getZooKeeper(); @@ -6646,7 +6593,7 @@ void StorageReplicatedMergeTree::movePartitionToShard( String part_name = partition->as().value.safeGet(); auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); - auto part = getPartIfExists(part_info, {MergeTreeDataPartState::Committed}); + auto part = getPartIfExists(part_info, {MergeTreeDataPartState::Active}); if (!part) throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "Part {} not found locally", part_name); @@ -6864,7 +6811,7 @@ bool StorageReplicatedMergeTree::dropPartImpl( { ReplicatedMergeTreeMergePredicate merge_pred = queue.getMergePredicate(zookeeper); - auto part = getPartIfExists(part_info, {MergeTreeDataPartState::Committed}); + auto part = getPartIfExists(part_info, {MergeTreeDataPartState::Active}); if (!part) { @@ -7039,7 +6986,7 @@ CheckResults StorageReplicatedMergeTree::checkData(const ASTPtr & query, Context if (const auto & check_query = query->as(); check_query.partition) { String partition_id = getPartitionIDFromQuery(check_query.partition, local_context); - data_parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + data_parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); } else data_parts = getDataPartsVector(); @@ -7080,12 +7027,53 @@ void StorageReplicatedMergeTree::startBackgroundMovesIfNeeded() background_moves_assignee.start(); } + std::unique_ptr StorageReplicatedMergeTree::getDefaultSettings() const { return std::make_unique(getContext()->getReplicatedMergeTreeSettings()); } +String StorageReplicatedMergeTree::getTableSharedID() const +{ + return toString(table_shared_id); +} + + +void StorageReplicatedMergeTree::createTableSharedID() +{ + if (table_shared_id != UUIDHelpers::Nil) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Table shared id already initialized"); + + zkutil::ZooKeeperPtr zookeeper = getZooKeeper(); + String zookeeper_table_id_path = fs::path(zookeeper_path) / "table_shared_id"; + String id; + if (!zookeeper->tryGet(zookeeper_table_id_path, id)) + { + UUID table_id_candidate; + auto storage_id = getStorageID(); + if (storage_id.uuid != UUIDHelpers::Nil) + table_id_candidate = storage_id.uuid; + else + table_id_candidate = UUIDHelpers::generateV4(); + + id = toString(table_id_candidate); + + auto code = zookeeper->tryCreate(zookeeper_table_id_path, id, zkutil::CreateMode::Persistent); + if (code == Coordination::Error::ZNODEEXISTS) + { /// Other replica create node early + id = zookeeper->get(zookeeper_table_id_path); + } + else if (code != Coordination::Error::ZOK) + { + throw zkutil::KeeperException(code, zookeeper_table_id_path); + } + } + + table_shared_id = parseFromString(id); +} + + void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part) const { if (!part.volume) @@ -7093,7 +7081,6 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part) DiskPtr disk = part.volume->getDisk(); if (!disk || !disk->supportZeroCopyReplication()) return; - String zero_copy = fmt::format("zero_copy_{}", toString(disk->getType())); zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); if (!zookeeper) @@ -7102,73 +7089,100 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part) String id = part.getUniqueId(); boost::replace_all(id, "/", "_"); - String zookeeper_node = fs::path(zookeeper_path) / zero_copy / "shared" / part.name / id / replica_name; - - LOG_TRACE(log, "Set zookeeper lock {}", zookeeper_node); - - /// In rare case other replica can remove path between createAncestors and createIfNotExists - /// So we make up to 5 attempts - for (int attempts = 5; attempts > 0; --attempts) + Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk->getType(), getTableSharedID(), + part.name, zookeeper_path); + for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { - try - { - zookeeper->createAncestors(zookeeper_node); - zookeeper->createIfNotExists(zookeeper_node, "lock"); - break; - } - catch (const zkutil::KeeperException & e) - { - if (e.code == Coordination::Error::ZNONODE) - continue; - throw; - } + String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name; + + LOG_TRACE(log, "Set zookeeper lock {}", zookeeper_node); + createZeroCopyLockNode(zookeeper, zookeeper_node); } } bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const +{ + return unlockSharedData(part, part.name); +} + + +bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, const String & name) const { if (!part.volume) return true; DiskPtr disk = part.volume->getDisk(); if (!disk || !disk->supportZeroCopyReplication()) return true; - String zero_copy = fmt::format("zero_copy_{}", toString(disk->getType())); zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); if (!zookeeper) return true; - String id = part.getUniqueId(); + auto ref_count = part.getNumberOfRefereneces(); + if (ref_count > 0) /// Keep part shard info for frozen backups + return false; + + return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), name, replica_name, disk, zookeeper, *getSettings(), log, + zookeeper_path); +} + + +bool StorageReplicatedMergeTree::unlockSharedDataByID(String id, const String & table_uuid, const String & part_name, + const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings, + Poco::Logger * logger, const String & zookeeper_path_old) +{ boost::replace_all(id, "/", "_"); - String zookeeper_part_node = fs::path(zookeeper_path) / zero_copy / "shared" / part.name; - String zookeeper_part_uniq_node = fs::path(zookeeper_part_node) / id; - String zookeeper_node = fs::path(zookeeper_part_uniq_node) / replica_name; + Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk->getType(), table_uuid, part_name, zookeeper_path_old); - LOG_TRACE(log, "Remove zookeeper lock {}", zookeeper_node); + bool res = true; - zookeeper->tryRemove(zookeeper_node); - - Strings children; - zookeeper->tryGetChildren(zookeeper_part_uniq_node, children); - - if (!children.empty()) + for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { - LOG_TRACE(log, "Found zookeper locks for {}", zookeeper_part_uniq_node); - return false; + String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / id; + String zookeeper_node = fs::path(zookeeper_part_uniq_node) / replica_name_; + + LOG_TRACE(logger, "Remove zookeeper lock {}", zookeeper_node); + + zookeeper_ptr->tryRemove(zookeeper_node); + + Strings children; + zookeeper_ptr->tryGetChildren(zookeeper_part_uniq_node, children); + + if (!children.empty()) + { + LOG_TRACE(logger, "Found zookeper locks for {}", zookeeper_part_uniq_node); + res = false; + continue; + } + + auto e = zookeeper_ptr->tryRemove(zookeeper_part_uniq_node); + + LOG_TRACE(logger, "Remove parent zookeeper lock {} : {}", zookeeper_part_uniq_node, e != Coordination::Error::ZNOTEMPTY); + + /// Even when we have lock with same part name, but with different uniq, we can remove files on S3 + children.clear(); + String zookeeper_part_node = fs::path(zookeeper_part_uniq_node).parent_path(); + zookeeper_ptr->tryGetChildren(zookeeper_part_node, children); + if (children.empty()) + { + /// Cleanup after last uniq removing + e = zookeeper_ptr->tryRemove(zookeeper_part_node); + + LOG_TRACE(logger, "Remove parent zookeeper lock {} : {}", zookeeper_part_node, e != Coordination::Error::ZNOTEMPTY); + } + else + { + LOG_TRACE(logger, "Can't remove parent zookeeper lock {} : {}", zookeeper_part_node, children.size()); + for (auto & c : children) + { + LOG_TRACE(logger, "Child node {}", c); + } + } } - zookeeper->tryRemove(zookeeper_part_uniq_node); - - /// Even when we have lock with same part name, but with different uniq, we can remove files on S3 - children.clear(); - zookeeper->tryGetChildren(zookeeper_part_node, children); - if (children.empty()) - /// Cleanup after last uniq removing - zookeeper->tryRemove(zookeeper_part_node); - - return true; + return res; } @@ -7201,20 +7215,24 @@ String StorageReplicatedMergeTree::getSharedDataReplica( if (!zookeeper) return best_replica; - String zero_copy = fmt::format("zero_copy_{}", toString(disk_type)); - String zookeeper_part_node = fs::path(zookeeper_path) / zero_copy / "shared" / part.name; + Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk_type, getTableSharedID(), part.name, + zookeeper_path); - Strings ids; - zookeeper->tryGetChildren(zookeeper_part_node, ids); + std::set replicas; - Strings replicas; - for (const auto & id : ids) + for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { - String zookeeper_part_uniq_node = fs::path(zookeeper_part_node) / id; - Strings id_replicas; - zookeeper->tryGetChildren(zookeeper_part_uniq_node, id_replicas); - LOG_TRACE(log, "Found zookeper replicas for {}: {}", zookeeper_part_uniq_node, id_replicas.size()); - replicas.insert(replicas.end(), id_replicas.begin(), id_replicas.end()); + Strings ids; + zookeeper->tryGetChildren(zc_zookeeper_path, ids); + + for (const auto & id : ids) + { + String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / id; + Strings id_replicas; + zookeeper->tryGetChildren(zookeeper_part_uniq_node, id_replicas); + LOG_TRACE(log, "Found zookeper replicas for {}: {}", zookeeper_part_uniq_node, id_replicas.size()); + replicas.insert(id_replicas.begin(), id_replicas.end()); + } } LOG_TRACE(log, "Found zookeper replicas for part {}: {}", part.name, replicas.size()); @@ -7267,24 +7285,45 @@ String StorageReplicatedMergeTree::getSharedDataReplica( return best_replica; } -String StorageReplicatedMergeTree::findReplicaHavingPart( - const String & part_name, const String & zookeeper_path_, zkutil::ZooKeeper::Ptr zookeeper_) + +Strings StorageReplicatedMergeTree::getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, + const String & part_name, const String & zookeeper_path_old) { - Strings replicas = zookeeper_->getChildren(fs::path(zookeeper_path_) / "replicas"); + Strings res; + + String zero_copy = fmt::format("zero_copy_{}", toString(disk_type)); + + String new_path = fs::path(settings.remote_fs_zero_copy_zookeeper_path.toString()) / zero_copy / table_uuid / part_name; + res.push_back(new_path); + if (settings.remote_fs_zero_copy_path_compatible_mode && !zookeeper_path_old.empty()) + { /// Compatibility mode for cluster with old and new versions + String old_path = fs::path(zookeeper_path_old) / zero_copy / "shared" / part_name; + res.push_back(old_path); + } + + return res; +} + + +String StorageReplicatedMergeTree::findReplicaHavingPart( + const String & part_name, const String & zookeeper_path_, zkutil::ZooKeeper::Ptr zookeeper_ptr) +{ + Strings replicas = zookeeper_ptr->getChildren(fs::path(zookeeper_path_) / "replicas"); /// Select replicas in uniformly random order. std::shuffle(replicas.begin(), replicas.end(), thread_local_rng); for (const String & replica : replicas) { - if (zookeeper_->exists(fs::path(zookeeper_path_) / "replicas" / replica / "parts" / part_name) - && zookeeper_->exists(fs::path(zookeeper_path_) / "replicas" / replica / "is_active")) + if (zookeeper_ptr->exists(fs::path(zookeeper_path_) / "replicas" / replica / "parts" / part_name) + && zookeeper_ptr->exists(fs::path(zookeeper_path_) / "replicas" / replica / "is_active")) return fs::path(zookeeper_path_) / "replicas" / replica; } return {}; } + bool StorageReplicatedMergeTree::checkIfDetachedPartExists(const String & part_name) { fs::directory_iterator dir_end; @@ -7295,6 +7334,7 @@ bool StorageReplicatedMergeTree::checkIfDetachedPartExists(const String & part_n return false; } + bool StorageReplicatedMergeTree::checkIfDetachedPartitionExists(const String & partition_name) { fs::directory_iterator dir_end; @@ -7485,4 +7525,180 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP return true; } + +void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node) +{ + /// In rare case other replica can remove path between createAncestors and createIfNotExists + /// So we make up to 5 attempts + + for (int attempts = 5; attempts > 0; --attempts) + { + try + { + zookeeper->createAncestors(zookeeper_node); + zookeeper->createIfNotExists(zookeeper_node, "lock"); + break; + } + catch (const zkutil::KeeperException & e) + { + if (e.code == Coordination::Error::ZNONODE) + continue; + throw; + } + } +} + + +namespace +{ + +/// Special metadata used during freeze table. Required for zero-copy +/// replication. +struct FreezeMetaData +{ +public: + void fill(const StorageReplicatedMergeTree & storage) + { + is_replicated = storage.supportsReplication(); + is_remote = storage.isRemote(); + replica_name = storage.getReplicaName(); + zookeeper_name = storage.getZooKeeperName(); + table_shared_id = storage.getTableSharedID(); + } + + void save(DiskPtr disk, const String & path) const + { + auto file_path = getFileName(path); + auto buffer = disk->writeMetaFile(file_path, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite); + writeIntText(version, *buffer); + buffer->write("\n", 1); + writeBoolText(is_replicated, *buffer); + buffer->write("\n", 1); + writeBoolText(is_remote, *buffer); + buffer->write("\n", 1); + writeString(replica_name, *buffer); + buffer->write("\n", 1); + writeString(zookeeper_name, *buffer); + buffer->write("\n", 1); + writeString(table_shared_id, *buffer); + buffer->write("\n", 1); + } + + bool load(DiskPtr disk, const String & path) + { + auto file_path = getFileName(path); + if (!disk->exists(file_path)) + return false; + auto buffer = disk->readMetaFile(file_path, ReadSettings(), {}); + readIntText(version, *buffer); + if (version != 1) + { + LOG_ERROR(&Poco::Logger::get("FreezeMetaData"), "Unknown freezed metadata version: {}", version); + return false; + } + DB::assertChar('\n', *buffer); + readBoolText(is_replicated, *buffer); + DB::assertChar('\n', *buffer); + readBoolText(is_remote, *buffer); + DB::assertChar('\n', *buffer); + readString(replica_name, *buffer); + DB::assertChar('\n', *buffer); + readString(zookeeper_name, *buffer); + DB::assertChar('\n', *buffer); + readString(table_shared_id, *buffer); + DB::assertChar('\n', *buffer); + return true; + } + + static void clean(DiskPtr disk, const String & path) + { + disk->removeMetaFileIfExists(getFileName(path)); + } + +private: + static String getFileName(const String & path) + { + return fs::path(path) / "frozen_metadata.txt"; + } + +public: + int version = 1; + bool is_replicated; + bool is_remote; + String replica_name; + String zookeeper_name; + String table_shared_id; +}; + +} + +bool StorageReplicatedMergeTree::removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) +{ + if (disk->supportZeroCopyReplication()) + { + if (is_freezed) + { + FreezeMetaData meta; + if (meta.load(disk, path)) + { + FreezeMetaData::clean(disk, path); + return removeSharedDetachedPart(disk, path, part_name, meta.table_shared_id, meta.zookeeper_name, meta.replica_name, ""); + } + } + else + { + String table_id = getTableSharedID(); + + return removeSharedDetachedPart(disk, path, part_name, table_id, zookeeper_name, replica_name, zookeeper_path); + } + } + + disk->removeRecursive(path); + + return false; +} + + +bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const String & path, const String & part_name, const String & table_uuid, + const String &, const String & detached_replica_name, const String & detached_zookeeper_path) +{ + bool keep_shared = false; + + zkutil::ZooKeeperPtr zookeeper = getZooKeeper(); + + if (zookeeper) + { + fs::path checksums = fs::path(path) / "checksums.txt"; + if (disk->exists(checksums)) + { + auto ref_count = disk->getRefCount(checksums); + if (ref_count == 0) + { + String id = disk->getUniqueId(checksums); + keep_shared = !StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name, + detached_replica_name, disk, zookeeper, getContext()->getReplicatedMergeTreeSettings(), log, + detached_zookeeper_path); + } + else + keep_shared = true; + } + } + + disk->removeSharedRecursive(path, keep_shared); + + return keep_shared; +} + + +void StorageReplicatedMergeTree::createAndStoreFreezeMetadata(DiskPtr disk, DataPartPtr, String backup_part_path) const +{ + if (disk->supportZeroCopyReplication()) + { + FreezeMetaData meta; + meta.fill(*this); + meta.save(disk, backup_part_path); + } +} + + } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 6861d89f070..e390a0bcea4 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -236,6 +237,16 @@ public: /// Return false if data is still used by another node bool unlockSharedData(const IMergeTreeDataPart & part) const override; + /// Remove lock with old name for shared data part after rename + bool unlockSharedData(const IMergeTreeDataPart & part, const String & name) const override; + + /// Unlock shared data part in zookeeper by part id + /// Return true if data unlocked + /// Return false if data is still used by another node + static bool unlockSharedDataByID(String id, const String & table_uuid, const String & part_name, const String & replica_name_, + DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger, + const String & zookeeper_path_old); + /// Fetch part only if some replica has it on shared storage like S3 bool tryToFetchIfShared(const IMergeTreeDataPart & part, const DiskPtr & disk, const String & path) override; @@ -245,7 +256,7 @@ public: inline String getReplicaName() const { return replica_name; } /// Restores table metadata if ZooKeeper lost it. - /// Used only on restarted readonly replicas (not checked). All active (Committed) parts are moved to detached/ + /// Used only on restarted readonly replicas (not checked). All active (Active) parts are moved to detached/ /// folder and attached. Parts in all other states are just moved to detached/ folder. void restoreMetadataInZooKeeper(); @@ -263,6 +274,14 @@ public: bool createEmptyPartInsteadOfLost(zkutil::ZooKeeperPtr zookeeper, const String & lost_part_name); + // Return default or custom zookeeper name for table + String getZooKeeperName() const { return zookeeper_name; } + + // Return table id, common for different replicas + String getTableSharedID() const; + + static const String getDefaultZooKeeperName() { return default_zookeeper_name; } + private: std::atomic_bool are_restoring_replica {false}; @@ -391,8 +410,11 @@ private: ThrottlerPtr replicated_fetches_throttler; ThrottlerPtr replicated_sends_throttler; + /// Global ID, synced via ZooKeeper between replicas + UUID table_shared_id; + template - void foreachCommittedParts(Func && func, bool select_sequential_consistency) const; + void foreachActiveParts(Func && func, bool select_sequential_consistency) const; /** Creates the minimum set of nodes in ZooKeeper and create first replica. * Returns true if was created, false if exists. @@ -436,7 +458,7 @@ private: String getChecksumsForZooKeeper(const MergeTreeDataPartChecksums & checksums) const; - /// Accepts a PreCommitted part, atomically checks its checksums with ones on other replicas and commit the part + /// Accepts a PreActive part, atomically checks its checksums with ones on other replicas and commit the part DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const DataPartPtr & part); bool partIsAssignedToBackgroundOperation(const DataPartPtr & part) const override; @@ -720,6 +742,22 @@ private: PartitionBlockNumbersHolder allocateBlockNumbersInAffectedPartitions( const MutationCommands & commands, ContextPtr query_context, const zkutil::ZooKeeperPtr & zookeeper) const; + static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, + const String & part_name, const String & zookeeper_path_old); + + static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node); + + bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) override; + + bool removeSharedDetachedPart(DiskPtr disk, const String & path, const String & part_name, const String & table_uuid, + const String & zookeeper_name, const String & replica_name, const String & zookeeper_path); + + /// Create freeze metadata for table and save in zookeeper. Required only if zero-copy replication enabled. + void createAndStoreFreezeMetadata(DiskPtr disk, DataPartPtr part, String backup_part_path) const override; + + // Create table id if needed + void createTableSharedID(); + protected: /** If not 'attach', either creates a new table in ZK, or adds a replica to an existing table. */ diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 552aa1caa6b..3d988472b54 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -3,7 +3,6 @@ #if USE_AWS_S3 -#include #include #include @@ -25,9 +24,9 @@ #include #include -#include #include +#include #include #include @@ -70,6 +69,7 @@ namespace ErrorCodes extern const int S3_ERROR; extern const int UNEXPECTED_EXPRESSION; extern const int CANNOT_OPEN_FILE; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } class IOutputFormat; @@ -222,6 +222,13 @@ StorageS3Source::StorageS3Source( } +void StorageS3Source::onCancel() +{ + if (reader) + reader->cancel(); +} + + bool StorageS3Source::initialize() { String current_key = (*file_iterator)(); @@ -473,13 +480,39 @@ StorageS3::StorageS3( { context_->getGlobalContext()->getRemoteHostFilter().checkURL(uri_.uri); StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + + updateClientAndAuthSettings(context_, client_auth); + if (columns_.empty()) + { + auto columns = getTableStructureFromDataImpl(format_name, client_auth, max_single_read_retries_, compression_method, distributed_processing_, format_settings, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); - updateClientAndAuthSettings(context_, client_auth); } +std::shared_ptr StorageS3::createFileIterator(const ClientAuthentication & client_auth, bool distributed_processing, ContextPtr local_context) +{ + std::shared_ptr iterator_wrapper{nullptr}; + if (distributed_processing) + { + return std::make_shared( + [callback = local_context->getReadTaskCallback()]() -> String { + return callback(); + }); + } + + /// Iterate through disclosed globs and make a source for each file + auto glob_iterator = std::make_shared(*client_auth.client, client_auth.uri); + return std::make_shared([glob_iterator]() + { + return glob_iterator->next(); + }); +} Pipe StorageS3::read( const Names & column_names, @@ -503,23 +536,7 @@ Pipe StorageS3::read( need_file_column = true; } - std::shared_ptr iterator_wrapper{nullptr}; - if (distributed_processing) - { - iterator_wrapper = std::make_shared( - [callback = local_context->getReadTaskCallback()]() -> String { - return callback(); - }); - } - else - { - /// Iterate through disclosed globs and make a source for each file - auto glob_iterator = std::make_shared(*client_auth.client, client_auth.uri); - iterator_wrapper = std::make_shared([glob_iterator]() - { - return glob_iterator->next(); - }); - } + std::shared_ptr iterator_wrapper = createFileIterator(client_auth, distributed_processing, local_context); for (size_t i = 0; i < num_streams; ++i) { @@ -700,6 +717,51 @@ StorageS3Configuration StorageS3::getConfiguration(ASTs & engine_args, ContextPt return configuration; } +ColumnsDescription StorageS3::getTableStructureFromData( + const String & format, + const S3::URI & uri, + const String & access_key_id, + const String & secret_access_key, + UInt64 max_connections, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx) +{ + ClientAuthentication client_auth{uri, access_key_id, secret_access_key, max_connections, {}, {}}; + updateClientAndAuthSettings(ctx, client_auth); + return getTableStructureFromDataImpl(format, client_auth, max_single_read_retries, compression_method, distributed_processing, format_settings, ctx); +} + +ColumnsDescription StorageS3::getTableStructureFromDataImpl( + const String & format, + const ClientAuthentication & client_auth, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx) +{ + auto read_buffer_creator = [&]() + { + auto file_iterator = createFileIterator(client_auth, distributed_processing, ctx); + String current_key = (*file_iterator)(); + if (current_key.empty()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files with provided path in S3. You must specify " + "table structure manually", + format); + + return wrapReadBufferWithCompressionMethod( + std::make_unique(client_auth.client, client_auth.uri.bucket, current_key, max_single_read_retries, ctx->getReadSettings()), + chooseCompressionMethod(current_key, compression_method)); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, ctx); +} + void registerStorageS3Impl(const String & name, StorageFactory & factory) { @@ -768,6 +830,7 @@ void registerStorageS3Impl(const String & name, StorageFactory & factory) { .supports_settings = true, .supports_sort_order = true, // for partition by + .supports_schema_inference = true, .source_access_type = AccessType::S3, }); } diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 8ce287ff681..0690040915d 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -68,6 +68,8 @@ public: Chunk generate() override; + void onCancel() override; + private: String name; String bucket; @@ -145,8 +147,19 @@ public: static StorageS3Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context); -private: + static ColumnsDescription getTableStructureFromData( + const String & format, + const S3::URI & uri, + const String & access_key_id, + const String & secret_access_key, + UInt64 max_connections, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx); +private: friend class StorageS3Cluster; friend class TableFunctionS3Cluster; @@ -173,6 +186,17 @@ private: ASTPtr partition_by; static void updateClientAndAuthSettings(ContextPtr, ClientAuthentication &); + + static std::shared_ptr createFileIterator(const ClientAuthentication & client_auth, bool distributed_processing, ContextPtr local_context); + + static ColumnsDescription getTableStructureFromDataImpl( + const String & format, + const ClientAuthentication & client_auth, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx); }; } diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 5fa16a25900..471b460d349 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -13,8 +13,9 @@ #include #include -#include +#include #include +#include #include #include @@ -40,7 +41,7 @@ namespace ErrorCodes IStorageURLBase::IStorageURLBase( const String & uri_, - ContextPtr /*context_*/, + ContextPtr context_, const StorageID & table_id_, const String & format_name_, const std::optional & format_settings_, @@ -61,12 +62,48 @@ IStorageURLBase::IStorageURLBase( , partition_by(partition_by_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + auto columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription IStorageURLBase::getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + const ReadWriteBufferFromHTTP::HTTPHeaderEntries & headers, + const std::optional & format_settings, + ContextPtr context) +{ + auto read_buffer_creator = [&]() + { + auto parsed_uri = Poco::URI(uri); + return wrapReadBufferWithCompressionMethod( + std::make_unique( + parsed_uri, + Poco::Net::HTTPRequest::HTTP_GET, + nullptr, + ConnectionTimeouts::getHTTPTimeouts(context), + Poco::Net::HTTPBasicCredentials{}, + context->getSettingsRef().max_http_get_redirects, + DBMS_DEFAULT_BUFFER_SIZE, + context->getReadSettings(), + headers, + ReadWriteBufferFromHTTP::Range{}, + context->getRemoteHostFilter()), + chooseCompressionMethod(parsed_uri.getPath(), compression_method)); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, context); +} + namespace { ReadWriteBufferFromHTTP::HTTPHeaderEntries getHeaders( @@ -107,6 +144,12 @@ namespace }; using URIInfoPtr = std::shared_ptr; + void onCancel() override + { + if (reader) + reader->cancel(); + } + StorageURLSource( URIInfoPtr uri_info_, const std::string & http_method, @@ -636,6 +679,7 @@ void registerStorageURL(StorageFactory & factory) }, { .supports_settings = true, + .supports_schema_inference = true, .source_access_type = AccessType::URL, }); } diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index cf72352a183..790f01135d3 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -41,6 +41,14 @@ public: bool supportsPartitionBy() const override { return true; } + static ColumnsDescription getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + const ReadWriteBufferFromHTTP::HTTPHeaderEntries & headers, + const std::optional & format_settings, + ContextPtr context); + protected: IStorageURLBase( const String & uri_, diff --git a/src/Storages/System/CMakeLists.txt b/src/Storages/System/CMakeLists.txt index 96c05a59173..133761cbe22 100644 --- a/src/Storages/System/CMakeLists.txt +++ b/src/Storages/System/CMakeLists.txt @@ -9,6 +9,36 @@ get_property (BUILD_COMPILE_DEFINITIONS DIRECTORY ${ClickHouse_SOURCE_DIR} PROPE get_property(TZDATA_VERSION GLOBAL PROPERTY TZDATA_VERSION_PROP) + +find_package(Git) +if(Git_FOUND) + # The commit's git hash, and whether the building workspace was dirty or not + execute_process(COMMAND + "${GIT_EXECUTABLE}" rev-parse HEAD + WORKING_DIRECTORY "${ClickHouse_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_HASH + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + # Git branch name + execute_process(COMMAND + "${GIT_EXECUTABLE}" rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY "${ClickHouse_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_BRANCH + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + # The date of the commit + SET(ENV{TZ} "UTC") + execute_process(COMMAND + "${GIT_EXECUTABLE}" log -1 --format=%ad --date=iso-local + WORKING_DIRECTORY "${ClickHouse_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_DATE + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + # The subject of the commit + execute_process(COMMAND + "${GIT_EXECUTABLE}" log -1 --format=%s + WORKING_DIRECTORY "${ClickHouse_SOURCE_DIR}" + OUTPUT_VARIABLE GIT_COMMIT_SUBJECT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() + configure_file (StorageSystemBuildOptions.generated.cpp.in ${CONFIG_BUILD}) include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") diff --git a/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in b/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in index da563cc245b..9435bdcc65b 100644 --- a/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in +++ b/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in @@ -50,6 +50,31 @@ const char * auto_config_build[] "USE_KRB5", "@USE_KRB5@", "USE_FILELOG", "@USE_FILELOG@", "USE_BZIP2", "@USE_BZIP2@", + "USE_AMQPCPP", "@USE_AMQPCPP@", + "USE_ROCKSDB", "@USE_ROCKSDB@", + "USE_NURAFT", "@USE_NURAFT@", + "USE_NLP", "@USE_NLP@", + "USE_SQLITE", "@USE_SQLITE@", + "USE_INTERNAL_LLVM_LIBRARY", "@USE_INTERNAL_LLVM_LIBRARY@", + "USE_OPENCL", "@USE_OPENCL@", + "USE_LIBPQXX", "@USE_LIBPQXX@", + "USE_AZURE_BLOB_STORAGE", "@USE_AZURE_BLOB_STORAGE@", + "USE_INTERNAL_SSL_LIBRARY", "@USE_INTERNAL_SSL_LIBRARY@", + "USE_AWS_S3", "@USE_AWS_S3@", + "USE_CASSANDRA", "@USE_CASSANDRA@", + "USE_YAML_CPP", "@USE_YAML_CPP@", + "USE_INTERNAL_HDFS3_LIBRARY", "@USE_INTERNAL_HDFS3_LIBRARY@", + "CLICKHOUSE_SPLIT_BINARY", "@CLICKHOUSE_SPLIT_BINARY@", + "USE_SENTRY", "@USE_SENTRY@", + "USE_DATASKETCHES", "@USE_DATASKETCHES@", + "USE_AVRO", "@USE_AVRO@", + "USE_ARROW", "@USE_ARROW@", + "USE_ORC", "@USE_ORC@", + "USE_MSGPACK", "@USE_MSGPACK@", + "GIT_HASH", "@GIT_HASH@", + "GIT_BRANCH", R"IRjaNsZIL9Yh7FQ4(@GIT_BRANCH@)IRjaNsZIL9Yh7FQ4", + "GIT_DATE", "@GIT_DATE@", + "GIT_COMMIT_SUBJECT", R"Gi17KJMlbGCjErEN(@GIT_COMMIT_SUBJECT@)Gi17KJMlbGCjErEN", nullptr, nullptr }; diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp index 1f5def6d6b4..1e303d1aeaa 100644 --- a/src/Storages/System/StorageSystemClusters.cpp +++ b/src/Storages/System/StorageSystemClusters.cpp @@ -45,7 +45,8 @@ void StorageSystemClusters::fillData(MutableColumns & res_columns, ContextPtr co // get an error when trying to get the info about DB from ZK. // Just ignore these inaccessible databases. A good example of a // failing test is `01526_client_start_and_exit`. - try { + try + { writeCluster(res_columns, {name_and_database.first, replicated->getCluster()}); } catch (...) diff --git a/src/Storages/System/StorageSystemDictionaries.cpp b/src/Storages/System/StorageSystemDictionaries.cpp index d8f92d38081..c0d7d8cc4ed 100644 --- a/src/Storages/System/StorageSystemDictionaries.cpp +++ b/src/Storages/System/StorageSystemDictionaries.cpp @@ -142,7 +142,9 @@ void StorageSystemDictionaries::fillData(MutableColumns & res_columns, ContextPt res_columns[i++]->insertDefault(); if (dict_ptr) + { res_columns[i++]->insert(dict_ptr->getDictionaryComment()); + } else { if (load_result.config && load_result.config->config->has("dictionary.comment")) diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index 6826082ef1d..2efb337b302 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -117,7 +117,7 @@ void StorageSystemParts::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(part->getTypeName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part_state == State::Committed); + columns[res_index++]->insert(part_state == State::Active); if (columns_mask[src_index++]) columns[res_index++]->insert(part->getMarksCount()); if (columns_mask[src_index++]) diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index c730d5a95c9..6c8159ca720 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -57,12 +57,12 @@ StoragesInfo::getParts(MergeTreeData::DataPartStateVector & state, bool has_stat { /// If has_state_column is requested, return all states. if (!has_state_column) - return data->getDataPartsVector({State::Committed, State::Outdated}, &state, require_projection_parts); + return data->getDataPartsVector({State::Active, State::Outdated}, &state, require_projection_parts); return data->getAllDataPartsVector(&state, require_projection_parts); } - return data->getDataPartsVector({State::Committed}, &state, require_projection_parts); + return data->getDataPartsVector({State::Active}, &state, require_projection_parts); } StoragesInfoStream::StoragesInfoStream(const SelectQueryInfo & query_info, ContextPtr context) diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp index f1b3a13c332..f5e9b82c136 100644 --- a/src/Storages/System/StorageSystemPartsColumns.cpp +++ b/src/Storages/System/StorageSystemPartsColumns.cpp @@ -132,7 +132,7 @@ void StorageSystemPartsColumns::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(part->getTypeName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part_state == State::Committed); + columns[res_index++]->insert(part_state == State::Active); if (columns_mask[src_index++]) columns[res_index++]->insert(part->getMarksCount()); diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index 378437bd4ec..d15acc97cb1 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -125,7 +125,7 @@ void StorageSystemProjectionParts::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(parent_part->getTypeName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part_state == State::Committed); + columns[res_index++]->insert(part_state == State::Active); if (columns_mask[src_index++]) columns[res_index++]->insert(part->getMarksCount()); if (columns_mask[src_index++]) diff --git a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp index f6490177014..29c877733d8 100644 --- a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp +++ b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp @@ -146,7 +146,7 @@ void StorageSystemProjectionPartsColumns::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(parent_part->getTypeName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part_state == State::Committed); + columns[res_index++]->insert(part_state == State::Active); if (columns_mask[src_index++]) columns[res_index++]->insert(part->getMarksCount()); if (columns_mask[src_index++]) diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index ac52f0afb32..24e3fe4f7a9 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -88,6 +88,26 @@ static ColumnPtr getFilteredDatabases(const SelectQueryInfo & query_info, Contex return block.getByPosition(0).column; } +static ColumnPtr getFilteredTables(const ASTPtr & query, const ColumnPtr & filtered_databases_column, ContextPtr context) +{ + MutableColumnPtr column = ColumnString::create(); + + for (size_t database_idx = 0; database_idx < filtered_databases_column->size(); ++database_idx) + { + const auto & database_name = filtered_databases_column->getDataAt(database_idx).toString(); + DatabasePtr database = DatabaseCatalog::instance().tryGetDatabase(database_name); + if (!database) + continue; + + for (auto table_it = database->getTablesIterator(context); table_it->isValid(); table_it->next()) + column->insert(table_it->name()); + } + + Block block {ColumnWithTypeAndName(std::move(column), std::make_shared(), "name")}; + VirtualColumnUtils::filterBlockWithQuery(query, block, context); + return block.getByPosition(0).column; +} + /// Avoid heavy operation on tables if we only queried columns that we can get without table object. /// Otherwise it will require table initialization for Lazy database. static bool needLockStructure(const DatabasePtr & database, const Block & header) @@ -112,12 +132,19 @@ public: Block header, UInt64 max_block_size_, ColumnPtr databases_, + ColumnPtr tables_, ContextPtr context_) : SourceWithProgress(std::move(header)) , columns_mask(std::move(columns_mask_)) , max_block_size(max_block_size_) , databases(std::move(databases_)) - , context(Context::createCopy(context_)) {} + , context(Context::createCopy(context_)) + { + size_t size = tables_->size(); + tables.reserve(size); + for (size_t idx = 0; idx < size; ++idx) + tables.insert(tables_->getDataAt(idx).toString()); + } String getName() const override { return "Tables"; } @@ -239,6 +266,9 @@ protected: for (; rows_count < max_block_size && tables_it->isValid(); tables_it->next()) { auto table_name = tables_it->name(); + if (!tables.contains(table_name)) + continue; + if (check_access_for_tables && !access->isGranted(AccessType::SHOW_TABLES, database_name, table_name)) continue; @@ -514,6 +544,7 @@ private: std::vector columns_mask; UInt64 max_block_size; ColumnPtr databases; + NameSet tables; size_t database_idx = 0; DatabaseTablesIteratorPtr tables_it; ContextPtr context; @@ -552,9 +583,10 @@ Pipe StorageSystemTables::read( } ColumnPtr filtered_databases_column = getFilteredDatabases(query_info, context); + ColumnPtr filtered_tables_column = getFilteredTables(query_info.query, filtered_databases_column, context); return Pipe(std::make_shared( - std::move(columns_mask), std::move(res_block), max_block_size, std::move(filtered_databases_column), context)); + std::move(columns_mask), std::move(res_block), max_block_size, std::move(filtered_databases_column), std::move(filtered_tables_column), context)); } } diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index 25ecc0e16ef..a81a5a9649a 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -91,10 +92,6 @@ namespace data.is_hop = t->name == "hop"; auto temp_node = t->clone(); temp_node->setAlias(""); - if (startsWith(t->arguments->children[0]->getColumnName(), "toDateTime")) - throw Exception( - "The first argument of time window function should not be a constant value.", - ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW); if (!data.window_function) { data.serialized_window_function = serializeAST(*temp_node); @@ -565,7 +562,13 @@ std::shared_ptr StorageWindowView::getInnerTableCreateQuery( inner_create_query->setDatabase(database_name); inner_create_query->setTable(table_name); - auto inner_select_query = std::static_pointer_cast(inner_query); + Aliases aliases; + QueryAliasesVisitor(aliases).visit(inner_query); + auto inner_query_normalized = inner_query->clone(); + QueryNormalizer::Data normalizer_data(aliases, {}, false, getContext()->getSettingsRef(), false); + QueryNormalizer(normalizer_data).visit(inner_query_normalized); + + auto inner_select_query = std::static_pointer_cast(inner_query_normalized); auto t_sample_block = InterpreterSelectQuery( @@ -582,6 +585,8 @@ std::shared_ptr StorageWindowView::getInnerTableCreateQuery( columns_list->children.push_back(column_window); } + bool has_window_id = false; + for (const auto & column : t_sample_block.getColumnsWithTypeAndName()) { ParserIdentifierWithOptionalParameters parser; @@ -591,8 +596,18 @@ std::shared_ptr StorageWindowView::getInnerTableCreateQuery( column_dec->name = column.name; column_dec->type = ast; columns_list->children.push_back(column_dec); + if (!is_time_column_func_now && !has_window_id) + { + if (startsWith(column.name, "windowID")) + has_window_id = true; + } } + if (!is_time_column_func_now && !has_window_id) + throw Exception( + "The first argument of time window function should not be a constant value.", + ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW); + ToIdentifierMatcher::Data query_data; query_data.window_id_name = window_id_name; query_data.window_id_alias = window_id_alias; @@ -634,10 +649,15 @@ std::shared_ptr StorageWindowView::getInnerTableCreateQuery( /// tumble/hop -> windowID func_window_visitor.visit(node); to_identifier_visitor.visit(node); + QueryNormalizer(normalizer_data).visit(node); + node->setAlias(""); new_storage->set(field, node); } }; + for (auto & [alias_name, ast] : aliases) + ast = std::make_shared(ast->getColumnName()); + visit(storage->partition_by, new_storage->partition_by); visit(storage->primary_key, new_storage->primary_key); visit(storage->order_by, new_storage->order_by); @@ -877,12 +897,12 @@ void StorageWindowView::threadFuncFireEvent() std::unique_lock lock(fire_signal_mutex); while (!shutdown_called) { - LOG_TRACE(log, "Fire events: {}", fire_signal.size()); - bool signaled = std::cv_status::no_timeout == fire_signal_condition.wait_for(lock, std::chrono::seconds(5)); if (!signaled) continue; + LOG_TRACE(log, "Fire events: {}", fire_signal.size()); + while (!fire_signal.empty()) { fire(fire_signal.front()); diff --git a/src/Storages/tests/gtest_transform_query_for_external_database.cpp b/src/Storages/tests/gtest_transform_query_for_external_database.cpp index f161400630b..57b9e73bbbd 100644 --- a/src/Storages/tests/gtest_transform_query_for_external_database.cpp +++ b/src/Storages/tests/gtest_transform_query_for_external_database.cpp @@ -120,7 +120,7 @@ TEST(TransformQueryForExternalDatabase, InWithSingleElement) check(state, 1, "SELECT column FROM test.table WHERE 1 IN (1)", - R"(SELECT "column" FROM "test"."table" WHERE 1)"); + R"(SELECT "column" FROM "test"."table" WHERE 1 = 1)"); check(state, 1, "SELECT column FROM test.table WHERE column IN (1, 2)", R"(SELECT "column" FROM "test"."table" WHERE "column" IN (1, 2))"); @@ -135,7 +135,7 @@ TEST(TransformQueryForExternalDatabase, InWithMultipleColumns) check(state, 1, "SELECT column FROM test.table WHERE (1,1) IN ((1,1))", - R"(SELECT "column" FROM "test"."table" WHERE 1)"); + R"(SELECT "column" FROM "test"."table" WHERE 1 = 1)"); check(state, 1, "SELECT field, value FROM test.table WHERE (field, value) IN (('foo', 'bar'))", R"(SELECT "field", "value" FROM "test"."table" WHERE ("field", "value") IN (('foo', 'bar')))"); diff --git a/src/Storages/transformQueryForExternalDatabase.cpp b/src/Storages/transformQueryForExternalDatabase.cpp index 4d6c1787a34..c42fb7fa965 100644 --- a/src/Storages/transformQueryForExternalDatabase.cpp +++ b/src/Storages/transformQueryForExternalDatabase.cpp @@ -306,6 +306,18 @@ String transformQueryForExternalDatabase( throw Exception("Query contains non-compatible expressions (and external_table_strict_query=true)", ErrorCodes::INCORRECT_QUERY); } + auto * literal_expr = typeid_cast(original_where.get()); + UInt64 value; + if (literal_expr && literal_expr->value.tryGet(value) && (value == 0 || value == 1)) + { + /// WHERE 1 -> WHERE 1=1, WHERE 0 -> WHERE 1=0. + if (value) + original_where = makeASTFunction("equals", std::make_shared(1), std::make_shared(1)); + else + original_where = makeASTFunction("equals", std::make_shared(1), std::make_shared(0)); + select->setExpression(ASTSelectQuery::Expression::WHERE, std::move(original_where)); + } + ASTPtr select_ptr = select; dropAliases(select_ptr); diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index fa7f6e52220..42b24abdbbe 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -15,25 +15,23 @@ namespace DB { StoragePtr ITableFunction::execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, - ColumnsDescription cached_columns) const + ColumnsDescription cached_columns, bool use_global_context) const { ProfileEvents::increment(ProfileEvents::TableFunctionExecute); context->checkAccess(AccessType::CREATE_TEMPORARY_TABLE | StorageFactory::instance().getSourceAccessType(getStorageTypeName())); + auto context_to_use = use_global_context ? context->getGlobalContext() : context; + if (cached_columns.empty()) return executeImpl(ast_function, context, table_name, std::move(cached_columns)); - /// We have table structure, so it's CREATE AS table_function(). - /// We should use global context here because there will be no query context on server startup - /// and because storage lifetime is bigger than query context lifetime. - auto global_context = context->getGlobalContext(); if (hasStaticStructure() && cached_columns == getActualTableStructure(context)) - return executeImpl(ast_function, global_context, table_name, std::move(cached_columns)); + return executeImpl(ast_function, context_to_use, table_name, std::move(cached_columns)); auto this_table_function = shared_from_this(); auto get_storage = [=]() -> StoragePtr { - return this_table_function->executeImpl(ast_function, global_context, table_name, cached_columns); + return this_table_function->executeImpl(ast_function, context_to_use, table_name, cached_columns); }; /// It will request actual table structure and create underlying storage lazily diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index 56147ffd598..93cf5057e88 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -54,7 +54,7 @@ public: /// Create storage according to the query. StoragePtr - execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}) const; + execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}, bool use_global_context = false) const; virtual ~ITableFunction() = default; diff --git a/src/TableFunctions/ITableFunctionFileLike.cpp b/src/TableFunctions/ITableFunctionFileLike.cpp index 699ad698bd8..4395c318983 100644 --- a/src/TableFunctions/ITableFunctionFileLike.cpp +++ b/src/TableFunctions/ITableFunctionFileLike.cpp @@ -1,4 +1,3 @@ -#include #include #include @@ -6,16 +5,16 @@ #include #include -#include #include #include -#include #include #include +#include + namespace DB { @@ -23,10 +22,27 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int INCORRECT_FILE_NAME; extern const int BAD_ARGUMENTS; } +namespace +{ + void checkIfFormatSupportsAutoStructure(const String & name, const String & format) + { + if (name == "file" && format == "Distributed") + return; + + if (FormatFactory::instance().checkIfFormatHasAnySchemaReader(format)) + return; + + throw Exception( + "Table function '" + name + + "' allows automatic structure determination only for formats that support schema inference and for Distributed format in table function " + "'file'", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + } +} + void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context) { /// Parse args @@ -46,21 +62,23 @@ void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, Context filename = args[0]->as().value.safeGet(); format = args[1]->as().value.safeGet(); - if (args.size() == 2 && getName() == "file") + if (args.size() == 2) { - if (format == "Distributed") - return; - throw Exception("Table function '" + getName() + "' allows 2 arguments only for Distributed format.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + checkIfFormatSupportsAutoStructure(getName(), format); + return; } if (args.size() != 3 && args.size() != 4) - throw Exception("Table function '" + getName() + "' requires 3 or 4 arguments: filename, format, structure and compression method (default auto).", + throw Exception("Table function '" + getName() + "' requires 2, 3 or 4 arguments: filename, format, structure (default auto) and compression method (default auto)", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); structure = args[2]->as().value.safeGet(); + if (structure == "auto") + checkIfFormatSupportsAutoStructure(getName(), format); + if (structure.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Table structure is empty for table function '{}'", + "Table structure is empty for table function '{}'. If you want to use automatic schema inference, use 'auto'", ast_function->formatForErrorMessage()); if (args.size() == 4) @@ -69,25 +87,12 @@ void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, Context StoragePtr ITableFunctionFileLike::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - auto columns = getActualTableStructure(context); + ColumnsDescription columns; + if (structure != "auto") + columns = parseColumnsListFromString(structure, context); StoragePtr storage = getStorage(filename, format, columns, context, table_name, compression_method); storage->startup(); return storage; } -ColumnsDescription ITableFunctionFileLike::getActualTableStructure(ContextPtr context) const -{ - if (structure.empty()) - { - assert(getName() == "file" && format == "Distributed"); - size_t total_bytes_to_read = 0; - Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read); - if (paths.empty()) - throw Exception("Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); - auto source = StorageDistributedDirectoryMonitor::createSourceFromFile(paths[0]); - return ColumnsDescription{source->getOutputs().front().getHeader().getNamesAndTypesList()}; - } - return parseColumnsListFromString(structure, context); -} - } diff --git a/src/TableFunctions/ITableFunctionFileLike.h b/src/TableFunctions/ITableFunctionFileLike.h index 2069f02b0dd..2ceafdee229 100644 --- a/src/TableFunctions/ITableFunctionFileLike.h +++ b/src/TableFunctions/ITableFunctionFileLike.h @@ -8,7 +8,7 @@ class ColumnsDescription; class Context; /* - * function(source, format, structure) - creates a temporary storage from formatted source + * function(source, format, structure[, compression_method]) - creates a temporary storage from formatted source */ class ITableFunctionFileLike : public ITableFunction { @@ -18,7 +18,7 @@ protected: String filename; String format; - String structure; + String structure = "auto"; String compression_method = "auto"; private: @@ -28,8 +28,7 @@ private: const String & source, const String & format, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method) const = 0; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; - - bool hasStaticStructure() const override { return true; } + bool hasStaticStructure() const override { return structure != "auto"; } }; + } diff --git a/src/TableFunctions/TableFunctionExecutable.cpp b/src/TableFunctions/TableFunctionExecutable.cpp index 9edb75b0a69..41ba2db5c33 100644 --- a/src/TableFunctions/TableFunctionExecutable.cpp +++ b/src/TableFunctions/TableFunctionExecutable.cpp @@ -75,7 +75,12 @@ ColumnsDescription TableFunctionExecutable::getActualTableStructure(ContextPtr c StoragePtr TableFunctionExecutable::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { auto storage_id = StorageID(getDatabaseName(), table_name); - auto storage = StorageExecutable::create(storage_id, script_name, arguments, format, input_queries, getActualTableStructure(context), ConstraintsDescription{}); + auto global_context = context->getGlobalContext(); + ExecutableSettings settings; + settings.script_name = script_name; + settings.script_arguments = std::move(arguments); + + auto storage = StorageExecutable::create(storage_id, format, settings, input_queries, getActualTableStructure(context), ConstraintsDescription{}); storage->startup(); return storage; } diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index d8bdb3b45c4..71aba5494e8 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -1,4 +1,5 @@ #include +#include #include "registerTableFunctions.h" #include @@ -9,11 +10,13 @@ namespace DB { + StoragePtr TableFunctionFile::getStorage(const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const std::string & compression_method_) const { + LOG_DEBUG(&Poco::Logger::get("TableFunctionFile"), "getStorage"); // For `file` table function, we are going to use format settings from the // query context. StorageFile::CommonArguments args{ @@ -30,8 +33,21 @@ StoragePtr TableFunctionFile::getStorage(const String & source, return StorageFile::create(source, global_context->getUserFilesPath(), args); } +ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + { + size_t total_bytes_to_read = 0; + Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read); + return StorageFile::getTableStructureFromData(format, paths, compression_method, std::nullopt, context); + } + + return parseColumnsListFromString(structure, context); +} + void registerTableFunctionFile(TableFunctionFactory & factory) { factory.registerFunction(); } + } diff --git a/src/TableFunctions/TableFunctionFile.h b/src/TableFunctions/TableFunctionFile.h index 460656a7218..f26e4a9c06d 100644 --- a/src/TableFunctions/TableFunctionFile.h +++ b/src/TableFunctions/TableFunctionFile.h @@ -6,7 +6,7 @@ namespace DB { -/* file(path, format, structure) - creates a temporary storage from file +/* file(path, format[, structure, compression]) - creates a temporary storage from file * * The file must be in the clickhouse data directory. * The relative path begins with the clickhouse data directory. @@ -20,9 +20,13 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + private: StoragePtr getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const std::string & compression_method_) const override; const char * getStorageTypeName() const override { return "File"; } -};} +}; + +} diff --git a/src/TableFunctions/TableFunctionHDFS.cpp b/src/TableFunctions/TableFunctionHDFS.cpp index 245674b0e06..b626f563977 100644 --- a/src/TableFunctions/TableFunctionHDFS.cpp +++ b/src/TableFunctions/TableFunctionHDFS.cpp @@ -6,9 +6,11 @@ #include #include #include +#include namespace DB { + StoragePtr TableFunctionHDFS::getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method_) const @@ -24,12 +26,18 @@ StoragePtr TableFunctionHDFS::getStorage( compression_method_); } +ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + return StorageHDFS::getTableStructureFromData(format, filename, compression_method, context); + + return parseColumnsListFromString(structure, context); +} -#if USE_HDFS void registerTableFunctionHDFS(TableFunctionFactory & factory) { factory.registerFunction(); } -#endif + } #endif diff --git a/src/TableFunctions/TableFunctionHDFS.h b/src/TableFunctions/TableFunctionHDFS.h index 70bdc67efc8..74139818209 100644 --- a/src/TableFunctions/TableFunctionHDFS.h +++ b/src/TableFunctions/TableFunctionHDFS.h @@ -12,7 +12,7 @@ namespace DB class Context; -/* hdfs(URI, format, structure) - creates a temporary storage from hdfs files +/* hdfs(URI, format[, structure, compression]) - creates a temporary storage from hdfs files * */ class TableFunctionHDFS : public ITableFunctionFileLike @@ -24,6 +24,8 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + private: StoragePtr getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, diff --git a/src/TableFunctions/TableFunctionPostgreSQL.cpp b/src/TableFunctions/TableFunctionPostgreSQL.cpp index bcfe8d5444c..7e7424be38f 100644 --- a/src/TableFunctions/TableFunctionPostgreSQL.cpp +++ b/src/TableFunctions/TableFunctionPostgreSQL.cpp @@ -45,12 +45,13 @@ ColumnsDescription TableFunctionPostgreSQL::getActualTableStructure(ContextPtr c { const bool use_nulls = context->getSettingsRef().external_table_functions_use_nulls; auto connection_holder = connection_pool->get(); - auto columns = fetchPostgreSQLTableStructure( - connection_holder->get(), configuration->table, configuration->schema, use_nulls).columns; + auto columns_info = fetchPostgreSQLTableStructure( + connection_holder->get(), configuration->table, configuration->schema, use_nulls).physical_columns; - if (!columns) + if (!columns_info) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table structure not returned"); - return ColumnsDescription{*columns}; + + return ColumnsDescription{columns_info->columns}; } diff --git a/src/TableFunctions/TableFunctionRemote.h b/src/TableFunctions/TableFunctionRemote.h index 845c36182dc..976397ddc45 100644 --- a/src/TableFunctions/TableFunctionRemote.h +++ b/src/TableFunctions/TableFunctionRemote.h @@ -27,6 +27,7 @@ public: bool needStructureConversion() const override { return false; } private: + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; const char * getStorageTypeName() const override { return "Distributed"; } diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index e26c282c622..c4be01c6b5c 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include "registerTableFunctions.h" @@ -28,6 +29,7 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con const auto message = fmt::format( "The signature of table function {} could be the following:\n" \ + " - url, format\n" \ " - url, format, structure\n" \ " - url, format, structure, compression_method\n" \ " - url, access_key_id, secret_access_key, format, structure\n" \ @@ -69,17 +71,32 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con /// Size -> argument indexes static auto size_to_args = std::map> { + {2, {{"format", 1}}}, {3, {{"format", 1}, {"structure", 2}}}, - {4, {{"format", 1}, {"structure", 2}, {"compression_method", 3}}}, {5, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}}}, {6, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}, {"compression_method", 5}}} }; + std::map args_to_idx; + /// For 4 arguments we support 2 possible variants: + /// s3(source, format, structure, compression_method) and s3(source, access_key_id, access_key_id, format) + /// We can distinguish them by looking at the 4-th argument: check if it's a format name or not. + if (args.size() == 4) + { + auto last_arg = args[3]->as().value.safeGet(); + if (FormatFactory::instance().getAllFormats().contains(last_arg)) + args_to_idx = {{"access_key_id", 1}, {"access_key_id", 2}, {"format", 3}}; + else + args_to_idx = {{"format", 1}, {"structure", 2}, {"compression_method", 3}}; + } + else + { + args_to_idx = size_to_args[args.size()]; + } + /// This argument is always the first configuration.url = args[0]->as().value.safeGet(); - auto & args_to_idx = size_to_args[args.size()]; - if (args_to_idx.contains("format")) configuration.format = args[args_to_idx["format"]]->as().value.safeGet(); @@ -101,6 +118,21 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) const { + if (s3_configuration->structure == "auto") + { + return StorageS3::getTableStructureFromData( + s3_configuration->format, + S3::URI(Poco::URI(s3_configuration->url)), + s3_configuration->access_key_id, + s3_configuration->secret_access_key, + context->getSettingsRef().s3_max_connections, + context->getSettingsRef().s3_max_single_read_retries, + s3_configuration->compression_method, + false, + std::nullopt, + context); + } + return parseColumnsListFromString(s3_configuration->structure, context); } @@ -113,6 +145,10 @@ StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, Context UInt64 max_single_part_upload_size = context->getSettingsRef().s3_max_single_part_upload_size; UInt64 max_connections = context->getSettingsRef().s3_max_connections; + ColumnsDescription columns; + if (s3_configuration->structure != "auto") + columns = parseColumnsListFromString(s3_configuration->structure, context); + StoragePtr storage = StorageS3::create( s3_uri, s3_configuration->access_key_id, diff --git a/src/TableFunctions/TableFunctionS3.h b/src/TableFunctions/TableFunctionS3.h index 8d4c1391236..374e653072e 100644 --- a/src/TableFunctions/TableFunctionS3.h +++ b/src/TableFunctions/TableFunctionS3.h @@ -13,7 +13,7 @@ namespace DB class Context; -/* s3(source, [access_key_id, secret_access_key,] format, structure) - creates a temporary storage for a file in S3 +/* s3(source, [access_key_id, secret_access_key,] format, structure[, compression]) - creates a temporary storage for a file in S3 */ class TableFunctionS3 : public ITableFunction { @@ -23,7 +23,7 @@ public: { return name; } - bool hasStaticStructure() const override { return true; } + bool hasStaticStructure() const override { return s3_configuration->structure != "auto"; } protected: StoragePtr executeImpl( diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index c3ea30f800f..7c4d7b4a444 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -2,11 +2,11 @@ #include "registerTableFunctions.h" #include -#include #include #include #include #include +#include #include @@ -59,20 +59,10 @@ void TableFunctionURL::parseArguments(const ASTPtr & ast_function, ContextPtr co } } - StoragePtr TableFunctionURL::getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method_) const { - ReadWriteBufferFromHTTP::HTTPHeaderEntries headers; - for (const auto & [header, value] : configuration.headers) - { - auto value_literal = value.safeGet(); - if (header == "Range") - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Range headers are not allowed"); - headers.emplace_back(std::make_pair(header, value_literal)); - } - return StorageURL::create( source, StorageID(getDatabaseName(), table_name), @@ -83,10 +73,31 @@ StoragePtr TableFunctionURL::getStorage( String{}, global_context, compression_method_, - headers, + getHeaders(), configuration.http_method); } +ReadWriteBufferFromHTTP::HTTPHeaderEntries TableFunctionURL::getHeaders() const +{ + ReadWriteBufferFromHTTP::HTTPHeaderEntries headers; + for (const auto & [header, value] : configuration.headers) + { + auto value_literal = value.safeGet(); + if (header == "Range") + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Range headers are not allowed"); + headers.emplace_back(std::make_pair(header, value_literal)); + } + return headers; +} + +ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + return StorageURL::getTableStructureFromData(format, filename, compression_method, getHeaders(), std::nullopt, context); + + return parseColumnsListFromString(structure, context); +} + void registerTableFunctionURL(TableFunctionFactory & factory) { factory.registerFunction(); diff --git a/src/TableFunctions/TableFunctionURL.h b/src/TableFunctions/TableFunctionURL.h index 9425112acb2..798a37dc478 100644 --- a/src/TableFunctions/TableFunctionURL.h +++ b/src/TableFunctions/TableFunctionURL.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB @@ -9,7 +10,7 @@ namespace DB class Context; -/* url(source, format, structure) - creates a temporary storage from url +/* url(source, format[, structure, compression]) - creates a temporary storage from url */ class TableFunctionURL : public ITableFunctionFileLike { @@ -20,6 +21,8 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + protected: void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; @@ -29,6 +32,8 @@ private: const std::string & table_name, const String & compression_method_) const override; const char * getStorageTypeName() const override { return "URL"; } + ReadWriteBufferFromHTTP::HTTPHeaderEntries getHeaders() const; + URLBasedDataSourceConfiguration configuration; }; diff --git a/tests/ci/cancel_and_rerun_workflow_lambda/app.py b/tests/ci/cancel_and_rerun_workflow_lambda/app.py index bd1dc394086..b79eb292dc6 100644 --- a/tests/ci/cancel_and_rerun_workflow_lambda/app.py +++ b/tests/ci/cancel_and_rerun_workflow_lambda/app.py @@ -11,7 +11,6 @@ import boto3 NEED_RERUN_OR_CANCELL_WORKFLOWS = { 13241696, # PR 15834118, # Docs - 15522500, # MasterCI 15516108, # ReleaseCI 15797242, # BackportPR } @@ -86,10 +85,23 @@ WorkflowDescription = namedtuple('WorkflowDescription', def get_workflows_description_for_pull_request(pull_request_event): head_branch = pull_request_event['head']['ref'] print("PR", pull_request_event['number'], "has head ref", head_branch) - workflows = _exec_get_with_retry(API_URL + f"/actions/runs?branch={head_branch}") + workflows_data = [] + workflows = _exec_get_with_retry(API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page=1") + workflows_data += workflows['workflow_runs'] + i = 2 + while len(workflows['workflow_runs']) > 0: + workflows = _exec_get_with_retry(API_URL + f"/actions/runs?branch={head_branch}&event=pull_request&page={i}") + workflows_data += workflows['workflow_runs'] + i += 1 + if i > 30: + print("Too many workflows found") + break + workflow_descriptions = [] - for workflow in workflows['workflow_runs']: - if workflow['workflow_id'] in NEED_RERUN_OR_CANCELL_WORKFLOWS: + for workflow in workflows_data: + # unfortunately we cannot filter workflows from forks in request to API so doing it manually + if (workflow['head_repository']['full_name'] == pull_request_event['head']['repo']['full_name'] + and workflow['workflow_id'] in NEED_RERUN_OR_CANCELL_WORKFLOWS): workflow_descriptions.append(WorkflowDescription( run_id=workflow['id'], status=workflow['status'], diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index c420b76aaf3..8396303c5a3 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -1,13 +1,33 @@ #!/usr/bin/env python3 +import time from env_helper import GITHUB_REPOSITORY +RETRY = 5 + + +def get_commit(gh, commit_sha, retry_count=RETRY): + for i in range(retry_count): + try: + repo = gh.get_repo(GITHUB_REPOSITORY) + commit = repo.get_commit(commit_sha) + return commit + except Exception as ex: + if i == retry_count - 1: + raise ex + time.sleep(i) + + # just suppress warning + return None -def get_commit(gh, commit_sha): - repo = gh.get_repo(GITHUB_REPOSITORY) - commit = repo.get_commit(commit_sha) - return commit def post_commit_status(gh, sha, check_name, description, state, report_url): - commit = get_commit(gh, sha) - commit.create_status(context=check_name, description=description, state=state, target_url=report_url) + for i in range(RETRY): + try: + commit = get_commit(gh, sha, 1) + commit.create_status(context=check_name, description=description, state=state, target_url=report_url) + break + except Exception as ex: + if i == RETRY - 1: + raise ex + time.sleep(i) diff --git a/tests/ci/docs_release.py b/tests/ci/docs_release.py index 90588848f12..825bca0b68b 100644 --- a/tests/ci/docs_release.py +++ b/tests/ci/docs_release.py @@ -2,6 +2,7 @@ import logging import subprocess import os +import sys from github import Github @@ -13,6 +14,7 @@ from ssh import SSHKey from upload_result_helper import upload_results from docker_pull_helper import get_image_with_version from commit_status_helper import get_commit +from rerun_helper import RerunHelper NAME = "Docs Release (actions)" @@ -22,9 +24,12 @@ if __name__ == "__main__": temp_path = TEMP_PATH repo_path = REPO_COPY - pr_info = PRInfo(need_changed_files=True) - gh = Github(get_best_robot_token()) + pr_info = PRInfo(need_changed_files=True) + rerun_helper = RerunHelper(gh, pr_info, NAME) + if rerun_helper.is_already_finished_by_status(): + logging.info("Check is already finished according to github status, exiting") + sys.exit(0) if not os.path.exists(temp_path): os.makedirs(temp_path) diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py index 20e33f2f2dc..e87528dd528 100644 --- a/tests/ci/integration_test_check.py +++ b/tests/ci/integration_test_check.py @@ -33,6 +33,7 @@ IMAGES = [ "clickhouse/integration-test", "clickhouse/kerberos-kdc", "clickhouse/integration-helper", + "clickhouse/dotnet-client", ] def get_json_params_dict(check_name, pr_info, docker_images, run_by_hash_total, run_by_hash_num): diff --git a/tests/ci/keeper_jepsen_check.py b/tests/ci/keeper_jepsen_check.py index 5c7582242a9..b7acc92b0f3 100644 --- a/tests/ci/keeper_jepsen_check.py +++ b/tests/ci/keeper_jepsen_check.py @@ -120,7 +120,9 @@ if __name__ == "__main__": pr_info = PRInfo() - if pr_info.number != 0 and 'jepsen-test' not in pr_info.labels(): + logging.info("Start at PR number %s, commit sha %s labels %s", pr_info.number, pr_info.sha, pr_info.labels) + + if pr_info.number != 0 and 'jepsen-test' not in pr_info.labels: logging.info("Not jepsen test label in labels list, skipping") sys.exit(0) diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index 812834824b7..48464439dbc 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import json import os -import urllib import requests from unidiff import PatchSet @@ -140,16 +139,15 @@ class PRInfo: if not self.diff_url: raise Exception("Diff URL cannot be find for event") + response = requests.get(self.diff_url) + response.raise_for_status() if 'commits' in self.event and self.number == 0: - response = requests.get(self.diff_url) - response.raise_for_status() diff = response.json() if 'files' in diff: self.changed_files = [f['filename'] for f in diff['files']] else: - diff = urllib.request.urlopen(self.diff_url) - diff_object = PatchSet(diff, diff.headers.get_charsets()[0]) + diff_object = PatchSet(response.text) self.changed_files = {f.path for f in diff_object} def get_dict(self): diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 692cda18f20..c7156dbef26 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -68,6 +68,8 @@ TRUSTED_CONTRIBUTORS = {e.lower() for e in [ "YiuRULE", "zlobober", # Developer of YT "ilejn", # Arenadata, responsible for Kerberized Kafka + "thomoco", # ClickHouse + "BoloniniD", # Seasoned contributor, HSE ]} diff --git a/tests/ci/workflow_approve_rerun_lambda/app.py b/tests/ci/workflow_approve_rerun_lambda/app.py index f2502f605af..396431a2e5f 100644 --- a/tests/ci/workflow_approve_rerun_lambda/app.py +++ b/tests/ci/workflow_approve_rerun_lambda/app.py @@ -41,6 +41,7 @@ TRUSTED_ORG_IDS = { NEED_RERUN_WORKFLOWS = { 13241696, # PR + 14738810, # DocsRelease 15834118, # Docs 15522500, # MasterCI 15516108, # ReleaseCI @@ -92,6 +93,7 @@ TRUSTED_CONTRIBUTORS = {e.lower() for e in [ "vzakaznikov", "YiuRULE", "zlobober", # Developer of YT + "BoloniniD", # Seasoned contributor, HSE ]} diff --git a/tests/config/executable_pool_dictionary.xml b/tests/config/executable_pool_dictionary.xml index 13f34f0048e..212552a6776 100644 --- a/tests/config/executable_pool_dictionary.xml +++ b/tests/config/executable_pool_dictionary.xml @@ -61,10 +61,11 @@ - + TabSeparated while read read_data; do printf "$read_data\tvalue a\tvalue b\n"; done - + 5 + diff --git a/tests/config/test_function.xml b/tests/config/test_function.xml index 2e31c9677ec..928cbd75c78 100644 --- a/tests/config/test_function.xml +++ b/tests/config/test_function.xml @@ -11,6 +11,6 @@ TabSeparated cd /; clickhouse-local --input-format TabSeparated --output-format TabSeparated --structure 'x UInt64, y UInt64' --query "SELECT x + y FROM table" - 0 + 0 diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index 830b8e149f6..6058a332c29 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -228,6 +228,7 @@ class ClickhouseIntegrationTestsRunner: "clickhouse/mysql-java-client", "clickhouse/mysql-js-client", "clickhouse/mysql-php-client", "clickhouse/postgresql-java-client", "clickhouse/integration-test", "clickhouse/kerberos-kdc", + "clickhouse/dotnet-client", "clickhouse/integration-helper", ] @@ -252,7 +253,7 @@ class ClickhouseIntegrationTestsRunner: logging.info("Executing installation cmd %s", cmd) retcode = subprocess.Popen(cmd, shell=True, stderr=log, stdout=log).wait() if retcode == 0: - logging.info("Instsallation of %s successfull", full_path) + logging.info("Installation of %s successfull", full_path) else: raise Exception("Installation of %s failed", full_path) break diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index d440f2de0ca..bb36d3452d7 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -1476,6 +1476,18 @@ class ClickHouseCluster: common_opts = ['--verbose', 'up', '-d'] + images_pull_cmd = self.base_cmd + ['pull'] + # sometimes dockerhub/proxy can be flaky + for i in range(5): + try: + run_and_check(images_pull_cmd) + break + except Exception as ex: + if i == 4: + raise ex + logging.info("Got exception pulling images: %s", ex) + time.sleep(i * 3) + if self.with_zookeeper_secure and self.base_zookeeper_cmd: logging.debug('Setup ZooKeeper Secure') logging.debug(f'Creating internal ZooKeeper dirs: {self.zookeeper_dirs_to_create}') @@ -2043,7 +2055,8 @@ class ClickHouseInstance: user=user, password=password, database=database) # Connects to the instance via HTTP interface, sends a query and returns the answer - def http_query(self, sql, data=None, params=None, user=None, password=None, expect_fail_and_get_error=False): + def http_query(self, sql, data=None, params=None, user=None, password=None, expect_fail_and_get_error=False, + port=8123, timeout=None, retry_strategy=None): logging.debug(f"Executing query {sql} on {self.name} via HTTP interface") if params is None: params = {} @@ -2057,12 +2070,19 @@ class ClickHouseInstance: auth = requests.auth.HTTPBasicAuth(user, password) elif user: auth = requests.auth.HTTPBasicAuth(user, '') - url = "http://" + self.ip_address + ":8123/?" + urllib.parse.urlencode(params) + url = f"http://{self.ip_address}:{port}/?" + urllib.parse.urlencode(params) - if data: - r = requests.post(url, data, auth=auth) + if retry_strategy is None: + requester = requests else: - r = requests.get(url, auth=auth) + adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) + requester = requests.Session() + requester.mount("https://", adapter) + requester.mount("http://", adapter) + if data: + r = requester.post(url, data, auth=auth, timeout=timeout) + else: + r = requester.get(url, auth=auth, timeout=timeout) def http_code_and_message(): code = r.status_code @@ -2236,7 +2256,7 @@ class ClickHouseInstance: logging.debug('{} log line(s) matching "{}" appeared in a {:.3f} seconds'.format(repetitions, regexp, wait_duration)) return wait_duration - def file_exists(self, path): + def path_exists(self, path): return self.exec_in_container( ["bash", "-c", "echo $(if [ -e '{}' ]; then echo 'yes'; else echo 'no'; fi)".format(path)]) == 'yes\n' @@ -2674,6 +2694,20 @@ class ClickHouseInstance: if p.exists(self.path): shutil.rmtree(self.path) + def wait_for_path_exists(self, path, seconds): + while seconds > 0: + seconds -= 1 + if self.path_exists(path): + return + time.sleep(1) + + def get_backuped_s3_objects(self, disk, backup_name): + path = f'/var/lib/clickhouse/disks/{disk}/shadow/{backup_name}/store' + self.wait_for_path_exists(path, 10) + command = ['find', path, '-type', 'f', + '-exec', 'grep', '-o', 'r[01]\\{64\\}-file-[[:lower:]]\\{32\\}', '{}', ';'] + return self.exec_in_container(command).split('\n') + class ClickHouseKiller(object): def __init__(self, clickhouse_node): diff --git a/tests/integration/runner b/tests/integration/runner index 4d01b9737d1..3687ca4068c 100755 --- a/tests/integration/runner +++ b/tests/integration/runner @@ -226,6 +226,8 @@ if __name__ == "__main__": [image, tag] = img_tag.split(":") if image == "clickhouse/mysql-golang-client": env_tags += "-e {}={} ".format("DOCKER_MYSQL_GOLANG_CLIENT_TAG", tag) + elif image == "clickhouse/dotnet-client": + env_tags += "-e {}={} ".format("DOCKER_DOTNET_CLIENT_TAG", tag) elif image == "clickhouse/mysql-java-client": env_tags += "-e {}={} ".format("DOCKER_MYSQL_JAVA_CLIENT_TAG", tag) elif image == "clickhouse/mysql-js-client": @@ -237,7 +239,7 @@ if __name__ == "__main__": elif image == "clickhouse/integration-test": env_tags += "-e {}={} ".format("DOCKER_BASE_TAG", tag) elif image == "clickhouse/kerberos-kdc": - env_tags += "-e {}={}".format("DOCKER_KERBEROS_KDC_TAG", tag) + env_tags += "-e {}={} ".format("DOCKER_KERBEROS_KDC_TAG", tag) else: logging.info("Unknown image %s" % (image)) diff --git a/tests/integration/test_cluster_discovery/__init__.py b/tests/integration/test_cluster_discovery/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_cluster_discovery/config/config.xml b/tests/integration/test_cluster_discovery/config/config.xml new file mode 100644 index 00000000000..70cb010fe0e --- /dev/null +++ b/tests/integration/test_cluster_discovery/config/config.xml @@ -0,0 +1,23 @@ + + 1 + + + + /clickhouse/discovery/test_auto_cluster + + + + + + + node1 + 9000 + + + node2 + 9000 + + + + + diff --git a/tests/integration/test_cluster_discovery/config/config_shard1.xml b/tests/integration/test_cluster_discovery/config/config_shard1.xml new file mode 100644 index 00000000000..06a77a37263 --- /dev/null +++ b/tests/integration/test_cluster_discovery/config/config_shard1.xml @@ -0,0 +1,24 @@ + + 1 + + + + /clickhouse/discovery/test_auto_cluster + 1 + + + + + + + node1 + 9000 + + + node2 + 9000 + + + + + diff --git a/tests/integration/test_cluster_discovery/config/config_shard3.xml b/tests/integration/test_cluster_discovery/config/config_shard3.xml new file mode 100644 index 00000000000..ab66fdc2ab7 --- /dev/null +++ b/tests/integration/test_cluster_discovery/config/config_shard3.xml @@ -0,0 +1,24 @@ + + 1 + + + + /clickhouse/discovery/test_auto_cluster + 3 + + + + + + + node1 + 9000 + + + node2 + 9000 + + + + + diff --git a/tests/integration/test_cluster_discovery/test.py b/tests/integration/test_cluster_discovery/test.py new file mode 100644 index 00000000000..acddd855040 --- /dev/null +++ b/tests/integration/test_cluster_discovery/test.py @@ -0,0 +1,81 @@ +import pytest + +import functools +import time + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + +shard_configs = { + i: f'config/config_shard{i}.xml' + for i in [1, 3] +} + +nodes = [ + cluster.add_instance( + f'node{i}', + main_configs=[shard_configs.get(i, 'config/config.xml')], + stay_alive=True, + with_zookeeper=True + ) for i in range(5) +] + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def check_on_cluster(nodes, expected, *, what, cluster_name='test_auto_cluster', msg=None, retries=5): + """ + Select data from `system.clusters` on specified nodes and check the result + """ + assert 1 <= retries <= 6 + + for retry in range(1, retries + 1): + nodes_res = { + node.name: int(node.query(f"SELECT {what} FROM system.clusters WHERE cluster = '{cluster_name}'")) + for node in nodes + } + if all(actual == expected for actual in nodes_res.values()): + break + + if retry != retries: + time.sleep(2 ** retry) + else: + msg = msg or f"Wrong '{what}' result" + raise Exception(f'{msg}: {nodes_res}, expected: {expected} (after {retries} retries)') + + +def test_cluster_discovery_startup_and_stop(start_cluster): + """ + Start cluster, check nodes count in system.clusters, + then stop/start some nodes and check that it (dis)appeared in cluster. + """ + + check_nodes_count = functools.partial(check_on_cluster, what='count()', msg='Wrong nodes count in cluster') + check_shard_num = functools.partial(check_on_cluster, what='count(DISTINCT shard_num)', msg='Wrong shard_num count in cluster') + + total_shards = len(shard_configs) + 1 + check_nodes_count([nodes[0], nodes[2]], len(nodes)) + check_shard_num([nodes[0], nodes[2]], total_shards) + + nodes[1].stop_clickhouse(kill=True) + check_nodes_count([nodes[0], nodes[2]], len(nodes) - 1) + check_shard_num([nodes[0], nodes[2]], total_shards - 1) + + nodes[3].stop_clickhouse() + check_nodes_count([nodes[0], nodes[2]], len(nodes) - 2) + + nodes[1].start_clickhouse() + check_nodes_count([nodes[0], nodes[2]], len(nodes) - 1) + + nodes[3].start_clickhouse() + check_nodes_count([nodes[0], nodes[2]], len(nodes)) + + check_nodes_count([nodes[1], nodes[2]], 2, cluster_name='two_shards', retries=1) diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/__init__.py b/tests/integration/test_concurrent_queries_restriction_by_query_kind/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_insert_restriction.xml b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_insert_restriction.xml new file mode 100644 index 00000000000..7753c579902 --- /dev/null +++ b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_insert_restriction.xml @@ -0,0 +1,3 @@ + + 2 + diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_select_restriction.xml b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_select_restriction.xml new file mode 100644 index 00000000000..c8f081e6804 --- /dev/null +++ b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_select_restriction.xml @@ -0,0 +1,3 @@ + + 2 + diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/test.py b/tests/integration/test_concurrent_queries_restriction_by_query_kind/test.py new file mode 100644 index 00000000000..2d16d9157f6 --- /dev/null +++ b/tests/integration/test_concurrent_queries_restriction_by_query_kind/test.py @@ -0,0 +1,77 @@ +import time +from multiprocessing.dummy import Pool + +import pytest +from helpers.cluster import ClickHouseCluster + + +cluster = ClickHouseCluster(__file__) +node_insert = cluster.add_instance('node_insert', main_configs=['configs/concurrent_insert_restriction.xml']) +node_select = cluster.add_instance('node_select', main_configs=['configs/concurrent_select_restriction.xml']) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + node_select.query("create table test_concurrent_insert (x UInt64) ENGINE = MergeTree() order by tuple()") + node_insert.query("create table test_concurrent_insert (x UInt64) ENGINE = MergeTree() order by tuple()") + yield cluster + finally: + cluster.shutdown() + + +def execute_with_background(node, sql, background_sql, background_times, wait_times=3): + r = None + for _ in range(wait_times): + r = node.query('show processlist', stdin='') + if not r.strip(): + break + time.sleep(1) + else: + assert False, "there are unknown background queries: {}".format(r) + for _ in range(background_times): + node.get_query_request(background_sql, stdin='') + time.sleep(0.5) # wait background to start. + return node.query(sql, stdin='') + + +def common_pattern(node, query_kind, restricted_sql, normal_sql, limit, wait_times): + # restriction is working + with pytest.raises(Exception, match=r".*Too many simultaneous {} queries.*".format(query_kind)): + execute_with_background(node, restricted_sql, restricted_sql, limit, wait_times) + + # different query kind is independent + execute_with_background(node, normal_sql, restricted_sql, limit, wait_times) + + # normal + execute_with_background(node, restricted_sql, '', 0, wait_times) + + +def test_select(started_cluster): + common_pattern( + node_select, 'select', + 'select sleep(3)', + 'insert into test_concurrent_insert values (0)', + 2, + 10 + ) + + # subquery is not counted + execute_with_background( + node_select, + 'select sleep(3)', + 'insert into test_concurrent_insert select sleep(3)', + 2, + 10 + ) + + +def test_insert(started_cluster): + common_pattern( + node_insert, 'insert', + 'insert into test_concurrent_insert select sleep(3)', + 'select 1', + 2, + 10 + ) diff --git a/tests/integration/test_config_xml_full/configs/config.xml b/tests/integration/test_config_xml_full/configs/config.xml index c277ff7341f..76eceedbcea 100644 --- a/tests/integration/test_config_xml_full/configs/config.xml +++ b/tests/integration/test_config_xml_full/configs/config.xml @@ -639,6 +639,24 @@ + + + + localhost + 9440 + + + + + + + + localhost + 9440 + + + + diff --git a/tests/integration/test_config_yaml_full/configs/config.yaml b/tests/integration/test_config_yaml_full/configs/config.yaml index 5958d463d21..21cf439f7ec 100644 --- a/tests/integration/test_config_yaml_full/configs/config.yaml +++ b/tests/integration/test_config_yaml_full/configs/config.yaml @@ -100,6 +100,12 @@ remote_servers: host: localhost port: 9440 secure: 1 + test_shard_localhost_secure_empty_tag: + shard: + replica: + host: localhost + port: 9440 + secure: test_unavailable_shard: shard: - replica: diff --git a/tests/integration/test_dictionaries_postgresql/test.py b/tests/integration/test_dictionaries_postgresql/test.py index 8869e9112d1..ce295e11586 100644 --- a/tests/integration/test_dictionaries_postgresql/test.py +++ b/tests/integration/test_dictionaries_postgresql/test.py @@ -369,6 +369,29 @@ def test_predefined_connection_configuration(started_cluster): assert(int(result.strip()) == 99) +def test_bad_configuration(started_cluster): + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) + cursor = conn.cursor() + + node1.query(''' + DROP DICTIONARY IF EXISTS postgres_dict; + CREATE DICTIONARY postgres_dict (id UInt32, value UInt32) + PRIMARY KEY id + SOURCE(POSTGRESQL( + port 5432 + host 'postgres1' + user 'postgres' + password 'mysecretpassword' + dbbb 'clickhouse' + table 'test_schema.test_table')) + LIFETIME(MIN 1 MAX 2) + LAYOUT(HASHED()); + ''') + + node1.query_and_get_error("SELECT dictGetUInt32(postgres_dict, 'value', toUInt64(1))") + assert node1.contains_in_log('Unexpected key `dbbb`') + + if __name__ == '__main__': cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_dictionaries_update_and_reload/test.py b/tests/integration/test_dictionaries_update_and_reload/test.py index 8e375b7b327..9bee5db8ce1 100644 --- a/tests/integration/test_dictionaries_update_and_reload/test.py +++ b/tests/integration/test_dictionaries_update_and_reload/test.py @@ -203,7 +203,7 @@ def test_reload_after_fail_by_timer(started_cluster): instance.copy_file_to_container(os.path.join(SCRIPT_DIR, "configs/dictionaries/file.txt"), "/etc/clickhouse-server/dictionaries/no_file_2.txt") # Check that file appears in container and wait if needed. - while not instance.file_exists("/etc/clickhouse-server/dictionaries/no_file_2.txt"): + while not instance.path_exists("/etc/clickhouse-server/dictionaries/no_file_2.txt"): time.sleep(1) assert("9\t10\n" == instance.exec_in_container(["cat", "/etc/clickhouse-server/dictionaries/no_file_2.txt"])) instance.query("SYSTEM RELOAD DICTIONARY no_file_2") diff --git a/tests/integration/test_dotnet_client/__init__.py b/tests/integration/test_dotnet_client/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_dotnet_client/configs/config.xml b/tests/integration/test_dotnet_client/configs/config.xml new file mode 100644 index 00000000000..9bcadc43f10 --- /dev/null +++ b/tests/integration/test_dotnet_client/configs/config.xml @@ -0,0 +1,16 @@ + + + + trace + /var/log/clickhouse-server/clickhouse-server.log + /var/log/clickhouse-server/clickhouse-server.err.log + 1000M + 10 + + + 8123 + 127.0.0.1 + + ./clickhouse/ + users.xml + diff --git a/tests/integration/test_dotnet_client/configs/users.xml b/tests/integration/test_dotnet_client/configs/users.xml new file mode 100644 index 00000000000..1874371871a --- /dev/null +++ b/tests/integration/test_dotnet_client/configs/users.xml @@ -0,0 +1,32 @@ + + + + + + + + + + 123 + + ::/0 + + default + default + + + + + + ::/0 + + default + default + + + + + + + + diff --git a/tests/integration/test_dotnet_client/dotnet.reference b/tests/integration/test_dotnet_client/dotnet.reference new file mode 100644 index 00000000000..a3d6e1d5ba8 Binary files /dev/null and b/tests/integration/test_dotnet_client/dotnet.reference differ diff --git a/tests/integration/test_dotnet_client/test.py b/tests/integration/test_dotnet_client/test.py new file mode 100644 index 00000000000..4cc16ac826e --- /dev/null +++ b/tests/integration/test_dotnet_client/test.py @@ -0,0 +1,47 @@ +# coding: utf-8 + +import datetime +import math +import os +import time + +import logging +import docker +import pytest +from docker.models.containers import Container +from helpers.cluster import ClickHouseCluster, get_docker_compose_path, run_and_check + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +DOCKER_COMPOSE_PATH = get_docker_compose_path() + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance('node', + user_configs=["configs/users.xml"], env_variables={'UBSAN_OPTIONS': 'print_stacktrace=1'}) + +@pytest.fixture(scope="module") +def started_cluster(): + cluster.start() + try: + yield cluster + finally: + cluster.shutdown() + + +@pytest.fixture(scope='module') +def dotnet_container(): + docker_compose = os.path.join(DOCKER_COMPOSE_PATH, 'docker_compose_dotnet_client.yml') + run_and_check( + ['docker-compose', '-p', cluster.project_name, '-f', docker_compose, 'up', '--no-recreate', '-d', '--no-build']) + yield docker.from_env().containers.get(cluster.project_name + '_dotnet1_1') + + +def test_dotnet_client(started_cluster, dotnet_container): + with open(os.path.join(SCRIPT_DIR, 'dotnet.reference'), 'rb') as fp: + reference = fp.read() + + code, (stdout, stderr) = dotnet_container.exec_run( + 'dotnet run --host {host} --port {port} --user default --password 123 --database default' + .format(host=started_cluster.get_instance_ip('node'), port=8123), demux=True) + + assert code == 0 + assert stdout == reference diff --git a/tests/integration/test_executable_dictionary/__init__.py b/tests/integration/test_executable_dictionary/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_executable_dictionary/config/dictionaries_config.xml b/tests/integration/test_executable_dictionary/config/dictionaries_config.xml new file mode 100644 index 00000000000..3cbf717bb67 --- /dev/null +++ b/tests/integration/test_executable_dictionary/config/dictionaries_config.xml @@ -0,0 +1,2 @@ + + diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_argument_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_argument_python_dictionary.xml new file mode 100644 index 00000000000..ddbb8e95abb --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_argument_python_dictionary.xml @@ -0,0 +1,99 @@ + + + executable_input_argument_python + + + TabSeparated + input_argument.py 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_argument_pool_python + + + TabSeparated + input_argument.py 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_argument_python + + + TabSeparated + input_implicit_argument.py 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_argument_pool_python + + + TabSeparated + input_implicit_argument.py 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_bash_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_bash_dictionary.xml new file mode 100644 index 00000000000..488a12de115 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_bash_dictionary.xml @@ -0,0 +1,99 @@ + + + executable_input_bash + + + TabSeparated + input.sh + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_pool_bash + + + TabSeparated + input.sh + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_bash + + + TabSeparated + input_implicit.sh + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_pool_bash + + + TabSeparated + input_implicit.sh + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_python_dictionary.xml new file mode 100644 index 00000000000..5b551e51951 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_python_dictionary.xml @@ -0,0 +1,99 @@ + + + executable_input_python + + + TabSeparated + input.py + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_pool_python + + + TabSeparated + input.py + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_python + + + TabSeparated + input_implicit.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_pool_python + + + TabSeparated + input_implicit.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_send_chunk_header_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_send_chunk_header_python_dictionary.xml new file mode 100644 index 00000000000..816cb0db2c5 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_send_chunk_header_python_dictionary.xml @@ -0,0 +1,103 @@ + + + executable_input_send_chunk_header_python + + + TabSeparated + input_chunk_header.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_send_chunk_header_pool_python + + + TabSeparated + input_chunk_header.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_send_chunk_header_python + + + TabSeparated + input_implicit_chunk_header.py + 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_send_chunk_header_pool_python + + + TabSeparated + input_implicit_chunk_header.py + 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_signalled_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_signalled_python_dictionary.xml new file mode 100644 index 00000000000..71f8873b20e --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_signalled_python_dictionary.xml @@ -0,0 +1,103 @@ + + + executable_input_signalled_python + + + TabSeparated + input_signalled.py + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + + + executable_input_signalled_pool_python + + + TabSeparated + input_signalled.py + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + + + executable_implicit_input_signalled_python + + + TabSeparated + input_implicit_signalled.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + + + executable_implicit_input_signalled_pool_python + + + TabSeparated + input_implicit_signalled.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_slow_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_slow_python_dictionary.xml new file mode 100644 index 00000000000..dee161a9b78 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_slow_python_dictionary.xml @@ -0,0 +1,103 @@ + + + executable_input_slow_python + + + TabSeparated + input_slow.py + 1 + 1000 + + + + + + + + input + + + result + String + + + + + + + executable_input_slow_pool_python + + + TabSeparated + input_slow.py + 1 + 1000 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_slow_python + + + TabSeparated + input_implicit_slow.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_slow_pool_python + + + TabSeparated + input_implicit_slow.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_sum_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_sum_python_dictionary.xml new file mode 100644 index 00000000000..3f63e7b8671 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_sum_python_dictionary.xml @@ -0,0 +1,128 @@ + + + executable_input_sum_python + + + TabSeparated + input_sum.py + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + + + executable_input_sum_pool_python + + + TabSeparated + input_sum.py + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + + + + executable_implicit_input_sum_python + + + TabSeparated + input_implicit_sum.py + 1 + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + + + executable_implicit_input_sum_pool_python + + + TabSeparated + input_implicit_sum.py + 1 + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_non_direct_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_non_direct_dictionary.xml new file mode 100644 index 00000000000..3f77dae1ac6 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_non_direct_dictionary.xml @@ -0,0 +1,95 @@ + + + executable_input_non_direct_bash + + + TabSeparated + while read read_data; do printf "$read_data\tKey $read_data\n"; done + + + + + + + + input + + + result + String + + + + + + + executable_input_non_direct_pool_bash + + + TabSeparated + while read read_data; do printf "$read_data\tKey $read_data\n"; done + + + + + + + + input + + + result + String + + + + + + + executable_input_implicit_non_direct_bash + + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_implicit_non_direct_pool_bash + + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_source_argument_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_source_argument_python_dictionary.xml new file mode 100644 index 00000000000..3173eb5500d --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_source_argument_python_dictionary.xml @@ -0,0 +1,54 @@ + + + executable_source_simple_key_argument_python + + + TabSeparated + source_argument.py 1 + 1 + + + + + + 0 + + + input + + + result + String + + + + + + + executable_source_complex_key_argument_python + + + TabSeparated + source_argument.py 1 + 1 + + + + + + 0 + + + + input + UInt64 + + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_source_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_source_python_dictionary.xml new file mode 100644 index 00000000000..a2036fc67bb --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_source_python_dictionary.xml @@ -0,0 +1,54 @@ + + + executable_source_simple_key_python + + + TabSeparated + source.py + 1 + + + + + + 0 + + + input + + + result + String + + + + + + + executable_source_complex_key_python + + + TabSeparated + source.py + 1 + + + + + + 0 + + + + input + UInt64 + + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_source_updated_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_source_updated_python_dictionary.xml new file mode 100644 index 00000000000..10d1b1ca0c6 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_source_updated_python_dictionary.xml @@ -0,0 +1,56 @@ + + + executable_source_simple_key_update_python + + + TabSeparated + source_update.py + 1 + 1 + + + + + + 5 + + + input + + + result + String + + + + + + + executable_source_complex_key_update_python + + + TabSeparated + source_update.py + 1 + 1 + + + + + + 5 + + + + input + UInt64 + + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/test.py b/tests/integration/test_executable_dictionary/test.py new file mode 100644 index 00000000000..5e50a092a29 --- /dev/null +++ b/tests/integration/test_executable_dictionary/test.py @@ -0,0 +1,175 @@ +import os +import sys +import time + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance('node', stay_alive=True, main_configs=[]) + + +def skip_test_msan(instance): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with vfork") + +def copy_file_to_container(local_path, dist_path, container_id): + os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path)) + +config = ''' + /etc/clickhouse-server/dictionaries/*_dictionary.xml +''' + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + node.replace_config("/etc/clickhouse-server/config.d/dictionaries_config.xml", config) + + copy_file_to_container(os.path.join(SCRIPT_DIR, 'dictionaries/.'), '/etc/clickhouse-server/dictionaries', node.docker_id) + copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) + + node.restart_clickhouse() + + yield cluster + + finally: + cluster.shutdown() + +def test_executable_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_input_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_input_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_input_send_chunk_header_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_send_chunk_header_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_send_chunk_header_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_input_send_chunk_header_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_send_chunk_header_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_send_chunk_header_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_input_sum_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_sum_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + assert node.query("SELECT dictGet('executable_input_sum_pool_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + +def test_executable_implicit_input_sum_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_sum_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + assert node.query("SELECT dictGet('executable_implicit_input_sum_pool_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + +def test_executable_input_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_argument_python', 'result', toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT dictGet('executable_input_argument_pool_python', 'result', toUInt64(1))") == 'Key 1 1\n' + +def test_executable_implicit_input_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_argument_python', 'result', toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_argument_pool_python', 'result', toUInt64(1))") == 'Key 1 1\n' + +def test_executable_input_signalled_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_signalled_python', 'result', toUInt64(1))") == 'Default result\n' + assert node.query("SELECT dictGet('executable_input_signalled_pool_python', 'result', toUInt64(1))") == 'Default result\n' + +def test_executable_implicit_input_signalled_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_signalled_python', 'result', toUInt64(1))") == 'Default result\n' + assert node.query("SELECT dictGet('executable_implicit_input_signalled_pool_python', 'result', toUInt64(1))") == 'Default result\n' + +def test_executable_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_implicit_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_implicit_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_non_direct_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_non_direct_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_non_direct_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_non_direct_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_implicit_non_direct_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_implicit_non_direct_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_source_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_python) ORDER BY input") == '1\tValue 1\n2\tValue 2\n3\tValue 3\n' + assert node.query("SELECT dictGet('executable_source_simple_key_python', 'result', toUInt64(1))") == 'Value 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_python', 'result', toUInt64(2))") == 'Value 2\n' + assert node.query("SELECT dictGet('executable_source_simple_key_python', 'result', toUInt64(3))") == 'Value 3\n' + + assert node.query("SELECT * FROM dictionary('executable_source_complex_key_python') ORDER BY input") == '1\tValue 1\n2\tValue 2\n3\tValue 3\n' + assert node.query("SELECT dictGet('executable_source_complex_key_python', 'result', tuple(toUInt64(1)))") == 'Value 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_python', 'result', tuple(toUInt64(2)))") == 'Value 2\n' + assert node.query("SELECT dictGet('executable_source_complex_key_python', 'result', tuple(toUInt64(3)))") == 'Value 3\n' + +def test_executable_source_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_argument_python) ORDER BY input") == '1\tValue 1 1\n2\tValue 1 2\n3\tValue 1 3\n' + assert node.query("SELECT dictGet('executable_source_simple_key_argument_python', 'result', toUInt64(1))") == 'Value 1 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_argument_python', 'result', toUInt64(2))") == 'Value 1 2\n' + assert node.query("SELECT dictGet('executable_source_simple_key_argument_python', 'result', toUInt64(3))") == 'Value 1 3\n' + + assert node.query("SELECT * FROM dictionary(executable_source_complex_key_argument_python) ORDER BY input") == '1\tValue 1 1\n2\tValue 1 2\n3\tValue 1 3\n' + assert node.query("SELECT dictGet('executable_source_complex_key_argument_python', 'result', toUInt64(1))") == 'Value 1 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_argument_python', 'result', toUInt64(2))") == 'Value 1 2\n' + assert node.query("SELECT dictGet('executable_source_complex_key_argument_python', 'result', toUInt64(3))") == 'Value 1 3\n' + +def test_executable_source_updated_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_update_python) ORDER BY input") == '1\tValue 0 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_update_python', 'result', toUInt64(1))") == 'Value 0 1\n' + + time.sleep(10) + + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_update_python) ORDER BY input") == '1\tValue 1 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_update_python', 'result', toUInt64(1))") == 'Value 1 1\n' + + assert node.query("SELECT * FROM dictionary(executable_source_complex_key_update_python) ORDER BY input") == '1\tValue 0 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_update_python', 'result', toUInt64(1))") == 'Value 0 1\n' + + time.sleep(10) + + assert node.query("SELECT * FROM dictionary(executable_source_complex_key_update_python) ORDER BY input") == '1\tValue 1 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_update_python', 'result', toUInt64(1))") == 'Value 1 1\n' + diff --git a/tests/integration/test_executable_dictionary/user_scripts/input.py b/tests/integration/test_executable_dictionary/user_scripts/input.py new file mode 100755 index 00000000000..e711dd8e306 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input.py @@ -0,0 +1,11 @@ +#!/usr/bin/python3 + +import sys +import os +import signal + +if __name__ == '__main__': + for line in sys.stdin: + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input.sh b/tests/integration/test_executable_dictionary/user_scripts/input.sh new file mode 100755 index 00000000000..7712c392951 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "$read_data\tKey $read_data\n"; +done diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_argument.py b/tests/integration/test_executable_dictionary/user_scripts/input_argument.py new file mode 100755 index 00000000000..163f9c4183f --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_argument.py @@ -0,0 +1,11 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + str(arg) + " " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_chunk_header.py b/tests/integration/test_executable_dictionary/user_scripts/input_chunk_header.py new file mode 100755 index 00000000000..4eb00f64eb3 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_chunk_header.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + updated_line = line.replace('\n', '') + chunk_length -= 1 + print(updated_line + '\t' + "Key " + updated_line, end='\n') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.py new file mode 100755 index 00000000000..835ab1f441a --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.py @@ -0,0 +1,8 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit.sh b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.sh new file mode 100755 index 00000000000..aea51b82b1f --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key $read_data\n"; +done diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_argument.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_argument.py new file mode 100755 index 00000000000..c1b2e5966d7 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_argument.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + print("Key " + str(arg) + " " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_chunk_header.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_chunk_header.py new file mode 100755 index 00000000000..5dc03e1c507 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_chunk_header.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_signalled.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_signalled.py new file mode 100755 index 00000000000..27c8bc4840e --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_signalled.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_slow.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_slow.py new file mode 100755 index 00000000000..648a9eac918 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_slow.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(5) + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_sum.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_sum.py new file mode 100755 index 00000000000..432d7a13a2f --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_sum.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_signalled.py b/tests/integration/test_executable_dictionary/user_scripts/input_signalled.py new file mode 100755 index 00000000000..a3a99f1e71e --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_signalled.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_slow.py b/tests/integration/test_executable_dictionary/user_scripts/input_slow.py new file mode 100755 index 00000000000..a3b8c484b29 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_slow.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(5) + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_sum.py b/tests/integration/test_executable_dictionary/user_scripts/input_sum.py new file mode 100755 index 00000000000..e9ec5028701 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_sum.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + updated_line = line.replace('\n', '') + line_split = re.split(r'\t+', line) + sum = int(line_split[0]) + int(line_split[1]) + print(updated_line + '\t' + str(sum), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/source.py b/tests/integration/test_executable_dictionary/user_scripts/source.py new file mode 100755 index 00000000000..e105773c467 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/source.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + print('1' + '\t' + 'Value 1', end='\n') + print('2' + '\t' + 'Value 2', end='\n') + print('3' + '\t' + 'Value 3', end='\n') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/source_argument.py b/tests/integration/test_executable_dictionary/user_scripts/source_argument.py new file mode 100755 index 00000000000..881e73adc97 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/source_argument.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + print('1' + '\t' + 'Value ' + str(arg) + ' 1', end='\n') + print('2' + '\t' + 'Value ' + str(arg) + ' 2', end='\n') + print('3' + '\t' + 'Value ' + str(arg) + ' 3', end='\n') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/source_update.py b/tests/integration/test_executable_dictionary/user_scripts/source_update.py new file mode 100755 index 00000000000..99388f9ada3 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/source_update.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + update_field_value = 0 + + if len(sys.argv) >= 2: + update_field_value = int(sys.argv[1]) + + print('1' + '\t' + 'Value ' + str(update_field_value) + ' 1', end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/test.py b/tests/integration/test_executable_table_function/test.py index f5537e26b94..7820396d20f 100644 --- a/tests/integration/test_executable_table_function/test.py +++ b/tests/integration/test_executable_table_function/test.py @@ -1,6 +1,5 @@ import os import sys -import time import pytest @@ -30,69 +29,353 @@ def started_cluster(): copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) node.restart_clickhouse() + node.query("CREATE TABLE test_data_table (id UInt64) ENGINE=TinyLog;") + node.query("INSERT INTO test_data_table VALUES (0), (1), (2);") + yield cluster finally: cluster.shutdown() -def test_executable_function_no_input(started_cluster): +def test_executable_function_no_input_bash(started_cluster): skip_test_msan(node) - assert node.query("SELECT * FROM executable('test_no_input.sh', 'TabSeparated', 'value UInt64')") == '1\n' + assert node.query("SELECT * FROM executable('no_input.sh', 'TabSeparated', 'value String')") == 'Key 0\nKey 1\nKey 2\n' -def test_executable_function_input(started_cluster): +def test_executable_function_no_input_python(started_cluster): skip_test_msan(node) - assert node.query("SELECT * FROM executable('test_input.sh', 'TabSeparated', 'value String', (SELECT 1))") == 'Key 1\n' + assert node.query("SELECT * FROM executable('no_input.py', 'TabSeparated', 'value String')") == 'Key 0\nKey 1\nKey 2\n' -def test_executable_function_input_multiple_pipes(started_cluster): +def test_executable_function_input_bash(started_cluster): skip_test_msan(node) - actual = node.query("SELECT * FROM executable('test_input_multiple_pipes.sh', 'TabSeparated', 'value String', (SELECT 1), (SELECT 2), (SELECT 3))") + + query = "SELECT * FROM executable('input.sh', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == 'Key 1\n' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == 'Key 0\nKey 1\nKey 2\n' + +def test_executable_function_input_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input.py', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == 'Key 1\n' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == 'Key 0\nKey 1\nKey 2\n' + +def test_executable_function_input_sum_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_sum.py', 'TabSeparated', 'value UInt64', {source})" + assert node.query(query.format(source='(SELECT 1, 1)')) == '2\n' + assert node.query(query.format(source='(SELECT id, id FROM test_data_table)')) == '0\n2\n4\n' + +def test_executable_function_input_argument_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_argument.py 1', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == 'Key 1 1\n' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == 'Key 1 0\nKey 1 1\nKey 1 2\n' + +def test_executable_function_input_signalled_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_signalled.py', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == '' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == '' + +def test_executable_function_input_slow_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_slow.py', 'TabSeparated', 'value String', {source})" + assert node.query_and_get_error(query.format(source='(SELECT 1)')) + assert node.query_and_get_error(query.format(source='(SELECT id FROM test_data_table)')) + +def test_executable_function_input_multiple_pipes_python(started_cluster): + skip_test_msan(node) + query = "SELECT * FROM executable('input_multiple_pipes.py', 'TabSeparated', 'value String', {source})" + actual = node.query(query.format(source='(SELECT 1), (SELECT 2), (SELECT 3)')) expected = 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' assert actual == expected -def test_executable_function_argument(started_cluster): - skip_test_msan(node) - assert node.query("SELECT * FROM executable('test_argument.sh 1', 'TabSeparated', 'value String')") == 'Key 1\n' - -def test_executable_storage_no_input(started_cluster): - skip_test_msan(node) - node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value UInt64) ENGINE=Executable('test_no_input.sh', 'TabSeparated')") - assert node.query("SELECT * FROM test_table") == '1\n' - node.query("DROP TABLE test_table") - -def test_executable_storage_input(started_cluster): - skip_test_msan(node) - node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=Executable('test_input.sh', 'TabSeparated', (SELECT 1))") - assert node.query("SELECT * FROM test_table") == 'Key 1\n' - node.query("DROP TABLE test_table") - -def test_executable_storage_input_multiple_pipes(started_cluster): - skip_test_msan(node) - node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=Executable('test_input_multiple_pipes.sh', 'TabSeparated', (SELECT 1), (SELECT 2), (SELECT 3))") - actual = node.query("SELECT * FROM test_table") - expected = 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + actual = node.query(query.format(source='(SELECT id FROM test_data_table), (SELECT 2), (SELECT 3)')) + expected = 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' assert actual == expected - node.query("DROP TABLE test_table") -def test_executable_storage_argument(started_cluster): +def test_executable_storage_no_input_bash(started_cluster): skip_test_msan(node) node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=Executable('test_argument.sh 1', 'TabSeparated')") + node.query("CREATE TABLE test_table (value String) ENGINE=Executable('no_input.sh', 'TabSeparated')") + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_no_input_python(started_cluster): + skip_test_msan(node) + node.query("DROP TABLE IF EXISTS test_table") + node.query("CREATE TABLE test_table (value String) ENGINE=Executable('no_input.py', 'TabSeparated')") + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_bash(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input.sh', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) assert node.query("SELECT * FROM test_table") == 'Key 1\n' node.query("DROP TABLE test_table") -def test_executable_pool_storage(started_cluster): + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_python(started_cluster): skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input.py', 'TabSeparated', {source})" + node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=ExecutablePool('test_input_process_pool.sh', 'TabSeparated', (SELECT 1))") + node.query(query.format(source='(SELECT 1)')) assert node.query("SELECT * FROM test_table") == 'Key 1\n' node.query("DROP TABLE test_table") -def test_executable_pool_storage_multiple_pipes(started_cluster): + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_send_chunk_header_python(started_cluster): skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_chunk_header.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1" + node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=ExecutablePool('test_input_process_pool_multiple_pipes.sh', 'TabSeparated', (SELECT 1), (SELECT 2), (SELECT 3))") + node.query(query.format(source='(SELECT 1)')) + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_sum_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value UInt64) ENGINE=Executable('input_sum.py', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1, 1)')) + assert node.query("SELECT * FROM test_table") == '2\n' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id, id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_argument_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_argument.py 1', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_signalled_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_signalled.py', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + assert node.query("SELECT * FROM test_table") == '' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == '' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_slow_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_slow.py', 'TabSeparated', {source}) SETTINGS command_read_timeout=2500" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + assert node.query_and_get_error("SELECT * FROM test_table") + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query_and_get_error("SELECT * FROM test_table") + node.query("DROP TABLE test_table") + +def test_executable_function_input_multiple_pipes_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_multiple_pipes.py', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1), (SELECT 2), (SELECT 3)')) assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table), (SELECT 2), (SELECT 3)')) + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_sum_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value UInt64) ENGINE=ExecutablePool('input_sum_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1, 1)')) + + assert node.query("SELECT * FROM test_table") == '2\n' + assert node.query("SELECT * FROM test_table") == '2\n' + assert node.query("SELECT * FROM test_table") == '2\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id, id FROM test_data_table)')) + + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_argument_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_argument_pool.py 1', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_signalled_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_signalled_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_slow_python(started_cluster): + skip_test_msan(node) + + query = """CREATE TABLE test_table (value String) + ENGINE=ExecutablePool('input_slow_pool.py', 'TabSeparated', {source}) + SETTINGS send_chunk_header=1, pool_size=1, command_read_timeout=2500""" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_multiple_pipes_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_multiple_pipes_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1), (SELECT 2), (SELECT 3)')) + + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table), (SELECT 2), (SELECT 3)')) + + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_count_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_count_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query("SELECT * FROM test_table") == '1\n' + assert node.query("SELECT * FROM test_table") == '1\n' + assert node.query("SELECT * FROM test_table") == '1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT number FROM system.numbers LIMIT 250000)')) + + assert node.query("SELECT * FROM test_table") == '250000\n' + assert node.query("SELECT * FROM test_table") == '250000\n' + assert node.query("SELECT * FROM test_table") == '250000\n' + + node.query("DROP TABLE test_table") diff --git a/tests/integration/test_executable_table_function/user_scripts/input.py b/tests/integration/test_executable_table_function/user_scripts/input.py new file mode 100755 index 00000000000..835ab1f441a --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input.py @@ -0,0 +1,8 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/test_input.sh b/tests/integration/test_executable_table_function/user_scripts/input.sh similarity index 100% rename from tests/integration/test_executable_table_function/user_scripts/test_input.sh rename to tests/integration/test_executable_table_function/user_scripts/input.sh diff --git a/tests/integration/test_executable_table_function/user_scripts/input_argument.py b/tests/integration/test_executable_table_function/user_scripts/input_argument.py new file mode 100755 index 00000000000..c1b2e5966d7 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_argument.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + print("Key " + str(arg) + " " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_argument_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_argument_pool.py new file mode 100755 index 00000000000..378a6ef4391 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_argument_pool.py @@ -0,0 +1,17 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + str(arg) + " " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_chunk_header.py b/tests/integration/test_executable_table_function/user_scripts/input_chunk_header.py new file mode 100755 index 00000000000..5dc03e1c507 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_chunk_header.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_count_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_count_pool.py new file mode 100755 index 00000000000..8b744168a82 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_count_pool.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(1, end='\n') + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes.py b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes.py new file mode 100755 index 00000000000..64590cbc16a --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes.py @@ -0,0 +1,19 @@ +#!/usr/bin/python3 + +import sys +import os + +if __name__ == '__main__': + fd3 = os.fdopen(3) + fd4 = os.fdopen(4) + + for line in fd4: + print("Key from 4 fd " + line, end='') + + for line in fd3: + print("Key from 3 fd " + line, end='') + + for line in sys.stdin: + print("Key from 0 fd " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes_pool.py new file mode 100755 index 00000000000..a3a515899f9 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes_pool.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 + +import sys +import os + +if __name__ == '__main__': + fd3 = os.fdopen(3) + fd4 = os.fdopen(4) + + lines = [] + + for chunk_header_fd4 in fd4: + fd4_chunk_length = int(chunk_header_fd4) + + while fd4_chunk_length != 0: + line = fd4.readline() + fd4_chunk_length -= 1 + lines.append("Key from 4 fd " + line) + + for chunk_header_fd3 in fd3: + fd3_chunk_length = int(chunk_header_fd3) + + while fd3_chunk_length != 0: + line = fd3.readline() + fd3_chunk_length -= 1 + lines.append("Key from 3 fd " + line) + + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + lines.append("Key from 0 fd " + line) + + break + break + + print(str(len(lines)), end='\n') + + for line in lines: + print(line, end='') + lines.clear() + + sys.stdout.flush() \ No newline at end of file diff --git a/tests/integration/test_executable_table_function/user_scripts/input_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_pool.py new file mode 100755 index 00000000000..ec4e9af23cd --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_pool.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_signalled.py b/tests/integration/test_executable_table_function/user_scripts/input_signalled.py new file mode 100755 index 00000000000..93ce20fa8e7 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_signalled.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import os +import signal + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_signalled_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_signalled_pool.py new file mode 100755 index 00000000000..1ea0eddbd8d --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_signalled_pool.py @@ -0,0 +1,19 @@ +#!/usr/bin/python3 + +import sys +import os +import signal + +if __name__ == '__main__': + for chunk_header in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_slow.py b/tests/integration/test_executable_table_function/user_scripts/input_slow.py new file mode 100755 index 00000000000..4c2abe89e33 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_slow.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(25) + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_slow_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_slow_pool.py new file mode 100755 index 00000000000..c8df7e18c4c --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_slow_pool.py @@ -0,0 +1,18 @@ +#!/usr/bin/python3 + +import sys +import time + +if __name__ == '__main__': + for chunk_header in sys.stdin: + time.sleep(25) + + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_sum.py b/tests/integration/test_executable_table_function/user_scripts/input_sum.py new file mode 100755 index 00000000000..432d7a13a2f --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_sum.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_sum_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_sum_pool.py new file mode 100755 index 00000000000..cd0de25fe87 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_sum_pool.py @@ -0,0 +1,17 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + chunk_length -= 1 + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/no_input.py b/tests/integration/test_executable_table_function/user_scripts/no_input.py new file mode 100755 index 00000000000..65b78f3d755 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/no_input.py @@ -0,0 +1,9 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + print("Key 0") + print("Key 1") + print("Key 2") + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/no_input.sh b/tests/integration/test_executable_table_function/user_scripts/no_input.sh new file mode 100755 index 00000000000..13d172a5be4 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/no_input.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +printf "Key 0\n"; +printf "Key 1\n"; +printf "Key 2\n"; diff --git a/tests/integration/test_executable_table_function/user_scripts/test_argument.sh b/tests/integration/test_executable_table_function/user_scripts/test_argument.sh deleted file mode 100755 index 89634031d2b..00000000000 --- a/tests/integration/test_executable_table_function/user_scripts/test_argument.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -echo "Key $1" diff --git a/tests/integration/test_executable_table_function/user_scripts/test_input_multiple_pipes.sh b/tests/integration/test_executable_table_function/user_scripts/test_input_multiple_pipes.sh deleted file mode 100755 index 1e53e3211dc..00000000000 --- a/tests/integration/test_executable_table_function/user_scripts/test_input_multiple_pipes.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -while read -t 250 -u 4 read_data; do printf "Key from 4 fd $read_data\n"; done -while read -t 250 -u 3 read_data; do printf "Key from 3 fd $read_data\n"; done -while read -t 250 read_data; do printf "Key from 0 fd $read_data\n"; done diff --git a/tests/integration/test_executable_table_function/user_scripts/test_no_input.sh b/tests/integration/test_executable_table_function/user_scripts/test_no_input.sh deleted file mode 100755 index 9e8b3be63d6..00000000000 --- a/tests/integration/test_executable_table_function/user_scripts/test_no_input.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -echo "1" diff --git a/tests/integration/test_executable_user_defined_function/__init__.py b/tests/integration/test_executable_user_defined_function/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_executable_user_defined_function/config/executable_user_defined_functions_config.xml b/tests/integration/test_executable_user_defined_function/config/executable_user_defined_functions_config.xml new file mode 100644 index 00000000000..3cbf717bb67 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/config/executable_user_defined_functions_config.xml @@ -0,0 +1,2 @@ + + diff --git a/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml b/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml new file mode 100644 index 00000000000..d8f81a588a2 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml @@ -0,0 +1,196 @@ + + + executable + test_function_bash + String + + UInt64 + + TabSeparated + input.sh + + + + executable_pool + test_function_pool_bash + String + + UInt64 + + TabSeparated + input.sh + + + + executable + test_function_python + String + + UInt64 + + TabSeparated + input.py + + + + executable_pool + test_function_pool_python + String + + UInt64 + + TabSeparated + input.py + + + + executable + test_function_send_chunk_header_python + String + + UInt64 + + TabSeparated + 1 + input_chunk_header.py + + + + executable_pool + test_function_send_chunk_header_pool_python + String + + UInt64 + + TabSeparated + 1 + input_chunk_header.py + + + + executable + test_function_sum_python + String + + UInt64 + + + UInt64 + + TabSeparated + input_sum.py + + + + executable_pool + test_function_sum_pool_python + String + + UInt64 + + + UInt64 + + TabSeparated + input_sum.py + + + + executable + test_function_argument_python + String + + UInt64 + + TabSeparated + input_argument.py 1 + + + + executable_pool + test_function_argument_pool_python + String + + UInt64 + + TabSeparated + input_argument.py 1 + + + + executable + test_function_slow_python + String + + UInt64 + + TabSeparated + input_slow.py + 1 + 1000 + + + + executable_pool + test_function_slow_pool_python + String + + UInt64 + + TabSeparated + input_slow.py + 1 + 1000 + + + + executable + test_function_signalled_python + String + + UInt64 + + TabSeparated + input_signalled.py + 1 + 1000 + + + + executable_pool + test_function_signalled_pool_python + String + + UInt64 + + TabSeparated + input_signalled.py + 1 + 1000 + + + + executable + test_function_non_direct_bash + String + + UInt64 + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 0 + + + + executable_pool + test_function_non_direct_pool_bash + String + + UInt64 + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 0 + + + diff --git a/tests/integration/test_executable_user_defined_function/test.py b/tests/integration/test_executable_user_defined_function/test.py new file mode 100644 index 00000000000..94afdf8d8a9 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/test.py @@ -0,0 +1,106 @@ +import os +import sys +import time + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance('node', stay_alive=True, main_configs=[]) + + +def skip_test_msan(instance): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with vfork") + +def copy_file_to_container(local_path, dist_path, container_id): + os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path)) + +config = ''' + /etc/clickhouse-server/functions/test_function_config.xml +''' + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + node.replace_config("/etc/clickhouse-server/config.d/executable_user_defined_functions_config.xml", config) + + copy_file_to_container(os.path.join(SCRIPT_DIR, 'functions/.'), '/etc/clickhouse-server/functions', node.docker_id) + copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) + + node.restart_clickhouse() + + yield cluster + + finally: + cluster.shutdown() + +def test_executable_function_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_bash(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_pool_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_pool_bash(1)") == 'Key 1\n' + +def test_executable_function_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_python(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_pool_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_pool_python(1)") == 'Key 1\n' + +def test_executable_function_send_chunk_header_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_send_chunk_header_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_send_chunk_header_python(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_send_chunk_header_pool_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_send_chunk_header_pool_python(1)") == 'Key 1\n' + +def test_executable_function_sum_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_sum_python(toUInt64(1), toUInt64(1))") == '2\n' + assert node.query("SELECT test_function_sum_python(1, 1)") == '2\n' + + assert node.query("SELECT test_function_sum_pool_python(toUInt64(1), toUInt64(1))") == '2\n' + assert node.query("SELECT test_function_sum_pool_python(1, 1)") == '2\n' + +def test_executable_function_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_argument_python(toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT test_function_argument_python(1)") == 'Key 1 1\n' + + assert node.query("SELECT test_function_argument_pool_python(toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT test_function_argument_pool_python(1)") == 'Key 1 1\n' + +def test_executable_function_signalled_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT test_function_signalled_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_signalled_python(1)") + + assert node.query_and_get_error("SELECT test_function_signalled_pool_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_signalled_pool_python(1)") + +def test_executable_function_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT test_function_slow_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_slow_python(1)") + + assert node.query_and_get_error("SELECT test_function_slow_pool_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_slow_pool_python(1)") + +def test_executable_function_non_direct_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_non_direct_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_non_direct_bash(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_non_direct_pool_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_non_direct_pool_bash(1)") == 'Key 1\n' diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input.py b/tests/integration/test_executable_user_defined_function/user_scripts/input.py new file mode 100755 index 00000000000..835ab1f441a --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input.py @@ -0,0 +1,8 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input.sh b/tests/integration/test_executable_user_defined_function/user_scripts/input.sh new file mode 100755 index 00000000000..aea51b82b1f --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key $read_data\n"; +done diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_argument.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_argument.py new file mode 100755 index 00000000000..c1b2e5966d7 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_argument.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + print("Key " + str(arg) + " " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_chunk_header.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_chunk_header.py new file mode 100755 index 00000000000..5dc03e1c507 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_chunk_header.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_signalled.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_signalled.py new file mode 100755 index 00000000000..27c8bc4840e --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_signalled.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_slow.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_slow.py new file mode 100755 index 00000000000..648a9eac918 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_slow.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(5) + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_sum.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_sum.py new file mode 100755 index 00000000000..432d7a13a2f --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_sum.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml index f2a7d6e67b1..d0bd6e5ab88 100644 --- a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml +++ b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml @@ -7,8 +7,7 @@ UInt64 TabSeparated - while read read_data; do printf "Key_1 $read_data\n"; done - 0 + test_input_1.sh diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml index fe02146a6b8..80ae21a086d 100644 --- a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml +++ b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml @@ -7,8 +7,7 @@ UInt64 TabSeparated - while read read_data; do printf "Key_2 $read_data\n"; done - 0 + test_input_2.sh diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/test.py b/tests/integration/test_executable_user_defined_functions_config_reload/test.py index 3117b3e72b1..629c426a28c 100644 --- a/tests/integration/test_executable_user_defined_functions_config_reload/test.py +++ b/tests/integration/test_executable_user_defined_functions_config_reload/test.py @@ -28,6 +28,8 @@ def started_cluster(): cluster.start() copy_file_to_container(os.path.join(SCRIPT_DIR, 'functions/.'), '/etc/clickhouse-server/functions', node.docker_id) + copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) + node.restart_clickhouse() yield cluster diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_1.sh b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_1.sh new file mode 100755 index 00000000000..a6cffe83bba --- /dev/null +++ b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_1.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key_1 $read_data\n"; +done diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_2.sh b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_2.sh new file mode 100755 index 00000000000..a673cfd18fb --- /dev/null +++ b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_2.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key_2 $read_data\n"; +done diff --git a/tests/integration/test_http_handlers_config/test.py b/tests/integration/test_http_handlers_config/test.py index 818a1e54640..01872a1d0c3 100644 --- a/tests/integration/test_http_handlers_config/test.py +++ b/tests/integration/test_http_handlers_config/test.py @@ -58,9 +58,9 @@ def test_predefined_query_handler(): 'test_predefined_handler_get?max_threads=1&setting_name=max_threads', method='GET', headers={'XXX': 'xxx'}).content - assert b'max_threads\t1\nmax_alter_threads\t1\n' == cluster.instance.http_request( - 'query_param_with_url/max_threads?max_threads=1&max_alter_threads=1', - headers={'XXX': 'max_alter_threads'}).content + assert b'max_final_threads\t1\nmax_threads\t1\n' == cluster.instance.http_request( + 'query_param_with_url/max_threads?max_threads=1&max_final_threads=1', + headers={'XXX': 'max_final_threads'}).content def test_fixed_static_handler(): diff --git a/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml b/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml index 3e4c885d1f6..3adba1d402a 100644 --- a/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml +++ b/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml @@ -1,4 +1,23 @@ - - 3000000000 + + 4000000000 + + + + + + + + + + + + + + diff --git a/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py b/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py index bc7f32bf544..1c686c7982e 100644 --- a/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py +++ b/tests/integration/test_input_format_parallel_parsing_memory_tracking/test.py @@ -24,16 +24,13 @@ def start_cluster(): # max_memory_usage_for_user cannot be used, since the memory for user accounted -# correctly, only total is not +# correctly, only total is not (it is set via conf.xml) def test_memory_tracking_total(): - instance.query(''' - CREATE TABLE null (row String) ENGINE=Null; - ''') + instance.query('CREATE TABLE null (row String) ENGINE=Null') instance.exec_in_container(['bash', '-c', 'clickhouse local -q "SELECT arrayStringConcat(arrayMap(x->toString(cityHash64(x)), range(1000)), \' \') from numbers(10000)" > data.json']) for it in range(0, 20): # the problem can be triggered only via HTTP, # since clickhouse-client parses the data by itself. assert instance.exec_in_container(['curl', '--silent', '--show-error', '--data-binary', '@data.json', - 'http://127.1:8123/?query=INSERT%20INTO%20null%20FORMAT%20TSV']) == '', 'Failed on {} iteration'.format( - it) + 'http://127.1:8123/?query=INSERT%20INTO%20null%20FORMAT%20TSV']) == '', f'Failed on {it} iteration' diff --git a/tests/integration/test_jemalloc_percpu_arena/test.py b/tests/integration/test_jemalloc_percpu_arena/test.py index 19ce520295a..bdd0ada966f 100755 --- a/tests/integration/test_jemalloc_percpu_arena/test.py +++ b/tests/integration/test_jemalloc_percpu_arena/test.py @@ -50,7 +50,6 @@ def skip_if_jemalloc_disabled(): if output != b'ON' and output != b'1': pytest.skip(f'Compiled w/o jemalloc (USE_JEMALLOC={output})') - # Ensure that clickhouse works even when number of online CPUs # (_SC_NPROCESSORS_ONLN) is smaller then available (_SC_NPROCESSORS_CONF). # diff --git a/tests/integration/test_keeper_auth/test.py b/tests/integration/test_keeper_auth/test.py index 276fe3d8518..6be78f95483 100644 --- a/tests/integration/test_keeper_auth/test.py +++ b/tests/integration/test_keeper_auth/test.py @@ -36,6 +36,38 @@ def get_genuine_zk(): get_fake_zk ] ) + +def test_remove_acl(started_cluster, get_zk): + auth_connection = get_zk() + + auth_connection.add_auth('digest', 'user1:password1') + + # Consistent with zookeeper, accept generated digest + auth_connection.create("/test_remove_acl1", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", read=True, write=False, create=False, delete=False, admin=False)]) + auth_connection.create("/test_remove_acl2", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", read=True, write=True, create=False, delete=False, admin=False)]) + auth_connection.create("/test_remove_acl3", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", all=True)]) + + auth_connection.delete("/test_remove_acl2") + + auth_connection.create("/test_remove_acl4", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", read=True, write=True, create=True, delete=False, admin=False)]) + + acls, stat = auth_connection.get_acls("/test_remove_acl3") + + assert stat.aversion == 0 + assert len(acls) == 1 + for acl in acls: + assert acl.acl_list == ['ALL'] + assert acl.perms == 31 + + +@pytest.mark.parametrize( + ('get_zk'), + [ + get_genuine_zk, + get_fake_zk + ] +) + def test_digest_auth_basic(started_cluster, get_zk): auth_connection = get_zk() @@ -43,12 +75,11 @@ def test_digest_auth_basic(started_cluster, get_zk): auth_connection.create("/test_no_acl", b"") auth_connection.create("/test_all_acl", b"data", acl=[make_acl("auth", "", all=True)]) - # for some reason original zookeeper accepts this ACL, but doesn't allow to do anything with this node - # even with correct credentials. - auth_connection.create("/test_all_digest_acl", b"dataX", acl=[make_acl("digest", "user1:password1", all=True)]) + # Consistent with zookeeper, accept generated digest + auth_connection.create("/test_all_digest_acl", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", all=True)]) assert auth_connection.get("/test_all_acl")[0] == b"data" - #assert auth_connection.get("/test_all_digest_acl")[0] == b"dataX" + assert auth_connection.get("/test_all_digest_acl")[0] == b"dataX" no_auth_connection = get_zk() no_auth_connection.set("/test_no_acl", b"hello") diff --git a/tests/integration/test_log_lz4_streaming/test.py b/tests/integration/test_log_lz4_streaming/test.py index 7f2f22f28c9..75b46a378c5 100644 --- a/tests/integration/test_log_lz4_streaming/test.py +++ b/tests/integration/test_log_lz4_streaming/test.py @@ -18,7 +18,7 @@ def started_cluster(): def check_log_file(): - assert node.file_exists("/var/log/clickhouse-server/clickhouse-server.log.lz4") + assert node.path_exists("/var/log/clickhouse-server/clickhouse-server.log.lz4") lz4_output = node.exec_in_container(["bash", "-c", "lz4 -t /var/log/clickhouse-server/clickhouse-server.log.lz4 2>&1"], user='root') assert lz4_output.count('Error') == 0, lz4_output diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index 7265105c8df..1528103e1cb 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -1079,9 +1079,41 @@ def table_overrides(clickhouse_node, mysql_node, service_name): check_query(clickhouse_node, "SELECT count() FROM table_overrides.t1", "1000\n") mysql_node.query("INSERT INTO table_overrides.t1 VALUES(1001, '2021-10-01 00:00:00', 42.0)") check_query(clickhouse_node, "SELECT count() FROM table_overrides.t1", "1001\n") + + explain_with_table_func = f"EXPLAIN TABLE OVERRIDE mysql('{service_name}:3306', 'table_overrides', 't1', 'root', 'clickhouse')" + + for what in ['ORDER BY', 'PRIMARY KEY', 'SAMPLE BY', 'PARTITION BY', 'TTL']: + with pytest.raises(QueryRuntimeException) as exc: + clickhouse_node.query(f"{explain_with_table_func} {what} temperature") + assert f'{what} override refers to nullable column `temperature`' in \ + str(exc.value) + assert f"{what} uses columns: `temperature` Nullable(Float32)" in \ + clickhouse_node.query(f"{explain_with_table_func} {what} assumeNotNull(temperature)") + + for testcase in [ + ('COLUMNS (temperature Nullable(Float32) MATERIALIZED 1.0)', + 'column `temperature`: modifying default specifier is not allowed'), + ('COLUMNS (sensor_id UInt64 ALIAS 42)', + 'column `sensor_id`: modifying default specifier is not allowed') + ]: + with pytest.raises(QueryRuntimeException) as exc: + clickhouse_node.query(f"{explain_with_table_func} {testcase[0]}") + assert testcase[1] in str(exc.value) + + for testcase in [ + ('COLUMNS (temperature Nullable(Float64))', + 'Modified columns: `temperature` Nullable(Float32) -> Nullable(Float64)'), + ('COLUMNS (temp_f Nullable(Float32) ALIAS if(temperature IS NULL, NULL, (temperature * 9.0 / 5.0) + 32),\ + temp_k Nullable(Float32) ALIAS if(temperature IS NULL, NULL, temperature + 273.15))', + 'Added columns: `temp_f` Nullable(Float32), `temp_k` Nullable(Float32)') + ]: + assert testcase[1] in clickhouse_node.query( + f"{explain_with_table_func} {testcase[0]}") + clickhouse_node.query("DROP DATABASE IF EXISTS table_overrides") mysql_node.query("DROP DATABASE IF EXISTS table_overrides") + def materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, mysql_node, service_name): mysql_node.query("DROP DATABASE IF EXISTS test_database_datatype") clickhouse_node.query("DROP DATABASE IF EXISTS test_database_datatype") diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index 80cfed7d17c..92b9d52cf86 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -32,6 +32,27 @@ def cluster(): finally: cluster.shutdown() +# Note: use this for selects and inserts and create table queries. +# For inserts there is no guarantee that retries will not result in duplicates. +# But it is better to retry anyway because 'Connection was closed by the server' error +# happens in fact only for inserts because reads already have build-in retries in code. +def azure_query(node, query, try_num=3): + for i in range(try_num): + try: + return node.query(query) + except Exception as ex: + retriable_errors = [ + 'DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response', + ] + retry = False + for error in retriable_errors: + if error in str(ex): + retry = True + logging.info(f"Try num: {i}. Having retriable error: {ex}") + break + if not retry or i == try_num - 1: + raise Exception(ex) + continue def create_table(node, table_name, **additional_settings): settings = { @@ -53,8 +74,8 @@ def create_table(node, table_name, **additional_settings): SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))}""" node.query(f"DROP TABLE IF EXISTS {table_name}") - node.query(create_table_statement) - assert node.query(f"SELECT COUNT(*) FROM {table_name} FORMAT Values") == "(0)" + azure_query(node, create_table_statement) + assert azure_query(node, f"SELECT COUNT(*) FROM {table_name} FORMAT Values") == "(0)" def test_create_table(cluster): @@ -68,13 +89,13 @@ def test_read_after_cache_is_wiped(cluster): values = "('2021-11-13',3,'hello'),('2021-11-14',4,'heyo')" - node.query(f"INSERT INTO {TABLE_NAME} VALUES {values}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {values}") # Wipe cache cluster.exec_in_container(cluster.get_container_id(NODE_NAME), ["rm", "-rf", "/var/lib/clickhouse/disks/blob_storage_disk/cache/"]) # After cache is populated again, only .bin files should be accessed from Blob Storage. - assert node.query(f"SELECT * FROM {TABLE_NAME} order by dt, id FORMAT Values") == values + assert azure_query(node, f"SELECT * FROM {TABLE_NAME} order by dt, id FORMAT Values") == values def test_simple_insert_select(cluster): @@ -82,8 +103,8 @@ def test_simple_insert_select(cluster): create_table(node, TABLE_NAME) values = "('2021-11-13',3,'hello')" - node.query(f"INSERT INTO {TABLE_NAME} VALUES {values}") - assert node.query(f"SELECT dt, id, data FROM {TABLE_NAME} FORMAT Values") == values + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {values}") + assert azure_query(node, f"SELECT dt, id, data FROM {TABLE_NAME} FORMAT Values") == values blob_container_client = cluster.blob_service_client.get_container_client(CONTAINER_NAME) assert len(list(blob_container_client.list_blobs())) >= 12 # 1 format file + 2 skip index files + 9 regular MergeTree files + leftovers from other tests @@ -93,14 +114,14 @@ def test_inserts_selects(cluster): create_table(node, TABLE_NAME) values1 = generate_values('2020-01-03', 4096) - node.query(f"INSERT INTO {TABLE_NAME} VALUES {values1}") - assert node.query(f"SELECT * FROM {TABLE_NAME} order by dt, id FORMAT Values") == values1 + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {values1}") + assert azure_query(node, f"SELECT * FROM {TABLE_NAME} order by dt, id FORMAT Values") == values1 values2 = generate_values('2020-01-04', 4096) - node.query(f"INSERT INTO {TABLE_NAME} VALUES {values2}") - assert node.query(f"SELECT * FROM {TABLE_NAME} ORDER BY dt, id FORMAT Values") == values1 + "," + values2 + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {values2}") + assert azure_query(node, f"SELECT * FROM {TABLE_NAME} ORDER BY dt, id FORMAT Values") == values1 + "," + values2 - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} where id = 1 FORMAT Values") == "(2)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} where id = 1 FORMAT Values") == "(2)" @pytest.mark.parametrize( @@ -118,20 +139,20 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical): create_table(node, TABLE_NAME, **settings) node.query(f"SYSTEM STOP MERGES {TABLE_NAME}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 1024)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 2048)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 1024, -1)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 2048, -1)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096, -1)}") - assert node.query(f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" - assert node.query(f"SELECT count(distinct(id)) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 1024)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 2048)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 1024, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 2048, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096, -1)}") + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(distinct(id)) FROM {TABLE_NAME} FORMAT Values") == "(8192)" node.query(f"SYSTEM START MERGES {TABLE_NAME}") # Wait for merges and old parts deletion for attempt in range(0, 10): - parts_count = node.query(f"SELECT COUNT(*) FROM system.parts WHERE table = '{TABLE_NAME}' FORMAT Values") + parts_count = azure_query(node, f"SELECT COUNT(*) FROM system.parts WHERE table = '{TABLE_NAME}' FORMAT Values") if parts_count == "(1)": break @@ -140,64 +161,64 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical): time.sleep(1) - assert node.query(f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" - assert node.query(f"SELECT count(distinct(id)) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(distinct(id)) FROM {TABLE_NAME} FORMAT Values") == "(8192)" def test_alter_table_columns(cluster): node = cluster.instances[NODE_NAME] create_table(node, TABLE_NAME) - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096, -1)}") node.query(f"ALTER TABLE {TABLE_NAME} ADD COLUMN col1 UInt64 DEFAULT 1") # To ensure parts have been merged node.query(f"OPTIMIZE TABLE {TABLE_NAME}") - assert node.query(f"SELECT sum(col1) FROM {TABLE_NAME} FORMAT Values") == "(8192)" - assert node.query(f"SELECT sum(col1) FROM {TABLE_NAME} WHERE id > 0 FORMAT Values") == "(4096)" + assert azure_query(node, f"SELECT sum(col1) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + assert azure_query(node, f"SELECT sum(col1) FROM {TABLE_NAME} WHERE id > 0 FORMAT Values") == "(4096)" node.query(f"ALTER TABLE {TABLE_NAME} MODIFY COLUMN col1 String", settings={"mutations_sync": 2}) - assert node.query(f"SELECT distinct(col1) FROM {TABLE_NAME} FORMAT Values") == "('1')" + assert azure_query(node, f"SELECT distinct(col1) FROM {TABLE_NAME} FORMAT Values") == "('1')" def test_attach_detach_partition(cluster): node = cluster.instances[NODE_NAME] create_table(node, TABLE_NAME) - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" node.query(f"ALTER TABLE {TABLE_NAME} DETACH PARTITION '2020-01-03'") - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(4096)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(4096)" node.query(f"ALTER TABLE {TABLE_NAME} ATTACH PARTITION '2020-01-03'") - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" node.query(f"ALTER TABLE {TABLE_NAME} DROP PARTITION '2020-01-03'") - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(4096)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(4096)" node.query(f"ALTER TABLE {TABLE_NAME} DETACH PARTITION '2020-01-04'") node.query(f"ALTER TABLE {TABLE_NAME} DROP DETACHED PARTITION '2020-01-04'", settings={"allow_drop_detached": 1}) - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(0)" def test_move_partition_to_another_disk(cluster): node = cluster.instances[NODE_NAME] create_table(node, TABLE_NAME) - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" node.query(f"ALTER TABLE {TABLE_NAME} MOVE PARTITION '2020-01-04' TO DISK '{LOCAL_DISK}'") - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" node.query(f"ALTER TABLE {TABLE_NAME} MOVE PARTITION '2020-01-04' TO DISK '{AZURE_BLOB_STORAGE_DISK}'") - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" def test_table_manipulations(cluster): @@ -210,17 +231,17 @@ def test_table_manipulations(cluster): node.query_with_retry(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") node.query(f"RENAME TABLE {TABLE_NAME} TO {renamed_table}") - assert node.query(f"SELECT count(*) FROM {renamed_table} FORMAT Values") == "(8192)" + assert azure_query(node, f"SELECT count(*) FROM {renamed_table} FORMAT Values") == "(8192)" node.query(f"RENAME TABLE {renamed_table} TO {TABLE_NAME}") assert node.query(f"CHECK TABLE {TABLE_NAME} FORMAT Values") == "(1)" node.query(f"DETACH TABLE {TABLE_NAME}") node.query(f"ATTACH TABLE {TABLE_NAME}") - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(8192)" node.query(f"TRUNCATE TABLE {TABLE_NAME}") - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(0)" @pytest.mark.long_run @@ -230,38 +251,38 @@ def test_move_replace_partition_to_another_table(cluster): table_clone_name = TABLE_NAME + "_clone" - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 256)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 256)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-05', 256, -1)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-06', 256, -1)}") - assert node.query(f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 256)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 256)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-05', 256, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-06', 256, -1)}") + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" create_table(node, table_clone_name) node.query(f"ALTER TABLE {TABLE_NAME} MOVE PARTITION '2020-01-03' TO TABLE {table_clone_name}") node.query(f"ALTER TABLE {TABLE_NAME} MOVE PARTITION '2020-01-05' TO TABLE {table_clone_name}") - assert node.query(f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(512)" - assert node.query(f"SELECT sum(id) FROM {table_clone_name} FORMAT Values") == "(0)" - assert node.query(f"SELECT count(*) FROM {table_clone_name} FORMAT Values") == "(512)" + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(512)" + assert azure_query(node, f"SELECT sum(id) FROM {table_clone_name} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {table_clone_name} FORMAT Values") == "(512)" # Add new partitions to source table, but with different values and replace them from copied table. - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 256, -1)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-05', 256)}") - assert node.query(f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 256, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-05', 256)}") + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" node.query(f"ALTER TABLE {TABLE_NAME} REPLACE PARTITION '2020-01-03' FROM {table_clone_name}") node.query(f"ALTER TABLE {TABLE_NAME} REPLACE PARTITION '2020-01-05' FROM {table_clone_name}") - assert node.query(f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" - assert node.query(f"SELECT sum(id) FROM {table_clone_name} FORMAT Values") == "(0)" - assert node.query(f"SELECT count(*) FROM {table_clone_name} FORMAT Values") == "(512)" + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" + assert azure_query(node, f"SELECT sum(id) FROM {table_clone_name} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {table_clone_name} FORMAT Values") == "(512)" node.query(f"DROP TABLE {table_clone_name} NO DELAY") - assert node.query(f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" - assert node.query(f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT count(*) FROM {TABLE_NAME} FORMAT Values") == "(1024)" node.query(f"ALTER TABLE {TABLE_NAME} FREEZE") @@ -275,12 +296,12 @@ def test_freeze_unfreeze(cluster): backup1 = 'backup1' backup2 = 'backup2' - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") node.query(f"ALTER TABLE {TABLE_NAME} FREEZE WITH NAME '{backup1}'") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") node.query(f"ALTER TABLE {TABLE_NAME} FREEZE WITH NAME '{backup2}'") - node.query(f"TRUNCATE TABLE {TABLE_NAME}") + azure_query(node, f"TRUNCATE TABLE {TABLE_NAME}") # Unfreeze single partition from backup1. node.query(f"ALTER TABLE {TABLE_NAME} UNFREEZE PARTITION '2020-01-03' WITH NAME '{backup1}'") @@ -292,7 +313,7 @@ def test_apply_new_settings(cluster): node = cluster.instances[NODE_NAME] create_table(node, TABLE_NAME) - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}") # Force multi-part upload mode. replace_config( @@ -301,7 +322,7 @@ def test_apply_new_settings(cluster): "4096") node.query("SYSTEM RELOAD CONFIG") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096, -1)}") # NOTE: this test takes a couple of minutes when run together with other tests @@ -313,14 +334,14 @@ def test_restart_during_load(cluster): # Force multi-part upload mode. replace_config(CONFIG_PATH, "false", "") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") - node.query(f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-05', 4096, -1)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-04', 4096)}") + azure_query(node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-05', 4096, -1)}") def read(): for ii in range(0, 5): logging.info(f"Executing {ii} query") - assert node.query(f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" + assert azure_query(node, f"SELECT sum(id) FROM {TABLE_NAME} FORMAT Values") == "(0)" logging.info(f"Query {ii} executed") time.sleep(0.2) @@ -347,5 +368,5 @@ def test_restart_during_load(cluster): def test_big_insert(cluster): node = cluster.instances[NODE_NAME] create_table(node, TABLE_NAME) - node.query(f"INSERT INTO {TABLE_NAME} select '2020-01-03', number, toString(number) from numbers(5000000)") - assert int(node.query(f"SELECT count() FROM {TABLE_NAME}")) == 5000000 + azure_query(node, f"INSERT INTO {TABLE_NAME} select '2020-01-03', number, toString(number) from numbers(5000000)") + assert int(azure_query(node, f"SELECT count() FROM {TABLE_NAME}")) == 5000000 diff --git a/tests/integration/test_parts_delete_zookeeper/test.py b/tests/integration/test_parts_delete_zookeeper/test.py index 8a4aafaa55c..62e14b68bd1 100644 --- a/tests/integration/test_parts_delete_zookeeper/test.py +++ b/tests/integration/test_parts_delete_zookeeper/test.py @@ -49,12 +49,16 @@ def test_merge_doesnt_work_without_zookeeper(start_cluster): node1.query("INSERT INTO test_table VALUES ('2018-10-01', 1), ('2018-10-02', 2), ('2018-10-03', 3)") node1.query("INSERT INTO test_table VALUES ('2018-10-01', 4), ('2018-10-02', 5), ('2018-10-03', 6)") - assert node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "2\n" + assert node1.query("SELECT count(*) from system.parts where table = 'test_table' and active") == "2\n" with PartitionManager() as pm: node1.query("OPTIMIZE TABLE test_table FINAL") pm.drop_instance_zk_connections(node1) - time.sleep(10) # > old_parts_lifetime - assert node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "3\n" + # unfortunately we can be too fast and delete node before partition with ZK + if node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "1\n": + print("We were too fast and deleted parts before partition with ZK") + else: + time.sleep(10) # > old_parts_lifetime + assert node1.query("SELECT count(*) from system.parts where table = 'test_table'") == "3\n" assert_eq_with_retry(node1, "SELECT count(*) from system.parts where table = 'test_table' and active = 1", "1") diff --git a/tests/integration/test_postgresql_replica_database_engine_2/test.py b/tests/integration/test_postgresql_replica_database_engine_2/test.py index c8b63d8e667..9a1e2cd9a38 100644 --- a/tests/integration/test_postgresql_replica_database_engine_2/test.py +++ b/tests/integration/test_postgresql_replica_database_engine_2/test.py @@ -178,7 +178,7 @@ def assert_number_of_columns(expected, table_name, database_name='test_database' def check_tables_are_synchronized(table_name, order_by='key', postgres_database='postgres_database', materialized_database='test_database', schema_name=''): assert_nested_table_is_created(table_name, materialized_database, schema_name) - print("Checking table is synchronized:", table_name) + print(f"Checking table is synchronized. Table name: {table_name}, table schema: {schema_name}") expected = instance.query('select * from {}.{} order by {};'.format(postgres_database, table_name, order_by)) if len(schema_name) == 0: result = instance.query('select * from {}.{} order by {};'.format(materialized_database, table_name, order_by)) @@ -356,6 +356,11 @@ def test_remove_table_from_replication(started_cluster): for i in range(NUM_TABLES): cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) + # Removing from replication table which does not exist in PostgreSQL must be ok. + instance.query('DETACH TABLE test_database.postgresql_replica_0'); + assert instance.contains_in_log("from publication, because table does not exist in PostgreSQL") + drop_materialized_db() + def test_predefined_connection_configuration(started_cluster): drop_materialized_db() @@ -379,6 +384,7 @@ def test_database_with_single_non_default_schema(started_cluster): NUM_TABLES=5 schema_name = 'test_schema' + materialized_db = 'test_database' clickhouse_postgres_db = 'postgres_database_with_schema' global insert_counter insert_counter = 0 @@ -430,6 +436,14 @@ def test_database_with_single_non_default_schema(started_cluster): instance.query(f"INSERT INTO {clickhouse_postgres_db}.postgresql_replica_{altered_table} SELECT number, number, number from numbers(5000, 1000)") assert_number_of_columns(3, f'postgresql_replica_{altered_table}') check_tables_are_synchronized(f"postgresql_replica_{altered_table}", postgres_database=clickhouse_postgres_db); + + print('DETACH-ATTACH') + detached_table_name = "postgresql_replica_1" + instance.query(f"DETACH TABLE {materialized_db}.{detached_table_name}") + assert not instance.contains_in_log("from publication, because table does not exist in PostgreSQL") + instance.query(f"ATTACH TABLE {materialized_db}.{detached_table_name}") + check_tables_are_synchronized(detached_table_name, postgres_database=clickhouse_postgres_db); + drop_materialized_db() @@ -440,6 +454,7 @@ def test_database_with_multiple_non_default_schemas_1(started_cluster): NUM_TABLES = 5 schema_name = 'test_schema' clickhouse_postgres_db = 'postgres_database_with_schema' + materialized_db = 'test_database' publication_tables = '' global insert_counter insert_counter = 0 @@ -494,6 +509,15 @@ def test_database_with_multiple_non_default_schemas_1(started_cluster): instance.query(f"INSERT INTO {clickhouse_postgres_db}.postgresql_replica_{altered_table} SELECT number, number, number from numbers(5000, 1000)") assert_number_of_columns(3, f'{schema_name}.postgresql_replica_{altered_table}') check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=schema_name, postgres_database=clickhouse_postgres_db); + + print('DETACH-ATTACH') + detached_table_name = "postgresql_replica_1" + instance.query(f"DETACH TABLE {materialized_db}.`{schema_name}.{detached_table_name}`") + assert not instance.contains_in_log("from publication, because table does not exist in PostgreSQL") + instance.query(f"ATTACH TABLE {materialized_db}.`{schema_name}.{detached_table_name}`") + assert_show_tables("test_schema.postgresql_replica_0\ntest_schema.postgresql_replica_1\ntest_schema.postgresql_replica_2\ntest_schema.postgresql_replica_3\ntest_schema.postgresql_replica_4\n") + check_tables_are_synchronized(detached_table_name, schema_name=schema_name, postgres_database=clickhouse_postgres_db); + drop_materialized_db() @@ -504,6 +528,7 @@ def test_database_with_multiple_non_default_schemas_2(started_cluster): NUM_TABLES = 2 schemas_num = 2 schema_list = 'schema0, schema1' + materialized_db = 'test_database' global insert_counter insert_counter = 0 @@ -557,11 +582,23 @@ def test_database_with_multiple_non_default_schemas_2(started_cluster): print('ALTER') altered_schema = random.randint(0, schemas_num-1) altered_table = random.randint(0, NUM_TABLES-1) + clickhouse_postgres_db = f'clickhouse_postgres_db{altered_schema}' cursor.execute(f"ALTER TABLE schema{altered_schema}.postgresql_replica_{altered_table} ADD COLUMN value2 integer") instance.query(f"INSERT INTO clickhouse_postgres_db{altered_schema}.postgresql_replica_{altered_table} SELECT number, number, number from numbers(1000 * {insert_counter}, 1000)") assert_number_of_columns(3, f'schema{altered_schema}.postgresql_replica_{altered_table}') - check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=schema_name, postgres_database=clickhouse_postgres_db); + check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=f"schema{altered_schema}", postgres_database=clickhouse_postgres_db); + + print('DETACH-ATTACH') + detached_table_name = "postgresql_replica_1" + detached_table_schema = "schema0" + clickhouse_postgres_db = f'clickhouse_postgres_db0' + instance.query(f"DETACH TABLE {materialized_db}.`{detached_table_schema}.{detached_table_name}`") + assert not instance.contains_in_log("from publication, because table does not exist in PostgreSQL") + instance.query(f"ATTACH TABLE {materialized_db}.`{detached_table_schema}.{detached_table_name}`") + assert_show_tables("schema0.postgresql_replica_0\nschema0.postgresql_replica_1\nschema1.postgresql_replica_0\nschema1.postgresql_replica_1\n") + check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=detached_table_schema, postgres_database=clickhouse_postgres_db); + drop_materialized_db() @@ -589,6 +626,53 @@ def test_table_override(started_cluster): drop_postgres_table(cursor, table_name) +def test_table_schema_changes_2(started_cluster): + drop_materialized_db() + conn = get_postgres_conn(ip=started_cluster.postgres_ip, + port=started_cluster.postgres_port, + database=True) + cursor = conn.cursor() + + table_name = "test_table" + + create_postgres_table(cursor, table_name, template=postgres_table_template_2); + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number, number, number from numbers(25)") + + create_materialized_db(ip=started_cluster.postgres_ip, + port=started_cluster.postgres_port, + settings=["materialized_postgresql_allow_automatic_update = 1, materialized_postgresql_tables_list='test_table'"]) + + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number, number, number from numbers(25, 25)") + check_tables_are_synchronized(table_name); + + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value1") + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value2") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value1 Text") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value2 Text") + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value3") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value3 Text") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value4 Text") + cursor.execute(f"UPDATE {table_name} SET value3 = 'kek' WHERE key%2=0") + check_tables_are_synchronized(table_name); + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, toString(number), toString(number), toString(number), toString(number) from numbers(50, 25)") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value5 Integer") + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value2") + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, toString(number), toString(number), toString(number), number from numbers(75, 25)") + check_tables_are_synchronized(table_name); + instance.restart_clickhouse() + check_tables_are_synchronized(table_name); + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value5") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value5 Text") + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, toString(number), toString(number), toString(number), toString(number) from numbers(100, 25)") + check_tables_are_synchronized(table_name); + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value6 Text") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value7 Integer") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value8 Integer") + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value5") + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, toString(number), toString(number), toString(number), toString(number), number, number from numbers(125, 25)") + check_tables_are_synchronized(table_name); + + if __name__ == '__main__': cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml index 5d10ac0d959..181144b0473 100644 --- a/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml +++ b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml @@ -7,18 +7,21 @@ http://minio1:9001/root/data/ minio minio123 + true s3 http://minio1:9001/root/data/ minio minio123 + true s3 http://minio1:9001/root/data2/ minio minio123 + true diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py index 1c3713c02a2..fb30a83877b 100644 --- a/tests/integration/test_s3_zero_copy_replication/test.py +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -32,11 +32,30 @@ def get_large_objects_count(cluster, size=100, folder='data'): minio = cluster.minio_client counter = 0 for obj in minio.list_objects(cluster.minio_bucket, '{}/'.format(folder)): - if obj.size >= size: + if obj.size is not None and obj.size >= size: counter = counter + 1 return counter +def check_objects_exisis(cluster, object_list, folder='data'): + minio = cluster.minio_client + for obj in object_list: + if obj: + minio.stat_object(cluster.minio_bucket, '{}/{}'.format(folder, obj)) + + +def check_objects_not_exisis(cluster, object_list, folder='data'): + minio = cluster.minio_client + for obj in object_list: + if obj: + try: + minio.stat_object(cluster.minio_bucket, '{}/{}'.format(folder, obj)) + except Exception as error: + assert "NoSuchKey" in str(error) + else: + assert False, "Object {} should not be exists".format(obj) + + def wait_for_large_objects_count(cluster, expected, size=100, timeout=30): while timeout > 0: if get_large_objects_count(cluster, size=size) == expected: @@ -266,6 +285,138 @@ def test_s3_zero_copy_with_ttl_delete(cluster, large_data, iterations): node2.query("DROP TABLE IF EXISTS ttl_delete_test NO DELAY") +def wait_mutations(node, table, seconds): + time.sleep(1) + while seconds > 0: + seconds -= 1 + mutations = node.query(f"SELECT count() FROM system.mutations WHERE table='{table}' AND is_done=0") + if mutations == '0\n': + return + time.sleep(1) + mutations = node.query(f"SELECT count() FROM system.mutations WHERE table='{table}' AND is_done=0") + assert mutations == '0\n' + + +def test_s3_zero_copy_unfreeze(cluster): + node1 = cluster.instances["node1"] + node2 = cluster.instances["node2"] + + node1.query("DROP TABLE IF EXISTS unfreeze_test NO DELAY") + node2.query("DROP TABLE IF EXISTS unfreeze_test NO DELAY") + + node1.query( + """ + CREATE TABLE unfreeze_test ON CLUSTER test_cluster (d UInt64) + ENGINE=ReplicatedMergeTree('/clickhouse/tables/unfreeze_test', '{}') + ORDER BY d + SETTINGS storage_policy='s3' + """ + .format('{replica}') + ) + + node1.query("INSERT INTO unfreeze_test VALUES (0)") + + node1.query("ALTER TABLE unfreeze_test FREEZE WITH NAME 'freeze_backup1'") + node2.query("ALTER TABLE unfreeze_test FREEZE WITH NAME 'freeze_backup2'") + wait_mutations(node1, "unfreeze_test", 10) + wait_mutations(node2, "unfreeze_test", 10) + + objects01 = node1.get_backuped_s3_objects("s31", "freeze_backup1") + objects02 = node2.get_backuped_s3_objects("s31", "freeze_backup2") + + assert objects01 == objects02 + + check_objects_exisis(cluster, objects01) + + node1.query("TRUNCATE TABLE unfreeze_test") + + objects11 = node1.get_backuped_s3_objects("s31", "freeze_backup1") + objects12 = node2.get_backuped_s3_objects("s31", "freeze_backup2") + + assert objects01 == objects11 + assert objects01 == objects12 + + check_objects_exisis(cluster, objects11) + + node1.query("ALTER TABLE unfreeze_test UNFREEZE WITH NAME 'freeze_backup1'") + wait_mutations(node1, "unfreeze_test", 10) + + check_objects_exisis(cluster, objects12) + + node2.query("ALTER TABLE unfreeze_test UNFREEZE WITH NAME 'freeze_backup2'") + wait_mutations(node2, "unfreeze_test", 10) + + check_objects_not_exisis(cluster, objects12) + + node1.query("DROP TABLE IF EXISTS unfreeze_test NO DELAY") + node2.query("DROP TABLE IF EXISTS unfreeze_test NO DELAY") + + +def test_s3_zero_copy_drop_detached(cluster): + node1 = cluster.instances["node1"] + node2 = cluster.instances["node2"] + + node1.query("DROP TABLE IF EXISTS drop_detached_test NO DELAY") + node2.query("DROP TABLE IF EXISTS drop_detached_test NO DELAY") + + node1.query( + """ + CREATE TABLE drop_detached_test ON CLUSTER test_cluster (d UInt64) + ENGINE=ReplicatedMergeTree('/clickhouse/tables/drop_detached_test', '{}') + ORDER BY d PARTITION BY d + SETTINGS storage_policy='s3' + """ + .format('{replica}') + ) + + node1.query("INSERT INTO drop_detached_test VALUES (0)") + node1.query("ALTER TABLE drop_detached_test FREEZE WITH NAME 'detach_backup1'") + node1.query("INSERT INTO drop_detached_test VALUES (1)") + node1.query("ALTER TABLE drop_detached_test FREEZE WITH NAME 'detach_backup2'") + + objects1 = node1.get_backuped_s3_objects("s31", "detach_backup1") + objects2 = node1.get_backuped_s3_objects("s31", "detach_backup2") + + objects_diff = list(set(objects2) - set(objects1)) + + node1.query("ALTER TABLE drop_detached_test UNFREEZE WITH NAME 'detach_backup2'") + node1.query("ALTER TABLE drop_detached_test UNFREEZE WITH NAME 'detach_backup1'") + + node1.query("ALTER TABLE drop_detached_test DETACH PARTITION '0'") + node1.query("ALTER TABLE drop_detached_test DETACH PARTITION '1'") + wait_mutations(node1, "drop_detached_test", 10) + wait_mutations(node2, "drop_detached_test", 10) + + check_objects_exisis(cluster, objects1) + check_objects_exisis(cluster, objects2) + + node2.query("ALTER TABLE drop_detached_test DROP DETACHED PARTITION '1'", settings={"allow_drop_detached": 1}) + wait_mutations(node1, "drop_detached_test", 10) + wait_mutations(node2, "drop_detached_test", 10) + + check_objects_exisis(cluster, objects1) + check_objects_exisis(cluster, objects2) + + node1.query("ALTER TABLE drop_detached_test DROP DETACHED PARTITION '1'", settings={"allow_drop_detached": 1}) + wait_mutations(node1, "drop_detached_test", 10) + wait_mutations(node2, "drop_detached_test", 10) + + check_objects_exisis(cluster, objects1) + check_objects_not_exisis(cluster, objects_diff) + + node1.query("ALTER TABLE drop_detached_test DROP DETACHED PARTITION '0'", settings={"allow_drop_detached": 1}) + wait_mutations(node1, "drop_detached_test", 10) + wait_mutations(node2, "drop_detached_test", 10) + + check_objects_exisis(cluster, objects1) + + node2.query("ALTER TABLE drop_detached_test DROP DETACHED PARTITION '0'", settings={"allow_drop_detached": 1}) + wait_mutations(node1, "drop_detached_test", 10) + wait_mutations(node2, "drop_detached_test", 10) + + check_objects_not_exisis(cluster, objects1) + + def test_s3_zero_copy_concurrent_merge(cluster): node1 = cluster.instances["node1"] node2 = cluster.instances["node2"] diff --git a/tests/integration/test_server_reload/.gitignore b/tests/integration/test_server_reload/.gitignore new file mode 100644 index 00000000000..edf565ec632 --- /dev/null +++ b/tests/integration/test_server_reload/.gitignore @@ -0,0 +1 @@ +_gen diff --git a/tests/integration/test_server_reload/__init__.py b/tests/integration/test_server_reload/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_server_reload/configs/default_passwd.xml b/tests/integration/test_server_reload/configs/default_passwd.xml new file mode 100644 index 00000000000..5c23be0dcb0 --- /dev/null +++ b/tests/integration/test_server_reload/configs/default_passwd.xml @@ -0,0 +1,13 @@ + + + + + + + + + + 123 + + + diff --git a/tests/integration/test_server_reload/configs/dhparam.pem b/tests/integration/test_server_reload/configs/dhparam.pem new file mode 100644 index 00000000000..fb935b9c898 --- /dev/null +++ b/tests/integration/test_server_reload/configs/dhparam.pem @@ -0,0 +1,8 @@ +-----BEGIN DH PARAMETERS----- +MIIBCAKCAQEAkPGhfLY5nppeQkFBKYRpiisxzrRQfyyTUu6aabZP2CbAMAuoYzaC +Z+iqeWSQZKRYeA21SZXkC9xE1e5FJsc5IWzCRiMNZeLuj4ApUNysMu89DpX8/b91 ++Ka6wRJnaO43ZqHj/9FpU4JiYtxoIpXDC9HeiSAnwLwJc3L+nkYfnSGgvzWIxhGV +gCoVmVBoTe7wrqCyVlM5nrNZSjhlSugvXmu2bSK3MwYF08QLKvlF68eedbs0PMWh +WC0bFM/X7gMBEqL4DiINufAShbZPKxD6eL2APiHPUo6xun3ed/Po/5j8QBmiku0c +5Jb12ZhOTRTQjaRg2aFF8LPdW2tDE7HmewIBAg== +-----END DH PARAMETERS----- diff --git a/tests/integration/test_server_reload/configs/ports_from_zk.xml b/tests/integration/test_server_reload/configs/ports_from_zk.xml new file mode 100644 index 00000000000..ae3435a3d3c --- /dev/null +++ b/tests/integration/test_server_reload/configs/ports_from_zk.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/tests/integration/test_server_reload/configs/server.crt b/tests/integration/test_server_reload/configs/server.crt new file mode 100644 index 00000000000..6f4deca038f --- /dev/null +++ b/tests/integration/test_server_reload/configs/server.crt @@ -0,0 +1,18 @@ +-----BEGIN CERTIFICATE----- +MIIC+zCCAeOgAwIBAgIJAIhI9ozZJ+TWMA0GCSqGSIb3DQEBCwUAMBQxEjAQBgNV +BAMMCWxvY2FsaG9zdDAeFw0xOTA0MjIwNDMyNTJaFw0yMDA0MjEwNDMyNTJaMBQx +EjAQBgNVBAMMCWxvY2FsaG9zdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoC +ggEBAK+wVUEdqF2uXvN0MJBgnAHyXi6JTi4p/F6igsrCjSNjJWzHH0vQmK8ujfcF +CkifW88i+W5eHctuEtQqNHK+t9x9YiZtXrj6m/XkOXs20mYgENSmbbbHbriTPnZB +zZrq6UqMlwIHNNAa+I3NMORQxVRaI0ybXnGVO5elr70xHpk03xL0JWKHpEqYp4db +2aBQgF6y3Ww4khxjIYqpUYXWXGFnVIRU7FKVEAM1xyKqvQzXjQ5sVM/wyHknveEF +3b/X4ggN+KNl5KOc0cWDh1/XaatJAPaUUPqZcq76tynLbP64Xm3dxHcj+gtRkO67 +ef6MSg6l63m3XQP6Qb+MIkd06OsCAwEAAaNQME4wHQYDVR0OBBYEFDmODTO8QLDN +ykR3x0LIOnjNhrKhMB8GA1UdIwQYMBaAFDmODTO8QLDNykR3x0LIOnjNhrKhMAwG +A1UdEwQFMAMBAf8wDQYJKoZIhvcNAQELBQADggEBAAwaiJc7uqEpnH3aukbftDwX +m8GfEnj1HVdgg+9GGNq+9rvUYBF6gdPmjRCX9dO0cclLFx8jc2org0rTSq9WoOhX +E6qL4Eqrmc5SE3Y9jZM0h6GRD4oXK014FmtZ3T6ddZU3dQLj3BS2r1XrvmubTvGN +ZuTJNY8nx8Hh6H5XINmsEjUF9E5hog+PwCE03xt2adIdYL+gsbxASeNYyeUFpZv5 +zcXR3VoakBWnAaOVgCHq2qh96QAnL7ZKzFkGf/MdwV10KU3dmb+ICbQUUdf9Gc17 +aaDCIRws312F433FdXBkGs2UkB7ZZme9dfn6O1QbeTNvex2VLMqYx/CTkfFbOQA= +-----END CERTIFICATE----- diff --git a/tests/integration/test_server_reload/configs/server.key b/tests/integration/test_server_reload/configs/server.key new file mode 100644 index 00000000000..6eddb3295db --- /dev/null +++ b/tests/integration/test_server_reload/configs/server.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCvsFVBHahdrl7z +dDCQYJwB8l4uiU4uKfxeooLKwo0jYyVsxx9L0JivLo33BQpIn1vPIvluXh3LbhLU +KjRyvrfcfWImbV64+pv15Dl7NtJmIBDUpm22x264kz52Qc2a6ulKjJcCBzTQGviN +zTDkUMVUWiNMm15xlTuXpa+9MR6ZNN8S9CVih6RKmKeHW9mgUIBest1sOJIcYyGK +qVGF1lxhZ1SEVOxSlRADNcciqr0M140ObFTP8Mh5J73hBd2/1+IIDfijZeSjnNHF +g4df12mrSQD2lFD6mXKu+rcpy2z+uF5t3cR3I/oLUZDuu3n+jEoOpet5t10D+kG/ +jCJHdOjrAgMBAAECggEARF66zrxb6RkSmmt8+rKeA6PuQu3sHsr4C1vyyjUr97l9 +tvdGlpp20LWtSZQMjHZ3pARYTTsTHTeY3DgQcRcHNicVKx8k3ZepWeeW9vw+pL+V +zSt3RsoVrH6gsCSrfr4sS3aqzX9AbjwQvh48CJ3mLQ1m70kHV+xbZIh1+4pB/hyP +1wKyUE18ZkOptXvO/TtoHzLQCecpkXtWzmry1Eh2isvXA+NMrAtLibGsyM1mtm7i +5ozevzHabvvCDBEe+KgZdONgVhhhvm2eOd+/s4w3rw4ETud4fI/ZAJyWXhiIKFnA +VJbElWruSAoVBW7p2bsF5PbmVzvo8vXL+VylxYD+AQKBgQDhLoRKTVhNkn/QjKxq +sdOh+QZra0LzjVpAmkQzu7wZMSHEz9qePQciDQQrYKrmRF1vNcIRCVUTqWYheJ/1 +lKRrCGa0ab6k96zkWMqLHD5u+UeJV7r1dJIx08ME9kNJ+x/XtB8klRIji16NiQUS +qc6p8z0M2AnbJzsRfWZRH8FeYwKBgQDHu8dzdtVGI7MtxfPOE/bfajiopDg8BdTC +pdug2T8XofRHRq7Q+0vYjTAZFT/slib91Pk6VvvPdo9VBZiL4omv4dAq6mOOdX/c +U14mJe1X5GCrr8ExZ8BfNJ3t/6sV1fcxyJwAw7iBguqxA2JqdM/wFk10K8XqvzVn +CD6O9yGt2QKBgFX1BMi8N538809vs41S7l9hCQNOQZNo/O+2M5yv6ECRkbtoQKKw +1x03bMUGNJaLuELweXE5Z8GGo5bZTe5X3F+DKHlr+DtO1C+ieUaa9HY2MAmMdLCn +2/qrREGLo+oEs4YKmuzC/taUp/ZNPKOAMISNdluFyFVg51pozPrgrVbTAoGBAKkE +LBl3O67o0t0vH8sJdeVFG8EJhlS0koBMnfgVHqC++dm+5HwPyvTrNQJkyv1HaqNt +r6FArkG3ED9gRuBIyT6+lctbIPgSUip9mbQqcBfqOCvQxGksZMur2ODncz09HLtS +CUFUXjOqNzOnq4ZuZu/Bz7U4vXiSaXxQq6+LTUKxAoGAFZU/qrI06XxnrE9A1X0W +l7DSkpZaDcu11NrZ473yONih/xOZNh4SSBpX8a7F6Pmh9BdtGqphML8NFPvQKcfP +b9H2iid2tc292uyrUEb5uTMmv61zoTwtitqLzO0+tS6PT3fXobX+eyeEWKzPBljL +HFtxG5CCXpkdnWRmaJnhTzA= +-----END PRIVATE KEY----- diff --git a/tests/integration/test_server_reload/configs/ssl_conf.xml b/tests/integration/test_server_reload/configs/ssl_conf.xml new file mode 100644 index 00000000000..43b25032059 --- /dev/null +++ b/tests/integration/test_server_reload/configs/ssl_conf.xml @@ -0,0 +1,18 @@ + + + + + + + /etc/clickhouse-server/config.d/server.crt + /etc/clickhouse-server/config.d/server.key + + /etc/clickhouse-server/config.d/dhparam.pem + none + true + true + sslv2,sslv3 + true + + + diff --git a/tests/integration/test_server_reload/protos/clickhouse_grpc.proto b/tests/integration/test_server_reload/protos/clickhouse_grpc.proto new file mode 100644 index 00000000000..c6cafaf6e40 --- /dev/null +++ b/tests/integration/test_server_reload/protos/clickhouse_grpc.proto @@ -0,0 +1,174 @@ +/* This file describes gRPC protocol supported in ClickHouse. + * + * To use this protocol a client should send one or more messages of the QueryInfo type + * and then receive one or more messages of the Result type. + * According to that the service provides four methods for that: + * ExecuteQuery(QueryInfo) returns (Result) + * ExecuteQueryWithStreamInput(stream QueryInfo) returns (Result) + * ExecuteQueryWithStreamOutput(QueryInfo) returns (stream Result) + * ExecuteQueryWithStreamIO(stream QueryInfo) returns (stream Result) + * It's up to the client to choose which method to use. + * For example, ExecuteQueryWithStreamInput() allows the client to add data multiple times + * while executing a query, which is suitable for inserting many rows. + */ + +syntax = "proto3"; + +package clickhouse.grpc; + +message NameAndType { + string name = 1; + string type = 2; +} + +// Describes an external table - a table which will exists only while a query is executing. +message ExternalTable { + // Name of the table. If omitted, "_data" is used. + string name = 1; + + // Columns of the table. Types are required, names can be omitted. If the names are omitted, "_1", "_2", ... is used. + repeated NameAndType columns = 2; + + // Data to insert to the external table. + // If a method with streaming input (i.e. ExecuteQueryWithStreamInput() or ExecuteQueryWithStreamIO()) is used, + // then data for insertion to the same external table can be split between multiple QueryInfos. + bytes data = 3; + + // Format of the data to insert to the external table. + string format = 4; + + // Settings for executing that insertion, applied after QueryInfo.settings. + map settings = 5; +} + +enum CompressionAlgorithm { + NO_COMPRESSION = 0; + DEFLATE = 1; + GZIP = 2; + STREAM_GZIP = 3; +} + +enum CompressionLevel { + COMPRESSION_NONE = 0; + COMPRESSION_LOW = 1; + COMPRESSION_MEDIUM = 2; + COMPRESSION_HIGH = 3; +} + +message Compression { + CompressionAlgorithm algorithm = 1; + CompressionLevel level = 2; +} + +// Information about a query which a client sends to a ClickHouse server. +// The first QueryInfo can set any of the following fields. Extra QueryInfos only add extra data. +// In extra QueryInfos only `input_data`, `external_tables`, `next_query_info` and `cancel` fields can be set. +message QueryInfo { + string query = 1; + string query_id = 2; + map settings = 3; + + // Default database. + string database = 4; + + // Input data, used both as data for INSERT query and as data for the input() function. + bytes input_data = 5; + + // Delimiter for input_data, inserted between input_data from adjacent QueryInfos. + bytes input_data_delimiter = 6; + + // Default output format. If not specified, 'TabSeparated' is used. + string output_format = 7; + + repeated ExternalTable external_tables = 8; + + string user_name = 9; + string password = 10; + string quota = 11; + + // Works exactly like sessions in the HTTP protocol. + string session_id = 12; + bool session_check = 13; + uint32 session_timeout = 14; + + // Set `cancel` to true to stop executing the query. + bool cancel = 15; + + // If true there will be at least one more QueryInfo in the input stream. + // `next_query_info` is allowed to be set only if a method with streaming input (i.e. ExecuteQueryWithStreamInput() or ExecuteQueryWithStreamIO()) is used. + bool next_query_info = 16; + + /// Controls how a ClickHouse server will compress query execution results before sending back to the client. + /// If not set the compression settings from the configuration file will be used. + Compression result_compression = 17; +} + +enum LogsLevel { + LOG_NONE = 0; + LOG_FATAL = 1; + LOG_CRITICAL = 2; + LOG_ERROR = 3; + LOG_WARNING = 4; + LOG_NOTICE = 5; + LOG_INFORMATION = 6; + LOG_DEBUG = 7; + LOG_TRACE = 8; +} + +message LogEntry { + uint32 time = 1; + uint32 time_microseconds = 2; + uint64 thread_id = 3; + string query_id = 4; + LogsLevel level = 5; + string source = 6; + string text = 7; +} + +message Progress { + uint64 read_rows = 1; + uint64 read_bytes = 2; + uint64 total_rows_to_read = 3; + uint64 written_rows = 4; + uint64 written_bytes = 5; +} + +message Stats { + uint64 rows = 1; + uint64 blocks = 2; + uint64 allocated_bytes = 3; + bool applied_limit = 4; + uint64 rows_before_limit = 5; +} + +message Exception { + int32 code = 1; + string name = 2; + string display_text = 3; + string stack_trace = 4; +} + +// Result of execution of a query which is sent back by the ClickHouse server to the client. +message Result { + // Output of the query, represented in the `output_format` or in a format specified in `query`. + bytes output = 1; + bytes totals = 2; + bytes extremes = 3; + + repeated LogEntry logs = 4; + Progress progress = 5; + Stats stats = 6; + + // Set by the ClickHouse server if there was an exception thrown while executing. + Exception exception = 7; + + // Set by the ClickHouse server if executing was cancelled by the `cancel` field in QueryInfo. + bool cancelled = 8; +} + +service ClickHouse { + rpc ExecuteQuery(QueryInfo) returns (Result) {} + rpc ExecuteQueryWithStreamInput(stream QueryInfo) returns (Result) {} + rpc ExecuteQueryWithStreamOutput(QueryInfo) returns (stream Result) {} + rpc ExecuteQueryWithStreamIO(stream QueryInfo) returns (stream Result) {} +} diff --git a/tests/integration/test_server_reload/test.py b/tests/integration/test_server_reload/test.py new file mode 100644 index 00000000000..3c22b476f64 --- /dev/null +++ b/tests/integration/test_server_reload/test.py @@ -0,0 +1,284 @@ +import contextlib +import grpc +import psycopg2 +import pymysql.connections +import pymysql.err +import pytest +import sys +import time +from helpers.cluster import ClickHouseCluster, run_and_check +from helpers.client import Client, QueryRuntimeException +from kazoo.exceptions import NodeExistsError +from pathlib import Path +from requests.exceptions import ConnectionError +from urllib3.util.retry import Retry + +cluster = ClickHouseCluster(__file__) +instance = cluster.add_instance( + "instance", + main_configs=[ + "configs/ports_from_zk.xml", "configs/ssl_conf.xml", "configs/dhparam.pem", "configs/server.crt", "configs/server.key" + ], + user_configs=["configs/default_passwd.xml"], + with_zookeeper=True) + + +LOADS_QUERY = "SELECT value FROM system.events WHERE event = 'MainConfigLoads'" + + +# Use grpcio-tools to generate *pb2.py files from *.proto. + +proto_dir = Path(__file__).parent / "protos" +gen_dir = Path(__file__).parent / "_gen" +gen_dir.mkdir(exist_ok=True) +run_and_check( + f"python3 -m grpc_tools.protoc -I{proto_dir!s} --python_out={gen_dir!s} --grpc_python_out={gen_dir!s} \ + {proto_dir!s}/clickhouse_grpc.proto", shell=True) + +sys.path.append(str(gen_dir)) +import clickhouse_grpc_pb2 +import clickhouse_grpc_pb2_grpc + + +@pytest.fixture(name="cluster", scope="module") +def fixture_cluster(): + try: + cluster.add_zookeeper_startup_command(configure_ports_from_zk) + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +@pytest.fixture(name="zk", scope="module") +def fixture_zk(cluster): + return cluster.get_kazoo_client("zoo1") + + +def get_client(cluster, port): + return Client(host=cluster.get_instance_ip("instance"), port=port, command=cluster.client_bin_path) + + +def get_mysql_client(cluster, port): + start_time = time.monotonic() + while True: + try: + return pymysql.connections.Connection( + host=cluster.get_instance_ip("instance"), user="default", password="", database="default", port=port) + except pymysql.err.OperationalError: + if time.monotonic() - start_time > 10: + raise + time.sleep(0.1) + + +def get_pgsql_client(cluster, port): + start_time = time.monotonic() + while True: + try: + return psycopg2.connect( + host=cluster.get_instance_ip("instance"), user="postgresql", password="123", database="default", port=port) + except psycopg2.OperationalError: + if time.monotonic() - start_time > 10: + raise + time.sleep(0.1) + + +def get_grpc_channel(cluster, port): + host_port = cluster.get_instance_ip("instance") + f":{port}" + channel = grpc.insecure_channel(host_port) + grpc.channel_ready_future(channel).result(timeout=10) + return channel + + +def grpc_query(channel, query_text): + query_info = clickhouse_grpc_pb2.QueryInfo(query=query_text) + stub = clickhouse_grpc_pb2_grpc.ClickHouseStub(channel) + result = stub.ExecuteQuery(query_info) + if result and result.HasField("exception"): + raise Exception(result.exception.display_text) + return result.output.decode() + + +def configure_ports_from_zk(zk, querier=None): + default_config = [ + ("/clickhouse/listen_hosts", b"0.0.0.0"), + ("/clickhouse/ports/tcp", b"9000"), + ("/clickhouse/ports/http", b"8123"), + ("/clickhouse/ports/mysql", b"9004"), + ("/clickhouse/ports/postgresql", b"9005"), + ("/clickhouse/ports/grpc", b"9100"), + ] + for path, value in default_config: + if querier is not None: + loads_before = querier(LOADS_QUERY) + has_changed = False + try: + zk.create(path=path, value=value, makepath=True) + has_changed = True + except NodeExistsError: + if zk.get(path) != value: + zk.set(path=path, value=value) + has_changed = True + if has_changed and querier is not None: + wait_loaded_config_changed(loads_before, querier) + + +@contextlib.contextmanager +def sync_loaded_config(querier): + # Depending on whether we test a change on tcp or http + # we monitor canges using the other, untouched, protocol + loads_before = querier(LOADS_QUERY) + yield + wait_loaded_config_changed(loads_before, querier) + + +def wait_loaded_config_changed(loads_before, querier): + loads_after = None + start_time = time.monotonic() + while time.monotonic() - start_time < 10: + try: + loads_after = querier(LOADS_QUERY) + if loads_after != loads_before: + return + except (QueryRuntimeException, ConnectionError): + pass + time.sleep(0.1) + assert loads_after is not None and loads_after != loads_before + + +@contextlib.contextmanager +def default_client(cluster, zk, restore_via_http=False): + client = get_client(cluster, port=9000) + try: + yield client + finally: + querier = instance.http_query if restore_via_http else client.query + configure_ports_from_zk(zk, querier) + + +def test_change_tcp_port(cluster, zk): + with default_client(cluster, zk, restore_via_http=True) as client: + assert client.query("SELECT 1") == "1\n" + with sync_loaded_config(instance.http_query): + zk.set("/clickhouse/ports/tcp", b"9090") + with pytest.raises(QueryRuntimeException, match="Connection refused"): + client.query("SELECT 1") + client_on_new_port = get_client(cluster, port=9090) + assert client_on_new_port.query("SELECT 1") == "1\n" + + +def test_change_http_port(cluster, zk): + with default_client(cluster, zk) as client: + retry_strategy = Retry(total=10, backoff_factor=0.1) + assert instance.http_query("SELECT 1", retry_strategy=retry_strategy) == "1\n" + with sync_loaded_config(client.query): + zk.set("/clickhouse/ports/http", b"9090") + with pytest.raises(ConnectionError, match="Connection refused"): + instance.http_query("SELECT 1") + instance.http_query("SELECT 1", port=9090) == "1\n" + + +def test_change_mysql_port(cluster, zk): + with default_client(cluster, zk) as client: + mysql_client = get_mysql_client(cluster, port=9004) + assert mysql_client.query("SELECT 1") == 1 + with sync_loaded_config(client.query): + zk.set("/clickhouse/ports/mysql", b"9090") + with pytest.raises(pymysql.err.OperationalError, match="Lost connection"): + mysql_client.query("SELECT 1") + mysql_client_on_new_port = get_mysql_client(cluster, port=9090) + assert mysql_client_on_new_port.query("SELECT 1") == 1 + + +def test_change_postgresql_port(cluster, zk): + with default_client(cluster, zk) as client: + pgsql_client = get_pgsql_client(cluster, port=9005) + cursor = pgsql_client.cursor() + cursor.execute("SELECT 1") + assert cursor.fetchall() == [(1,)] + with sync_loaded_config(client.query): + zk.set("/clickhouse/ports/postgresql", b"9090") + with pytest.raises(psycopg2.OperationalError, match="closed"): + cursor.execute("SELECT 1") + pgsql_client_on_new_port = get_pgsql_client(cluster, port=9090) + cursor = pgsql_client_on_new_port.cursor() + cursor.execute("SELECT 1") + cursor.fetchall() == [(1,)] + + +def test_change_grpc_port(cluster, zk): + with default_client(cluster, zk) as client: + grpc_channel = get_grpc_channel(cluster, port=9100) + assert grpc_query(grpc_channel, "SELECT 1") == "1\n" + with sync_loaded_config(client.query): + zk.set("/clickhouse/ports/grpc", b"9090") + with pytest.raises(grpc._channel._InactiveRpcError, match="StatusCode.UNAVAILABLE"): + grpc_query(grpc_channel, "SELECT 1") + grpc_channel_on_new_port = get_grpc_channel(cluster, port=9090) + assert grpc_query(grpc_channel_on_new_port, "SELECT 1") == "1\n" + + +def test_remove_tcp_port(cluster, zk): + with default_client(cluster, zk, restore_via_http=True) as client: + assert client.query("SELECT 1") == "1\n" + with sync_loaded_config(instance.http_query): + zk.delete("/clickhouse/ports/tcp") + with pytest.raises(QueryRuntimeException, match="Connection refused"): + client.query("SELECT 1") + + +def test_remove_http_port(cluster, zk): + with default_client(cluster, zk) as client: + assert instance.http_query("SELECT 1") == "1\n" + with sync_loaded_config(client.query): + zk.delete("/clickhouse/ports/http") + with pytest.raises(ConnectionError, match="Connection refused"): + instance.http_query("SELECT 1") + + +def test_remove_mysql_port(cluster, zk): + with default_client(cluster, zk) as client: + mysql_client = get_mysql_client(cluster, port=9004) + assert mysql_client.query("SELECT 1") == 1 + with sync_loaded_config(client.query): + zk.delete("/clickhouse/ports/mysql") + with pytest.raises(pymysql.err.OperationalError, match="Lost connection"): + mysql_client.query("SELECT 1") + + +def test_remove_postgresql_port(cluster, zk): + with default_client(cluster, zk) as client: + pgsql_client = get_pgsql_client(cluster, port=9005) + cursor = pgsql_client.cursor() + cursor.execute("SELECT 1") + assert cursor.fetchall() == [(1,)] + with sync_loaded_config(client.query): + zk.delete("/clickhouse/ports/postgresql") + with pytest.raises(psycopg2.OperationalError, match="closed"): + cursor.execute("SELECT 1") + + +def test_remove_grpc_port(cluster, zk): + with default_client(cluster, zk) as client: + grpc_channel = get_grpc_channel(cluster, port=9100) + assert grpc_query(grpc_channel, "SELECT 1") == "1\n" + with sync_loaded_config(client.query): + zk.delete("/clickhouse/ports/grpc") + with pytest.raises(grpc._channel._InactiveRpcError, match="StatusCode.UNAVAILABLE"): + grpc_query(grpc_channel, "SELECT 1") + + +def test_change_listen_host(cluster, zk): + localhost_client = Client(host="127.0.0.1", port=9000, command="/usr/bin/clickhouse") + localhost_client.command = ["docker", "exec", "-i", instance.docker_id] + localhost_client.command + try: + client = get_client(cluster, port=9000) + with sync_loaded_config(localhost_client.query): + zk.set("/clickhouse/listen_hosts", b"127.0.0.1") + with pytest.raises(QueryRuntimeException, match="Connection refused"): + client.query("SELECT 1") + assert localhost_client.query("SELECT 1") == "1\n" + finally: + with sync_loaded_config(localhost_client.query): + configure_ports_from_zk(zk) + diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 33ce94a7a29..f317fb5429a 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -310,6 +310,7 @@ def test_seekable_formats(started_cluster): result = node1.query(f"SELECT count() FROM {table_function}") assert(int(result) == 5000000) + def test_read_table_with_default(started_cluster): hdfs_api = started_cluster.hdfs_api @@ -322,6 +323,22 @@ def test_read_table_with_default(started_cluster): "select * from hdfs('hdfs://hdfs1:9000/simple_table_function', 'TSVWithNames', 'n UInt32, m UInt32 DEFAULT n * 2') FORMAT TSVWithNames") == output +def test_schema_inference(started_cluster): + node1.query(f"insert into table function hdfs('hdfs://hdfs1:9000/native', 'Native', 'a Int32, b String') SELECT number, randomString(100) FROM numbers(5000000)") + + result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/native', 'Native')") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = node1.query(f"select count(*) from hdfs('hdfs://hdfs1:9000/native', 'Native')") + assert(int(result) == 5000000) + + node1.query(f"create table schema_inference engine=HDFS('hdfs://hdfs1:9000/native', 'Native')") + result = node1.query(f"desc schema_inference") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = node1.query(f"select count(*) from schema_inference") + assert(int(result) == 5000000) + def test_hdfsCluster(started_cluster): hdfs_api = started_cluster.hdfs_api diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 1ee7f3cf125..a92dafa0b8a 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -445,15 +445,21 @@ def test_kafka_formats(kafka_cluster): # /src/Processors/Formats/IRowInputFormat.cpp:0: DB::IRowInputFormat::generate() @ 0x1de72710 in /usr/bin/clickhouse ], }, - # 'Template' : { - # 'data_sample' : [ - # '(id = 0, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', - # # '(id = 1, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 2, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 3, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 4, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 5, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 6, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 7, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 8, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 9, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 10, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 11, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 12, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 13, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 14, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 15, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', - # # '(id = 0, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', - # # '' # tolerates - # ], - # 'extra_settings': ", format_template_row='template_row.format'" - # }, + 'CustomSeparated' : { + 'data_sample' : [ + '0\t0\tAM\t0.5\t1\n', + '1\t0\tAM\t0.5\t1\n2\t0\tAM\t0.5\t1\n3\t0\tAM\t0.5\t1\n4\t0\tAM\t0.5\t1\n5\t0\tAM\t0.5\t1\n6\t0\tAM\t0.5\t1\n7\t0\tAM\t0.5\t1\n8\t0\tAM\t0.5\t1\n9\t0\tAM\t0.5\t1\n10\t0\tAM\t0.5\t1\n11\t0\tAM\t0.5\t1\n12\t0\tAM\t0.5\t1\n13\t0\tAM\t0.5\t1\n14\t0\tAM\t0.5\t1\n15\t0\tAM\t0.5\t1\n', + '0\t0\tAM\t0.5\t1\n', + ], + }, + 'Template' : { + 'data_sample' : [ + '(id = 0, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', + '(id = 1, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 2, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 3, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 4, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 5, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 6, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 7, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 8, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 9, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 10, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 11, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 12, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 13, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 14, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 15, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', + '(id = 0, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', + ], + 'extra_settings': ", format_template_row='template_row.format'" + }, 'Regexp': { 'data_sample': [ '(id = 0, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', @@ -1498,6 +1504,13 @@ def test_kafka_flush_on_big_message(kafka_cluster): def test_kafka_virtual_columns(kafka_cluster): + admin_client = KafkaAdminClient(bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)) + topic_config = { + # default retention, since predefined timestamp_ms is used. + 'retention.ms': '-1', + } + kafka_create_topic(admin_client, "virt1", config=topic_config) + instance.query(''' CREATE TABLE test.kafka (key UInt64, value UInt64) ENGINE = Kafka @@ -1530,6 +1543,13 @@ def test_kafka_virtual_columns(kafka_cluster): def test_kafka_virtual_columns_with_materialized_view(kafka_cluster): + admin_client = KafkaAdminClient(bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)) + topic_config = { + # default retention, since predefined timestamp_ms is used. + 'retention.ms': '-1', + } + kafka_create_topic(admin_client, "virt2", config=topic_config) + instance.query(''' DROP TABLE IF EXISTS test.view; DROP TABLE IF EXISTS test.consumer; @@ -1738,8 +1758,12 @@ def test_kafka_commit_on_block_write(kafka_cluster): def test_kafka_virtual_columns2(kafka_cluster): admin_client = KafkaAdminClient(bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)) - kafka_create_topic(admin_client, "virt2_0", num_partitions=2) - kafka_create_topic(admin_client, "virt2_1", num_partitions=2) + topic_config = { + # default retention, since predefined timestamp_ms is used. + 'retention.ms': '-1', + } + kafka_create_topic(admin_client, "virt2_0", num_partitions=2, config=topic_config) + kafka_create_topic(admin_client, "virt2_1", num_partitions=2, config=topic_config) instance.query(''' CREATE TABLE test.kafka (value UInt64) @@ -1867,6 +1891,13 @@ def test_kafka_produce_key_timestamp(kafka_cluster): def test_kafka_insert_avro(kafka_cluster): + admin_client = KafkaAdminClient(bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)) + topic_config = { + # default retention, since predefined timestamp_ms is used. + 'retention.ms': '-1', + } + kafka_create_topic(admin_client, "avro1", config=topic_config) + instance.query(''' DROP TABLE IF EXISTS test.kafka; CREATE TABLE test.kafka (key UInt64, value UInt64, _timestamp DateTime('UTC')) diff --git a/tests/integration/test_storage_postgresql/test.py b/tests/integration/test_storage_postgresql/test.py index 6f43036e64d..b6ac121cd0c 100644 --- a/tests/integration/test_storage_postgresql/test.py +++ b/tests/integration/test_storage_postgresql/test.py @@ -424,6 +424,21 @@ def test_predefined_connection_configuration(started_cluster): cursor.execute(f'DROP TABLE test_table ') +def test_where_false(started_cluster): + cursor = started_cluster.postgres_conn.cursor() + cursor.execute("DROP TABLE IF EXISTS test") + cursor.execute('CREATE TABLE test (a Integer)') + cursor.execute("INSERT INTO test SELECT 1") + + result = node1.query("SELECT count() FROM postgresql('postgres1:5432', 'postgres', 'test', 'postgres', 'mysecretpassword') WHERE 1=0") + assert(int(result) == 0) + result = node1.query("SELECT count() FROM postgresql('postgres1:5432', 'postgres', 'test', 'postgres', 'mysecretpassword') WHERE 0") + assert(int(result) == 0) + result = node1.query("SELECT count() FROM postgresql('postgres1:5432', 'postgres', 'test', 'postgres', 'mysecretpassword') WHERE 1=1") + assert(int(result) == 1) + cursor.execute("DROP TABLE test") + + if __name__ == '__main__': cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index 2c2a9e41509..a3d99159cb2 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -35,6 +35,17 @@ def rabbitmq_check_result(result, check=False, ref_file='test_rabbitmq_json.refe else: return TSV(result) == TSV(reference) +def wait_rabbitmq_to_start(rabbitmq_docker_id, timeout=180): + start = time.time() + while time.time() - start < timeout: + try: + if instance.cluster.check_rabbitmq_is_available(rabbitmq_docker_id): + logging.debug("RabbitMQ is available") + return + time.sleep(0.5) + except Exception as ex: + logging.debug("Can't connect to RabbitMQ " + str(ex)) + time.sleep(0.5) def kill_rabbitmq(rabbitmq_id): p = subprocess.Popen(('docker', 'stop', rabbitmq_id), stdout=subprocess.PIPE) @@ -45,7 +56,7 @@ def kill_rabbitmq(rabbitmq_id): def revive_rabbitmq(rabbitmq_id): p = subprocess.Popen(('docker', 'start', rabbitmq_id), stdout=subprocess.PIPE) p.communicate() - return p.returncode == 0 + wait_rabbitmq_to_start(rabbitmq_id) # Fixtures diff --git a/tests/integration/test_storage_s3/configs/named_collections.xml b/tests/integration/test_storage_s3/configs/named_collections.xml index dfcbeeb2d4a..efadedc1bde 100644 --- a/tests/integration/test_storage_s3/configs/named_collections.xml +++ b/tests/integration/test_storage_s3/configs/named_collections.xml @@ -15,5 +15,10 @@ minio minio123 + + http://minio1:9001/root/test_native + minio + minio123 + diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index f3c4b1dd0cf..885a37f875c 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -126,7 +126,7 @@ def run_query(instance, query, stdin=None, settings=None): pytest.param("'wrongid','wrongkey',", False, 'xz', id="xz"), pytest.param("'wrongid','wrongkey',", False, 'zstd', id="zstd") ]) -def test_put(started_cluster, maybe_auth, positive, compression): +def _test_put(started_cluster, maybe_auth, positive, compression): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -148,7 +148,7 @@ def test_put(started_cluster, maybe_auth, positive, compression): assert values_csv == get_s3_file_content(started_cluster, bucket, filename) -def test_partition_by(started_cluster): +def _test_partition_by(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -173,7 +173,7 @@ def test_partition_by(started_cluster): assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test2_45.csv") -def test_partition_by_string_column(started_cluster): +def _test_partition_by_string_column(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "col_num UInt32, col_str String" @@ -191,7 +191,7 @@ def test_partition_by_string_column(started_cluster): assert '78,"你好"\n' == get_s3_file_content(started_cluster, bucket, "test_你好.csv") -def test_partition_by_const_column(started_cluster): +def _test_partition_by_const_column(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -212,7 +212,7 @@ def test_partition_by_const_column(started_cluster): "space", "plus" ]) -def test_get_file_with_special(started_cluster, special): +def _test_get_file_with_special(started_cluster, special): symbol = {"space": " ", "plus": "+"}[special] urlsafe_symbol = {"space": "%20", "plus": "%2B"}[special] auth = "'minio','minio123'," @@ -239,7 +239,7 @@ def test_get_file_with_special(started_cluster, special): "plus", "plus2" ]) -def test_get_path_with_special(started_cluster, special): +def _test_get_path_with_special(started_cluster, special): symbol = {"space": "%20", "plus": "%2B", "plus2": "%2B"}[special] safe_symbol = {"space": "%20", "plus": "+", "plus2": "%2B"}[special] auth = "'minio','minio123'," @@ -253,7 +253,7 @@ def test_get_path_with_special(started_cluster, special): @pytest.mark.parametrize("auth", [ pytest.param("'minio','minio123',", id="minio") ]) -def test_empty_put(started_cluster, auth): +def _test_empty_put(started_cluster, auth): # type: (ClickHouseCluster, str) -> None bucket = started_cluster.minio_bucket @@ -291,7 +291,7 @@ def test_empty_put(started_cluster, auth): pytest.param("'minio','minio123',", True, id="auth_positive"), pytest.param("'wrongid','wrongkey',", False, id="negative"), ]) -def test_put_csv(started_cluster, maybe_auth, positive): +def _test_put_csv(started_cluster, maybe_auth, positive): # type: (ClickHouseCluster, bool, str) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -313,7 +313,7 @@ def test_put_csv(started_cluster, maybe_auth, positive): # Test put and get with S3 server redirect. -def test_put_get_with_redirect(started_cluster): +def _test_put_get_with_redirect(started_cluster): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket @@ -340,7 +340,7 @@ def test_put_get_with_redirect(started_cluster): # Test put with restricted S3 server redirect. -def test_put_with_zero_redirect(started_cluster): +def _test_put_with_zero_redirect(started_cluster): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket @@ -367,7 +367,7 @@ def test_put_with_zero_redirect(started_cluster): assert exception_raised -def test_put_get_with_globs(started_cluster): +def _test_put_get_with_globs(started_cluster): # type: (ClickHouseCluster) -> None unique_prefix = random.randint(1,10000) bucket = started_cluster.minio_bucket @@ -399,7 +399,7 @@ def test_put_get_with_globs(started_cluster): pytest.param("'wrongid','wrongkey'", False, id="negative"), # ("'minio','minio123',",True), Redirect with credentials not working with nginx. ]) -def test_multipart_put(started_cluster, maybe_auth, positive): +def _test_multipart_put(started_cluster, maybe_auth, positive): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -439,7 +439,7 @@ def test_multipart_put(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) -def test_remote_host_filter(started_cluster): +def _test_remote_host_filter(started_cluster): instance = started_cluster.instances["restricted_dummy"] format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -457,7 +457,7 @@ def test_remote_host_filter(started_cluster): pytest.param("''", id="1_argument"), pytest.param("'','','','','',''", id="6_arguments"), ]) -def test_wrong_s3_syntax(started_cluster, s3_storage_args): +def _test_wrong_s3_syntax(started_cluster, s3_storage_args): instance = started_cluster.instances["dummy"] # type: ClickHouseInstance expected_err_msg = "Code: 42" # NUMBER_OF_ARGUMENTS_DOESNT_MATCH @@ -466,7 +466,7 @@ def test_wrong_s3_syntax(started_cluster, s3_storage_args): # https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights -def test_s3_glob_scheherazade(started_cluster): +def _test_s3_glob_scheherazade(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -535,7 +535,7 @@ def replace_config(old, new): config.close() -def test_custom_auth_headers(started_cluster): +def _test_custom_auth_headers(started_cluster): table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" get_query = "select * from s3('http://resolver:8080/{bucket}/{file}', 'CSV', '{table_format}')".format( @@ -566,7 +566,7 @@ def test_custom_auth_headers(started_cluster): instance.query("DROP TABLE test") -def test_custom_auth_headers_exclusion(started_cluster): +def _test_custom_auth_headers_exclusion(started_cluster): table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" get_query = f"SELECT * FROM s3('http://resolver:8080/{started_cluster.minio_restricted_bucket}/restricteddirectory/{filename}', 'CSV', '{table_format}')" @@ -580,7 +580,7 @@ def test_custom_auth_headers_exclusion(started_cluster): assert 'Forbidden Error' in ei.value.stderr -def test_infinite_redirect(started_cluster): +def _test_infinite_redirect(started_cluster): bucket = "redirected" table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" @@ -598,7 +598,7 @@ def test_infinite_redirect(started_cluster): pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz"), ]) -def test_storage_s3_get_gzip(started_cluster, extension, method): +def _test_storage_s3_get_gzip(started_cluster, extension, method): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = f"test_get_gzip.{extension}" @@ -638,7 +638,7 @@ def test_storage_s3_get_gzip(started_cluster, extension, method): run_query(instance, f"DROP TABLE {name}") -def test_storage_s3_get_unstable(started_cluster): +def _test_storage_s3_get_unstable(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64" @@ -647,7 +647,7 @@ def test_storage_s3_get_unstable(started_cluster): assert result.splitlines() == ["500001,500000,0"] -def test_storage_s3_put_uncompressed(started_cluster): +def _test_storage_s3_put_uncompressed(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = "test_put_uncompressed.bin" @@ -684,7 +684,7 @@ def test_storage_s3_put_uncompressed(started_cluster): pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz") ]) -def test_storage_s3_put_gzip(started_cluster, extension, method): +def _test_storage_s3_put_gzip(started_cluster, extension, method): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = f"test_put_gzip.{extension}" @@ -721,7 +721,7 @@ def test_storage_s3_put_gzip(started_cluster, extension, method): assert sum([ int(i.split(',')[1]) for i in uncompressed_content.splitlines() ]) == 708 -def test_truncate_table(started_cluster): +def _test_truncate_table(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance name = "truncate" @@ -745,7 +745,7 @@ def test_truncate_table(started_cluster): assert instance.query("SELECT * FROM {}".format(name)) == "" -def test_predefined_connection_configuration(started_cluster): +def _test_predefined_connection_configuration(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance name = "test_table" @@ -762,7 +762,7 @@ def test_predefined_connection_configuration(started_cluster): result = "" -def test_url_reconnect_in_the_middle(started_cluster): +def _test_url_reconnect_in_the_middle(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] table_format = "id String, data String" @@ -783,7 +783,7 @@ def test_url_reconnect_in_the_middle(started_cluster): f"""select sum(cityHash64(x)) from (select toUInt64(id) + sleep(0.1) as x from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}') settings http_max_tries = 10, http_retry_max_backoff_ms=2000, http_send_timeout=1, http_receive_timeout=1)""") - assert(int(result), 3914219105369203805) + assert(int(result) == 3914219105369203805) thread = threading.Thread(target=select) thread.start() @@ -796,10 +796,10 @@ def test_url_reconnect_in_the_middle(started_cluster): thread.join() - assert(int(result), 3914219105369203805) + assert(int(result) == 3914219105369203805) -def test_seekable_formats(started_cluster): +def _test_seekable_formats(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance @@ -821,7 +821,7 @@ def test_seekable_formats(started_cluster): assert(int(result[:3]) < 200) -def test_seekable_formats_url(started_cluster): +def _test_seekable_formats_url(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] @@ -865,3 +865,53 @@ def test_insert_with_path_with_globs(started_cluster): table_function_3 = f"s3('http://minio1:9001/root/test_parquet*', 'minio', 'minio123', 'Parquet', 'a Int32, b String')" instance.query_and_get_error(f"insert into table function {table_function_3} SELECT number, randomString(100) FROM numbers(500)") + + +def test_s3_schema_inference(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + instance.query(f"insert into table function s3(s3_native, structure='a Int32, b String', format='Native') select number, randomString(100) from numbers(5000000)") + result = instance.query(f"desc s3(s3_native, format='Native')") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from s3(s3_native, format='Native')") + assert(int(result) == 5000000) + + instance.query(f"create table schema_inference engine=S3(s3_native, format='Native')") + result = instance.query(f"desc schema_inference") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from schema_inference") + assert(int(result) == 5000000) + + + table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')" + result = instance.query(f"desc {table_function}") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from {table_function}") + assert(int(result) == 5000000) + + instance.query(f"create table schema_inference_2 engine=URL('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')") + result = instance.query(f"desc schema_inference_2") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from schema_inference_2") + assert(int(result) == 5000000) + + +def test_empty_file(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + name = "empty" + url = f'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}' + + minio = started_cluster.minio_client + minio.put_object(bucket, name, io.BytesIO(b""), 0) + + table_function = f"s3('{url}', 'CSV', 'id Int32')" + result = instance.query(f"SELECT count() FROM {table_function}") + assert(int(result) == 0) + diff --git a/tests/integration/test_system_metrics/test.py b/tests/integration/test_system_metrics/test.py index 9e8eac162f6..efcc6f88a24 100644 --- a/tests/integration/test_system_metrics/test.py +++ b/tests/integration/test_system_metrics/test.py @@ -59,3 +59,32 @@ def test_readonly_metrics(start_cluster): node1.query("ATTACH TABLE test.test_table") assert_eq_with_retry(node1, "SELECT value FROM system.metrics WHERE metric = 'ReadonlyReplica'", "0\n", retry_count=300, sleep_time=1) +#For LowCardinality-columns, the bytes for N rows is not N*size of 1 row. +def test_metrics_storage_buffer_size(start_cluster): + node1.query(''' + CREATE TABLE test.test_mem_table + ( + `str` LowCardinality(String) + ) + ENGINE = Memory; + + CREATE TABLE test.buffer_table + ( + `str` LowCardinality(String) + ) + ENGINE = Buffer('test', 'test_mem_table', 1, 600, 600, 1000, 100000, 100000, 10000000); + ''') + + #before flush + node1.query("INSERT INTO test.buffer_table VALUES('hello');") + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferRows'") == "1\n" + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferBytes'") == "24\n" + + node1.query("INSERT INTO test.buffer_table VALUES('hello');") + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferRows'") == "2\n" + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferBytes'") == "25\n" + + #flush + node1.query("OPTIMIZE TABLE test.buffer_table") + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferRows'") == "0\n" + assert node1.query("SELECT value FROM system.metrics WHERE metric = 'StorageBufferBytes'") == "0\n" diff --git a/tests/integration/test_table_functions_access_rights/test.py b/tests/integration/test_table_functions_access_rights/test.py index 16f18407960..90106303315 100644 --- a/tests/integration/test_table_functions_access_rights/test.py +++ b/tests/integration/test_table_functions_access_rights/test.py @@ -39,7 +39,7 @@ def test_merge(): instance.query("GRANT CREATE TEMPORARY TABLE ON *.* TO A") assert "no tables in database matches" in instance.query_and_get_error(select_query, user = 'A') - + instance.query("GRANT SELECT ON default.table1 TO A") assert instance.query(select_query, user = 'A') == "1\n" diff --git a/tests/performance/set_index.xml b/tests/performance/set_index.xml index 1fb7cf967f3..631cad9986e 100644 --- a/tests/performance/set_index.xml +++ b/tests/performance/set_index.xml @@ -3,17 +3,17 @@ INSERT INTO test_in SELECT number FROM numbers(500000000) - SELECT count() FROM test_in WHERE a IN (SELECT rand(1) FROM numbers(200000)) SETTINGS max_rows_to_read = 1, read_overflow_mode = 'break' + SELECT count() FROM test_in WHERE a IN (SELECT rand(1) FROM numbers(200000)) SETTINGS max_rows_to_read = 200001, read_overflow_mode = 'break' - SELECT count() FROM test_in WHERE toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=1, read_overflow_mode='break' + SELECT count() FROM test_in WHERE toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=200001, read_overflow_mode='break' - SELECT count() FROM test_in WHERE -toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=1, read_overflow_mode='break' + SELECT count() FROM test_in WHERE -toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=200001, read_overflow_mode='break' - SELECT count() FROM test_in WHERE -toInt64(a) NOT IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=1, read_overflow_mode='break' + SELECT count() FROM test_in WHERE -toInt64(a) NOT IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=200001, read_overflow_mode='break' SELECT (rand(), rand()) IN ((17258, 93148), (4508, 52749), (68660, 70017), (77797, 23528), (1136, 37393), (53237, 15379), (68370, 73211), (15782, 54962), (59432, 45415), (68396, 920), (96154, 21016), (12700, 26887), (88016, 43191), (68153, 51575), (91315, 40005), (18070, 73178), (86, 631), (77717, 20324), (3227, 76188), (74960, 43147), (77538, 19628), (82292, 6525), (24293, 12566), (85244, 96287), (93982, 1329), (38064, 54723), (83999, 45810), (71921, 53673), (88638, 9669), (1959, 39535), (82235, 95796), (27907, 90975), (42383, 91015), (9948, 91514), (81712, 47309), (400, 25808), (31791, 46948), (39740, 36098), (25943, 84598), (99598, 52939), (77134, 15845), (40313, 72174), (85017, 94036), (36595, 14303), (83961, 68078), (55792, 72759), (73574, 43606), (9853, 63560), (28580, 56721), (74804, 41025), (32095, 55657), (52881, 63416), (91368, 90310), (23922, 38883), (30592, 10758), (66448, 61183), (31880, 96697), (11362, 20633), (75331, 2015), (71129, 8785), (1115, 70955), (7886, 83698), (18961, 84556), (16677, 43028), (37347, 70220), (31699, 71244), (10578, 96159), (67600, 39041), (78791, 86687), (21545, 54174), (68774, 37637), (46132, 81768), (98413, 20605), (2960, 23665), (31507, 35719), (96209, 18368), (60558, 38035), (21952, 3264), (11834, 86458), (21651, 17650), (86276, 36087), (18818, 24849), (61951, 3390), (59637, 62545), (30346, 72253), (36281, 2992), (78340, 49872), (94326, 93723), (3416, 94405), (12272, 8741), (22600, 22095), (57636, 37106), (38702, 14889), (70238, 11276), (17325, 60648), (16492, 41271), (52100, 1304), (93416, 7795), (57209, 71008), (48010, 36078), (20384, 74420), (77440, 34439), (69224, 45099), (30374, 33884), (49038, 90140), (1154, 84725), (64926, 86985), (91746, 73472), (59757, 75755), (45860, 71557), (45833, 36526), (74618, 73598), (91360, 65168), (58029, 30793), (56332, 14973), (99943, 96877), (97454, 6450), (64502, 77301), (73182, 31853), (76809, 83964), (82916, 86188), (78736, 65427), (36495, 7422), (76196, 2804), (96117, 61093), (9177, 26099), (52942, 63007), (48578, 47876), (50638, 89903), (7113, 97316), (35301, 12750), (47807, 7254), (38217, 55418), (56970, 41687), (20527, 62886), (358, 14021), (64018, 18582), (91740, 21683), (81967, 53589), (45437, 38450), (45476, 67752), (76851, 72072), (7304, 60091), (40097, 12897), (39906, 29247), (84262, 58734), (30857, 43791), (56087, 78929), (20498, 45954), (48726, 500), (62723, 43763), (28368, 30756), (74048, 52403), (15045, 95926), (75542, 55384), (52543, 22525), (56001, 6935), (11431, 46745), (77731, 7310), (36718, 59909), (32235, 91254), (92417, 25917), (21782, 79277), (46378, 87536), (35324, 26075), (6310, 76915), (1551, 69473), (50642, 68865), (55190, 72934), (49780, 21873), (99466, 29686), (90761, 13179), (72959, 57033), (20020, 90200), (46186, 79105), (73871, 52382), (59559, 38801), (59916, 16082), (33610, 94966), (46001, 45225), (86679, 26469), (77245, 91929), (32887, 36623), (11179, 46898), (87881, 68087), (45438, 47991), (24950, 94525), (91664, 51656), (43914, 47805), (15736, 96156), (56346, 20283), (85053, 48931), (17790, 26179), (96195, 55728), (43765, 54807), (44988, 89269), (55911, 99411), (52446, 47397), (28346, 65442), (96669, 68226), (66194, 26848), (37276, 55864), (14116, 41583), (18058, 16317), (93136, 85318), (35616, 86252), (29222, 29969), (33386, 85372), (71094, 44238), (27733, 31838), (64626, 16692), (52904, 97899), (97619, 12663), (50165, 4688), (67557, 44053), (69184, 66269), (73164, 89705), (39822, 15169), (65499, 72808), (30068, 63697), (30154, 64235), (97016, 58716), (94366, 36592), (1592, 16261), (87985, 52102), (12554, 23652), (15909, 25292), (2527, 91531), (92139, 36031), (28986, 30032), (3038, 56314), (32239, 26707), (15973, 34901), (70246, 39680), (82529, 38132), (45827, 74783), (53665, 64111), (55218, 84170), (20466, 16130), (55734, 71203), (31438, 96906), (66338, 85858), (35988, 68511), (78391, 15191), (80747, 59213), (5357, 11546), (16822, 16607), (36607, 41106), (74949, 30739), (45726, 64887), (1524, 54847), (37371, 89195), (28726, 27788), (22600, 44777), (53999, 63625), (84304, 98338), (49260, 76480), (74564, 53907), (89867, 97096), (60157, 61299), (17165, 10146), (56334, 36268), (62114, 49222), (22715, 23620), (42830, 11539), (41091, 69151), (75471, 68364), (18681, 43249), (42738, 63219), (35474, 98454), (76815, 46024), (66310, 36521), (86095, 77013), (63693, 77319), (80731, 63031), (95478, 92387), (23787, 63724), (46299, 68994), (4800, 2460), (9663, 80639), (77231, 85814), (81615, 11311), (35638, 27340), (13598, 14322), (30657, 17238), (90957, 96846), (69962, 52140), (41681, 65962), (96836, 58177), (36190, 11623), (4231, 40500), (43049, 41949), (71177, 98492), (30193, 39750), (19744, 33204), (63358, 30210), (45638, 58918), (43641, 38741), (35598, 40932), (33238, 36236), (50835, 20968), (25099, 34071), (84986, 88456), (35333, 1529), (79771, 23985), (647, 61658), (9424, 11743), (77766, 31528), (77811, 86973), (76403, 74377), (55568, 79251), (68858, 20762), (68520, 66773), (93598, 89823), (8080, 82539), (87760, 52247), (25191, 16905), (17837, 8339), (85177, 59050), (51680, 77374), (3287, 43018), (43479, 62141), (34909, 46322), (11869, 5885), (96193, 58417), (101, 47460), (34937, 88582), (83216, 88388), (28571, 15292), (66683, 62613), (34478, 8924), (2680, 89973), (62438, 44460), (11724, 4791), (5383, 72888), (88206, 67586), (8124, 21690), (28779, 75789), (66791, 4757), (6176, 47760), (6403, 78084), (78122, 35446), (99494, 73608), (39691, 89098), (59182, 19484), (25389, 98963), (96487, 3692), (76222, 67381), (21199, 50358), (95998, 58137), (28777, 43913), (14176, 60117), (52257, 81703), (14604, 13438), (71301, 14401), (19758, 66914), (15506, 29873), (87205, 29449), (93295, 15930), (63651, 11287), (19785, 15966), (30795, 75112), (69462, 37655), (18793, 85764), (36240, 31236), (98153, 73724), (72491, 4223), (66930, 35048), (25686, 13269), (13940, 13259), (69163, 11235), (1183, 86961), (54323, 67315), (85044, 60872), (48875, 3683), (43052, 92861), (87574, 32969), (92552, 80564), (94832, 47682), (72011, 80994), (60182, 917), (97788, 34169), (66432, 47940), (87468, 80954), (35385, 68758), (50555, 63710), (55311, 44337), (87065, 26514), (84581, 98736), (23212, 56499), (75120, 72447), (56087, 38285), (58171, 45629), (28401, 44319), (70432, 27883), (18891, 14646), (26206, 49924), (79957, 44914), (56064, 27529), (99090, 29197), (49435, 340), (53525, 65601), (76998, 88349), (50416, 70860), (42506, 75290), (34024, 13295), (86663, 46523), (88814, 231), (57809, 21), (84914, 84771), (43042, 66892), (17288, 33908), (4934, 63195), (50590, 1516), (97843, 80208), (20091, 86717), (71566, 15929), (19531, 23634), (41646, 45549), (89226, 82902), (96683, 63386), (31072, 53788), (51135, 41099), (78912, 65609), (36094, 23603), (88403, 51455), (73795, 47066), (26448, 82852), (22829, 2894), (30041, 92548), (27733, 20608), (70180, 19892), (51650, 63440), (76328, 13666), (40514, 6677), (2786, 51059), (40809, 16499), (10857, 82541), (78221, 61067), (17982, 51969), (85369, 66965), (47153, 47149), (43965, 75796), (82725, 60767), (42407, 97249), (51475, 81224), (60957, 89414), (33065, 21663), (36601, 5290), (95842, 67301), (64630, 60398), (55212, 35638), (41750, 44235), (75260, 82400), (91291, 25843), (6477, 8311), (14919, 52306), (66220, 33180), (45736, 2313), (37450, 64444), (98614, 61344), (75007, 50946), (56701, 28117), (66632, 5174), (92323, 76613), (6796, 73695), (33696, 76280), (86876, 5614), (50863, 67993), (36068, 17049), (91912, 34271), (70706, 1904), (97798, 41117), (68154, 72483), (83862, 25578), (61643, 17204), (69974, 64232), (77926, 19637), (64901, 88988), (71424, 91703), (91655, 17147), (46872, 56530), (44189, 98087), (95939, 54420), (72651, 68785), (67624, 84875), (92587, 87663), (65275, 81256), (53798, 2506), (14702, 3638), (71291, 50452), (14909, 13903), (66965, 26606), (14127, 60345), (35306, 1738), (77234, 10468), (53521, 41218), (80681, 82583), (44227, 26521), (32263, 21482), (82270, 56963), (50580, 80567), (11593, 22346), (20074, 26867), (73126, 28667), (62996, 24317), (20295, 57163), (1506, 57668), (69567, 45236), (43366, 26001), (88052, 40181), (1599, 89349), (36789, 1579), (39895, 46673), (30381, 3206), (31723, 5625), (19252, 31317), (16932, 77149), (48794, 34409), (55986, 30328), (47551, 75088), (57363, 78365), (95221, 63385), (26449, 5733), (96588, 53077), (52980, 41140), (8187, 85947), (36723, 26520), (23579, 38909), (33350, 19275), (63930, 19357), (43536, 59941), (31117, 77322), (44638, 94812), (44730, 99097), (95108, 48170), (57813, 49503), (79959, 89436), (86980, 62031), (8275, 44009), (36666, 94645), (22064, 38882), (40471, 16939), (31156, 11337), (13101, 96977), (17906, 26835), (89861, 51405), (73369, 67946), (99141, 58572), (27131, 98703), (15900, 43412), (51768, 93125), (78579, 46689), (23029, 13895), (60870, 55830), (22553, 8236), (76449, 96207), (83766, 51024), (27630, 50614), (53484, 90104), (77626, 21944), (46755, 41583), (53616, 34240), (94159, 44415), (13914, 90059), (44387, 89012), (27499, 64579), (83415, 30809), (77558, 82619), (88880, 9814), (8466, 4424), (43598, 91921), (24695, 3349), (46295, 65208), (51256, 82461), (49126, 93012), (16186, 96585), (43284, 22655), (93130, 90393), (77495, 34372), (85509, 65856), (86662, 61906), (50988, 44393), (29828, 17737), (91651, 35308), (29796, 49716), (14019, 87751), (29688, 71207), (82845, 19100), (11989, 50132), (21158, 99905), (54732, 42547), (32314, 12851), (46405, 43794), (87849, 45643), (53524, 21212), (61925, 75491), (12498, 21937), (30185, 69475), (48421, 52487), (15112, 90935), (33187, 17801), (61704, 25514), (17889, 23917), (18758, 57197), (7693, 47232), (47905, 24618), (11494, 78950), (95662, 54561), (8075, 33909), (90427, 46065), (73962, 19821), (50691, 79400), (58218, 4881), (94106, 2509), (60633, 55169), (49600, 83054), (23339, 13270), (70262, 58946), (48417, 97266), (27629, 46905), (74465, 75514), (41687, 2564), (12814, 19492), (78899, 30168), (17745, 35206), (37972, 35296), (22288, 80001), diff --git a/tests/queries/0_stateless/00398_url_functions.reference b/tests/queries/0_stateless/00398_url_functions.reference index 9cd18350d78..e5f89fbea78 100644 --- a/tests/queries/0_stateless/00398_url_functions.reference +++ b/tests/queries/0_stateless/00398_url_functions.reference @@ -45,6 +45,7 @@ com /?query=hello world+foo+bar /?query=hello world+foo+bar /?query=hello world+foo+bar +/?query=hello world foo+bar /a/b/c /a/b/c @@ -57,6 +58,7 @@ query=hello world+foo+bar query=hello world+foo+bar query=hello world+foo+bar query=hello world+foo+bar +query=hello world foo+bar ====FRAGMENT==== @@ -71,6 +73,7 @@ query=hello world+foo+bar#a=b query=hello world+foo+bar#a=b query=hello world+foo+bar#a=b #a=b +query=hello world foo+bar#a=b ====CUT TO FIRST SIGNIFICANT SUBDOMAIN==== example.com example.com diff --git a/tests/queries/0_stateless/00398_url_functions.sql b/tests/queries/0_stateless/00398_url_functions.sql index af03a6d487a..ea71ed226d7 100644 --- a/tests/queries/0_stateless/00398_url_functions.sql +++ b/tests/queries/0_stateless/00398_url_functions.sql @@ -49,6 +49,7 @@ SELECT decodeURLComponent(pathFull('//127.0.0.1/?query=hello%20world+foo%2Bbar') SELECT decodeURLComponent(pathFull('http://127.0.0.1/?query=hello%20world+foo%2Bbar')) AS Path; SELECT decodeURLComponent(materialize(pathFull('http://127.0.0.1/?query=hello%20world+foo%2Bbar'))) AS Path; SELECT decodeURLComponent(materialize(pathFull('//127.0.0.1/?query=hello%20world+foo%2Bbar'))) AS Path; +SELECT decodeURLFormComponent(materialize(pathFull('//127.0.0.1/?query=hello%20world+foo%2Bbar'))) AS Path; SELECT path('http://127.0.0.1') AS Path; SELECT path('http://127.0.0.1/a/b/c') AS Path; SELECT path('http://127.0.0.1:443/a/b/c') AS Path; @@ -62,6 +63,7 @@ SELECT decodeURLComponent(queryString('http://127.0.0.1/?query=hello%20world+foo SELECT decodeURLComponent(queryString('http://127.0.0.1:443/?query=hello%20world+foo%2Bbar')); SELECT decodeURLComponent(queryString('http://paul@127.0.0.1:443/?query=hello%20world+foo%2Bbar')); SELECT decodeURLComponent(queryString('//paul@127.0.0.1:443/?query=hello%20world+foo%2Bbar')); +SELECT decodeURLFormComponent(queryString('//paul@127.0.0.1:443/?query=hello%20world+foo%2Bbar')); SELECT '====FRAGMENT===='; SELECT decodeURLComponent(fragment('http://127.0.0.1/?query=hello%20world+foo%2Bbar')); @@ -78,6 +80,7 @@ SELECT decodeURLComponent(queryStringAndFragment('http://127.0.0.1/?query=hello% SELECT decodeURLComponent(queryStringAndFragment('http://paul@127.0.0.1/?query=hello%20world+foo%2Bbar#a=b')); SELECT decodeURLComponent(queryStringAndFragment('//paul@127.0.0.1/?query=hello%20world+foo%2Bbar#a=b')); SELECT decodeURLComponent(queryStringAndFragment('//paul@127.0.0.1/#a=b')); +SELECT decodeURLFormComponent(queryStringAndFragment('//paul@127.0.0.1/?query=hello%20world+foo%2Bbar#a=b')); SELECT '====CUT TO FIRST SIGNIFICANT SUBDOMAIN===='; SELECT cutToFirstSignificantSubdomain('http://www.example.com'); diff --git a/tests/queries/0_stateless/00646_url_engine.python b/tests/queries/0_stateless/00646_url_engine.python index 85ae3e776ed..4f47e819328 100644 --- a/tests/queries/0_stateless/00646_url_engine.python +++ b/tests/queries/0_stateless/00646_url_engine.python @@ -156,6 +156,7 @@ def test_select(table_name="", schema="str String,numuint UInt32,numint Int32,do if table_name: get_ch_answer("drop table if exists {}".format(table_name)) + def test_insert(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests_insert=[], requests_select=[], answers=[]): with open(CSV_DATA, 'w') as f: # flush test file f.write('') diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh index 2731e4bcce3..8d9e2689e26 100755 --- a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh +++ b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh @@ -25,15 +25,15 @@ SELECT * FROM enum_mapping_protobuf_00825; EOF BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_enum_mapping.XXXXXX.binary") -$CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$SCHEMADIR/00825_protobuf_format_enum_mapping:Message'" > "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage'" > "$BINARY_FILE_PATH" # Check the output in the protobuf format echo -$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$SCHEMADIR/00825_protobuf_format_enum_mapping:Message" --input "$BINARY_FILE_PATH" +$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage" --input "$BINARY_FILE_PATH" # Check the input in the protobuf format (now the table contains the same data twice). echo -$CLICKHOUSE_CLIENT --query "INSERT INTO enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_enum_mapping:Message'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "INSERT INTO enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage'" < "$BINARY_FILE_PATH" $CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825" rm "$BINARY_FILE_PATH" diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary new file mode 100644 index 00000000000..4b7b97a300f Binary files /dev/null and b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary differ diff --git a/tests/queries/0_stateless/00907_set_index_with_nullable_and_low_cardinality_bug.sql b/tests/queries/0_stateless/00907_set_index_with_nullable_and_low_cardinality_bug.sql index 3a55a69c726..336d9984e69 100644 --- a/tests/queries/0_stateless/00907_set_index_with_nullable_and_low_cardinality_bug.sql +++ b/tests/queries/0_stateless/00907_set_index_with_nullable_and_low_cardinality_bug.sql @@ -8,7 +8,7 @@ CREATE TABLE null_lc_set_index ( INDEX test_user_idx (user) TYPE set(0) GRANULARITY 8192 ) ENGINE=MergeTree PARTITION BY toYYYYMMDD(timestamp) - ORDER BY (timestamp, action, cityHash64(user)); + ORDER BY (timestamp, action, cityHash64(user)) SETTINGS allow_nullable_key = 1; INSERT INTO null_lc_set_index VALUES (1550883010, 'subscribe', 'alice'); INSERT INTO null_lc_set_index VALUES (1550883020, 'follow', 'bob'); diff --git a/tests/queries/0_stateless/00945_bloom_filter_index.sql b/tests/queries/0_stateless/00945_bloom_filter_index.sql index f45c4c04290..497b0cdb641 100644 --- a/tests/queries/0_stateless/00945_bloom_filter_index.sql +++ b/tests/queries/0_stateless/00945_bloom_filter_index.sql @@ -14,10 +14,10 @@ SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN (1, 2) SETTINGS max_ SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN ((1, 2), (2, 3)) SETTINGS max_rows_to_read = 6; SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN ((1, 1), (2, 2)) SETTINGS max_rows_to_read = 6; SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN ((1, (1, 1)), (2, (2, 2))) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN (SELECT arrayJoin([toInt32(1), toInt32(2)])) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN (SELECT arrayJoin([(toInt32(1), toInt32(2)), (toInt32(2), toInt32(3))])) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN (SELECT arrayJoin([(toInt32(1), toUInt64(1)), (toInt32(2), toUInt64(2))])) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN (SELECT arrayJoin([(toUInt64(1), (toUInt64(1), toInt32(1))), (toUInt64(2), (toUInt64(2), toInt32(2)))])) SETTINGS max_rows_to_read = 6; +SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN (SELECT arrayJoin([toInt32(1), toInt32(2)])) SETTINGS max_rows_to_read = 7; +SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN (SELECT arrayJoin([(toInt32(1), toInt32(2)), (toInt32(2), toInt32(3))])) SETTINGS max_rows_to_read = 7; +SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN (SELECT arrayJoin([(toInt32(1), toUInt64(1)), (toInt32(2), toUInt64(2))])) SETTINGS max_rows_to_read = 7; +SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN (SELECT arrayJoin([(toUInt64(1), (toUInt64(1), toInt32(1))), (toUInt64(2), (toUInt64(2), toInt32(2)))])) SETTINGS max_rows_to_read = 7; WITH (1, 2) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN liter_prepared_set SETTINGS max_rows_to_read = 6; WITH ((1, 2), (2, 3)) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN liter_prepared_set SETTINGS max_rows_to_read = 6; WITH ((1, 1), (2, 2)) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN liter_prepared_set SETTINGS max_rows_to_read = 6; @@ -183,7 +183,7 @@ CREATE TABLE bloom_filter_array_lc_null_types_test ( fixed_string Array(LowCardinality(Nullable(FixedString(5)))), INDEX idx (i8, i16, i32, i64, u8, u16, u32, u64, f32, f64, date, date_time, str, fixed_string) TYPE bloom_filter GRANULARITY 1) -ENGINE = MergeTree() ORDER BY order_key SETTINGS index_granularity = 6; +ENGINE = MergeTree() ORDER BY order_key SETTINGS index_granularity = 6, allow_nullable_key = 1; INSERT INTO bloom_filter_array_lc_null_types_test SELECT groupArray(number) AS order_key, diff --git a/tests/queries/0_stateless/00974_query_profiler.reference b/tests/queries/0_stateless/00974_query_profiler.reference index e37cf5f7642..708c4988416 100644 --- a/tests/queries/0_stateless/00974_query_profiler.reference +++ b/tests/queries/0_stateless/00974_query_profiler.reference @@ -1,4 +1,4 @@ 0 0 1 -1000000000 0 +10000000000 0 1 diff --git a/tests/queries/0_stateless/00974_query_profiler.sql b/tests/queries/0_stateless/00974_query_profiler.sql index 45ba6504a79..24e4241b813 100644 --- a/tests/queries/0_stateless/00974_query_profiler.sql +++ b/tests/queries/0_stateless/00974_query_profiler.sql @@ -15,7 +15,7 @@ SELECT count() > 0 FROM system.trace_log t WHERE query_id = (SELECT query_id FRO SET query_profiler_real_time_period_ns = 0; SET query_profiler_cpu_time_period_ns = 1000000; SET log_queries = 1; -SELECT count(), ignore('test cpu time query profiler') FROM numbers(1000000000); +SELECT count(), ignore('test cpu time query profiler') FROM numbers_mt(10000000000); SET log_queries = 0; SYSTEM FLUSH LOGS; diff --git a/tests/queries/0_stateless/01045_bloom_filter_null_array.sql b/tests/queries/0_stateless/01045_bloom_filter_null_array.sql index 3dfc04ae8ff..4a5741b4e72 100644 --- a/tests/queries/0_stateless/01045_bloom_filter_null_array.sql +++ b/tests/queries/0_stateless/01045_bloom_filter_null_array.sql @@ -1,6 +1,6 @@ DROP TABLE IF EXISTS bloom_filter_null_array; -CREATE TABLE bloom_filter_null_array (v Array(LowCardinality(Nullable(String))), INDEX idx v TYPE bloom_filter(0.1) GRANULARITY 1) ENGINE = MergeTree() ORDER BY v; +CREATE TABLE bloom_filter_null_array (v Array(LowCardinality(Nullable(String))), INDEX idx v TYPE bloom_filter(0.1) GRANULARITY 1) ENGINE = MergeTree() ORDER BY v SETTINGS allow_nullable_key = 1; INSERT INTO bloom_filter_null_array VALUES ([]); INSERT INTO bloom_filter_null_array VALUES (['1', '2']) ([]) ([]); diff --git a/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference b/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference index 77f48f2832c..96f7cbb1d69 100644 --- a/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference +++ b/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference @@ -1,8 +1,12 @@ ---TUMBLE--- +||---DEFAULT ENGINE WITH DATA COLUMN ALIAS--- +CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'))`, b)\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN NAME--- CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN ALIAS--- CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 +||---DATA COLUMN ALIAS--- +CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY b\nSETTINGS index_granularity = 8192 ||---IDENTIFIER--- CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'))`, b)\nSETTINGS index_granularity = 8192 ||---FUNCTION--- @@ -10,10 +14,14 @@ CREATE TABLE test_01047.`.inner.wv`\n(\n `plus(a, b)` Int64,\n `windowID(t ||---PARTITION--- CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(____timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY `windowID(____timestamp, toIntervalSecond(\'1\'))`\nORDER BY `windowID(____timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 ---HOP--- +||---DEFAULT ENGINE WITH DATA COLUMN ALIAS--- +CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, b)\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN NAME--- CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN ALIAS--- CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nSETTINGS index_granularity = 8192 +||---DATA COLUMN ALIAS--- +CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY b\nSETTINGS index_granularity = 8192 ||---IDENTIFIER--- CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, b)\nSETTINGS index_granularity = 8192 ||---FUNCTION--- diff --git a/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql b/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql index 777c5ae2a5a..595d93e0771 100644 --- a/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql +++ b/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql @@ -9,6 +9,12 @@ DROP TABLE IF EXISTS test_01047.mt; CREATE TABLE test_01047.mt(a Int32, b Int32, timestamp DateTime) ENGINE=MergeTree ORDER BY tuple(); SELECT '---TUMBLE---'; +SELECT '||---DEFAULT ENGINE WITH DATA COLUMN ALIAS---'; +DROP TABLE IF EXISTS test_01047.wv; +DROP TABLE IF EXISTS test_01047.`.inner.wv`; +CREATE WINDOW VIEW test_01047.wv AS SELECT count(a) AS count, b as id FROM test_01047.mt GROUP BY id, tumble(timestamp, INTERVAL '1' SECOND); +SHOW CREATE TABLE test_01047.`.inner.wv`; + SELECT '||---WINDOW COLUMN NAME---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; @@ -21,6 +27,12 @@ DROP TABLE IF EXISTS test_01047.`.inner.wv`; CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid AS SELECT count(a) AS count, tumble(timestamp, INTERVAL '1' SECOND) AS wid FROM test_01047.mt GROUP BY wid; SHOW CREATE TABLE test_01047.`.inner.wv`; +SELECT '||---DATA COLUMN ALIAS---'; +DROP TABLE IF EXISTS test_01047.wv; +DROP TABLE IF EXISTS test_01047.`.inner.wv`; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY id AS SELECT count(a) AS count, b as id FROM test_01047.mt GROUP BY id, tumble(timestamp, INTERVAL '1' SECOND); +SHOW CREATE TABLE test_01047.`.inner.wv`; + SELECT '||---IDENTIFIER---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; @@ -41,6 +53,12 @@ SHOW CREATE TABLE test_01047.`.inner.wv`; SELECT '---HOP---'; +SELECT '||---DEFAULT ENGINE WITH DATA COLUMN ALIAS---'; +DROP TABLE IF EXISTS test_01047.wv; +DROP TABLE IF EXISTS test_01047.`.inner.wv`; +CREATE WINDOW VIEW test_01047.wv AS SELECT count(a) AS count, b as id FROM test_01047.mt GROUP BY id, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND); +SHOW CREATE TABLE test_01047.`.inner.wv`; + SELECT '||---WINDOW COLUMN NAME---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; @@ -53,6 +71,12 @@ DROP TABLE IF EXISTS test_01047.`.inner.wv`; CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid AS SELECT count(a) AS count, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid FROM test_01047.mt GROUP BY wid; SHOW CREATE TABLE test_01047.`.inner.wv`; +SELECT '||---DATA COLUMN ALIAS---'; +DROP TABLE IF EXISTS test_01047.wv; +DROP TABLE IF EXISTS test_01047.`.inner.wv`; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY id AS SELECT count(a) AS count, b as id FROM test_01047.mt GROUP BY id, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND); +SHOW CREATE TABLE test_01047.`.inner.wv`; + SELECT '||---IDENTIFIER---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; diff --git a/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql b/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql index a653206fe18..0bc5fcd1db8 100644 --- a/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql +++ b/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql @@ -89,8 +89,11 @@ INSERT INTO checkouts SELECT number as id, '2000-01-01 10:00:00' from numbers(50 -- by this time we should have 3 parts for target_table because of prev inserts -- and we plan to make two more inserts. With index_granularity=128 and max id=1000 -- we expect to read not more than: +-- 1000 rows read from numbers(1000) in the INSERT itself +-- 1000 rows in the `IN (SELECT id FROM table)` in the mat views -- (1000/128) marks per part * (3 + 2) parts * 128 granularity = 5120 rows -set max_rows_to_read = 5120; +-- Total: 7120 +set max_rows_to_read = 7120; INSERT INTO logins SELECT number as id, '2000-01-01 11:00:00' from numbers(1000); INSERT INTO checkouts SELECT number as id, '2000-01-01 11:10:00' from numbers(1000); @@ -98,8 +101,8 @@ INSERT INTO checkouts SELECT number as id, '2000-01-01 11:10:00' from numbers(10 -- by this time we should have 5 parts for target_table because of prev inserts -- and we plan to make two more inserts. With index_granularity=128 and max id=1 -- we expect to read not more than: --- 1 mark per part * (5 + 2) parts * 128 granularity = 896 rows -set max_rows_to_read = 896; +-- 1 mark per part * (5 + 2) parts * 128 granularity + 1 (numbers(1)) = 897 rows +set max_rows_to_read = 897; INSERT INTO logins SELECT number+2 as id, '2001-01-01 11:10:01' from numbers(1); INSERT INTO checkouts SELECT number+2 as id, '2001-01-01 11:10:02' from numbers(1); diff --git a/tests/queries/0_stateless/01176_mysql_client_interactive.expect b/tests/queries/0_stateless/01176_mysql_client_interactive.expect index 37087dd85f7..5bbc77ccf14 100755 --- a/tests/queries/0_stateless/01176_mysql_client_interactive.expect +++ b/tests/queries/0_stateless/01176_mysql_client_interactive.expect @@ -5,11 +5,12 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail + expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01179_insert_values_semicolon.expect b/tests/queries/0_stateless/01179_insert_values_semicolon.expect index 0e65e5c4cbf..bf937c3a6a4 100755 --- a/tests/queries/0_stateless/01179_insert_values_semicolon.expect +++ b/tests/queries/0_stateless/01179_insert_values_semicolon.expect @@ -1,13 +1,14 @@ #!/usr/bin/expect -f +# Tags: long log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01180_client_syntax_errors.expect b/tests/queries/0_stateless/01180_client_syntax_errors.expect index c20982b2991..6e4e975988e 100755 --- a/tests/queries/0_stateless/01180_client_syntax_errors.expect +++ b/tests/queries/0_stateless/01180_client_syntax_errors.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.reference b/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.reference index 2a6b00cdddb..968247bd35b 100644 --- a/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.reference +++ b/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.reference @@ -8,10 +8,10 @@ Hello 1 Word 1 date1 date2 value1 value2 2019-10-02 2018-10-02 1 1 -CREATE TABLE default.table_rename_with_ttl\n(\n `date1` Date,\n `date2` Date,\n `value1` String,\n `value2` String TTL date1 + toIntervalMonth(10000)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL date2 + toIntervalMonth(10000)\nSETTINGS index_granularity = 8192 +CREATE TABLE default.table_rename_with_ttl\n(\n `date1` Date,\n `date2` Date,\n `value1` String,\n `value2` String TTL date1 + toIntervalMonth(500)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL date2 + toIntervalMonth(500)\nSETTINGS index_granularity = 8192 renamed_date1 date2 value1 value2 2019-10-02 2018-10-02 1 1 -CREATE TABLE default.table_rename_with_ttl\n(\n `renamed_date1` Date,\n `date2` Date,\n `value1` String,\n `value2` String TTL renamed_date1 + toIntervalMonth(10000)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL date2 + toIntervalMonth(10000)\nSETTINGS index_granularity = 8192 +CREATE TABLE default.table_rename_with_ttl\n(\n `renamed_date1` Date,\n `date2` Date,\n `value1` String,\n `value2` String TTL renamed_date1 + toIntervalMonth(500)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL date2 + toIntervalMonth(500)\nSETTINGS index_granularity = 8192 renamed_date1 renamed_date2 value1 value2 2019-10-02 2018-10-02 1 1 -CREATE TABLE default.table_rename_with_ttl\n(\n `renamed_date1` Date,\n `renamed_date2` Date,\n `value1` String,\n `value2` String TTL renamed_date1 + toIntervalMonth(10000)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL renamed_date2 + toIntervalMonth(10000)\nSETTINGS index_granularity = 8192 +CREATE TABLE default.table_rename_with_ttl\n(\n `renamed_date1` Date,\n `renamed_date2` Date,\n `value1` String,\n `value2` String TTL renamed_date1 + toIntervalMonth(500)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL renamed_date2 + toIntervalMonth(500)\nSETTINGS index_granularity = 8192 diff --git a/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.sql b/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.sql index 986947d5979..a831fd18bfe 100644 --- a/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.sql +++ b/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.sql @@ -38,11 +38,11 @@ CREATE TABLE table_rename_with_ttl date1 Date, date2 Date, value1 String, - value2 String TTL date1 + INTERVAL 10000 MONTH + value2 String TTL date1 + INTERVAL 500 MONTH ) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_01213/table_rename_with_ttl', '1') ORDER BY tuple() -TTL date2 + INTERVAL 10000 MONTH; +TTL date2 + INTERVAL 500 MONTH; INSERT INTO table_rename_with_ttl SELECT toDateTime(toDate('2019-10-01') + number % 3, 'Europe/Moscow'), toDateTime(toDate('2018-10-01') + number % 3, 'Europe/Moscow'), toString(number), toString(number) from numbers(9); diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index d347f149230..cc237a40a3f 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -114,6 +114,7 @@ SYSTEM RESTORE REPLICA ['RESTORE REPLICA'] TABLE SYSTEM SYSTEM FLUSH DISTRIBUTED ['FLUSH DISTRIBUTED'] TABLE SYSTEM FLUSH SYSTEM FLUSH LOGS ['FLUSH LOGS'] GLOBAL SYSTEM FLUSH SYSTEM FLUSH [] \N SYSTEM +SYSTEM THREAD FUZZER ['SYSTEM START THREAD FUZZER','SYSTEM STOP THREAD FUZZER','START THREAD FUZZER','STOP THREAD FUZZER'] GLOBAL SYSTEM SYSTEM [] \N ALL dictGet ['dictHas','dictGetHierarchy','dictIsIn'] DICTIONARY ALL addressToLine [] GLOBAL INTROSPECTION diff --git a/tests/queries/0_stateless/01281_parseDateTime64BestEffort.reference b/tests/queries/0_stateless/01281_parseDateTime64BestEffort.reference index 5d2507d2a08..b76db01a8ab 100644 --- a/tests/queries/0_stateless/01281_parseDateTime64BestEffort.reference +++ b/tests/queries/0_stateless/01281_parseDateTime64BestEffort.reference @@ -13,3 +13,7 @@ Formats 2020-05-14 03:37:03.253 2020-05-14 03:37:03.000 2020-05-14 03:37:03.000 +Unix Timestamp with Milliseconds +2021-12-28 00:00:00.123 +2021-12-28 00:00:00.1 +2021-12-28 00:00:00.123000 diff --git a/tests/queries/0_stateless/01281_parseDateTime64BestEffort.sql b/tests/queries/0_stateless/01281_parseDateTime64BestEffort.sql index 5c0bbe1b4c2..ac1186284be 100644 --- a/tests/queries/0_stateless/01281_parseDateTime64BestEffort.sql +++ b/tests/queries/0_stateless/01281_parseDateTime64BestEffort.sql @@ -30,4 +30,9 @@ SELECT parseDateTime64BestEffort('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk SELECT 'Formats'; SELECT parseDateTime64BestEffort('2020-05-14T03:37:03.253184', 3, 'UTC'); SELECT parseDateTime64BestEffort('2020-05-14T03:37:03', 3, 'UTC'); -SELECT parseDateTime64BestEffort('2020-05-14 03:37:03', 3, 'UTC'); \ No newline at end of file +SELECT parseDateTime64BestEffort('2020-05-14 03:37:03', 3, 'UTC'); + +SELECT 'Unix Timestamp with Milliseconds'; +SELECT parseDateTime64BestEffort('1640649600123', 3, 'UTC'); +SELECT parseDateTime64BestEffort('1640649600123', 1, 'UTC'); +SELECT parseDateTime64BestEffort('1640649600123', 6, 'UTC'); diff --git a/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect b/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect index 5e845754402..e4442047c87 100755 --- a/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect +++ b/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect @@ -4,11 +4,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect b/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect index c68b153d5d3..2f871ab46d8 100755 --- a/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect +++ b/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect b/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect index 05d9d408228..ad5b7625929 100755 --- a/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect +++ b/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect @@ -4,11 +4,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect b/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect index e0d01d905bb..9c20b7c517e 100755 --- a/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect +++ b/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] @@ -20,6 +20,7 @@ expect "SET max_distributed" # Wait for suggestions to load, they are loaded in background set is_done 0 +set timeout 1 while {$is_done == 0} { send -- "\t" expect { @@ -27,10 +28,15 @@ while {$is_done == 0} { set is_done 1 } default { - sleep 1 + # expect "_" will wait for timeout, + # if completion was not loaded it will fail, + # and we will retry, + # but for retry on timeout default should be reseted, + # this is what for this block. } } } +set timeout 60 send -- "\3\4" expect eof diff --git a/tests/queries/0_stateless/01410_nullable_key_and_index.sql b/tests/queries/0_stateless/01410_nullable_key_and_index.sql index fd1712b5d24..46a58152700 100644 --- a/tests/queries/0_stateless/01410_nullable_key_and_index.sql +++ b/tests/queries/0_stateless/01410_nullable_key_and_index.sql @@ -65,3 +65,12 @@ CREATE TABLE xxxx_null (`ts` Nullable(DateTime)) ENGINE = MergeTree ORDER BY toS INSERT INTO xxxx_null SELECT '2021-11-11 00:00:00'; SELECT * FROM xxxx_null WHERE ts > '2021-10-11 00:00:00'; DROP TABLE xxxx_null; + +-- nullable keys are forbidden when `allow_nullable_key = 0` +CREATE TABLE invalid_null (id Nullable(String)) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +CREATE TABLE invalid_lc_null (id LowCardinality(Nullable(String))) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +CREATE TABLE invalid_array_null (id Array(Nullable(String))) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +CREATE TABLE invalid_tuple_null (id Tuple(Nullable(String), UInt8)) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +CREATE TABLE invalid_map_null (id Map(UInt8, Nullable(String))) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +CREATE TABLE invalid_simple_agg_state_null (id SimpleAggregateFunction(sum, Nullable(UInt64))) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +-- AggregateFunctions are not comparable and cannot be used in key expressions. No need to test it. diff --git a/tests/queries/0_stateless/01410_nullable_key_more_tests.reference b/tests/queries/0_stateless/01410_nullable_key_more_tests.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01410_nullable_key_more_tests.sh b/tests/queries/0_stateless/01410_nullable_key_more_tests.sh new file mode 100755 index 00000000000..03bebed324b --- /dev/null +++ b/tests/queries/0_stateless/01410_nullable_key_more_tests.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +test_func() +{ + engine=$1 + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "drop table if exists table_with_nullable_keys" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "create table table_with_nullable_keys (nullable_int Nullable(UInt32), nullable_str Nullable(String), nullable_lc LowCardinality(Nullable(String)), nullable_ints Array(Nullable(UInt32)), nullable_misc Tuple(Nullable(String), Nullable(UInt32)), nullable_val Map(UInt32, Nullable(String)), value UInt8) engine $engine order by (nullable_int, nullable_str, nullable_lc, nullable_ints, nullable_misc, nullable_val) settings allow_nullable_key = 1, index_granularity = 1" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "insert into table_with_nullable_keys select * replace (cast(nullable_val as Map(UInt32, Nullable(String))) as nullable_val) from generateRandom('nullable_int Nullable(UInt32), nullable_str Nullable(String), nullable_lc Nullable(String), nullable_ints Array(Nullable(UInt32)), nullable_misc Tuple(Nullable(String), Nullable(UInt32)), nullable_val Array(Tuple(UInt32, Nullable(String))), value UInt8', 1, 30, 30) limit 1024" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_str = (select randomPrintableASCII(30)) or nullable_str in (select randomPrintableASCII(30) from numbers(3)) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_lc = (select randomPrintableASCII(30)) or nullable_lc in (select randomPrintableASCII(30) from numbers(3)) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_ints = [1, 2, null] or nullable_ints in (select * from generateRandom('nullable_ints Array(Nullable(UInt32))', 1, 30, 30) limit 3) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_misc = (select (randomPrintableASCII(30), rand())) or nullable_misc in (select arrayJoin([(randomPrintableASCII(30), null), (null, rand())]))" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_val = (select map(rand(), randomPrintableASCII(10), rand(2), randomPrintableASCII(20), rand(3), null)) or nullable_val in (select cast(nullable_ints as Map(UInt32, Nullable(String))) from generateRandom('nullable_ints Array(Tuple(UInt32, Nullable(String)))', 1, 30, 30) limit 3) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "drop table table_with_nullable_keys" +} + +test_func MergeTree +test_func AggregatingMergeTree +test_func ReplacingMergeTree diff --git a/tests/queries/0_stateless/01414_low_cardinality_nullable.sql b/tests/queries/0_stateless/01414_low_cardinality_nullable.sql index 596e90adfd6..c11e990cea8 100644 --- a/tests/queries/0_stateless/01414_low_cardinality_nullable.sql +++ b/tests/queries/0_stateless/01414_low_cardinality_nullable.sql @@ -19,7 +19,7 @@ CREATE TABLE lc_nullable ( str Array(LowCardinality(Nullable(String))), fixed_string Array(LowCardinality(Nullable(FixedString(5)))) -) ENGINE = MergeTree() ORDER BY order_key; +) ENGINE = MergeTree() ORDER BY order_key SETTINGS allow_nullable_key = 1; INSERT INTO lc_nullable SELECT groupArray(number) AS order_key, diff --git a/tests/queries/0_stateless/01442_date_time_with_params.reference b/tests/queries/0_stateless/01442_date_time_with_params.reference index 19f78c83f82..726e59d4d35 100644 --- a/tests/queries/0_stateless/01442_date_time_with_params.reference +++ b/tests/queries/0_stateless/01442_date_time_with_params.reference @@ -13,6 +13,8 @@ parseDateTimeBestEffort 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') 2020-05-14 06:37:03.253 DateTime64(3, \'Europe/Minsk\') 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') +2021-12-28 00:00:00.123 DateTime64(3, \'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') parseDateTimeBestEffortOrNull \N Nullable(DateTime64(3)) 2020-05-14 03:37:03.000 Nullable(DateTime64(3, \'UTC\')) @@ -25,6 +27,8 @@ parseDateTimeBestEffortOrNull 2020-05-14 03:37:03.253 Nullable(DateTime64(3, \'UTC\')) 2020-05-14 06:37:03.253 Nullable(DateTime64(3, \'Europe/Minsk\')) 2020-05-14 03:37:03.253 Nullable(DateTime64(3, \'UTC\')) +2021-12-28 00:00:00.123 Nullable(DateTime64(3, \'UTC\')) +2021-12-28 00:00:00 Nullable(DateTime(\'UTC\')) parseDateTimeBestEffortOrZero 1970-01-01 00:00:00.000 DateTime64(3, \'UTC\') 2020-05-14 03:37:03.000 DateTime64(3, \'UTC\') @@ -37,6 +41,8 @@ parseDateTimeBestEffortOrZero 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') 2020-05-14 06:37:03.253 DateTime64(3, \'Europe/Minsk\') 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') +2021-12-28 00:00:00.123 DateTime64(3, \'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') parseDateTime32BestEffort 2020-05-14 03:37:03 DateTime(\'UTC\') 2020-05-14 03:37:03 DateTime(\'UTC\') @@ -48,6 +54,7 @@ parseDateTime32BestEffort 2020-05-14 03:37:03 DateTime(\'UTC\') 2020-05-14 06:37:03 DateTime(\'Europe/Minsk\') 2020-05-14 03:37:03 DateTime(\'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') parseDateTime32BestEffortOrNull \N Nullable(DateTime) 2020-05-14 03:37:03 Nullable(DateTime(\'UTC\')) @@ -60,6 +67,7 @@ parseDateTime32BestEffortOrNull 2020-05-14 03:37:03 Nullable(DateTime(\'UTC\')) 2020-05-14 06:37:03 Nullable(DateTime(\'Europe/Minsk\')) 2020-05-14 03:37:03 Nullable(DateTime(\'UTC\')) +2021-12-28 00:00:00 Nullable(DateTime(\'UTC\')) parseDateTime32BestEffortOrZero 1970-01-01 00:00:00 DateTime(\'UTC\') 2020-05-14 03:37:03 DateTime(\'UTC\') @@ -72,3 +80,4 @@ parseDateTime32BestEffortOrZero 2020-05-14 03:37:03 DateTime(\'UTC\') 2020-05-14 06:37:03 DateTime(\'Europe/Minsk\') 2020-05-14 03:37:03 DateTime(\'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') diff --git a/tests/queries/0_stateless/01442_date_time_with_params.sql b/tests/queries/0_stateless/01442_date_time_with_params.sql index 52815460245..5a57aabdb0c 100644 --- a/tests/queries/0_stateless/01442_date_time_with_params.sql +++ b/tests/queries/0_stateless/01442_date_time_with_params.sql @@ -24,6 +24,8 @@ SELECT parseDateTimeBestEffort('2020-05-14T03:37:03.253184', 3, 'UTC') AS a, toT SELECT parseDateTimeBestEffort('2020-05-14T03:37:03.253184Z', 3, 'UTC') AS a, toTypeName(a); SELECT parseDateTimeBestEffort('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTimeBestEffort(materialize('2020-05-14T03:37:03.253184Z'), 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffort('1640649600123', 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffort('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTimeBestEffortOrNull'; SELECT parseDateTimeBestEffortOrNull('', 3) AS a, toTypeName(a); @@ -37,6 +39,8 @@ SELECT parseDateTimeBestEffortOrNull('2020-05-14T03:37:03.253184', 3, 'UTC') AS SELECT parseDateTimeBestEffortOrNull('2020-05-14T03:37:03.253184Z', 3, 'UTC') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrNull('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrNull(materialize('2020-05-14T03:37:03.253184Z'), 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrNull('1640649600123', 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrNull('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTimeBestEffortOrZero'; SELECT parseDateTimeBestEffortOrZero('', 3, 'UTC') AS a, toTypeName(a); @@ -50,6 +54,8 @@ SELECT parseDateTimeBestEffortOrZero('2020-05-14T03:37:03.253184', 3, 'UTC') AS SELECT parseDateTimeBestEffortOrZero('2020-05-14T03:37:03.253184Z', 3, 'UTC') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrZero('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrZero(materialize('2020-05-14T03:37:03.253184Z'), 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrZero('1640649600123', 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrZero('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTime32BestEffort'; SELECT parseDateTime32BestEffort('') AS a, toTypeName(a); -- {serverError 41} @@ -63,6 +69,7 @@ SELECT parseDateTime32BestEffort('2020-05-14T03:37:03.253184', 'UTC') AS a, toTy SELECT parseDateTime32BestEffort('2020-05-14T03:37:03.253184Z', 'UTC') AS a, toTypeName(a); SELECT parseDateTime32BestEffort('2020-05-14T03:37:03.253184Z', 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTime32BestEffort(materialize('2020-05-14T03:37:03.253184Z'), 'UTC') AS a, toTypeName(a); +SELECT parseDateTime32BestEffort('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTime32BestEffortOrNull'; SELECT parseDateTime32BestEffortOrNull('') AS a, toTypeName(a); @@ -76,6 +83,7 @@ SELECT parseDateTime32BestEffortOrNull('2020-05-14T03:37:03.253184', 'UTC') AS a SELECT parseDateTime32BestEffortOrNull('2020-05-14T03:37:03.253184Z', 'UTC') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrNull('2020-05-14T03:37:03.253184Z', 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrNull(materialize('2020-05-14T03:37:03.253184Z'), 'UTC') AS a, toTypeName(a); +SELECT parseDateTime32BestEffortOrNull('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTime32BestEffortOrZero'; SELECT parseDateTime32BestEffortOrZero('', 'UTC') AS a, toTypeName(a); @@ -89,6 +97,6 @@ SELECT parseDateTime32BestEffortOrZero('2020-05-14T03:37:03.253184', 'UTC') AS a SELECT parseDateTime32BestEffortOrZero('2020-05-14T03:37:03.253184Z', 'UTC') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrZero('2020-05-14T03:37:03.253184Z', 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrZero(materialize('2020-05-14T03:37:03.253184Z'), 'UTC') AS a, toTypeName(a); - +SELECT parseDateTime32BestEffortOrZero('1640649600123', 'UTC') AS a, toTypeName(a); DROP TABLE IF EXISTS test; diff --git a/tests/queries/0_stateless/01503_if_const_optimization.reference b/tests/queries/0_stateless/01503_if_const_optimization.reference index e69de29bb2d..dec7d2fabd2 100644 --- a/tests/queries/0_stateless/01503_if_const_optimization.reference +++ b/tests/queries/0_stateless/01503_if_const_optimization.reference @@ -0,0 +1 @@ +\N diff --git a/tests/queries/0_stateless/01503_if_const_optimization.sql b/tests/queries/0_stateless/01503_if_const_optimization.sql index 047f6f757e8..a64be6bc80b 100644 --- a/tests/queries/0_stateless/01503_if_const_optimization.sql +++ b/tests/queries/0_stateless/01503_if_const_optimization.sql @@ -1 +1 @@ -SELECT if(CAST(NULL), '2.55', NULL) AS x; -- { serverError 42 } +SELECT if(CAST(NULL AS Nullable(UInt8)), '2.55', NULL) AS x; diff --git a/tests/queries/0_stateless/01504_rocksdb.sql b/tests/queries/0_stateless/01504_rocksdb.sql index 9f9e6c3b1ac..f79f31139fe 100644 --- a/tests/queries/0_stateless/01504_rocksdb.sql +++ b/tests/queries/0_stateless/01504_rocksdb.sql @@ -34,7 +34,7 @@ INSERT INTO 01504_test_memory SELECT number % 77 AS k, SUM(number) AS value, (1, SELECT A.a = B.a, A.b = B.b, A.c = B.c, A.d = B.d, A.e = B.e FROM ( SELECT 0 AS a, groupBitmapMerge(bm) AS b , SUM(k) AS c, SUM(value) AS d, SUM(dummy.1) AS e FROM 01504_test) A ANY LEFT JOIN (SELECT 0 AS a, groupBitmapMerge(bm) AS b , SUM(k) AS c, SUM(value) AS d, SUM(dummy.1) AS e FROM 01504_test_memory) B USING a ORDER BY a; -CREATE TEMPORARY TABLE keys AS SELECT * FROM numbers(1000); +CREATE TEMPORARY TABLE keys AS SELECT * FROM system.numbers LIMIT 1 OFFSET 4; SET max_rows_to_read = 2; SELECT dummy == (1,1.2) FROM 01504_test WHERE k IN (1, 3) OR k IN (1) OR k IN (3, 1) OR k IN [1] OR k IN [1, 3] ; diff --git a/tests/queries/0_stateless/01520_client_print_query_id.expect b/tests/queries/0_stateless/01520_client_print_query_id.expect index b0ff5d9d165..8b6e0e17a85 100755 --- a/tests/queries/0_stateless/01520_client_print_query_id.expect +++ b/tests/queries/0_stateless/01520_client_print_query_id.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01534_lambda_array_join.sql b/tests/queries/0_stateless/01534_lambda_array_join.sql index aee9dd1411a..092c569b25f 100644 --- a/tests/queries/0_stateless/01534_lambda_array_join.sql +++ b/tests/queries/0_stateless/01534_lambda_array_join.sql @@ -6,7 +6,7 @@ SELECT count() AS c FROM numbers(10) GROUP BY - arrayMap(x -> reinterpretAsUInt8(substring(randomString(randomString(range(randomString(255), NULL)), NULL))), range(3)), + arrayMap(x -> reinterpretAsUInt8(substring(randomString(randomString(range(randomString(255), NULL)), NULL), NULL)), range(3)), randomString(range(randomString(1048577), NULL), NULL), byte ORDER BY byte ASC; diff --git a/tests/queries/0_stateless/01565_reconnect_after_client_error.expect b/tests/queries/0_stateless/01565_reconnect_after_client_error.expect index 712fe4ff64a..819450ffd30 100755 --- a/tests/queries/0_stateless/01565_reconnect_after_client_error.expect +++ b/tests/queries/0_stateless/01565_reconnect_after_client_error.expect @@ -1,4 +1,5 @@ #!/usr/bin/expect -f +# Tags: long # This is a separate test, because we want to test the interactive mode. # https://github.com/ClickHouse/ClickHouse/issues/19353 @@ -7,11 +8,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01583_const_column_in_set_index.sql b/tests/queries/0_stateless/01583_const_column_in_set_index.sql index e40249eaf08..b781efb0f13 100644 --- a/tests/queries/0_stateless/01583_const_column_in_set_index.sql +++ b/tests/queries/0_stateless/01583_const_column_in_set_index.sql @@ -3,7 +3,7 @@ drop table if exists insub; create table insub (i int, j int) engine MergeTree order by i settings index_granularity = 1; insert into insub select number a, a + 2 from numbers(10); -SET max_rows_to_read = 2; +SET max_rows_to_read = 12; -- 10 from numbers + 2 from table select * from insub where i in (select toInt32(3) from numbers(10)); drop table if exists insub; diff --git a/tests/queries/0_stateless/01585_use_index_for_global_in.sql b/tests/queries/0_stateless/01585_use_index_for_global_in.sql index a0a5b90ac1f..1dd7609350f 100644 --- a/tests/queries/0_stateless/01585_use_index_for_global_in.sql +++ b/tests/queries/0_stateless/01585_use_index_for_global_in.sql @@ -8,10 +8,12 @@ create table xp_d as xp engine Distributed(test_shard_localhost, currentDatabase insert into xp select number, number + 2 from numbers(10); -set max_rows_to_read = 2; +set max_rows_to_read = 4; -- 2 from numbers, 2 from tables select * from xp where i in (select * from numbers(2)); select * from xp where i global in (select * from numbers(2)); select * from xp_d where i in (select * from numbers(2)); + +set max_rows_to_read = 6; -- 2 from numbers, 2 from GLOBAL temp table (pushed from numbers), 2 from local xp select * from xp_d where i global in (select * from numbers(2)); drop table if exists xp; diff --git a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference index de0116f9eaa..0cb1993057f 100644 --- a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference +++ b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference @@ -14,6 +14,14 @@ 1 3 0 2 1 3 +0 2 +1 3 +0 2 +1 3 +0 2 +1 3 +0 2 +1 3 \N 100 \N 100 \N 100 diff --git a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql index 6129c92c888..d4147a445ec 100644 --- a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql +++ b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql @@ -12,17 +12,29 @@ insert into xp select null, 100; optimize table xp final; set max_rows_to_read = 2; +select * from xp where i in [0, 1]; +select * from xp where i global in [0, 1]; +select * from xp_d where i in [0, 1]; +select * from xp_d where i global in [0, 1]; + +set max_rows_to_read = 4; -- 2 in the subquery, 2 in the query itself select * from xp where i in (select * from numbers(2)); select * from xp where i global in (select * from numbers(2)); select * from xp_d where i in (select * from numbers(2)); + +set max_rows_to_read = 6; -- 2 subquery, 2 from global temp table (GLOBAL IN), 2 from local xp table select * from xp_d where i global in (select * from numbers(2)); set transform_null_in = 1; +set max_rows_to_read = 4; -- 2 in the subquery, 2 in the query itself select * from xp where i in (select * from numbers(2)); select * from xp where i global in (select * from numbers(2)); select * from xp_d where i in (select * from numbers(2)); + +set max_rows_to_read = 6; -- 2 subquery, 2 from global temp table (GLOBAL IN), 2 from local xp table select * from xp_d where i global in (select * from numbers(2)); +set max_rows_to_read = 0; -- No rows should be read select * from xp where i in (null); select * from xp where i global in (null); select * from xp_d where i in (null); diff --git a/tests/queries/0_stateless/01600_parts_states_metrics_long.sh b/tests/queries/0_stateless/01600_parts_states_metrics_long.sh index 9c0d28fdd91..f47d0863e69 100755 --- a/tests/queries/0_stateless/01600_parts_states_metrics_long.sh +++ b/tests/queries/0_stateless/01600_parts_states_metrics_long.sh @@ -7,7 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # NOTE: database = $CLICKHOUSE_DATABASE is unwanted verify_sql="SELECT - (SELECT sumIf(value, metric = 'PartsCommitted'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics) + (SELECT sumIf(value, metric = 'PartsActive'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics) = (SELECT sum(active), sum(NOT active) FROM system.parts)" # The query is not atomic - it can compare states between system.parts and system.metrics from different points in time. diff --git a/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.reference b/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.reference index ebb0b033d5b..05f7d08de7d 100644 --- a/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.reference +++ b/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.reference @@ -1,3 +1,4 @@ +~~~~source parts~~~~~ 1 1_0_0_0 1 1_1_1_0 2 2_0_0_0 @@ -10,6 +11,7 @@ 2_ 2_1_1_0 3_ 3_0_0_0 3_ 3_1_1_0 +~~~~parts after deduplication~~~~~ 1 1_0_0_0 1 1_1_1_0 2 2_0_0_0 @@ -22,6 +24,7 @@ 2_ 2_1_1_0 3_ 3_0_0_0 3_ 3_1_1_0 +~~~~parts after drop 3_1_1_0~~~~~ 1 1_0_0_0 1 1_1_1_0 2 2_0_0_0 @@ -32,6 +35,7 @@ 2_ 2_0_0_0 2_ 2_1_1_0 3_ 3_0_0_0 +~~~~parts after new part without deduplication~~~~~ 1 1_0_0_0 1 1_1_1_0 2 2_0_0_0 diff --git a/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.sql b/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.sql index 1aa568c1663..c77f29d89c2 100644 --- a/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.sql +++ b/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper_long.sql @@ -17,25 +17,33 @@ SYSTEM STOP MERGES partitioned_table; INSERT INTO partitioned_table VALUES (1, 1, 'A'), (2, 2, 'B'), (3, 3, 'C'); INSERT INTO partitioned_table VALUES (11, 1, 'AA'), (22, 2, 'BB'), (33, 3, 'CC'); -SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name; +SELECT '~~~~source parts~~~~~'; + +SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() and active ORDER BY name; SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/' || currentDatabase() || '/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value; INSERT INTO partitioned_table VALUES (33, 3, 'CC'); -- must be deduplicated -SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name; +SELECT '~~~~parts after deduplication~~~~~'; + +SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() and active ORDER BY name; SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/' || currentDatabase() || '/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value; ALTER TABLE partitioned_table DROP PART '3_1_1_0'; -SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name; +SELECT '~~~~parts after drop 3_1_1_0~~~~~'; + +SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() and active ORDER BY name; SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/' || currentDatabase() || '/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value; INSERT INTO partitioned_table VALUES (33, 3, 'CC'); -- mustn't be deduplicated -SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name; +SELECT '~~~~parts after new part without deduplication~~~~~'; + +SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() and active ORDER BY name; SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/' || currentDatabase() || '/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value; diff --git a/tests/queries/0_stateless/01660_system_parts_smoke.reference b/tests/queries/0_stateless/01660_system_parts_smoke.reference index f21fab8e539..36550f31bd0 100644 --- a/tests/queries/0_stateless/01660_system_parts_smoke.reference +++ b/tests/queries/0_stateless/01660_system_parts_smoke.reference @@ -1,13 +1,13 @@ # two parts -Committed -Committed -all_1_1_0 Committed -all_2_2_0 Committed +Active +Active +all_1_1_0 Active +all_2_2_0 Active all_1_1_0 1 all_2_2_0 1 # optimize +1 Active 2 Outdated -1 Committed # truncate Outdated Outdated diff --git a/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh b/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh index e029d90a686..1be082a6aae 100755 --- a/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh +++ b/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh @@ -20,11 +20,11 @@ function test_completion_word_client() log_user 0 set timeout 3 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } spawn bash -c "$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT" @@ -104,11 +104,11 @@ function test_completion_word_local() log_user 0 set timeout 3 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } spawn bash -c "$CLICKHOUSE_LOCAL" diff --git a/tests/queries/0_stateless/01748_partition_id_pruning.sql b/tests/queries/0_stateless/01748_partition_id_pruning.sql index 17a405e17ad..e0d45884c60 100644 --- a/tests/queries/0_stateless/01748_partition_id_pruning.sql +++ b/tests/queries/0_stateless/01748_partition_id_pruning.sql @@ -8,12 +8,12 @@ set max_rows_to_read = 3; select * from x where _partition_id = partitionId(1); -set max_rows_to_read = 4; -- one row for subquery +set max_rows_to_read = 5; -- one row for subquery + subquery select * from x where _partition_id in (select partitionId(number + 1) from numbers(1)); -- trivial count optimization test -set max_rows_to_read = 1; -- one row for subquery +set max_rows_to_read = 2; -- one row for subquery + subquery itself select count() from x where _partition_id in (select partitionId(number + 1) from numbers(1)); drop table x; diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect index 5543af4dd05..022320e2d4b 100755 --- a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect +++ b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 2 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql index 6e4337fc05f..220d5d91a0b 100644 --- a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql +++ b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql @@ -32,7 +32,7 @@ select '(0, 2)'; with (select currentDatabase()) as id_no select *, ignore(id_no) from dist_01756 where dummy in (0, 2); system flush logs; select query from system.query_log where - event_date = today() and + event_date >= yesterday() and event_time > now() - interval 1 hour and not is_initial_query and query not like '%system%query_log%' and @@ -51,7 +51,7 @@ select 'optimize_skip_unused_shards_rewrite_in(0, 2)'; with (select currentDatabase()) as id_02 select *, ignore(id_02) from dist_01756 where dummy in (0, 2); system flush logs; select query from system.query_log where - event_date = today() and + event_date >= yesterday() and event_time > now() - interval 1 hour and not is_initial_query and query not like '%system%query_log%' and @@ -63,7 +63,7 @@ select 'optimize_skip_unused_shards_rewrite_in(2,)'; with (select currentDatabase()) as id_2 select *, ignore(id_2) from dist_01756 where dummy in (2,); system flush logs; select query from system.query_log where - event_date = today() and + event_date >= yesterday() and event_time > now() - interval 1 hour and not is_initial_query and query not like '%system%query_log%' and @@ -75,7 +75,7 @@ select 'optimize_skip_unused_shards_rewrite_in(0,)'; with (select currentDatabase()) as id_0 select *, ignore(id_0) from dist_01756 where dummy in (0,); system flush logs; select query from system.query_log where - event_date = today() and + event_date >= yesterday() and event_time > now() - interval 1 hour and not is_initial_query and query not like '%system%query_log%' and diff --git a/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh b/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh index a2945de5b0c..c5aaa794ac9 100755 --- a/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh +++ b/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh @@ -20,7 +20,7 @@ $CLICKHOUSE_CLIENT -nm -q """ insert into data_01810 select * from numbers(50); drop table data_01810 settings log_queries=1; system flush logs; - select throwIf(length(thread_ids)<50) from system.query_log where event_date = today() and current_database = currentDatabase() and query = 'drop table data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null; + select throwIf(length(thread_ids)<50) from system.query_log where event_date >= yesterday() and current_database = currentDatabase() and query = 'drop table data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null; """ # ReplicatedMergeTree @@ -31,7 +31,7 @@ $CLICKHOUSE_CLIENT -nm -q """ insert into rep_data_01810 select * from numbers(50); drop table rep_data_01810 settings log_queries=1; system flush logs; - select throwIf(length(thread_ids)<50) from system.query_log where event_date = today() and current_database = currentDatabase() and query = 'drop table rep_data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null; + select throwIf(length(thread_ids)<50) from system.query_log where event_date >= yesterday() and current_database = currentDatabase() and query = 'drop table rep_data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null; """ $CLICKHOUSE_CLIENT -nm -q "drop database ordinary_$CLICKHOUSE_DATABASE" diff --git a/tests/queries/0_stateless/01814_distributed_push_down_limit.sh b/tests/queries/0_stateless/01814_distributed_push_down_limit.sh index 81ed4568092..1412ea3be65 100755 --- a/tests/queries/0_stateless/01814_distributed_push_down_limit.sh +++ b/tests/queries/0_stateless/01814_distributed_push_down_limit.sh @@ -69,7 +69,7 @@ function test_distributed_push_down_limit_with_query_log() system flush logs; select read_rows from system.query_log where - event_date = today() + event_date >= yesterday() and query_kind = 'Select' /* exclude DESC TABLE */ and initial_query_id = '$query_id' and initial_query_id != query_id; " | xargs # convert new lines to spaces diff --git a/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect b/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect index 138727d296d..d5ce4c3cbf2 100755 --- a/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect +++ b/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect @@ -4,11 +4,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01922_sum_null_for_remote.reference b/tests/queries/0_stateless/01922_sum_null_for_remote.reference new file mode 100644 index 00000000000..dec7d2fabd2 --- /dev/null +++ b/tests/queries/0_stateless/01922_sum_null_for_remote.reference @@ -0,0 +1 @@ +\N diff --git a/tests/queries/0_stateless/01922_sum_null_for_remote.sql b/tests/queries/0_stateless/01922_sum_null_for_remote.sql new file mode 100644 index 00000000000..a19740364a1 --- /dev/null +++ b/tests/queries/0_stateless/01922_sum_null_for_remote.sql @@ -0,0 +1 @@ +select sum(null) from remote('127.0.0.{1,2}', 'system', 'one') diff --git a/tests/queries/0_stateless/01927_query_views_log_current_database.reference b/tests/queries/0_stateless/01927_query_views_log_current_database.reference index ff9eca2d97f..eaa1e98c55c 100644 --- a/tests/queries/0_stateless/01927_query_views_log_current_database.reference +++ b/tests/queries/0_stateless/01927_query_views_log_current_database.reference @@ -1,70 +1,94 @@ Row 1: ────── -stage: Query log rows -read_rows: 100 -written_rows: 201 -databases: ['_table_function','default'] -tables: ['_table_function.numbers','default.table_a','default.table_b','default.table_b_live_view','default.table_c'] -views: ['default.matview_a_to_b','default.matview_b_to_c','default.table_b_live_view'] -sleep_calls: 200 -sleep_us: 298 +stage: Query log rows +read_rows: 400 +written_rows: 201 +databases: ['_table_function','default'] +tables: ['_table_function.numbers','default.table_a','default.table_b','default.table_b_live_view','default.table_c'] +views: ['default.matview_a_to_b','default.matview_b_to_c','default.table_b_live_view'] +sleep_calls: 200 +sleep_us: 298 +profile_select_rows: 400 +profile_select_bytes: 5200 +profile_insert_rows: 201 +profile_insert_bytes: 2808 Row 1: ────── -stage: Depending views -view_name: default.matview_a_to_b -view_type: Materialized -status: QueryFinish -view_target: default.table_b -view_query: SELECT toFloat64(a) AS a, b + sleepEachRow(0.000001) AS count FROM default.table_a -read_rows: 100 -written_rows: 100 -sleep_calls: 100 -sleep_us: 99 +stage: Depending views +view_name: default.matview_a_to_b +view_type: Materialized +status: QueryFinish +view_target: default.table_b +view_query: SELECT toFloat64(a) AS a, b + sleepEachRow(0.000001) AS count FROM default.table_a +read_rows: 100 +written_rows: 100 +sleep_calls: 100 +sleep_us: 99 +profile_select_rows: 100 +profile_select_bytes: 2000 +profile_insert_rows: 100 +profile_insert_bytes: 800 Row 2: ────── -stage: Depending views -view_name: default.matview_b_to_c -view_type: Materialized -status: QueryFinish -view_target: default.table_c -view_query: SELECT sum(a + sleepEachRow(0.000002)) AS a FROM default.table_b -read_rows: 100 -written_rows: 1 -sleep_calls: 100 -sleep_us: 199 +stage: Depending views +view_name: default.matview_b_to_c +view_type: Materialized +status: QueryFinish +view_target: default.table_c +view_query: SELECT sum(a + sleepEachRow(0.000002)) AS a FROM default.table_b +read_rows: 100 +written_rows: 1 +sleep_calls: 100 +sleep_us: 199 +profile_select_rows: 100 +profile_select_bytes: 800 +profile_insert_rows: 1 +profile_insert_bytes: 8 Row 3: ────── -stage: Depending views -view_name: default.table_b_live_view -view_type: Live -status: QueryFinish -view_target: default.table_b_live_view -view_query: SELECT sum(a + b) FROM default.table_b -read_rows: 100 -written_rows: 0 -sleep_calls: 0 -sleep_us: 0 +stage: Depending views +view_name: default.table_b_live_view +view_type: Live +status: QueryFinish +view_target: default.table_b_live_view +view_query: SELECT sum(a + b) FROM default.table_b +read_rows: 100 +written_rows: 0 +sleep_calls: 0 +sleep_us: 0 +profile_select_rows: 100 +profile_select_bytes: 1600 +profile_insert_rows: 0 +profile_insert_bytes: 0 Row 1: ────── -stage: Query log rows 2 -read_rows: 50 -written_rows: 100 -databases: ['_table_function','default'] -tables: ['_table_function.numbers','default.table_d','default.table_e','default.table_f'] -views: ['default.matview_join_d_e'] -sleep_calls: 50 -sleep_us: 150 +stage: Query log rows 2 +read_rows: 100 +written_rows: 100 +databases: ['_table_function','default'] +tables: ['_table_function.numbers','default.table_d','default.table_e','default.table_f'] +views: ['default.matview_join_d_e'] +sleep_calls: 50 +sleep_us: 150 +profile_select_rows: 100 +profile_select_bytes: 800 +profile_insert_rows: 100 +profile_insert_bytes: 1600 Row 1: ────── -stage: Depending views 2 -view_name: default.matview_join_d_e -view_type: Materialized -status: QueryFinish -view_target: default.table_f -view_query: SELECT table_d.a AS a, table_e.count + sleepEachRow(0.000003) AS count FROM default.table_d LEFT JOIN default.table_e ON table_d.a = table_e.a -read_rows: 50 -written_rows: 50 -sleep_calls: 50 -sleep_us: 150 +stage: Depending views 2 +view_name: default.matview_join_d_e +view_type: Materialized +status: QueryFinish +view_target: default.table_f +view_query: SELECT table_d.a AS a, table_e.count + sleepEachRow(0.000003) AS count FROM default.table_d LEFT JOIN default.table_e ON table_d.a = table_e.a +read_rows: 50 +written_rows: 50 +sleep_calls: 50 +sleep_us: 150 +profile_select_rows: 50 +profile_select_bytes: 400 +profile_insert_rows: 50 +profile_insert_bytes: 800 diff --git a/tests/queries/0_stateless/01927_query_views_log_current_database.sql b/tests/queries/0_stateless/01927_query_views_log_current_database.sql index 40ab8c8e16a..fbfbeab0167 100644 --- a/tests/queries/0_stateless/01927_query_views_log_current_database.sql +++ b/tests/queries/0_stateless/01927_query_views_log_current_database.sql @@ -45,7 +45,11 @@ SELECT arraySort(tables) as tables, arraySort(views) as views, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_log WHERE query like '-- INSERT 1%INSERT INTO table_a%' AND current_database = currentDatabase() @@ -62,7 +66,11 @@ SELECT read_rows, written_rows, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_views_log WHERE initial_query_id = ( @@ -85,7 +93,11 @@ SELECT arraySort(tables) as tables, arraySort(views) as views, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_log WHERE query like '-- INSERT 2%INSERT INTO table_d%' AND current_database = currentDatabase() @@ -102,7 +114,11 @@ SELECT read_rows, written_rows, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_views_log WHERE initial_query_id = ( diff --git a/tests/queries/0_stateless/01933_client_replxx_convert_history.expect b/tests/queries/0_stateless/01933_client_replxx_convert_history.expect index 59231161d91..c5645179ab3 100755 --- a/tests/queries/0_stateless/01933_client_replxx_convert_history.expect +++ b/tests/queries/0_stateless/01933_client_replxx_convert_history.expect @@ -5,11 +5,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01945_show_debug_warning.expect b/tests/queries/0_stateless/01945_show_debug_warning.expect index 402ad9a1f35..2f74b6e33ae 100755 --- a/tests/queries/0_stateless/01945_show_debug_warning.expect +++ b/tests/queries/0_stateless/01945_show_debug_warning.expect @@ -7,11 +7,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02003_memory_limit_in_client.expect b/tests/queries/0_stateless/02003_memory_limit_in_client.expect index 47ac4926537..a3d6d04110a 100755 --- a/tests/queries/0_stateless/02003_memory_limit_in_client.expect +++ b/tests/queries/0_stateless/02003_memory_limit_in_client.expect @@ -8,14 +8,18 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] + +# +# Check that the query will fail in clickhouse-client +# spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --max_memory_usage_in_client=1" expect ":) " @@ -28,7 +32,24 @@ expect ":) " send -- "\4" expect eof -set basedir [file dirname $argv0] +# +# Check that the query will fail in clickhouse-client +# +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --max_memory_usage_in_client=1" +expect ":) " + +send -- "SELECT arrayMap(x -> range(x), range(number)) FROM numbers(1000)\r" +expect "Code: 241" + +expect ":) " + +# Exit. +send -- "\4" +expect eof + +# +# Check that the query will not fail (due to max_untracked_memory) +# spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --max_memory_usage_in_client=1" expect ":) " diff --git a/tests/queries/0_stateless/02015_global_in_threads.sh b/tests/queries/0_stateless/02015_global_in_threads.sh index c112e47fe92..9437187d462 100755 --- a/tests/queries/0_stateless/02015_global_in_threads.sh +++ b/tests/queries/0_stateless/02015_global_in_threads.sh @@ -6,4 +6,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ${CLICKHOUSE_CLIENT} --log_queries=1 --max_threads=32 --query_id "2015_${CLICKHOUSE_DATABASE}_query" -q "select count() from remote('127.0.0.{2,3}', numbers(10)) where number global in (select number % 5 from numbers_mt(1000000))" ${CLICKHOUSE_CLIENT} -q "system flush logs" -${CLICKHOUSE_CLIENT} -q "select length(thread_ids) >= 32 from system.query_log where event_date = today() and query_id = '2015_${CLICKHOUSE_DATABASE}_query' and type = 'QueryFinish' and current_database = currentDatabase()" +${CLICKHOUSE_CLIENT} -q "select length(thread_ids) >= 32 from system.query_log where event_date >= yesterday() and query_id = '2015_${CLICKHOUSE_DATABASE}_query' and type = 'QueryFinish' and current_database = currentDatabase()" diff --git a/tests/queries/0_stateless/02047_client_exception.expect b/tests/queries/0_stateless/02047_client_exception.expect index 57a38c4f6aa..f7d4bfb555d 100755 --- a/tests/queries/0_stateless/02047_client_exception.expect +++ b/tests/queries/0_stateless/02047_client_exception.expect @@ -4,11 +4,11 @@ log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect b/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect index 89271805fb3..ffa25b964db 100755 --- a/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect +++ b/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect @@ -4,12 +4,11 @@ log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } - + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02051_read_settings.reference.j2 b/tests/queries/0_stateless/02051_read_settings.reference.j2 index 86aa67a9d2d..391cf3adf35 100644 --- a/tests/queries/0_stateless/02051_read_settings.reference.j2 +++ b/tests/queries/0_stateless/02051_read_settings.reference.j2 @@ -1,3 +1,4 @@ +{% for index_granularity_bytes in [0, 10 * 1024 * 1024] -%} {% for read_method in ['read', 'mmap', 'pread_threadpool', 'pread_fake_async'] -%} {% for direct_io in [0, 1] -%} {% for prefetch in [0, 1] -%} @@ -9,3 +10,4 @@ {% endfor -%} {% endfor -%} {% endfor -%} +{% endfor -%} diff --git a/tests/queries/0_stateless/02051_read_settings.sql.j2 b/tests/queries/0_stateless/02051_read_settings.sql.j2 index 9f02274e732..fa19fbd3036 100644 --- a/tests/queries/0_stateless/02051_read_settings.sql.j2 +++ b/tests/queries/0_stateless/02051_read_settings.sql.j2 @@ -4,7 +4,15 @@ drop table if exists data_02051; -create table data_02051 (key Int, value String) engine=MergeTree() order by key +{# check each index_granularity_bytes #} +{% for index_granularity_bytes in [0, 10 * 1024 * 1024] %} +create table data_02051 (key Int, value String) +engine=MergeTree() +order by key +settings + index_granularity_bytes={{ index_granularity_bytes }}, + /* to suppress "Table can't create parts with adaptive granularity, but settings ..." warning */ + min_bytes_for_wide_part=0 as select number, repeat(toString(number), 5) from numbers(1e6); {# check each local_filesystem_read_method #} @@ -29,3 +37,7 @@ select count(ignore(*)) from data_02051 settings {% endfor %} {% endfor %} {% endfor %} + +drop table data_02051; +{# index_granularity_bytes #} +{% endfor %} diff --git a/tests/queries/0_stateless/02096_date_time_1970_saturation.reference b/tests/queries/0_stateless/02096_date_time_1970_saturation.reference new file mode 100644 index 00000000000..3c073b9262e --- /dev/null +++ b/tests/queries/0_stateless/02096_date_time_1970_saturation.reference @@ -0,0 +1,30 @@ +1970-01-01 +1970-01-01 03:00:00 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-02 03:00:00 +1970-01-01 03:00:00 +1970-01-01 03:00:00 +1970-01-01 03:00:00 +1970-01-01 03:00:00 +1970-01-01 03:00:00 +1969-12-31 16:00:00 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-02 16:00:00 +1969-12-31 16:00:00 +1969-12-31 16:00:00 +1969-12-31 16:00:00 +1969-12-31 16:00:00 +1969-12-31 16:00:00 diff --git a/tests/queries/0_stateless/02096_date_time_1970_saturation.sql b/tests/queries/0_stateless/02096_date_time_1970_saturation.sql new file mode 100644 index 00000000000..e0c401443a7 --- /dev/null +++ b/tests/queries/0_stateless/02096_date_time_1970_saturation.sql @@ -0,0 +1,31 @@ +select toDate(0); +select toDateTime(0, 'Europe/Moscow'); +select toMonday(toDate(0)); +select toMonday(toDateTime(0, 'Europe/Moscow')); +select toStartOfWeek(toDate(0)); +select toStartOfWeek(toDateTime(0, 'Europe/Moscow')); +select toStartOfMonth(toDate(0)); +select toStartOfMonth(toDateTime(0, 'Europe/Moscow')); +select toStartOfQuarter(toDate(0)); +select toStartOfQuarter(toDateTime(0, 'Europe/Moscow')); +select toStartOfYear(toDate(0)); +select toStartOfYear(toDateTime(0, 'Europe/Moscow')); +select toTime(toDateTime(0, 'Europe/Moscow')); +select toStartOfMinute(toDateTime(0, 'Europe/Moscow')); +select toStartOfFiveMinute(toDateTime(0, 'Europe/Moscow')); +select toStartOfTenMinutes(toDateTime(0, 'Europe/Moscow')); +select toStartOfFifteenMinutes(toDateTime(0, 'Europe/Moscow')); +select toStartOfHour(toDateTime(0, 'Europe/Moscow')); + +select toDateTime(0, 'America/Los_Angeles'); +select toMonday(toDateTime(0, 'America/Los_Angeles')); +select toStartOfWeek(toDateTime(0, 'America/Los_Angeles')); +select toStartOfMonth(toDateTime(0, 'America/Los_Angeles')); +select toStartOfQuarter(toDateTime(0, 'America/Los_Angeles')); +select toStartOfYear(toDateTime(0, 'America/Los_Angeles')); +select toTime(toDateTime(0, 'America/Los_Angeles'), 'America/Los_Angeles'); +select toStartOfMinute(toDateTime(0, 'America/Los_Angeles')); +select toStartOfFiveMinute(toDateTime(0, 'America/Los_Angeles')); +select toStartOfTenMinutes(toDateTime(0, 'America/Los_Angeles')); +select toStartOfFifteenMinutes(toDateTime(0, 'America/Los_Angeles')); +select toStartOfHour(toDateTime(0, 'America/Los_Angeles')); diff --git a/tests/queries/0_stateless/02105_backslash_letter_commands.expect b/tests/queries/0_stateless/02105_backslash_letter_commands.expect index 89d896fdedc..e67d60912fa 100755 --- a/tests/queries/0_stateless/02105_backslash_letter_commands.expect +++ b/tests/queries/0_stateless/02105_backslash_letter_commands.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 02 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect index 73b12637906..0abe25e60f4 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect @@ -5,23 +5,24 @@ log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } - -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_prepare.sh" - set basedir [file dirname $argv0] -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT --disable_suggestion --interactive --queries-file \$CURDIR/file_02112" + +system "$basedir/helpers/02112_prepare.sh" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT --disable_suggestion --interactive --queries-file $basedir/file_02112" expect ":) " send -- "select * from t format TSV\r" expect "1" expect ":) " -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_clean.sh" +send "" +expect eof +system "$basedir/helpers/02112_clean.sh" diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect index fa146577234..c846464b011 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect @@ -4,11 +4,11 @@ log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect index fbf79629f71..c64f149a93c 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect @@ -5,23 +5,24 @@ log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } - -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_prepare.sh" - set basedir [file dirname $argv0] -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --interactive --queries-file \$CURDIR/file_02112" + +system "$basedir/helpers/02112_prepare.sh" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --interactive --queries-file $basedir/file_02112" expect ":) " send -- "select * from t format TSV\r" expect "1" expect ":) " -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_clean.sh" +send "" +expect eof +system "$basedir/helpers/02112_clean.sh" diff --git a/tests/queries/0_stateless/02116_interactive_hello.expect b/tests/queries/0_stateless/02116_interactive_hello.expect index 49a167e5a6e..e659cf8703c 100755 --- a/tests/queries/0_stateless/02116_interactive_hello.expect +++ b/tests/queries/0_stateless/02116_interactive_hello.expect @@ -1,14 +1,15 @@ #!/usr/bin/expect -f +# Tags: long log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 35de7f8e82c..b52a8712087 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -20,7 +20,7 @@ CREATE TABLE system.errors\n(\n `name` String,\n `code` Int32,\n `value CREATE TABLE system.events\n(\n `event` String,\n `value` UInt64,\n `description` String\n)\nENGINE = SystemEvents()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.formats\n(\n `name` String,\n `is_input` UInt8,\n `is_output` UInt8\n)\nENGINE = SystemFormats()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.functions\n(\n `name` String,\n `is_aggregate` UInt8,\n `case_insensitive` UInt8,\n `alias_to` String,\n `create_query` String,\n `origin` Enum8(\'System\' = 0, \'SQLUserDefined\' = 1, \'ExecutableUserDefined\' = 2)\n)\nENGINE = SystemFunctions()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' -CREATE TABLE system.grants\n(\n `user_name` Nullable(String),\n `role_name` Nullable(String),\n `access_type` Enum8(\'SQLITE\' = -128, \'ODBC\' = -127, \'JDBC\' = -126, \'HDFS\' = -125, \'S3\' = -124, \'SOURCES\' = -123, \'ALL\' = -122, \'NONE\' = -121, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM\' = 116, \'dictGet\' = 117, \'addressToLine\' = 118, \'addressToSymbol\' = 119, \'demangle\' = 120, \'INTROSPECTION\' = 121, \'FILE\' = 122, \'URL\' = 123, \'REMOTE\' = 124, \'MONGO\' = 125, \'MYSQL\' = 126, \'POSTGRES\' = 127),\n `database` Nullable(String),\n `table` Nullable(String),\n `column` Nullable(String),\n `is_partial_revoke` UInt8,\n `grant_option` UInt8\n)\nENGINE = SystemGrants()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' +CREATE TABLE system.grants\n(\n `user_name` Nullable(String),\n `role_name` Nullable(String),\n `access_type` Enum8(\'POSTGRES\' = -128, \'SQLITE\' = -127, \'ODBC\' = -126, \'JDBC\' = -125, \'HDFS\' = -124, \'S3\' = -123, \'SOURCES\' = -122, \'ALL\' = -121, \'NONE\' = -120, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM THREAD FUZZER\' = 116, \'SYSTEM\' = 117, \'dictGet\' = 118, \'addressToLine\' = 119, \'addressToSymbol\' = 120, \'demangle\' = 121, \'INTROSPECTION\' = 122, \'FILE\' = 123, \'URL\' = 124, \'REMOTE\' = 125, \'MONGO\' = 126, \'MYSQL\' = 127),\n `database` Nullable(String),\n `table` Nullable(String),\n `column` Nullable(String),\n `is_partial_revoke` UInt8,\n `grant_option` UInt8\n)\nENGINE = SystemGrants()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.graphite_retentions\n(\n `config_name` String,\n `regexp` String,\n `function` String,\n `age` UInt64,\n `precision` UInt64,\n `priority` UInt16,\n `is_default` UInt8,\n `Tables.database` Array(String),\n `Tables.table` Array(String)\n)\nENGINE = SystemGraphite()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.licenses\n(\n `library_name` String,\n `license_type` String,\n `license_path` String,\n `license_text` String\n)\nENGINE = SystemLicenses()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.macros\n(\n `macro` String,\n `substitution` String\n)\nENGINE = SystemMacros()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' @@ -35,7 +35,7 @@ CREATE TABLE system.one\n(\n `dummy` UInt8\n)\nENGINE = SystemOne()\nCOMMENT CREATE TABLE system.part_moves_between_shards\n(\n `database` String,\n `table` String,\n `task_name` String,\n `task_uuid` UUID,\n `create_time` DateTime,\n `part_name` String,\n `part_uuid` UUID,\n `to_shard` String,\n `dst_part_name` String,\n `update_time` DateTime,\n `state` String,\n `rollback` UInt8,\n `num_tries` UInt32,\n `last_exception` String\n)\nENGINE = SystemShardMoves()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.parts\n(\n `partition` String,\n `name` String,\n `uuid` UUID,\n `part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `secondary_indices_compressed_bytes` UInt64,\n `secondary_indices_uncompressed_bytes` UInt64,\n `secondary_indices_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `is_frozen` UInt8,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `hash_of_all_files` String,\n `hash_of_uncompressed_files` String,\n `uncompressed_hash_of_compressed_files` String,\n `delete_ttl_info_min` DateTime,\n `delete_ttl_info_max` DateTime,\n `move_ttl_info.expression` Array(String),\n `move_ttl_info.min` Array(DateTime),\n `move_ttl_info.max` Array(DateTime),\n `default_compression_codec` String,\n `recompression_ttl_info.expression` Array(String),\n `recompression_ttl_info.min` Array(DateTime),\n `recompression_ttl_info.max` Array(DateTime),\n `group_by_ttl_info.expression` Array(String),\n `group_by_ttl_info.min` Array(DateTime),\n `group_by_ttl_info.max` Array(DateTime),\n `rows_where_ttl_info.expression` Array(String),\n `rows_where_ttl_info.min` Array(DateTime),\n `rows_where_ttl_info.max` Array(DateTime),\n `projections` Array(String),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemParts()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.parts_columns\n(\n `partition` String,\n `name` String,\n `uuid` UUID,\n `part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `column` String,\n `type` String,\n `column_position` UInt64,\n `default_kind` String,\n `default_expression` String,\n `column_bytes_on_disk` UInt64,\n `column_data_compressed_bytes` UInt64,\n `column_data_uncompressed_bytes` UInt64,\n `column_marks_bytes` UInt64,\n `serialization_kind` String,\n `subcolumns.names` Array(String),\n `subcolumns.types` Array(String),\n `subcolumns.serializations` Array(String),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemPartsColumns()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' -CREATE TABLE system.privileges\n(\n `privilege` Enum8(\'SQLITE\' = -128, \'ODBC\' = -127, \'JDBC\' = -126, \'HDFS\' = -125, \'S3\' = -124, \'SOURCES\' = -123, \'ALL\' = -122, \'NONE\' = -121, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM\' = 116, \'dictGet\' = 117, \'addressToLine\' = 118, \'addressToSymbol\' = 119, \'demangle\' = 120, \'INTROSPECTION\' = 121, \'FILE\' = 122, \'URL\' = 123, \'REMOTE\' = 124, \'MONGO\' = 125, \'MYSQL\' = 126, \'POSTGRES\' = 127),\n `aliases` Array(String),\n `level` Nullable(Enum8(\'GLOBAL\' = 0, \'DATABASE\' = 1, \'TABLE\' = 2, \'DICTIONARY\' = 3, \'VIEW\' = 4, \'COLUMN\' = 5)),\n `parent_group` Nullable(Enum8(\'SQLITE\' = -128, \'ODBC\' = -127, \'JDBC\' = -126, \'HDFS\' = -125, \'S3\' = -124, \'SOURCES\' = -123, \'ALL\' = -122, \'NONE\' = -121, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM\' = 116, \'dictGet\' = 117, \'addressToLine\' = 118, \'addressToSymbol\' = 119, \'demangle\' = 120, \'INTROSPECTION\' = 121, \'FILE\' = 122, \'URL\' = 123, \'REMOTE\' = 124, \'MONGO\' = 125, \'MYSQL\' = 126, \'POSTGRES\' = 127))\n)\nENGINE = SystemPrivileges()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' +CREATE TABLE system.privileges\n(\n `privilege` Enum8(\'POSTGRES\' = -128, \'SQLITE\' = -127, \'ODBC\' = -126, \'JDBC\' = -125, \'HDFS\' = -124, \'S3\' = -123, \'SOURCES\' = -122, \'ALL\' = -121, \'NONE\' = -120, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM THREAD FUZZER\' = 116, \'SYSTEM\' = 117, \'dictGet\' = 118, \'addressToLine\' = 119, \'addressToSymbol\' = 120, \'demangle\' = 121, \'INTROSPECTION\' = 122, \'FILE\' = 123, \'URL\' = 124, \'REMOTE\' = 125, \'MONGO\' = 126, \'MYSQL\' = 127),\n `aliases` Array(String),\n `level` Nullable(Enum8(\'GLOBAL\' = 0, \'DATABASE\' = 1, \'TABLE\' = 2, \'DICTIONARY\' = 3, \'VIEW\' = 4, \'COLUMN\' = 5)),\n `parent_group` Nullable(Enum8(\'POSTGRES\' = -128, \'SQLITE\' = -127, \'ODBC\' = -126, \'JDBC\' = -125, \'HDFS\' = -124, \'S3\' = -123, \'SOURCES\' = -122, \'ALL\' = -121, \'NONE\' = -120, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM THREAD FUZZER\' = 116, \'SYSTEM\' = 117, \'dictGet\' = 118, \'addressToLine\' = 119, \'addressToSymbol\' = 120, \'demangle\' = 121, \'INTROSPECTION\' = 122, \'FILE\' = 123, \'URL\' = 124, \'REMOTE\' = 125, \'MONGO\' = 126, \'MYSQL\' = 127))\n)\nENGINE = SystemPrivileges()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.processes\n(\n `is_initial_query` UInt8,\n `user` String,\n `query_id` String,\n `address` IPv6,\n `port` UInt16,\n `initial_user` String,\n `initial_query_id` String,\n `initial_address` IPv6,\n `initial_port` UInt16,\n `interface` UInt8,\n `os_user` String,\n `client_hostname` String,\n `client_name` String,\n `client_revision` UInt64,\n `client_version_major` UInt64,\n `client_version_minor` UInt64,\n `client_version_patch` UInt64,\n `http_method` UInt8,\n `http_user_agent` String,\n `http_referer` String,\n `forwarded_for` String,\n `quota_key` String,\n `elapsed` Float64,\n `is_cancelled` UInt8,\n `read_rows` UInt64,\n `read_bytes` UInt64,\n `total_rows_approx` UInt64,\n `written_rows` UInt64,\n `written_bytes` UInt64,\n `memory_usage` Int64,\n `peak_memory_usage` Int64,\n `query` String,\n `thread_ids` Array(UInt64),\n `ProfileEvents` Map(String, UInt64),\n `Settings` Map(String, String),\n `current_database` String,\n `ProfileEvents.Names` Array(String),\n `ProfileEvents.Values` Array(UInt64),\n `Settings.Names` Array(String),\n `Settings.Values` Array(String)\n)\nENGINE = SystemProcesses()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.projection_parts\n(\n `partition` String,\n `name` String,\n `part_type` String,\n `parent_name` String,\n `parent_uuid` UUID,\n `parent_part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `parent_marks` UInt64,\n `parent_rows` UInt64,\n `parent_bytes_on_disk` UInt64,\n `parent_data_compressed_bytes` UInt64,\n `parent_data_uncompressed_bytes` UInt64,\n `parent_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `is_frozen` UInt8,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `hash_of_all_files` String,\n `hash_of_uncompressed_files` String,\n `uncompressed_hash_of_compressed_files` String,\n `delete_ttl_info_min` DateTime,\n `delete_ttl_info_max` DateTime,\n `move_ttl_info.expression` Array(String),\n `move_ttl_info.min` Array(DateTime),\n `move_ttl_info.max` Array(DateTime),\n `default_compression_codec` String,\n `recompression_ttl_info.expression` Array(String),\n `recompression_ttl_info.min` Array(DateTime),\n `recompression_ttl_info.max` Array(DateTime),\n `group_by_ttl_info.expression` Array(String),\n `group_by_ttl_info.min` Array(DateTime),\n `group_by_ttl_info.max` Array(DateTime),\n `rows_where_ttl_info.expression` Array(String),\n `rows_where_ttl_info.min` Array(DateTime),\n `rows_where_ttl_info.max` Array(DateTime),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemProjectionParts()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.projection_parts_columns\n(\n `partition` String,\n `name` String,\n `part_type` String,\n `parent_name` String,\n `parent_uuid` UUID,\n `parent_part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `parent_marks` UInt64,\n `parent_rows` UInt64,\n `parent_bytes_on_disk` UInt64,\n `parent_data_compressed_bytes` UInt64,\n `parent_data_uncompressed_bytes` UInt64,\n `parent_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `column` String,\n `type` String,\n `column_position` UInt64,\n `default_kind` String,\n `default_expression` String,\n `column_bytes_on_disk` UInt64,\n `column_data_compressed_bytes` UInt64,\n `column_data_uncompressed_bytes` UInt64,\n `column_marks_bytes` UInt64,\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemProjectionPartsColumns()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' diff --git a/tests/queries/0_stateless/02122_4letter_words_stress_zookeeper.sh b/tests/queries/0_stateless/02122_4letter_words_stress_zookeeper.sh index 4a94beddbba..2deaf788ecf 100755 --- a/tests/queries/0_stateless/02122_4letter_words_stress_zookeeper.sh +++ b/tests/queries/0_stateless/02122_4letter_words_stress_zookeeper.sh @@ -18,7 +18,7 @@ function four_letter_thread() function create_drop_thread() { while true; do - num=$RANDOM + num=$(($RANDOM % 10 + 1)) $CLICKHOUSE_CLIENT --query "CREATE TABLE test_table$num (key UInt64, value1 UInt8, value2 UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_table$num', '0') ORDER BY key" sleep 0.$RANDOM $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS test_table$num" @@ -43,5 +43,12 @@ timeout $TIMEOUT bash -c create_drop_thread 2> /dev/null & wait +for num in $(seq 1 10); do + $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS test_table$num" 2>/dev/null + while [ $? -ne 0 ]; do + $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS test_table$num" 2>/dev/null + done +done + # still alive $CLICKHOUSE_CLIENT --query "SELECT 1" diff --git a/tests/queries/0_stateless/02125_query_views_log.reference b/tests/queries/0_stateless/02125_query_views_log.reference index 3ae4af9b4d0..fac70027113 100644 --- a/tests/queries/0_stateless/02125_query_views_log.reference +++ b/tests/queries/0_stateless/02125_query_views_log.reference @@ -18,7 +18,7 @@ written_bytes: 4000000 select read_rows, read_bytes, written_rows, written_bytes from system.query_log where type = 'QueryFinish' and query_kind = 'Insert' and current_database = currentDatabase() format Vertical; Row 1: ────── -read_rows: 1000000 -read_bytes: 8000000 +read_rows: 3000000 +read_bytes: 16000000 written_rows: 3000000 written_bytes: 12000000 diff --git a/tests/queries/0_stateless/02132_client_history_navigation.expect b/tests/queries/0_stateless/02132_client_history_navigation.expect index cd83454c85e..b722a0af04c 100755 --- a/tests/queries/0_stateless/02132_client_history_navigation.expect +++ b/tests/queries/0_stateless/02132_client_history_navigation.expect @@ -3,11 +3,12 @@ log_user 0 set timeout 3 match_max 100000 -# A default timeout action is to do nothing, change it to fail + expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } # useful debugging configuration diff --git a/tests/queries/0_stateless/02136_kill_scalar_queries.reference b/tests/queries/0_stateless/02136_kill_scalar_queries.reference new file mode 100644 index 00000000000..a598447cff5 --- /dev/null +++ b/tests/queries/0_stateless/02136_kill_scalar_queries.reference @@ -0,0 +1,2 @@ +finished default_TEST02132KILL_QUERY1 default select (SELECT max(number) from system.numbers) + 1; +finished default_TEST02132KILL_QUERY2 default SELECT (SELECT number FROM system.numbers WHERE number = 1000000000000); diff --git a/tests/queries/0_stateless/02136_kill_scalar_queries.sh b/tests/queries/0_stateless/02136_kill_scalar_queries.sh new file mode 100755 index 00000000000..382f6555c66 --- /dev/null +++ b/tests/queries/0_stateless/02136_kill_scalar_queries.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Ref: https://github.com/ClickHouse/ClickHouse/issues/1576 +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +function wait_for_query_to_start() +{ + while [[ $($CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "SELECT count() FROM system.processes WHERE query_id = '$1'") == 0 ]]; do sleep 0.1; done +} + +QUERY_1_ID="${CLICKHOUSE_DATABASE}_TEST02132KILL_QUERY1" +(${CLICKHOUSE_CLIENT} --query_id="${QUERY_1_ID}" --query='select (SELECT max(number) from system.numbers) + 1;' 2>&1 | grep -q "Code: 394." || echo 'FAIL') & +wait_for_query_to_start "${QUERY_1_ID}" +${CLICKHOUSE_CLIENT} --query="KILL QUERY WHERE query_id='${QUERY_1_ID}' SYNC" + +QUERY_2_ID="${CLICKHOUSE_DATABASE}_TEST02132KILL_QUERY2" +(${CLICKHOUSE_CLIENT} --query_id="${QUERY_2_ID}" --query='SELECT (SELECT number FROM system.numbers WHERE number = 1000000000000);' 2>&1 | grep -q "Code: 394." || echo 'FAIL') & +wait_for_query_to_start "${QUERY_2_ID}" +${CLICKHOUSE_CLIENT} --query="KILL QUERY WHERE query_id='${QUERY_2_ID}' SYNC" + +wait diff --git a/tests/queries/0_stateless/02136_scalar_progress.reference b/tests/queries/0_stateless/02136_scalar_progress.reference new file mode 100644 index 00000000000..21f6d3e0043 --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_progress.reference @@ -0,0 +1,6 @@ +< X-ClickHouse-Progress: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"65505","read_bytes":"524040","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"131010","read_bytes":"1048080","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Summary: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} diff --git a/tests/queries/0_stateless/02136_scalar_progress.sh b/tests/queries/0_stateless/02136_scalar_progress.sh new file mode 100755 index 00000000000..4608031f83d --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_progress.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# Ref: https://github.com/ClickHouse/ClickHouse/issues/1576 +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CURL -sS "${CLICKHOUSE_URL}&wait_end_of_query=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0" -d "SELECT (SELECT max(number), count(number) FROM numbers(100000));" -v 2>&1 | grep -E "X-ClickHouse-Summary|X-ClickHouse-Progress" diff --git a/tests/queries/0_stateless/02136_scalar_read_rows_json.reference b/tests/queries/0_stateless/02136_scalar_read_rows_json.reference new file mode 100644 index 00000000000..49020a4432f --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_read_rows_json.reference @@ -0,0 +1,50 @@ +#1 +{ + "meta": + [ + { + "name": "count()", + "type": "UInt64" + } + ], + + "data": + [ + { + "count()": "100" + } + ], + + "rows": 1, + + "rows_before_limit_at_least": 100, + + "statistics": + { + "rows_read": 100, + "bytes_read": 800 + } +} +#2 +{ + "meta": + [ + { + "type": "Tuple(UInt64, UInt64)" + } + ], + + "data": + [ + { + } + ], + + "rows": 1, + + "statistics": + { + "rows_read": 131011, + "bytes_read": 1048081 + } +} diff --git a/tests/queries/0_stateless/02136_scalar_read_rows_json.sh b/tests/queries/0_stateless/02136_scalar_read_rows_json.sh new file mode 100755 index 00000000000..d589cb60086 --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_read_rows_json.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Ref: https://github.com/ClickHouse/ClickHouse/issues/1576 +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo "#1" +${CLICKHOUSE_CLIENT} --query='SELECT count() FROM numbers(100) FORMAT JSON;' | grep -a -v "elapsed" +echo "#2" +${CLICKHOUSE_CLIENT} --query='SELECT (SELECT max(number), count(number) FROM numbers(100000) as n) FORMAT JSON;' | grep -a -v "elapsed" | grep -v "_subquery" diff --git a/tests/queries/0_stateless/02136_scalar_subquery_metrics.reference b/tests/queries/0_stateless/02136_scalar_subquery_metrics.reference new file mode 100644 index 00000000000..7bef11d008f --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_subquery_metrics.reference @@ -0,0 +1,9 @@ +#02136_scalar_subquery_1 999 +#02136_scalar_subquery_2 999 0 +#02136_scalar_subquery_3 999 999 +#02136_scalar_subquery_4 999 +#02136_scalar_subquery_4 999 +1001 SELECT \'#02136_scalar_subquery_1\', (SELECT max(number) FROM numbers(1000)) as n; +2001 SELECT \'#02136_scalar_subquery_2\', (SELECT max(number) FROM numbers(1000)) as n, (SELECT min(number) FROM numbers(1000)) as n2; +1001 SELECT \'#02136_scalar_subquery_3\', (SELECT max(number) FROM numbers(1000)) as n, (SELECT max(number) FROM numbers(1000)) as n2; +1002 SELECT \'#02136_scalar_subquery_4\', (SELECT max(number) FROM numbers(1000)) as n FROM system.numbers LIMIT 2; diff --git a/tests/queries/0_stateless/02136_scalar_subquery_metrics.sql b/tests/queries/0_stateless/02136_scalar_subquery_metrics.sql new file mode 100644 index 00000000000..180610288aa --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_subquery_metrics.sql @@ -0,0 +1,13 @@ +SELECT '#02136_scalar_subquery_1', (SELECT max(number) FROM numbers(1000)) as n; +SELECT '#02136_scalar_subquery_2', (SELECT max(number) FROM numbers(1000)) as n, (SELECT min(number) FROM numbers(1000)) as n2; +SELECT '#02136_scalar_subquery_3', (SELECT max(number) FROM numbers(1000)) as n, (SELECT max(number) FROM numbers(1000)) as n2; -- Cached +SELECT '#02136_scalar_subquery_4', (SELECT max(number) FROM numbers(1000)) as n FROM system.numbers LIMIT 2; -- Cached + +SYSTEM FLUSH LOGS; +SELECT read_rows, query FROM system.query_log +WHERE + event_date > yesterday() + AND type = 'QueryFinish' + AND current_database == currentDatabase() + AND query LIKE 'SELECT ''#02136_scalar_subquery_%' +ORDER BY query ASC; diff --git a/tests/queries/0_stateless/02149_external_schema_inference.reference b/tests/queries/0_stateless/02149_external_schema_inference.reference new file mode 100644 index 00000000000..875659c7fb6 --- /dev/null +++ b/tests/queries/0_stateless/02149_external_schema_inference.reference @@ -0,0 +1,168 @@ +Protobuf + +a_b_c Array(Array(Array(Int32))) + +a String +b_c Array(Array(Float64)) + +x Enum8(\'FIRST\' = 0, \'SECOND\' = 1, \'TEN\' = 10, \'HUNDRED\' = 100) + +a Map(String, UInt32) + +x_y_z Array(Array(Int32)) + +uuid String +name String +surname String +gender Enum8(\'female\' = 0, \'male\' = 1) +birthDate UInt32 +photo String +phoneNumber String +isOnline UInt8 +visitTime UInt32 +age UInt32 +zodiacSign Enum8(\'aries\' = 0, \'taurus\' = 1, \'gemini\' = 2, \'cancer\' = 3, \'leo\' = 4, \'virgo\' = 5, \'libra\' = 6, \'scorpius\' = 7, \'sagittarius\' = 8, \'capricorn\' = 9, \'aquarius\' = 10, \'pisces\' = 11) +songs Array(String) +color Array(UInt32) +hometown String +location Array(Float32) +pi Float64 +lotteryWin Float64 +someRatio Float32 +temperature Float32 +randomBigNumber Int64 +measureUnits Array(Tuple(unit String, coef Float32)) +nestiness_a_b_c Tuple(d UInt32, e Array(UInt32)) + +location Array(Int32) +pi Float32 +uuid String +newFieldBool UInt8 +name String +gender Enum8(\'male\' = 0, \'female\' = 1) +zodiacSign Int32 +birthDate Int64 +age String +isOnline Enum8(\'offline\' = 0, \'online\' = 1) +someRatio Float64 +visitTime UInt64 +newMessage Tuple(empty Array(Tuple()), z Float32) +randomBigNumber Int64 +newFieldInt Array(Int32) +color Array(Float32) +lotteryWin UInt64 +surname String +phoneNumber UInt64 +temperature Int32 +newFieldStr String +measureUnits_unit Array(String) +measureUnits_coef Array(Float32) +nestiness_a_b_c_d UInt32 +nestiness_a_b_c_e Array(UInt32) + +uuid String +name String +surname String +gender String +birthDate String +phoneNumber String +isOnline String +visitTime String +age String +zodiacSign String +songs Array(String) +color Array(String) +hometown String +location Array(String) +pi String +lotteryWin String +someRatio String +temperature String +randomBigNumber String +measureUnits Tuple(unit Array(String), coef Array(String)) +nestiness_a_b_c Tuple(d String, e Array(String)) + +uuid String +name String +surname String +gender Enum8(\'female\' = 0, \'male\' = 1) +birthDate UInt32 +photo String +phoneNumber String +isOnline UInt8 +visitTime UInt32 +age UInt32 +zodiacSign Enum8(\'aries\' = 0, \'taurus\' = 1, \'gemini\' = 2, \'cancer\' = 3, \'leo\' = 4, \'virgo\' = 5, \'libra\' = 6, \'scorpius\' = 7, \'sagittarius\' = 8, \'capricorn\' = 9, \'aquarius\' = 10, \'pisces\' = 11) +songs Array(String) +color Array(UInt32) +hometown String +location Array(Float32) +pi Float64 +lotteryWin Float64 +someRatio Float32 +temperature Float32 +randomBigNumber Int64 +measureunits Tuple(coef Array(Float32), unit Array(String)) +nestiness_a_b_c Tuple(d UInt32, e Array(UInt32)) +newFieldStr String +newFieldInt Int32 +newBool UInt8 + +identifier String +modules Array(Tuple(module_id UInt32, supply UInt32, temp UInt32, nodes Array(Tuple(node_id UInt32, opening_time UInt32, closing_time UInt32, current UInt32, coords_y Float32)))) + +Capnproto + +value Enum8(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2) + +value UInt64 +list1 Array(UInt64) +list2 Array(Array(Array(UInt64))) + +lc1 String +lc2 Nullable(String) +lc3 Array(Nullable(String)) + +value UInt64 +nested Tuple(a Tuple(b UInt64, c Array(Array(UInt64))), d Array(Tuple(e Array(Array(Tuple(f UInt64, g UInt64))), h Array(Tuple(k Array(UInt64)))))) + +nested Tuple(value Array(UInt64), array Array(Array(UInt64)), tuple Array(Tuple(one UInt64, two UInt64))) + +a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(f UInt64))) + +nullable Nullable(UInt64) +array Array(Nullable(UInt64)) +tuple Tuple(nullable Nullable(UInt64)) + +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +float32 Float32 +float64 Float64 +string String +fixed String +data String +date UInt16 +datetime UInt32 +datetime64 Int64 + +value UInt64 +tuple1 Tuple(one UInt64, two Tuple(three UInt64, four UInt64)) +tuple2 Tuple(nested1 Tuple(nested2 Tuple(x UInt64))) + +RawBLOB + +raw_blob String + +LineAsString + +line String + +JSONAsString + +json String diff --git a/tests/queries/0_stateless/02149_external_schema_inference.sh b/tests/queries/0_stateless/02149_external_schema_inference.sh new file mode 100755 index 00000000000..df2b9a43565 --- /dev/null +++ b/tests/queries/0_stateless/02149_external_schema_inference.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +touch $DATA_FILE + +SCHEMADIR=$(clickhouse-client --query "select * from file('$FILE_NAME', 'CapnProto', 'val1 char') settings format_schema='nonexist:Message'" 2>&1 | grep Exception | grep -oP "file \K.*(?=/nonexist.capnp)") +CLIENT_SCHEMADIR=$CURDIR/format_schemas +SERVER_SCHEMADIR=test_02149 +mkdir -p $SCHEMADIR/$SERVER_SCHEMADIR +cp -r $CLIENT_SCHEMADIR/* $SCHEMADIR/$SERVER_SCHEMADIR/ + +echo -e "Protobuf\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_array_3dim:ABC'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_array_of_arrays:AA'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_enum_mapping.proto:EnumMessage'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_map:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_nested_in_nested:MessageType'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:Person'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:AltPerson'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:StrPerson'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons_syntax2:Syntax2Person'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_skipped_column_in_nested:UpdateMessage'" + + +echo -e "\nCapnproto\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_lists:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_low_cardinality:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_lists_and_tuples:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_table:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nullable:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_simple_types:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_tuples:Message'" + +echo -e "\nRawBLOB\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'RawBLOB')" + +echo -e "\nLineAsString\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'LineAsString')" + +echo -e "\nJSONAsString\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONAsString')" + + + +rm -rf ${SCHEMADIR:?}/$SERVER_SCHEMADIR +rm $DATA_FILE diff --git a/tests/queries/0_stateless/02149_schema_inference.reference b/tests/queries/0_stateless/02149_schema_inference.reference new file mode 100644 index 00000000000..f46e3bee101 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference.reference @@ -0,0 +1,170 @@ +TSV +c1 Nullable(String) +c2 Nullable(String) +c3 Nullable(String) +c4 Nullable(String) +42 Some string [1, 2, 3, 4] (1, 2, 3) +42 abcd [] (4, 5, 6) +TSVWithNames +number Nullable(String) +string Nullable(String) +array Nullable(String) +tuple Nullable(String) +42 Some string [1, 2, 3, 4] (1, 2, 3) +42 abcd [] (4, 5, 6) +CSV +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c4 Array(Nullable(Float64)) +\N Some string [([1,2.3],'String'),([],NULL)] [1,NULL,3] +42 \N [([1,2.3],'String'),([3],'abcd')] [4,5,6] +c1 Nullable(String) +c2 Nullable(String) +42 String +String 42 +c1 Nullable(String) +c2 Nullable(String) +\N [NULL, NULL] +\N [] +CSVWithNames +a Nullable(Float64) +b Nullable(String) +c Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +d Array(Nullable(Float64)) +\N Some string [([1,2.3],'String'),([],NULL)] [1,NULL,3] +42 \N [([1,2.3],'String'),([3],'abcd')] [4,5,6] +JSONCompactEachRow +c1 Nullable(Float64) +c2 Array(Tuple(Nullable(Float64), Nullable(String))) +c3 Map(String, Nullable(Float64)) +c4 Nullable(UInt8) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +c1 Nullable(Float64) +c2 Array(Tuple(Nullable(Float64), Nullable(String))) +c3 Map(String, Nullable(Float64)) +c4 Nullable(UInt8) +\N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N +32 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 1 +JSONCompactEachRowWithNames +a Nullable(Float64) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +d Nullable(UInt8) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +JSONEachRow +d Nullable(UInt8) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +a Nullable(Float64) +1 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 42.42 +d Nullable(UInt8) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +a Nullable(Float64) +\N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N +1 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 32 +b Nullable(String) +c Array(Nullable(Float64)) +a Nullable(Float64) +s1 [] 1 +\N [2] 2 +\N [] \N +\N [] \N +\N [3] \N +TSKV +b Nullable(String) +c Nullable(String) +a Nullable(String) +s1 \N 1 +} [2] 2 +\N \N \N +\N \N \N +\N [3] \N +Values +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(String)) +c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(String)))) +42.42 Some string [1,2,3] (1,'2') ([1,2],[(3,'4'),(5,'6')]) +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(Float64)) +c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(String)))) +42.42 \N [1,NULL,3] (1,NULL) ([1,2],[(3,'4'),(5,'6')]) +\N Some string [10] (1,2) ([],[]) +Regexp +c1 Nullable(String) +c2 Nullable(String) +c3 Nullable(String) +42 Some string 1 [([1, 2, 3], String 1), ([], String 1)] +2 Some string 2 [([4, 5, 6], String 2), ([], String 2)] +312 Some string 3 [([1, 2, 3], String 2), ([], String 2)] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([],'String 1')] +3 Some string 2 [([3,5,1],'String 2'),([],'String 2')] +244 Some string 3 [([],'String 3'),([],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([],'String 1')] +2 Some string 2 [([],'String 2'),([],'String 2')] +43 Some string 3 [([1,5,3],'String 3'),([],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +52 Some string 2 [([],'String 2'),([1],'String 2')] +24 Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +CustomSeparated +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +Template +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +MsgPack +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Float32) +c4 Nullable(String) +c5 Array(Array(Nullable(Int64))) +c6 Map(Int64, Array(Nullable(Int64))) +\N 0 0 Str: 0 [[0,1],[0]] {0:[0,1]} +1 \N 1 Str: 1 [[1,2],[1]] {1:[1,2]} +\N 2 2 Str: 2 [[2,3],[2]] {2:[2,3]} diff --git a/tests/queries/0_stateless/02149_schema_inference.sh b/tests/queries/0_stateless/02149_schema_inference.sh new file mode 100755 index 00000000000..1ccec240627 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference.sh @@ -0,0 +1,251 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME + +touch $DATA_FILE + +SCHEMADIR=$(clickhouse-client --query "select * from file('$FILE_NAME', 'Template', 'val1 char') settings format_template_row='nonexist'" 2>&1 | grep Exception | grep -oP "file \K.*(?=/nonexist)") + +echo "TSV" + +echo -e "42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3) +42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo "TSVWithNames" + +echo -e "number\tstring\tarray\ttuple +42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3) +42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSVWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSVWithNames')" + +echo "CSV" + +echo -e "\N,\"Some string\",\"[([1, 2.3], 'String'), ([], NULL)]\",\"[1, NULL, 3]\" +42,\N,\"[([1, 2.3], 'String'), ([3.], 'abcd')]\",\"[4, 5, 6]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "42,\"String\" +\"String\",42" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "\N,\"[NULL, NULL]\" +\N,[]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo "CSVWithNames" + +echo -e "a,b,c,d +\N,\"Some string\",\"[([1, 2.3], 'String'), ([], NULL)]\",\"[1, NULL, 3]\" +42,\N,\"[([1, 2.3], 'String'), ([3.], 'abcd')]\",\"[4, 5, 6]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSVWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSVWithNames')" + +echo "JSONCompactEachRow" + +echo -e "[42.42, [[1, \"String\"], [2, \"abcd\"]], {\"key\" : 42, \"key2\" : 24}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo -e "[null, [[1, \"String\"], [2, null]], {\"key\" : null, \"key2\" : 24}, null] +[32, [[2, \"String 2\"], [3, \"hello\"]], {\"key3\" : 4242, \"key4\" : 2424}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo "JSONCompactEachRowWithNames" + +echo -e "[\"a\", \"b\", \"c\", \"d\"] +[42.42, [[1, \"String\"], [2, \"abcd\"]], {\"key\" : 42, \"key2\" : 24}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRowWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRowWithNames')" + + +echo "JSONEachRow" +echo -e '{"a" : 42.42, "b" : [[1, "String"], [2, "abcd"]], "c" : {"key" : 42, "key2" : 24}, "d" : true}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"a" : null, "b" : [[1, "String"], [2, null]], "c" : {"key" : null, "key2" : 24}, "d" : null} +{"a" : 32, "b" : [[2, "String 2"], [3, "hello"]], "c" : {"key3" : 4242, "key4" : 2424}, "d" : true}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"a" : 1, "b" : "s1", "c" : null} +{"c" : [2], "a" : 2, "b" : null} +{} +{"a" : null} +{"c" : [3], "a" : null}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + + +echo "TSKV" + +echo -e 'a=1\tb=s1\tc=\N +c=[2]\ta=2\tb=\N} + +a=\N +c=[3]\ta=\N' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')" + + +echo "Values" + +echo -e "(42.42, 'Some string', [1, 2, 3], (1, '2'), ([1, 2], [(3, '4'), (5, '6')]))" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Values')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Values')" + +echo -e "(42.42, NULL, [1, NULL, 3], (1, NULL), ([1, 2], [(3, '4'), (5, '6')])), (NULL, 'Some string', [10], (1, 2), ([], []))" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Values')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Values')" + + +echo "Regexp" + +REGEXP="^Line: value_1=(.+?), value_2=(.+?), value_3=(.+?)" + +echo "Line: value_1=42, value_2=Some string 1, value_3=[([1, 2, 3], String 1), ([], String 1)] +Line: value_1=2, value_2=Some string 2, value_3=[([4, 5, 6], String 2), ([], String 2)] +Line: value_1=312, value_2=Some string 3, value_3=[([1, 2, 3], String 2), ([], String 2)]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Escaped'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Escaped'" + + +echo "Line: value_1=42, value_2=\"Some string 1\", value_3=\"[([1, 2, 3], 'String 1'), ([], 'String 1')]\" +Line: value_1=3, value_2=\"Some string 2\", value_3=\"[([3, 5, 1], 'String 2'), ([], 'String 2')]\" +Line: value_1=244, value_2=\"Some string 3\", value_3=\"[([], 'String 3'), ([], 'String 3')]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='CSV'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='CSV'" + + +echo "Line: value_1=42, value_2='Some string 1', value_3=[([1, 2, 3], 'String 1'), ([], 'String 1')] +Line: value_1=2, value_2='Some string 2', value_3=[([], 'String 2'), ([], 'String 2')] +Line: value_1=43, value_2='Some string 3', value_3=[([1, 5, 3], 'String 3'), ([], 'String 3')]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Quoted'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Quoted'" + + +echo "Line: value_1=42, value_2=\"Some string 1\", value_3=[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] +Line: value_1=52, value_2=\"Some string 2\", value_3=[[[], \"String 2\"], [[1], \"String 2\"]] +Line: value_1=24, value_2=\"Some string 3\", value_3=[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='JSON'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='JSON'" + + +echo "CustomSeparated" + +CUSTOM_SETTINGS="SETTINGS format_custom_row_before_delimiter='', format_custom_row_after_delimiter='\n', format_custom_row_between_delimiter='\n', format_custom_result_before_delimiter='\n', format_custom_result_after_delimiter='\n', format_custom_field_delimiter=''" + +echo -e " +42.42\"Some string 1\"\"[([1, 2, 3], 'String 1'), ([1], 'String 1')]\" + +42\"Some string 2\"\"[([], 'String 2'), ([], 'String 2')]\" + +\N\"Some string 3\"\"[([1, 2, 3], 'String 3'), ([1], 'String 3')]\" +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='CSV'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='CSV'" + +echo -e " +42.42'Some string 1'[([1, 2, 3], 'String 1'), ([1], 'String 1')] + +42'Some string 2'[([], 'String 2'), ([], 'String 2')] + +NULL'Some string 3'[([1, 2, 3], 'String 3'), ([1], 'String 3')] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='Quoted'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='Quoted'" + +echo -e " +42.42\"Some string 1\"[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] + +42\"Some string 2\"[[[], \"String 2\"], [[], \"String 2\"]] + +null\"Some string 3\"[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='JSON'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='JSON'" + + +echo "Template" + +echo -e " +\${data}" > $SCHEMADIR/resultset_format_02149 + +echo -e "\${column_1:CSV}\${column_2:CSV}\${column_3:CSV}" > $SCHEMADIR/row_format_02149 + +TEMPLATE_SETTINGS="SETTINGS format_template_rows_between_delimiter='\n', format_template_row='row_format_02149', format_template_resultset='resultset_format_02149'" + +echo -e " +42.42\"Some string 1\"\"[([1, 2, 3], 'String 1'), ([1], 'String 1')]\" + +42\"Some string 2\"\"[([], 'String 2'), ([], 'String 2')]\" + +\N\"Some string 3\"\"[([1, 2, 3], 'String 3'), ([1], 'String 3')]\" +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + +echo -e "\${column_1:Quoted}\${column_2:Quoted}\${column_3:Quoted}" > $SCHEMADIR/row_format_02149 + +echo -e " +42.42'Some string 1'[([1, 2, 3], 'String 1'), ([1], 'String 1')] + +42'Some string 2'[([], 'String 2'), ([], 'String 2')] + +NULL'Some string 3'[([1, 2, 3], 'String 3'), ([1], 'String 3')] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + +echo -e "\${column_1:JSON}\${column_2:JSON}\${column_3:JSON}" > $SCHEMADIR/row_format_02149 + +echo -e " +42.42\"Some string 1\"[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] + +42\"Some string 2\"[[[], \"String 2\"], [[], \"String 2\"]] + +null\"Some string 3\"[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + + +echo "MsgPack" + +$CLICKHOUSE_CLIENT -q "select toInt32(number % 2 ? number : NULL) as int, toUInt64(number % 2 ? NULL : number) as uint, toFloat32(number) as float, concat('Str: ', toString(number)) as str, [[number, number + 1], [number]] as arr, map(number, [number, number + 1]) as map from numbers(3) format MsgPack" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'MsgPack') settings input_format_msgpack_number_of_columns=6" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'MsgPack') settings input_format_msgpack_number_of_columns=6" + + +rm $SCHEMADIR/resultset_format_02149 $SCHEMADIR/row_format_02149 +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference new file mode 100644 index 00000000000..dae12318ce0 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference @@ -0,0 +1,40 @@ +0 Str: 0 [0,1] +1 Str: 1 [1,2] +2 Str: 2 [2,3] +3 Str: 3 [3,4] +4 Str: 4 [4,5] +5 Str: 5 [5,6] +6 Str: 6 [6,7] +7 Str: 7 [7,8] +8 Str: 8 [8,9] +9 Str: 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] diff --git a/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh new file mode 100755 index 00000000000..f00f2531dd0 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +mkdir $USER_FILES_PATH/test_02149 +FILE_NAME=test_02149/data.Parquet +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +$CLICKHOUSE_CLIENT -q "select number as num, concat('Str: ', toString(number)) as str, [number, number + 1] as arr from numbers(10) format Parquet" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "drop table if exists test_02149" +$CLICKHOUSE_CLIENT -q "create table test_02149 engine=File('Parquet', '$FILE_NAME')" +$CLICKHOUSE_CLIENT -q "select * from test_02149" +$CLICKHOUSE_CLIENT -q "drop table test_02149" + +$CLICKHOUSE_CLIENT -q "create table test_02149 (x UInt32, s String, a Array(UInt32)) engine=Memory" +$CLICKHOUSE_CLIENT -q "insert into test_02149 select number, toString(number), [number, number + 1] from numbers(10)" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_merge" +$CLICKHOUSE_CLIENT -q "create table test_merge engine=Merge(currentDatabase(), 'test_02149')" +$CLICKHOUSE_CLIENT -q "select * from test_merge" +$CLICKHOUSE_CLIENT -q "drop table test_merge" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_distributed" +$CLICKHOUSE_CLIENT -q "create table test_distributed engine=Distributed(test_shard_localhost, currentDatabase(), 'test_02149')" +$CLICKHOUSE_CLIENT -q "select * from test_distributed" +$CLICKHOUSE_CLIENT -q "drop table test_distributed" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_buffer" +$CLICKHOUSE_CLIENT -q "create table test_buffer engine=Buffer(currentDatabase(), 'test_02149', 16, 10, 100, 10000, 1000000, 10000000, 100000000)" +$CLICKHOUSE_CLIENT -q "select * from test_buffer" +$CLICKHOUSE_CLIENT -q "drop table test_buffer" + +rm -rf ${USER_FILES_PATH:?}/test_02149 + diff --git a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference new file mode 100644 index 00000000000..d3d2d86d696 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference @@ -0,0 +1,435 @@ +Arrow +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +ArrowStream +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Parquet +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 Int64 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +ORC +int8 Int8 +uint8 Int8 +int16 Int16 +uint16 Int16 +int32 Int32 +uint32 Int32 +int64 Int64 +uint64 Int64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date32 +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(Int64) +tuple Tuple(`tuple.0` Int64, `tuple.1` String) +map Map(String, Int64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(Int64), `nested1.1` Map(String, Int64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(Int64)), `nested2.0.1` Map(Int64, Array(Tuple(`nested2.0.1.0` Int64, `nested2.0.1.1` String)))), `nested2.1` Int8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Native +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +TSVWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +TSVRawWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +CSVWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +JSONCompactEachRowWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +JSONCompactStringsEachRowWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +RowBinaryWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +CustomSeparatedWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Avro +CustomSeparatedWithNamesAndTypes +int8 Int32 +uint8 Int32 +int16 Int32 +uint16 Int32 +int32 Int32 +uint32 Int32 +int64 Int64 +uint64 Int64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +0 0 +1.2 0.7692307692307692 +date Int32 +0 +1 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(Int64) +nested Array(Array(Array(Int64))) +[0,1] [[[0],[1]]] +[1,2] [[[1],[2]]] diff --git a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh new file mode 100755 index 00000000000..d263ef63681 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +for format in Arrow ArrowStream Parquet ORC Native TSVWithNamesAndTypes TSVRawWithNamesAndTypes CSVWithNamesAndTypes JSONCompactEachRowWithNamesAndTypes JSONCompactStringsEachRowWithNamesAndTypes RowBinaryWithNamesAndTypes CustomSeparatedWithNamesAndTypes +do + echo $format + $CLICKHOUSE_CLIENT -q "select toInt8(-number) as int8, toUInt8(number) as uint8, toInt16(-number) as int16, toUInt16(number) as uint16, toInt32(-number) as int32, toUInt32(number) as uint32, toInt64(-number) as int64, toUInt64(number) as uint64 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select toFloat32(number * 1.2) as float32, toFloat64(number / 1.3) as float64, toDecimal32(number / 0.3, 5) as decimal32, toDecimal64(number / 0.003, 5) as decimal64 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select toDate(number) as date, toDate32(number) as date32 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select concat('Str: ', toString(number)) as str, toFixedString(toString((number + 1) * 100 % 1000), 3) as fixed_string from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select [number, number + 1] as array, (number, toString(number)) as tuple, map(toString(number), number) as map from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select [([number, number + 1], map('42', number)), ([], map()), ([42], map('42', 42))] as nested1, (([[number], [number + 1], []], map(number, [(number, '42'), (number + 1, '42')])), 42) as nested2 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" +done + +echo "Avro" + +echo $format +$CLICKHOUSE_CLIENT -q "select toInt8(-number) as int8, toUInt8(number) as uint8, toInt16(-number) as int16, toUInt16(number) as uint16, toInt32(-number) as int32, toUInt32(number) as uint32, toInt64(-number) as int64, toUInt64(number) as uint64 from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select toFloat32(number * 1.2) as float32, toFloat64(number / 1.3) as float64 from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select toDate(number) as date from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select concat('Str: ', toString(number)) as str, toFixedString(toString((number + 1) * 100 % 1000), 3) as fixed_string from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select [number, number + 1] as array, [[[number], [number + 1]]] as nested from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/02151_clickhouse_client_hints.sh b/tests/queries/0_stateless/02151_clickhouse_client_hints.sh index 3e6c6cb16a5..7221acc2504 100755 --- a/tests/queries/0_stateless/02151_clickhouse_client_hints.sh +++ b/tests/queries/0_stateless/02151_clickhouse_client_hints.sh @@ -5,4 +5,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_CLIENT --hardware_utilization 2>&1 | grep -q "Code: 552. DB::Exception: Unrecognized option '--hardware_utilization'. Maybe you meant \['--hardware-utilization'\]. (UNRECOGNIZED_ARGUMENTS)" && echo 'OK' || echo 'FAIL' ||: +$CLICKHOUSE_CLIENT --secuer 2>&1 | grep -q "Code: 552. DB::Exception: Unrecognized option '--secuer'. Maybe you meant \['--secure'\]. (UNRECOGNIZED_ARGUMENTS)" && echo 'OK' || echo 'FAIL' ||: diff --git a/tests/queries/0_stateless/02154_parser_backtracking.reference b/tests/queries/0_stateless/02154_parser_backtracking.reference new file mode 100644 index 00000000000..23751ef6c1f --- /dev/null +++ b/tests/queries/0_stateless/02154_parser_backtracking.reference @@ -0,0 +1,14 @@ +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02154_parser_backtracking.sh b/tests/queries/0_stateless/02154_parser_backtracking.sh new file mode 100755 index 00000000000..af032008069 --- /dev/null +++ b/tests/queries/0_stateless/02154_parser_backtracking.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Should finish in reasonable time (milliseconds). +# In previous versions this query led to exponential backtracking. + +echo 'SELECT '"$(perl -e 'print "CAST(" x 100')"'a b c'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo 'SELECT '"$(perl -e 'print "CAST(" x 100')"'a, b'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo 'SELECT '"$(perl -e 'print "CAST(" x 100')"'a AS b'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo 'SELECT '"$(perl -e 'print "CAST(" x 100')"'1'"$(perl -e 'print ", '"'UInt8'"')" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 +echo 'SELECT '"$(perl -e 'print "CAST(" x 100')"'1'"$(perl -e 'print " AS UInt8)" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 + +echo "SELECT fo,22222?LUTAY(SELECT(NOT CAUTAY(SELECT(NOT CAST(NOTT(NOT CAST(NOT NOT LEfT(NOT coARRAYlumnsFLuTAY(SELECT(NO0?LUTAY(SELECT(NOT CAUTAY(SELECT(NOT CAST(NOTT(NOT CAST(NOT NOT LEfT(NOT coARRAYlumnsFLuTAY(SELECT(NOTAYTAY(SELECT(NOTAYEFAULT(fo,22222?LUTAY(%SELECT(NOT CAST(NOT NOTAYTAY(SELECT(NOTAYEFAULT(fo,22222?LUTAY(SELECT(NOT CAST(NOT NOT (NOe)))))))))))))))))))))))))))))))))" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo "SELECT position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(a b))))))))))))))))))))" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo "SELECT position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(a, b))))))))))))))))))))" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' +echo "SELECT position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(a, b, c))))))))))))))))))))" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' + +echo 'SELECT '"$(perl -e 'print "position(" x 100')"'x'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' +echo 'SELECT '"$(perl -e 'print "position(" x 100')"'x y'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo 'SELECT '"$(perl -e 'print "position(" x 100')"'x IN y'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' +echo 'SELECT '"$(perl -e 'print "position(" x 100')"'x'"$(perl -e 'print " IN x)" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' +echo 'SELECT '"$(perl -e 'print "position(" x 100')"'x'"$(perl -e 'print ", x)" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' diff --git a/tests/queries/0_stateless/02155_create_table_w_timezone.reference b/tests/queries/0_stateless/02155_create_table_w_timezone.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02155_create_table_w_timezone.sql b/tests/queries/0_stateless/02155_create_table_w_timezone.sql new file mode 100644 index 00000000000..0b72122ce39 --- /dev/null +++ b/tests/queries/0_stateless/02155_create_table_w_timezone.sql @@ -0,0 +1,8 @@ +create table t02155_t64_tz ( a DateTime64(9, America/Chicago)) Engine = Memory; -- { clientError 62 } +create table t02155_t_tz ( a DateTime(America/Chicago)) Engine = Memory; -- { clientError 62 } + +create table t02155_t64_tz ( a DateTime64(9, 'America/Chicago')) Engine = Memory; +create table t02155_t_tz ( a DateTime('America/Chicago')) Engine = Memory; + +drop table t02155_t64_tz; +drop table t02155_t_tz; diff --git a/tests/queries/0_stateless/02155_csv_with_strings_with_slash.reference b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.reference new file mode 100644 index 00000000000..db750f36364 --- /dev/null +++ b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.reference @@ -0,0 +1,62 @@ +input_format_null_as_default = 1 +0 \\asdf 2000-01-01 +1 x\\x\\ 2000-01-01 +2 x\\x 2000-01-01 +3 x\\ 2000-01-01 +4 x\\ 2000-01-01 +5 \\x 2000-01-01 +6 2000-01-01 +7 \\r\\n 2000-01-01 +8 \\\\r\\\\n 2000-01-01 +9 x\\\\ 2000-01-01 +10 \\asdf 2000-01-01 +11 x\\x\\ 2000-01-01 +12 x\\x 2000-01-01 +13 x\\ 2000-01-01 +14 x\\ 2000-01-01 +15 \\x 2000-01-01 +16 \\N 2000-01-01 +17 \\r\\n 2000-01-01 +18 \\\\r\\\\n 2000-01-01 +19 x\\\\ 2000-01-01 +20 \\asdf 2000-01-01 +21 x\\x\\ 2000-01-01 +22 x\\x 2000-01-01 +23 x\\ 2000-01-01 +24 x\\ 2000-01-01 +25 \\x 2000-01-01 +26 \\N 2000-01-01 +27 \\r\\n 2000-01-01 +28 \\\\r\\\\n 2000-01-01 +29 x\\\\ 2000-01-01 +input_format_null_as_default = 0 +0 \\asdf 2000-01-01 +1 x\\x\\ 2000-01-01 +2 x\\x 2000-01-01 +3 x\\ 2000-01-01 +4 x\\ 2000-01-01 +5 \\x 2000-01-01 +6 \\N 2000-01-01 +7 \\r\\n 2000-01-01 +8 \\\\r\\\\n 2000-01-01 +9 x\\\\ 2000-01-01 +10 \\asdf 2000-01-01 +11 x\\x\\ 2000-01-01 +12 x\\x 2000-01-01 +13 x\\ 2000-01-01 +14 x\\ 2000-01-01 +15 \\x 2000-01-01 +16 \\N 2000-01-01 +17 \\r\\n 2000-01-01 +18 \\\\r\\\\n 2000-01-01 +19 x\\\\ 2000-01-01 +20 \\asdf 2000-01-01 +21 x\\x\\ 2000-01-01 +22 x\\x 2000-01-01 +23 x\\ 2000-01-01 +24 x\\ 2000-01-01 +25 \\x 2000-01-01 +26 \\N 2000-01-01 +27 \\r\\n 2000-01-01 +28 \\\\r\\\\n 2000-01-01 +29 x\\\\ 2000-01-01 diff --git a/tests/queries/0_stateless/02155_csv_with_strings_with_slash.sh b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.sh new file mode 100755 index 00000000000..ab2577e6138 --- /dev/null +++ b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test_02155_csv" + +${CLICKHOUSE_CLIENT} --query="create table test_02155_csv (A Int64, S String, D Date) Engine=Memory;" + + +echo "input_format_null_as_default = 1" +cat $CUR_DIR/data_csv/csv_with_slash.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_02155_csv FORMAT CSV SETTINGS input_format_null_as_default = 1" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM test_02155_csv" + +${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test_02155_csv" + +echo "input_format_null_as_default = 0" +cat $CUR_DIR/data_csv/csv_with_slash.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_02155_csv FORMAT CSV SETTINGS input_format_null_as_default = 0" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM test_02155_csv" + + +${CLICKHOUSE_CLIENT} --query="DROP TABLE test_02155_csv" + diff --git a/tests/queries/0_stateless/02155_dictionary_comment.reference b/tests/queries/0_stateless/02155_dictionary_comment.reference new file mode 100644 index 00000000000..69b871a6925 --- /dev/null +++ b/tests/queries/0_stateless/02155_dictionary_comment.reference @@ -0,0 +1,11 @@ +02155_test_dictionary +02155_test_dictionary 02155_test_dictionary_comment_0 +02155_test_dictionary 02155_test_dictionary_comment_0 +0 Value +02155_test_dictionary 02155_test_dictionary_comment_0 +02155_test_dictionary 02155_test_dictionary_comment_0 +02155_test_dictionary 02155_test_dictionary_comment_1 +02155_test_dictionary 02155_test_dictionary_comment_1 +0 Value +02155_test_dictionary_view 02155_test_dictionary_view_comment_0 +02155_test_dictionary_view 02155_test_dictionary_view_comment_0 diff --git a/tests/queries/0_stateless/02155_dictionary_comment.sql b/tests/queries/0_stateless/02155_dictionary_comment.sql new file mode 100644 index 00000000000..e31d9d28366 --- /dev/null +++ b/tests/queries/0_stateless/02155_dictionary_comment.sql @@ -0,0 +1,53 @@ +DROP TABLE IF EXISTS 02155_test_table; +CREATE TABLE 02155_test_table +( + id UInt64, + value String +) ENGINE=TinyLog; + +INSERT INTO 02155_test_table VALUES (0, 'Value'); + +DROP DICTIONARY IF EXISTS 02155_test_dictionary; +CREATE DICTIONARY 02155_test_dictionary +( + id UInt64, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02155_test_table')) +LAYOUT(DIRECT()); + +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +ALTER TABLE 02155_test_dictionary COMMENT COLUMN value 'value_column'; --{serverError 48} + +ALTER TABLE 02155_test_dictionary MODIFY COMMENT '02155_test_dictionary_comment_0'; +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +SELECT * FROM 02155_test_dictionary; +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +ALTER TABLE 02155_test_dictionary MODIFY COMMENT '02155_test_dictionary_comment_1'; +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +DROP TABLE IF EXISTS 02155_test_dictionary_view; +CREATE TABLE 02155_test_dictionary_view +( + id UInt64, + value String +) ENGINE=Dictionary(concat(currentDatabase(), '.02155_test_dictionary')); + +SELECT * FROM 02155_test_dictionary_view; + +ALTER TABLE 02155_test_dictionary_view COMMENT COLUMN value 'value_column'; --{serverError 48} + +ALTER TABLE 02155_test_dictionary_view MODIFY COMMENT '02155_test_dictionary_view_comment_0'; +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary_view' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary_view' AND database == currentDatabase(); + +DROP TABLE 02155_test_dictionary_view; +DROP TABLE 02155_test_table; +DROP DICTIONARY 02155_test_dictionary; diff --git a/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.reference b/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.reference new file mode 100644 index 00000000000..e599dcc71e5 --- /dev/null +++ b/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.reference @@ -0,0 +1 @@ +2016-07-15 00:00:00 diff --git a/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.sql b/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.sql new file mode 100644 index 00000000000..703cf1fed7a --- /dev/null +++ b/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.sql @@ -0,0 +1 @@ +SELECT parseDateTimeBestEffort(toLowCardinality(materialize('15-JUL-16'))); diff --git a/tests/queries/0_stateless/02156_async_insert_query_log.reference b/tests/queries/0_stateless/02156_async_insert_query_log.reference new file mode 100644 index 00000000000..404dbfe753d --- /dev/null +++ b/tests/queries/0_stateless/02156_async_insert_query_log.reference @@ -0,0 +1,4 @@ +1 a +2 b +INSERT INTO async_inserts_2156 VALUES 1 Insert 1 0 +INSERT INTO async_inserts_2156 VALUES 1 Insert 1 diff --git a/tests/queries/0_stateless/02156_async_insert_query_log.sh b/tests/queries/0_stateless/02156_async_insert_query_log.sh new file mode 100755 index 00000000000..d7177fbe70c --- /dev/null +++ b/tests/queries/0_stateless/02156_async_insert_query_log.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS async_inserts_2156" +${CLICKHOUSE_CLIENT} -q "CREATE TABLE async_inserts_2156 (id UInt32, s String) ENGINE = Memory" + +${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=0" -d "INSERT INTO async_inserts_2156 VALUES (1, 'a')" +${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=1" -d "INSERT INTO async_inserts_2156 VALUES (2, 'b')" + +${CLICKHOUSE_CLIENT} -q "SELECT * FROM async_inserts_2156 ORDER BY id" + +${CLICKHOUSE_CLIENT} -q "SYSTEM FLUSH LOGS" + +${CLICKHOUSE_CLIENT} -q "SELECT query, arrayExists(x -> x LIKE '%async_inserts_2156', tables), \ + query_kind, Settings['async_insert'], Settings['wait_for_async_insert'] FROM system.query_log \ + WHERE event_date >= yesterday() AND current_database = '$CLICKHOUSE_DATABASE' \ + AND query ILIKE 'INSERT INTO async_inserts_2156 VALUES%' AND type = 'QueryFinish' \ + ORDER BY query_start_time_microseconds" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE async_inserts_2156" diff --git a/tests/queries/0_stateless/02157_line_as_string_output_format.reference b/tests/queries/0_stateless/02157_line_as_string_output_format.reference new file mode 100644 index 00000000000..196aafcda30 --- /dev/null +++ b/tests/queries/0_stateless/02157_line_as_string_output_format.reference @@ -0,0 +1 @@ +Hello \ World diff --git a/tests/queries/0_stateless/02157_line_as_string_output_format.sql b/tests/queries/0_stateless/02157_line_as_string_output_format.sql new file mode 100644 index 00000000000..f1c567cf41d --- /dev/null +++ b/tests/queries/0_stateless/02157_line_as_string_output_format.sql @@ -0,0 +1 @@ +SELECT 'Hello \\ World' FORMAT LineAsString; diff --git a/tests/queries/0_stateless/02157_readonly_system_suspend.reference b/tests/queries/0_stateless/02157_readonly_system_suspend.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02157_readonly_system_suspend.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02157_readonly_system_suspend.sh b/tests/queries/0_stateless/02157_readonly_system_suspend.sh new file mode 100755 index 00000000000..77fe7b5f291 --- /dev/null +++ b/tests/queries/0_stateless/02157_readonly_system_suspend.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=none + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --readonly 1 --query "SYSTEM SUSPEND FOR 1 SECOND" 2>&1 | grep -c -F 'Code: 164' diff --git a/tests/queries/0_stateless/02158_contingency.reference b/tests/queries/0_stateless/02158_contingency.reference new file mode 100644 index 00000000000..ac475c7f204 --- /dev/null +++ b/tests/queries/0_stateless/02158_contingency.reference @@ -0,0 +1,5 @@ +0 0 -0 -0 0 +1 nan -1 -1 0.7 +0.95 0.95 -1 -1 0.23 +0.89 0.87 -0.7 -1 0.14 +0.95 0.89 -1 -0.89 0.23 diff --git a/tests/queries/0_stateless/02158_contingency.sql b/tests/queries/0_stateless/02158_contingency.sql new file mode 100644 index 00000000000..d1e1c76c066 --- /dev/null +++ b/tests/queries/0_stateless/02158_contingency.sql @@ -0,0 +1,5 @@ +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM (SELECT number % 3 AS a, number % 5 AS b FROM numbers(150)); +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM (SELECT number AS a, number + 1 AS b FROM numbers(150)); +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM (SELECT number % 10 AS a, number % 10 AS b FROM numbers(150)); +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM (SELECT number % 10 AS a, number % 5 AS b FROM numbers(150)); +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM (SELECT number % 10 AS a, number % 10 = 0 ? number : a AS b FROM numbers(150)); diff --git a/tests/queries/0_stateless/02158_interval_length_sum.reference b/tests/queries/0_stateless/02158_interval_length_sum.reference new file mode 100644 index 00000000000..b4de3947675 --- /dev/null +++ b/tests/queries/0_stateless/02158_interval_length_sum.reference @@ -0,0 +1 @@ +11 diff --git a/tests/queries/0_stateless/02158_interval_length_sum.sql b/tests/queries/0_stateless/02158_interval_length_sum.sql new file mode 100644 index 00000000000..af22a707caf --- /dev/null +++ b/tests/queries/0_stateless/02158_interval_length_sum.sql @@ -0,0 +1 @@ +SELECT intervalLengthSum(x, y) FROM values('x Int64, y Int64', (0, 10), (5, 5), (5, 6), (1, -1)); diff --git a/tests/queries/0_stateless/02159_left_right.reference b/tests/queries/0_stateless/02159_left_right.reference new file mode 100644 index 00000000000..8d7ba1686a7 --- /dev/null +++ b/tests/queries/0_stateless/02159_left_right.reference @@ -0,0 +1,230 @@ +-- { echo } + +SELECT left('Hello', 3); +Hel +SELECT left('Hello', -3); +He +SELECT left('Hello', 5); +Hello +SELECT left('Hello', -5); + +SELECT left('Hello', 6); +Hello +SELECT left('Hello', -6); + +SELECT left('Hello', 0); + +SELECT left('Hello', NULL); +\N +SELECT left(materialize('Привет'), 4); +Пр +SELECT LEFT('Привет', -4); +Прив +SELECT left(toNullable('Привет'), 12); +Привет +SELECT lEFT('Привет', -12); + +SELECT left(materialize(toNullable('Привет')), 13); +Привет +SELECT left('Привет', -13); + +SELECT Left('Привет', 0); + +SELECT left('Привет', NULL); +\N +SELECT leftUTF8('Привет', 4); +Прив +SELECT leftUTF8('Привет', -4); +Пр +SELECT leftUTF8('Привет', 12); +Привет +SELECT leftUTF8('Привет', -12); + +SELECT leftUTF8('Привет', 13); +Привет +SELECT leftUTF8('Привет', -13); + +SELECT leftUTF8('Привет', 0); + +SELECT leftUTF8('Привет', NULL); +\N +SELECT left('Hello', number) FROM numbers(10); + +H +He +Hel +Hell +Hello +Hello +Hello +Hello +Hello +SELECT leftUTF8('Привет', number) FROM numbers(10); + +П +Пр +При +Прив +Приве +Привет +Привет +Привет +Привет +SELECT left('Hello', -number) FROM numbers(10); + +Hell +Hel +He +H + + + + + +SELECT leftUTF8('Привет', -number) FROM numbers(10); + +Приве +Прив +При +Пр +П + + + + +SELECT leftUTF8('Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +\N +П +Прив +\N +Пр +Приве +\N +Привет + +\N +SELECT leftUTF8(number < 5 ? 'Hello' : 'Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +\N +H +Hel +\N +H +Приве +\N +Привет + +\N +SELECT right('Hello', 3); +llo +SELECT right('Hello', -3); +lo +SELECT right('Hello', 5); +Hello +SELECT right('Hello', -5); + +SELECT right('Hello', 6); +Hello +SELECT right('Hello', -6); + +SELECT right('Hello', 0); + +SELECT right('Hello', NULL); +\N +SELECT RIGHT(materialize('Привет'), 4); +ет +SELECT right('Привет', -4); +ивет +SELECT Right(toNullable('Привет'), 12); +Привет +SELECT right('Привет', -12); + +SELECT rIGHT(materialize(toNullable('Привет')), 13); +Привет +SELECT right('Привет', -13); + +SELECT rIgHt('Привет', 0); + +SELECT RiGhT('Привет', NULL); +\N +SELECT rightUTF8('Привет', 4); +ивет +SELECT rightUTF8('Привет', -4); +ет +SELECT rightUTF8('Привет', 12); +Привет +SELECT rightUTF8('Привет', -12); + +SELECT rightUTF8('Привет', 13); +Привет +SELECT rightUTF8('Привет', -13); + +SELECT rightUTF8('Привет', 0); + +SELECT rightUTF8('Привет', NULL); +\N +SELECT right('Hello', number) FROM numbers(10); + +o +lo +llo +ello +Hello +Hello +Hello +Hello +Hello +SELECT rightUTF8('Привет', number) FROM numbers(10); + +т +ет +вет +ивет +ривет +Привет +Привет +Привет +Привет +SELECT right('Hello', -number) FROM numbers(10); + +ello +llo +lo +o + + + + + +SELECT rightUTF8('Привет', -number) FROM numbers(10); + +ривет +ивет +вет +ет +т + + + + +SELECT rightUTF8('Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +\N +т +ивет +\N +ет +ривет +\N +Привет + +\N +SELECT rightUTF8(number < 5 ? 'Hello' : 'Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +\N +o +llo +\N +o +ривет +\N +Привет + +\N diff --git a/tests/queries/0_stateless/02159_left_right.sql b/tests/queries/0_stateless/02159_left_right.sql new file mode 100644 index 00000000000..a45ca3db961 --- /dev/null +++ b/tests/queries/0_stateless/02159_left_right.sql @@ -0,0 +1,71 @@ +-- { echo } + +SELECT left('Hello', 3); +SELECT left('Hello', -3); +SELECT left('Hello', 5); +SELECT left('Hello', -5); +SELECT left('Hello', 6); +SELECT left('Hello', -6); +SELECT left('Hello', 0); +SELECT left('Hello', NULL); + +SELECT left(materialize('Привет'), 4); +SELECT LEFT('Привет', -4); +SELECT left(toNullable('Привет'), 12); +SELECT lEFT('Привет', -12); +SELECT left(materialize(toNullable('Привет')), 13); +SELECT left('Привет', -13); +SELECT Left('Привет', 0); +SELECT left('Привет', NULL); + +SELECT leftUTF8('Привет', 4); +SELECT leftUTF8('Привет', -4); +SELECT leftUTF8('Привет', 12); +SELECT leftUTF8('Привет', -12); +SELECT leftUTF8('Привет', 13); +SELECT leftUTF8('Привет', -13); +SELECT leftUTF8('Привет', 0); +SELECT leftUTF8('Привет', NULL); + +SELECT left('Hello', number) FROM numbers(10); +SELECT leftUTF8('Привет', number) FROM numbers(10); +SELECT left('Hello', -number) FROM numbers(10); +SELECT leftUTF8('Привет', -number) FROM numbers(10); + +SELECT leftUTF8('Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +SELECT leftUTF8(number < 5 ? 'Hello' : 'Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); + +SELECT right('Hello', 3); +SELECT right('Hello', -3); +SELECT right('Hello', 5); +SELECT right('Hello', -5); +SELECT right('Hello', 6); +SELECT right('Hello', -6); +SELECT right('Hello', 0); +SELECT right('Hello', NULL); + +SELECT RIGHT(materialize('Привет'), 4); +SELECT right('Привет', -4); +SELECT Right(toNullable('Привет'), 12); +SELECT right('Привет', -12); +SELECT rIGHT(materialize(toNullable('Привет')), 13); +SELECT right('Привет', -13); +SELECT rIgHt('Привет', 0); +SELECT RiGhT('Привет', NULL); + +SELECT rightUTF8('Привет', 4); +SELECT rightUTF8('Привет', -4); +SELECT rightUTF8('Привет', 12); +SELECT rightUTF8('Привет', -12); +SELECT rightUTF8('Привет', 13); +SELECT rightUTF8('Привет', -13); +SELECT rightUTF8('Привет', 0); +SELECT rightUTF8('Привет', NULL); + +SELECT right('Hello', number) FROM numbers(10); +SELECT rightUTF8('Привет', number) FROM numbers(10); +SELECT right('Hello', -number) FROM numbers(10); +SELECT rightUTF8('Привет', -number) FROM numbers(10); + +SELECT rightUTF8('Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +SELECT rightUTF8(number < 5 ? 'Hello' : 'Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); diff --git a/tests/queries/0_stateless/02160_monthname.reference b/tests/queries/0_stateless/02160_monthname.reference new file mode 100644 index 00000000000..a3386cb33c7 --- /dev/null +++ b/tests/queries/0_stateless/02160_monthname.reference @@ -0,0 +1,12 @@ +January January January +February February February +March March March +April April April +May May May +June June June +July July July +August August August +September September September +October October October +November November November +December December December diff --git a/tests/queries/0_stateless/02160_monthname.sql b/tests/queries/0_stateless/02160_monthname.sql new file mode 100644 index 00000000000..2c5bd5b576b --- /dev/null +++ b/tests/queries/0_stateless/02160_monthname.sql @@ -0,0 +1,71 @@ +WITH + toDate('2021-01-14') AS date_value, + toDateTime('2021-01-14 11:22:33') AS date_time_value, + toDateTime64('2021-01-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-02-14') AS date_value, + toDateTime('2021-02-14 11:22:33') AS date_time_value, + toDateTime64('2021-02-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-03-14') AS date_value, + toDateTime('2021-03-14 11:22:33') AS date_time_value, + toDateTime64('2021-03-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-04-14') AS date_value, + toDateTime('2021-04-14 11:22:33') AS date_time_value, + toDateTime64('2021-04-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-05-14') AS date_value, + toDateTime('2021-05-14 11:22:33') AS date_time_value, + toDateTime64('2021-05-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-06-14') AS date_value, + toDateTime('2021-06-14 11:22:33') AS date_time_value, + toDateTime64('2021-06-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-07-14') AS date_value, + toDateTime('2021-07-14 11:22:33') AS date_time_value, + toDateTime64('2021-07-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-08-14') AS date_value, + toDateTime('2021-08-14 11:22:33') AS date_time_value, + toDateTime64('2021-08-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-09-14') AS date_value, + toDateTime('2021-09-14 11:22:33') AS date_time_value, + toDateTime64('2021-09-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-10-14') AS date_value, + toDateTime('2021-10-14 11:22:33') AS date_time_value, + toDateTime64('2021-10-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-11-14') AS date_value, + toDateTime('2021-11-14 11:22:33') AS date_time_value, + toDateTime64('2021-11-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-12-14') AS date_value, + toDateTime('2021-12-14 11:22:33') AS date_time_value, + toDateTime64('2021-12-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); diff --git a/tests/queries/0_stateless/02160_special_functions.reference b/tests/queries/0_stateless/02160_special_functions.reference new file mode 100644 index 00000000000..3a1dcd88902 --- /dev/null +++ b/tests/queries/0_stateless/02160_special_functions.reference @@ -0,0 +1,36 @@ +1 +[] +1 +world +world +world +world +def +abc +bcde +abcdef +abcdef + abcdef +2022 +Hello +3 +3 +2023-01-01 +2023-01-01 +2023-01-01 +2023-01-01 +2023-01-01 +2023-01-01 +2023-01-01 +2023-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +1 +1 +1 diff --git a/tests/queries/0_stateless/02160_special_functions.sql b/tests/queries/0_stateless/02160_special_functions.sql new file mode 100644 index 00000000000..6d18e7d0d25 --- /dev/null +++ b/tests/queries/0_stateless/02160_special_functions.sql @@ -0,0 +1,44 @@ +SELECT CAST(1 AS UInt8); +SELECT CAST([] AS Array(UInt8)); +SELECT CAST(1, 'UInt8'); + +SELECT SUBSTRING('Hello, world' FROM 8); +SELECT SUBSTRING('Hello, world' FROM 8 FOR 5); +SELECT SUBSTRING('Hello, world', 8); +SELECT SUBSTRING('Hello, world', 8, 5); + +SELECT TRIM(LEADING 'abc' FROM 'abcdef'); +SELECT TRIM(TRAILING 'def' FROM 'abcdef'); +SELECT TRIM(BOTH 'af' FROM 'abcdef'); +SELECT TRIM(' abcdef '); +SELECT LTRIM(' abcdef '); +SELECT RTRIM(' abcdef '); + +SELECT EXTRACT(YEAR FROM DATE '2022-01-01'); +SELECT EXTRACT('Hello, world', '^\w+'); + +SELECT POSITION('ll' IN 'Hello'); +SELECT POSITION('Hello', 'll'); + +SELECT DATE_ADD(YEAR, 1, DATE '2022-01-01'); +SELECT DATE_ADD(INTERVAL 1 YEAR, DATE '2022-01-01'); +SELECT DATEADD(YEAR, 1, DATE '2022-01-01'); +SELECT DATEADD(INTERVAL 1 YEAR, DATE '2022-01-01'); +SELECT TIMESTAMP_ADD(YEAR, 1, DATE '2022-01-01'); +SELECT TIMESTAMP_ADD(INTERVAL 1 YEAR, DATE '2022-01-01'); +SELECT TIMESTAMPADD(YEAR, 1, DATE '2022-01-01'); +SELECT TIMESTAMPADD(INTERVAL 1 YEAR, DATE '2022-01-01'); + +SELECT DATE_SUB(YEAR, 1, DATE '2022-01-01'); +SELECT DATE_SUB(DATE '2022-01-01', INTERVAL 1 YEAR); +SELECT DATESUB(YEAR, 1, DATE '2022-01-01'); +SELECT DATESUB(DATE '2022-01-01', INTERVAL 1 YEAR); +SELECT TIMESTAMP_SUB(YEAR, 1, DATE '2022-01-01'); +SELECT TIMESTAMP_SUB(DATE '2022-01-01', INTERVAL 1 YEAR); +SELECT TIMESTAMPSUB(YEAR, 1, DATE '2022-01-01'); +SELECT TIMESTAMPSUB(DATE '2022-01-01', INTERVAL 1 YEAR); + +SELECT DATE_DIFF(YEAR, DATE '2021-01-01', DATE '2022-01-01'); +SELECT DATEDIFF(YEAR, DATE '2021-01-01', DATE '2022-01-01'); + +SELECT EXISTS (SELECT 1); diff --git a/tests/queries/0_stateless/02160_untuple_exponential_growth.reference b/tests/queries/0_stateless/02160_untuple_exponential_growth.reference new file mode 100644 index 00000000000..6ed281c757a --- /dev/null +++ b/tests/queries/0_stateless/02160_untuple_exponential_growth.reference @@ -0,0 +1,2 @@ +1 +1 diff --git a/tests/queries/0_stateless/02160_untuple_exponential_growth.sh b/tests/queries/0_stateless/02160_untuple_exponential_growth.sh new file mode 100755 index 00000000000..9ec6594af69 --- /dev/null +++ b/tests/queries/0_stateless/02160_untuple_exponential_growth.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Should finish in reasonable time (milliseconds). +# In previous versions this query led to exponential complexity of query analysis. + +${CLICKHOUSE_LOCAL} --query "SELECT untuple(tuple(untuple((1, untuple((untuple(tuple(untuple(tuple(untuple((untuple((1, 1, 1, 1)), 1, 1, 1)))))), 1, 1))))))" 2>&1 | grep -cF 'TOO_BIG_AST' +${CLICKHOUSE_LOCAL} --query "SELECT untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple((1, 1, 1, 1, 1))))))))))))))))))))))))))" 2>&1 | grep -cF 'TOO_BIG_AST' diff --git a/tests/queries/0_stateless/02161_array_first_last.reference b/tests/queries/0_stateless/02161_array_first_last.reference new file mode 100644 index 00000000000..25734a6f01c --- /dev/null +++ b/tests/queries/0_stateless/02161_array_first_last.reference @@ -0,0 +1,18 @@ +ArrayFirst constant predicate +0 +0 +1 +0 +ArrayFirst non constant predicate +0 +2 +2 +ArrayLast constant predicate +0 +0 +3 +0 +ArrayLast non constant predicate +0 +3 +3 diff --git a/tests/queries/0_stateless/02161_array_first_last.sql b/tests/queries/0_stateless/02161_array_first_last.sql new file mode 100644 index 00000000000..f5be8cd26df --- /dev/null +++ b/tests/queries/0_stateless/02161_array_first_last.sql @@ -0,0 +1,21 @@ +SELECT 'ArrayFirst constant predicate'; +SELECT arrayFirst(x -> 1, emptyArrayUInt8()); +SELECT arrayFirst(x -> 0, emptyArrayUInt8()); +SELECT arrayFirst(x -> 1, [1, 2, 3]); +SELECT arrayFirst(x -> 0, [1, 2, 3]); + +SELECT 'ArrayFirst non constant predicate'; +SELECT arrayFirst(x -> x >= 2, emptyArrayUInt8()); +SELECT arrayFirst(x -> x >= 2, [1, 2, 3]); +SELECT arrayFirst(x -> x >= 2, materialize([1, 2, 3])); + +SELECT 'ArrayLast constant predicate'; +SELECT arrayLast(x -> 1, emptyArrayUInt8()); +SELECT arrayLast(x -> 0, emptyArrayUInt8()); +SELECT arrayLast(x -> 1, [1, 2, 3]); +SELECT arrayLast(x -> 0, [1, 2, 3]); + +SELECT 'ArrayLast non constant predicate'; +SELECT arrayLast(x -> x >= 2, emptyArrayUInt8()); +SELECT arrayLast(x -> x >= 2, [1, 2, 3]); +SELECT arrayLast(x -> x >= 2, materialize([1, 2, 3])); diff --git a/tests/queries/0_stateless/02162_array_first_last_index.reference b/tests/queries/0_stateless/02162_array_first_last_index.reference new file mode 100644 index 00000000000..24bd1442598 --- /dev/null +++ b/tests/queries/0_stateless/02162_array_first_last_index.reference @@ -0,0 +1,18 @@ +ArrayFirstIndex constant predicate +0 +0 +1 +0 +ArrayFirstIndex non constant predicate +0 +2 +2 +ArrayLastIndex constant predicate +0 +0 +3 +0 +ArrayLastIndex non constant predicate +0 +3 +3 diff --git a/tests/queries/0_stateless/02162_array_first_last_index.sql b/tests/queries/0_stateless/02162_array_first_last_index.sql new file mode 100644 index 00000000000..af107f0f4c9 --- /dev/null +++ b/tests/queries/0_stateless/02162_array_first_last_index.sql @@ -0,0 +1,21 @@ +SELECT 'ArrayFirstIndex constant predicate'; +SELECT arrayFirstIndex(x -> 1, emptyArrayUInt8()); +SELECT arrayFirstIndex(x -> 0, emptyArrayUInt8()); +SELECT arrayFirstIndex(x -> 1, [1, 2, 3]); +SELECT arrayFirstIndex(x -> 0, [1, 2, 3]); + +SELECT 'ArrayFirstIndex non constant predicate'; +SELECT arrayFirstIndex(x -> x >= 2, emptyArrayUInt8()); +SELECT arrayFirstIndex(x -> x >= 2, [1, 2, 3]); +SELECT arrayFirstIndex(x -> x >= 2, [1, 2, 3]); + +SELECT 'ArrayLastIndex constant predicate'; +SELECT arrayLastIndex(x -> 1, emptyArrayUInt8()); +SELECT arrayLastIndex(x -> 0, emptyArrayUInt8()); +SELECT arrayLastIndex(x -> 1, [1, 2, 3]); +SELECT arrayLastIndex(x -> 0, materialize([1, 2, 3])); + +SELECT 'ArrayLastIndex non constant predicate'; +SELECT arrayLastIndex(x -> x >= 2, emptyArrayUInt8()); +SELECT arrayLastIndex(x -> x >= 2, [1, 2, 3]); +SELECT arrayLastIndex(x -> x >= 2, materialize([1, 2, 3])); diff --git a/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.reference b/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.reference new file mode 100644 index 00000000000..d366ce64c27 --- /dev/null +++ b/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.reference @@ -0,0 +1 @@ +0 1 1 Value 1 diff --git a/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.sql b/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.sql new file mode 100644 index 00000000000..24eb08137e1 --- /dev/null +++ b/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.sql @@ -0,0 +1,29 @@ +DROP TABLE IF EXISTS 02162_test_table; +CREATE TABLE 02162_test_table +( + id UInt64, + value String, + range_value UInt64 +) ENGINE=TinyLog; + +INSERT INTO 02162_test_table VALUES (0, 'Value', 1); + +DROP DICTIONARY IF EXISTS 02162_test_dictionary; +CREATE DICTIONARY 02162_test_dictionary +( + id UInt64, + value String, + range_value UInt64, + start UInt64 EXPRESSION range_value, + end UInt64 EXPRESSION range_value +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02162_test_table')) +LAYOUT(RANGE_HASHED()) +RANGE(MIN start MAX end) +LIFETIME(0); + +SELECT * FROM 02162_test_dictionary; + +DROP DICTIONARY 02162_test_dictionary; +DROP TABLE 02162_test_table; diff --git a/tests/queries/0_stateless/02163_operators.reference b/tests/queries/0_stateless/02163_operators.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02163_operators.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02163_operators.sql b/tests/queries/0_stateless/02163_operators.sql new file mode 100644 index 00000000000..4968e448ab2 --- /dev/null +++ b/tests/queries/0_stateless/02163_operators.sql @@ -0,0 +1,2 @@ +WITH 2 AS `b.c`, [4, 5] AS a, 6 AS u, 3 AS v, 2 AS d, TRUE AS e, 1 AS f, 0 AS g, 2 AS h, 'Hello' AS i, 'World' AS j, TIMESTAMP '2022-02-02 02:02:02' AS w, [] AS k, (1, 2) AS l, 2 AS m, 3 AS n, [] AS o, [1] AS p, 1 AS q, q AS r, 1 AS s, 1 AS t +SELECT INTERVAL CASE CASE WHEN NOT -a[b.c] * u DIV v + d IS NOT NULL AND e OR f BETWEEN g AND h THEN i ELSE j END WHEN w THEN k END || [l, (m, n)] MINUTE IS NULL OR NOT o::Array(INT) = p <> q < r > s != t AS upyachka; diff --git a/tests/queries/0_stateless/data_csv/csv_with_slash.csv b/tests/queries/0_stateless/data_csv/csv_with_slash.csv new file mode 100644 index 00000000000..0f2c166faa8 --- /dev/null +++ b/tests/queries/0_stateless/data_csv/csv_with_slash.csv @@ -0,0 +1,30 @@ +0,\asdf,2000-01-01 +1,x\x\,2000-01-01 +2,x\x,2000-01-01 +3,x\,2000-01-01 +4,x\,2000-01-01 +5,\x,2000-01-01 +6,\N,2000-01-01 +7,\r\n,2000-01-01 +8,\\r\\n,2000-01-01 +9,x\\,2000-01-01 +10,'\asdf',2000-01-01 +11,'x\x\',2000-01-01 +12,'x\x',2000-01-01 +13,'x\',2000-01-01 +14,'x\',2000-01-01 +15,'\x',2000-01-01 +16,'\N',2000-01-01 +17,'\r\n',2000-01-01 +18,"\\r\\n",2000-01-01 +19,"x\\",2000-01-01 +20,"\asdf",2000-01-01 +21,"x\x\",2000-01-01 +22,"x\x",2000-01-01 +23,"x\",2000-01-01 +24,"x\",2000-01-01 +25,"\x",2000-01-01 +26,"\N",2000-01-01 +27,"\r\n",2000-01-01 +28,"\\r\\n",2000-01-01 +29,"x\\",2000-01-01 diff --git a/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto b/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto index ba558dbbadb..048a689d021 100644 --- a/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto +++ b/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto @@ -1,6 +1,6 @@ syntax = "proto3"; -message Message +message EnumMessage { enum Enum { @@ -10,4 +10,4 @@ message Message HUNDRED = 100; }; Enum x = 1; -}; \ No newline at end of file +}; diff --git a/tests/queries/0_stateless/helpers/02112_clean.sh b/tests/queries/0_stateless/helpers/02112_clean.sh index 910c0709955..95af0cede9c 100755 --- a/tests/queries/0_stateless/helpers/02112_clean.sh +++ b/tests/queries/0_stateless/helpers/02112_clean.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash -FILE=${CURDIR}/file_02112 -if [ -f $FILE ]; then - rm $FILE -fi +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +FILE=${CURDIR}/../file_02112 +rm "$FILE" diff --git a/tests/queries/0_stateless/helpers/02112_prepare.sh b/tests/queries/0_stateless/helpers/02112_prepare.sh index 1f371789f86..c2791b01140 100755 --- a/tests/queries/0_stateless/helpers/02112_prepare.sh +++ b/tests/queries/0_stateless/helpers/02112_prepare.sh @@ -1,7 +1,5 @@ #!/usr/bin/env bash -FILE=${CURDIR}/file_02112 -if [ -f $FILE ]; then - rm $FILE -fi -echo "drop table if exists t;create table t(i Int32) engine=Memory; insert into t select 1" >> $FILE +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +FILE=${CURDIR}/../file_02112 +echo "drop table if exists t;create table t(i Int32) engine=Memory; insert into t select 1" > "$FILE" diff --git a/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh b/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh index d025dae5b2e..699700bcd3e 100755 --- a/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh +++ b/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh @@ -48,6 +48,10 @@ SkipList=( for TESTPATH in "$CURDIR"/*.sql; do TESTNAME=$(basename $TESTPATH) + NUM=$(echo "${TESTNAME}" | grep -o -P '^\d+' | sed 's/^0*//') + if [[ "${NUM}" -ge 168 ]]; then + continue + fi if [[ " ${SkipList[*]} " =~ ${TESTNAME} ]]; then echo "Skipping $TESTNAME " diff --git a/tests/queries/1_stateful/00169_contingency.reference b/tests/queries/1_stateful/00169_contingency.reference new file mode 100644 index 00000000000..8e881081445 --- /dev/null +++ b/tests/queries/1_stateful/00169_contingency.reference @@ -0,0 +1,5 @@ +1 1 -1 -1 0.09 +0.49 0.49 -0.45 -0.69 0.03 +0.81 0.81 -0.91 -0.85 0.08 +0.96 0.96 -0.9 -0.98 0.14 +0.6 0.6 -0.78 -0.8 0.01 diff --git a/tests/queries/1_stateful/00169_contingency.sql b/tests/queries/1_stateful/00169_contingency.sql new file mode 100644 index 00000000000..cc44bba8509 --- /dev/null +++ b/tests/queries/1_stateful/00169_contingency.sql @@ -0,0 +1,14 @@ +WITH URLDomain AS a, URLDomain AS b +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM test.hits; + +WITH URLDomain AS a, RefererDomain AS b +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM test.hits; + +WITH URLDomain AS a, CounterID AS b +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM test.hits; + +WITH ClientIP AS a, RemoteIP AS b +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM test.hits; + +WITH ResolutionWidth AS a, ResolutionHeight AS b +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM test.hits; diff --git a/utils/check-style/check-style b/utils/check-style/check-style index c65099f2582..22b5faa0fcb 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -184,7 +184,6 @@ tables_with_database_column=( tests_with_database_column=( $( find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | grep -vP $EXCLUDE_DIRS | - grep -v -x -e $ROOT_PATH/tests/queries/query_test.py | xargs grep --with-filename $(printf -- "-e %s " "${tables_with_database_column[@]}") | cut -d: -f1 | sort -u ) ) for test_case in "${tests_with_database_column[@]}"; do @@ -299,6 +298,20 @@ for src in "${sources_with_std_cerr_cout[@]}"; do fi done +# Queries with event_date should have yesterday() not today() +# +# NOTE: it is not that accuate, but at least something. +tests_with_event_time_date=( $( + find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' -or -iname '*.j2' | + grep -vP $EXCLUDE_DIRS | + xargs grep --with-filename -e event_time -e event_date | cut -d: -f1 | sort -u +) ) +for test_case in "${tests_with_event_time_date[@]}"; do + cat "$test_case" | tr '\n' ' ' | grep -q -i -e 'WHERE.*event_date[ ]*=[ ]*today()' -e 'WHERE.*event_date[ ]*=[ ]*today()' && { + echo "event_time/event_date should be filtered using >=yesterday() in $test_case (to avoid flakiness)" + } +done + # Conflict markers find $ROOT_PATH/{src,base,programs,utils,tests,docs,website,cmake} -name '*.md' -or -name '*.cpp' -or -name '*.h' | xargs grep -P '^(<<<<<<<|=======|>>>>>>>)$' | grep -P '.' && echo "Conflict markers are found in files" diff --git a/utils/ci/jobs/quick-build/README.md b/utils/ci/jobs/quick-build/README.md deleted file mode 100644 index 803acae0f93..00000000000 --- a/utils/ci/jobs/quick-build/README.md +++ /dev/null @@ -1,5 +0,0 @@ -## Build with debug mode and without many libraries - -This job is intended as first check that build is not broken on wide variety of platforms. - -Results of this build are not intended for production usage. diff --git a/utils/ci/jobs/quick-build/run.sh b/utils/ci/jobs/quick-build/run.sh deleted file mode 100755 index af977d14465..00000000000 --- a/utils/ci/jobs/quick-build/run.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash -set -e -x - -# How to run: -# From "ci" directory: -# jobs/quick-build/run.sh -# or: -# ./run-with-docker.sh ubuntu:bionic jobs/quick-build/run.sh - -cd "$(dirname $0)"/../.. - -. default-config - -SOURCES_METHOD=local -COMPILER=clang -COMPILER_INSTALL_METHOD=packages -COMPILER_PACKAGE_VERSION=6.0 -BUILD_METHOD=normal -BUILD_TARGETS=clickhouse -BUILD_TYPE=Debug -ENABLE_EMBEDDED_COMPILER=0 - -CMAKE_FLAGS="-D CMAKE_C_FLAGS_ADD=-g0 -D CMAKE_CXX_FLAGS_ADD=-g0 -D ENABLE_JEMALLOC=0 -D ENABLE_CAPNP=0 -D ENABLE_RDKAFKA=0 -D ENABLE_UNWIND=0 -D ENABLE_ICU=0 -D ENABLE_POCO_MONGODB=0 -D ENABLE_POCO_REDIS=0 -D ENABLE_POCO_NETSSL=0 -D ENABLE_ODBC=0 -D ENABLE_MYSQL=0 -D ENABLE_SSL=0 -D ENABLE_POCO_NETSSL=0 -D ENABLE_CASSANDRA=0 -D ENABLE_LDAP=0" - -[[ $(uname) == "FreeBSD" ]] && COMPILER_PACKAGE_VERSION=devel && export COMPILER_PATH=/usr/local/bin - -. get-sources.sh -. prepare-toolchain.sh -. install-libraries.sh -. build-normal.sh diff --git a/utils/ci/vagrant-freebsd/.gitignore b/utils/ci/vagrant-freebsd/.gitignore deleted file mode 100644 index 8000dd9db47..00000000000 --- a/utils/ci/vagrant-freebsd/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.vagrant diff --git a/utils/ci/vagrant-freebsd/Vagrantfile b/utils/ci/vagrant-freebsd/Vagrantfile deleted file mode 100644 index c01ae5fa6e2..00000000000 --- a/utils/ci/vagrant-freebsd/Vagrantfile +++ /dev/null @@ -1,3 +0,0 @@ -Vagrant.configure("2") do |config| - config.vm.box = "generic/freebsd11" -end diff --git a/utils/clickhouse-diagnostics/README.md b/utils/clickhouse-diagnostics/README.md new file mode 100644 index 00000000000..991efefdf5a --- /dev/null +++ b/utils/clickhouse-diagnostics/README.md @@ -0,0 +1,2657 @@ +## Installation + +``` +python3 -m pip install -r requirements.txt +``` + +## Usage + +``` +./clickhouse-diagnostics +``` + +Example output: + +### Diagnostics data for host clickhouse01.test_net_3697 +Version: **21.11.8.4** +Timestamp: **2021-12-25 15:34:02** +Uptime: **13 minutes and 51 seconds** +#### ClickHouse configuration +**result** +```XML + + + + trace + /var/log/clickhouse-server/clickhouse-server.log + /var/log/clickhouse-server/clickhouse-server.err.log + 1000M + 10 + 1 + + 8123 + 9000 + 9004 + 9005 + 9009 + 4096 + 3 + + false + /path/to/ssl_cert_file + /path/to/ssl_key_file + false + /path/to/ssl_ca_cert_file + deflate + medium + -1 + -1 + false + + + + /etc/clickhouse-server/server.crt + /etc/clickhouse-server/server.key + + none + true + true + sslv2,sslv3 + true + + + true + true + sslv2,sslv3,tlsv1,tlsv1_1 + true + + RejectCertificateHandler + + /etc/clickhouse-server/server.crt + /etc/clickhouse-server/server.key + /etc/clickhouse-server/allCAs.pem + + + 100 + 0 + 10000 + 0.9 + 4194304 + 0 + 8589934592 + 5368709120 + 1000 + 134217728 + 10000 + /var/lib/clickhouse/ + /var/lib/clickhouse/tmp/ + /var/lib/clickhouse/user_files/ + + + + users.xml + + + /var/lib/clickhouse/access/ + + + default + + default + true + false + + + + + localhost + 9000 + + + + + + + localhost + 9000 + + + + + localhost + 9000 + + + + + + + 127.0.0.1 + 9000 + + + + + 127.0.0.2 + 9000 + + + + + + true + + 127.0.0.1 + 9000 + + + + true + + 127.0.0.2 + 9000 + + + + + + + localhost + 9440 + 1 + + + + + + + localhost + 9000 + + + + + localhost + 1 + + + + + + + clickhouse01.test_net_3697 + 9000 + + + + + 3600 + 3600 + 60 + + system +
query_log
+ toYYYYMM(event_date) + 7500 + + + system + trace_log
+ toYYYYMM(event_date) + 7500 +
+ + system + query_thread_log
+ toYYYYMM(event_date) + 7500 +
+ + system + query_views_log
+ toYYYYMM(event_date) + 7500 +
+ + system + part_log
+ toYYYYMM(event_date) + 7500 +
+ + system + metric_log
+ 7500 + 1000 +
+ + system + asynchronous_metric_log
+ 7000 +
+ + engine MergeTree + partition by toYYYYMM(finish_date) + order by (finish_date, finish_time_us, trace_id) + system + opentelemetry_span_log
+ 7500 +
+ + system + crash_log
+ + 1000 +
+ + system + session_log
+ toYYYYMM(event_date) + 7500 +
+ + *_dictionary.xml + *_function.xml + + + /clickhouse/task_queue/ddl + + + + click_cost + any + + 0 + 3600 + + + 86400 + 60 + + + + max + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + + /var/lib/clickhouse/format_schemas/ + + + hide encrypt/decrypt arguments + ((?:aes_)?(?:encrypt|decrypt)(?:_mysql)?)\s*\(\s*(?:'(?:\\'|.)+'|.*?)\s*\) + \1(???) + + + + false + false + https://6f33034cfe684dd7a3ab9875e57b1c8d@o388870.ingest.sentry.io/5226277 + + 0.0.0.0 + 8443 + 9440 + + + zookeeper01.test_net_3697 + 2281 + 1 + + 3000 + /clickhouse01 + ***** + + + clickhouse01 + shard1 + + 0 + + + + /hdd1/ + + + /hdd2/ + + + s3 + http://minio01:9000/cloud-storage-01/data/ + bB5vT2M8yaRv9J14SnAP + ***** + true + + + + + +
+ default +
+ + hdd1 + + + hdd2 + +
+ 0.0 +
+ + +
+ s3 +
+ + default + +
+ 0.0 +
+ + +
+ default +
+ + s3 + +
+ 0.0 +
+
+
+``` +#### Access configuration +**query** +```sql +SHOW ACCESS +``` +**result** +``` +CREATE USER default IDENTIFIED WITH plaintext_password SETTINGS PROFILE default +CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, load_balancing = 'random' +CREATE SETTINGS PROFILE readonly SETTINGS readonly = 1 +CREATE QUOTA default KEYED BY user_name FOR INTERVAL 1 hour TRACKING ONLY TO default +GRANT ALL ON *.* TO default WITH GRANT OPTION +``` +#### Quotas +**query** +```sql +SHOW QUOTA +``` +**result** +``` +Row 1: +────── +quota_name: default +quota_key: default +start_time: 2021-12-25 15:00:00 +end_time: 2021-12-25 16:00:00 +duration: 3600 +queries: 49 +max_queries: ᴺᵁᴸᴸ +query_selects: 49 +max_query_selects: ᴺᵁᴸᴸ +query_inserts: 0 +max_query_inserts: ᴺᵁᴸᴸ +errors: 6 +max_errors: ᴺᵁᴸᴸ +result_rows: 607 +max_result_rows: ᴺᵁᴸᴸ +result_bytes: 237632 +max_result_bytes: ᴺᵁᴸᴸ +read_rows: 1256 +max_read_rows: ᴺᵁᴸᴸ +read_bytes: 778936 +max_read_bytes: ᴺᵁᴸᴸ +execution_time: 0 +max_execution_time: ᴺᵁᴸᴸ +``` +#### Schema +##### Database engines +**query** +```sql +SELECT + engine, + count() "count" +FROM system.databases +GROUP BY engine +``` +**result** +``` +┌─engine─┬─count─┐ +│ Memory │ 2 │ +│ Atomic │ 2 │ +└────────┴───────┘ +``` +##### Databases (top 10 by size) +**query** +```sql +SELECT + name, + engine, + tables, + partitions, + parts, + formatReadableSize(bytes_on_disk) "disk_size" +FROM system.databases db +LEFT JOIN +( + SELECT + database, + uniq(table) "tables", + uniq(table, partition) "partitions", + count() AS parts, + sum(bytes_on_disk) "bytes_on_disk" + FROM system.parts + WHERE active + GROUP BY database +) AS db_stats ON db.name = db_stats.database +ORDER BY bytes_on_disk DESC +LIMIT 10 +``` +**result** +``` +┌─name───────────────┬─engine─┬─tables─┬─partitions─┬─parts─┬─disk_size──┐ +│ system │ Atomic │ 6 │ 6 │ 22 │ 716.29 KiB │ +│ INFORMATION_SCHEMA │ Memory │ 0 │ 0 │ 0 │ 0.00 B │ +│ default │ Atomic │ 0 │ 0 │ 0 │ 0.00 B │ +│ information_schema │ Memory │ 0 │ 0 │ 0 │ 0.00 B │ +└────────────────────┴────────┴────────┴────────────┴───────┴────────────┘ +``` +##### Table engines +**query** +```sql +SELECT + engine, + count() "count" +FROM system.tables +WHERE database != 'system' +GROUP BY engine +``` +**result** +``` +┌─engine─┬─count─┐ +│ View │ 8 │ +└────────┴───────┘ +``` +##### Dictionaries +**query** +```sql +SELECT + source, + type, + status, + count() "count" +FROM system.dictionaries +GROUP BY source, type, status +ORDER BY status DESC, source +``` +**result** +``` + +``` +#### Replication +##### Replicated tables (top 10 by absolute delay) +**query** +```sql +SELECT + database, + table, + is_leader, + is_readonly, + absolute_delay, + queue_size, + inserts_in_queue, + merges_in_queue +FROM system.replicas +ORDER BY absolute_delay DESC +LIMIT 10 +``` +**result** +``` + +``` +##### Replication queue (top 20 oldest tasks) +**query** +```sql +SELECT + database, + table, + replica_name, + position, + node_name, + type, + source_replica, + parts_to_merge, + new_part_name, + create_time, + required_quorum, + is_detach, + is_currently_executing, + num_tries, + last_attempt_time, + last_exception, + concat('time: ', toString(last_postpone_time), ', number: ', toString(num_postponed), ', reason: ', postpone_reason) postpone +FROM system.replication_queue +ORDER BY create_time ASC +LIMIT 20 +``` +**result** +``` + +``` +##### Replicated fetches +**query** +```sql +SELECT + database, + table, + round(elapsed, 1) "elapsed", + round(100 * progress, 1) "progress", + partition_id, + result_part_name, + result_part_path, + total_size_bytes_compressed, + bytes_read_compressed, + source_replica_path, + source_replica_hostname, + source_replica_port, + interserver_scheme, + to_detached, + thread_id +FROM system.replicated_fetches +``` +**result** +``` + +``` +#### Top 10 tables by max parts per partition +**query** +```sql +SELECT + database, + table, + count() "partitions", + sum(part_count) "parts", + max(part_count) "max_parts_per_partition" +FROM +( + SELECT + database, + table, + partition, + count() "part_count" + FROM system.parts + WHERE active + GROUP BY database, table, partition +) partitions +GROUP BY database, table +ORDER BY max_parts_per_partition DESC +LIMIT 10 +``` +**result** +``` +┌─database─┬─table───────────────────┬─partitions─┬─parts─┬─max_parts_per_partition─┐ +│ system │ metric_log │ 1 │ 5 │ 5 │ +│ system │ trace_log │ 1 │ 5 │ 5 │ +│ system │ query_thread_log │ 1 │ 3 │ 3 │ +│ system │ query_log │ 1 │ 3 │ 3 │ +│ system │ asynchronous_metric_log │ 1 │ 3 │ 3 │ +│ system │ session_log │ 1 │ 3 │ 3 │ +└──────────┴─────────────────────────┴────────────┴───────┴─────────────────────────┘ +``` +#### Merges in progress +**query** +```sql +SELECT + database, + table, + round(elapsed, 1) "elapsed", + round(100 * progress, 1) "progress", + is_mutation, + partition_id, +result_part_path, + source_part_paths, +num_parts, + formatReadableSize(total_size_bytes_compressed) "total_size_compressed", + formatReadableSize(bytes_read_uncompressed) "read_uncompressed", + formatReadableSize(bytes_written_uncompressed) "written_uncompressed", + columns_written, +formatReadableSize(memory_usage) "memory_usage", + thread_id +FROM system.merges +``` +**result** +``` + +``` +#### Mutations in progress +**query** +```sql +SELECT + database, + table, + mutation_id, + command, + create_time, +parts_to_do_names, +parts_to_do, + is_done, + latest_failed_part, + latest_fail_time, + latest_fail_reason +FROM system.mutations +WHERE NOT is_done +ORDER BY create_time DESC +``` +**result** +``` + +``` +#### Recent data parts (modification time within last 3 minutes) +**query** +```sql +SELECT + database, + table, + engine, + partition_id, + name, +part_type, +active, + level, +disk_name, +path, + marks, + rows, + bytes_on_disk, + data_compressed_bytes, + data_uncompressed_bytes, + marks_bytes, + modification_time, + remove_time, + refcount, + is_frozen, + min_date, + max_date, + min_time, + max_time, + min_block_number, + max_block_number +FROM system.parts +WHERE modification_time > now() - INTERVAL 3 MINUTE +ORDER BY modification_time DESC +``` +**result** +``` +Row 1: +────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_110_110_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_110_110_0/ +marks: 2 +rows: 8 +bytes_on_disk: 21752 +data_compressed_bytes: 11699 +data_uncompressed_bytes: 19952 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:59 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 110 +max_block_number: 110 + +Row 2: +────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_118_118_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_118_118_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 10856 +data_compressed_bytes: 10656 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:58 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 118 +max_block_number: 118 + +Row 3: +────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_117_117_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_117_117_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 11028 +data_compressed_bytes: 10828 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:51 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 117 +max_block_number: 117 + +Row 4: +────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_109_109_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_109_109_0/ +marks: 2 +rows: 7 +bytes_on_disk: 21802 +data_compressed_bytes: 11749 +data_uncompressed_bytes: 17458 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:51 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 109 +max_block_number: 109 + +Row 5: +────── +database: system +table: trace_log +engine: MergeTree +partition_id: 202112 +name: 202112_53_53_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/c0b/c0bc3be3-22d7-45a3-80bc-3be322d7b5a3/202112_53_53_0/ +marks: 2 +rows: 6 +bytes_on_disk: 1057 +data_compressed_bytes: 700 +data_uncompressed_bytes: 1894 +marks_bytes: 336 +modification_time: 2021-12-25 15:33:49 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 53 +max_block_number: 53 + +Row 6: +────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_116_116_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_116_116_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 10911 +data_compressed_bytes: 10711 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:44 +remove_time: 2021-12-25 15:33:44 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 116 +max_block_number: 116 + +Row 7: +────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_1_116_23 +part_type: Wide +active: 1 +level: 23 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_1_116_23/ +marks: 69 +rows: 553071 +bytes_on_disk: 435279 +data_compressed_bytes: 424915 +data_uncompressed_bytes: 13289123 +marks_bytes: 9936 +modification_time: 2021-12-25 15:33:44 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 1 +max_block_number: 116 + +Row 8: +────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_108_108_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_108_108_0/ +marks: 2 +rows: 8 +bytes_on_disk: 21833 +data_compressed_bytes: 11780 +data_uncompressed_bytes: 19952 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:44 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 108 +max_block_number: 108 + +Row 9: +─────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_115_115_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_115_115_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 11146 +data_compressed_bytes: 10946 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:37 +remove_time: 2021-12-25 15:33:44 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 115 +max_block_number: 115 + +Row 10: +─────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_107_107_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_107_107_0/ +marks: 2 +rows: 7 +bytes_on_disk: 21996 +data_compressed_bytes: 11943 +data_uncompressed_bytes: 17458 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:36 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 107 +max_block_number: 107 + +Row 11: +─────── +database: system +table: session_log +engine: MergeTree +partition_id: 202112 +name: 202112_3_3_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9f3/9f3dd592-781c-48d8-9f3d-d592781c48d8/202112_3_3_0/ +marks: 2 +rows: 44 +bytes_on_disk: 2208 +data_compressed_bytes: 1498 +data_uncompressed_bytes: 5130 +marks_bytes: 688 +modification_time: 2021-12-25 15:33:34 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 3 +max_block_number: 3 + +Row 12: +─────── +database: system +table: query_log +engine: MergeTree +partition_id: 202112 +name: 202112_3_3_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/1a3/1a3ec308-d42e-4f3c-9a3e-c308d42e2f3c/202112_3_3_0/ +marks: 2 +rows: 43 +bytes_on_disk: 17843 +data_compressed_bytes: 15725 +data_uncompressed_bytes: 61869 +marks_bytes: 2096 +modification_time: 2021-12-25 15:33:34 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 3 +max_block_number: 3 + +Row 13: +─────── +database: system +table: query_thread_log +engine: MergeTree +partition_id: 202112 +name: 202112_3_3_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/afa/afa652ef-f91d-4a48-afa6-52eff91daa48/202112_3_3_0/ +marks: 2 +rows: 43 +bytes_on_disk: 11878 +data_compressed_bytes: 10432 +data_uncompressed_bytes: 52339 +marks_bytes: 1424 +modification_time: 2021-12-25 15:33:34 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 3 +max_block_number: 3 + +Row 14: +─────── +database: system +table: trace_log +engine: MergeTree +partition_id: 202112 +name: 202112_52_52_0 +part_type: Compact +active: 1 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/c0b/c0bc3be3-22d7-45a3-80bc-3be322d7b5a3/202112_52_52_0/ +marks: 2 +rows: 4 +bytes_on_disk: 1078 +data_compressed_bytes: 721 +data_uncompressed_bytes: 1252 +marks_bytes: 336 +modification_time: 2021-12-25 15:33:34 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 52 +max_block_number: 52 + +Row 15: +─────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_114_114_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_114_114_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 11447 +data_compressed_bytes: 11247 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:30 +remove_time: 2021-12-25 15:33:44 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 114 +max_block_number: 114 + +Row 16: +─────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_1_106_21 +part_type: Compact +active: 1 +level: 21 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_1_106_21/ +marks: 2 +rows: 798 +bytes_on_disk: 84853 +data_compressed_bytes: 74798 +data_uncompressed_bytes: 1990212 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:29 +remove_time: 1970-01-01 03:00:00 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 1 +max_block_number: 106 + +Row 17: +─────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_106_106_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_106_106_0/ +marks: 2 +rows: 8 +bytes_on_disk: 21863 +data_compressed_bytes: 11810 +data_uncompressed_bytes: 19952 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:28 +remove_time: 2021-12-25 15:33:29 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 106 +max_block_number: 106 + +Row 18: +─────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_113_113_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_113_113_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 11191 +data_compressed_bytes: 10991 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:23 +remove_time: 2021-12-25 15:33:44 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 113 +max_block_number: 113 + +Row 19: +─────── +database: system +table: metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_105_105_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/9a2/9a2fb3b4-8ced-4c0b-9a2f-b3b48ced4c0b/202112_105_105_0/ +marks: 2 +rows: 7 +bytes_on_disk: 21786 +data_compressed_bytes: 11733 +data_uncompressed_bytes: 17458 +marks_bytes: 10032 +modification_time: 2021-12-25 15:33:21 +remove_time: 2021-12-25 15:33:29 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 105 +max_block_number: 105 + +Row 20: +─────── +database: system +table: asynchronous_metric_log +engine: MergeTree +partition_id: 202112 +name: 202112_112_112_0 +part_type: Compact +active: 0 +level: 0 +disk_name: default +path: /var/lib/clickhouse/store/78e/78e6eec8-3f71-4724-b8e6-eec83f71a724/202112_112_112_0/ +marks: 2 +rows: 4767 +bytes_on_disk: 11281 +data_compressed_bytes: 11081 +data_uncompressed_bytes: 128675 +marks_bytes: 176 +modification_time: 2021-12-25 15:33:16 +remove_time: 2021-12-25 15:33:44 +refcount: 1 +is_frozen: 0 +min_date: 2021-12-25 +max_date: 2021-12-25 +min_time: 1970-01-01 03:00:00 +max_time: 1970-01-01 03:00:00 +min_block_number: 112 +max_block_number: 112 +``` +#### Detached data +##### system.detached_parts +**query** +```sql +SELECT + database, + table, + partition_id, + name, + disk, + reason, + min_block_number, + max_block_number, + level +FROM system.detached_parts +``` +**result** +``` +┌─database─┬─table─┬─partition_id─┬─name─┬─disk─┬─reason─┬─min_block_number─┬─max_block_number─┬─level─┐ +└──────────┴───────┴──────────────┴──────┴──────┴────────┴──────────────────┴──────────────────┴───────┘ +``` +##### Disk space usage +**command** +``` +du -sh -L -c /var/lib/clickhouse/data/*/*/detached/* | sort -rsh +``` +**result** +``` +0 total + +``` +#### Queries +##### Queries in progress (process list) +**query** +```sql +SELECT + elapsed, + query_id, + query, + is_cancelled, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + formatReadableSize(memory_usage) AS "memory usage", + user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + thread_ids, + ProfileEvents, + Settings + FROM system.processes +ORDER BY elapsed DESC +``` +**result** +``` +Row 1: +────── +elapsed: 0.000785246 +query_id: b51cbc7a-2260-4c9b-b26c-6307b10ad948 +query: SELECT + elapsed, + query_id, + query, + is_cancelled, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + formatReadableSize(memory_usage) AS "memory usage", + user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + thread_ids, + ProfileEvents, + Settings + FROM system.processes +ORDER BY elapsed DESC FORMAT Vertical + +is_cancelled: 0 +read: 0 rows / 0.00 B +written: 0 rows / 0.00 B +memory usage: 0.00 B +user: default +client: python-requests/2.26.0 +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'ContextLock':38,'RWLockAcquiredReadLocks':1} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} +``` +##### Top 10 queries by duration +**query** +```sql +SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY query_duration_ms DESC +LIMIT 10 +``` +**result** +``` +Row 1: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:25:01 +query_duration_ms: 60 +query_id: f72e1120-cc66-434c-9809-3a99077ed842 +query_kind: Select +is_initial_query: 1 +query: SELECT + database, + table, + count() "partitions", + sum(part_count) "parts", + max(part_count) "max_parts_per_partition" +FROM +( + SELECT + database, + table, + partition, + count() "part_count" + FROM system.parts + WHERE active + GROUP BY database, table, partition +) partitions +GROUP BY database, table +ORDER BY max_parts_per_partition DESC +LIMIT 10 FORMAT PrettyCompactNoEscapes + +read: 5 rows / 262.00 B +written: 0 rows / 0.00 B +result: 3 rows / 488.00 B +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.parts'] +columns: ['system.parts.active','system.parts.database','system.parts.partition','system.parts.table'] +used_aggregate_functions: ['count','max','sum'] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['PrettyCompactNoEscapes'] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'ArenaAllocChunks':2,'ArenaAllocBytes':8192,'CompileFunction':1,'CompileExpressionsMicroseconds':52574,'CompileExpressionsBytes':8192,'SelectedRows':5,'SelectedBytes':262,'ContextLock':58,'RWLockAcquiredReadLocks':6,'RealTimeMicroseconds':61493,'UserTimeMicroseconds':34154,'SystemTimeMicroseconds':9874,'SoftPageFaults':170,'HardPageFaults':33,'OSIOWaitMicroseconds':10000,'OSCPUWaitMicroseconds':2433,'OSCPUVirtualTimeMicroseconds':43706,'OSReadBytes':3080192,'OSWriteBytes':4096,'OSReadChars':863,'OSWriteChars':5334} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 2: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:26 +query_duration_ms: 12 +query_id: eabd7483-70df-4d60-a668-d8961416e3fb +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY query_duration_ms DESC +LIMIT 10 FORMAT Vertical + +read: 40 rows / 67.42 KiB +written: 0 rows / 0.00 B +result: 10 rows / 41.23 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':2,'Seek':3,'ReadBufferFromFileDescriptorRead':10,'ReadBufferFromFileDescriptorReadBytes':16873,'ReadCompressedBytes':12855,'CompressedReadBufferBlocks':41,'CompressedReadBufferBytes':61376,'IOBufferAllocs':5,'IOBufferAllocBytes':26594,'FunctionExecute':28,'MarkCacheHits':1,'MarkCacheMisses':1,'CreatedReadBufferOrdinary':3,'DiskReadElapsedMicroseconds':30,'SelectedParts':1,'SelectedRanges':1,'SelectedMarks':1,'SelectedRows':40,'SelectedBytes':69039,'ContextLock':342,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':14451,'UserTimeMicroseconds':10009,'SystemTimeMicroseconds':1515,'SoftPageFaults':44,'OSCPUWaitMicroseconds':3050,'OSCPUVirtualTimeMicroseconds':11523,'OSReadChars':17311,'OSWriteChars':7288} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 3: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 12 +query_id: d9557845-5b5e-44ef-befa-55f837065d00 +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY query_duration_ms DESC +LIMIT 10 FORMAT Vertical + +read: 83 rows / 130.00 KiB +written: 0 rows / 0.00 B +result: 10 rows / 183.10 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,283,225,281,282] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':3,'Seek':6,'ReadBufferFromFileDescriptorRead':18,'ReadBufferFromFileDescriptorReadBytes':32140,'ReadCompressedBytes':25892,'CompressedReadBufferBlocks':82,'CompressedReadBufferBytes':116215,'IOBufferAllocs':9,'IOBufferAllocBytes':47368,'FunctionExecute':51,'MarkCacheHits':3,'MarkCacheMisses':1,'CreatedReadBufferOrdinary':5,'DiskReadElapsedMicroseconds':13,'SelectedParts':2,'SelectedRanges':2,'SelectedMarks':2,'SelectedRows':83,'SelectedBytes':133125,'ContextLock':351,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':19368,'UserTimeMicroseconds':12036,'SystemTimeMicroseconds':2047,'SoftPageFaults':42,'OSCPUWaitMicroseconds':710,'OSCPUVirtualTimeMicroseconds':13623,'OSWriteBytes':4096,'OSReadChars':34225,'OSWriteChars':8142} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 4: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 11 +query_id: bae8a338-eee9-406b-80d2-4596af2ba31f +query_kind: Select +is_initial_query: 1 +query: SELECT + name, + engine, + tables, + partitions, + parts, + formatReadableSize(bytes_on_disk) "disk_size" +FROM system.databases db +LEFT JOIN +( + SELECT + database, + uniq(table) "tables", + uniq(table, partition) "partitions", + count() AS parts, + sum(bytes_on_disk) "bytes_on_disk" + FROM system.parts + WHERE active + GROUP BY database +) AS db_stats ON db.name = db_stats.database +ORDER BY bytes_on_disk DESC +LIMIT 10 FORMAT PrettyCompactNoEscapes + +read: 17 rows / 1.31 KiB +written: 0 rows / 0.00 B +result: 4 rows / 640.00 B +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.databases','system.parts'] +columns: ['system.databases.engine','system.databases.name','system.parts.active','system.parts.bytes_on_disk','system.parts.database','system.parts.partition','system.parts.table'] +used_aggregate_functions: ['count','sum','uniq'] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['PrettyCompactNoEscapes'] +used_functions: ['formatReadableSize'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'ArenaAllocChunks':5,'ArenaAllocBytes':20480,'FunctionExecute':1,'SelectedRows':17,'SelectedBytes':1345,'ContextLock':69,'RWLockAcquiredReadLocks':9,'RealTimeMicroseconds':12225,'UserTimeMicroseconds':10731,'SystemTimeMicroseconds':1146,'SoftPageFaults':2,'OSCPUWaitMicroseconds':720,'OSCPUVirtualTimeMicroseconds':11876,'OSWriteBytes':4096,'OSReadChars':438,'OSWriteChars':8938} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 5: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:26 +query_duration_ms: 9 +query_id: f0c62bc7-36da-4542-a3d5-68a40c1c4b48 +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY + AND exception != '' +ORDER BY query_start_time DESC +LIMIT 10 FORMAT Vertical + +read: 40 rows / 67.42 KiB +written: 0 rows / 0.00 B +result: 4 rows / 43.13 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':1,'Seek':3,'ReadBufferFromFileDescriptorRead':8,'ReadBufferFromFileDescriptorReadBytes':15561,'ReadCompressedBytes':12855,'CompressedReadBufferBlocks':41,'CompressedReadBufferBytes':61376,'IOBufferAllocs':4,'IOBufferAllocBytes':25506,'FunctionExecute':31,'MarkCacheHits':2,'CreatedReadBufferOrdinary':2,'DiskReadElapsedMicroseconds':16,'SelectedParts':1,'SelectedRanges':1,'SelectedMarks':1,'SelectedRows':40,'SelectedBytes':69039,'ContextLock':361,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':11353,'UserTimeMicroseconds':8910,'SystemTimeMicroseconds':533,'SoftPageFaults':7,'HardPageFaults':2,'OSCPUWaitMicroseconds':1117,'OSCPUVirtualTimeMicroseconds':9443,'OSReadBytes':16384,'OSWriteBytes':4096,'OSReadChars':15999,'OSWriteChars':7714,'QueryProfilerRuns':1} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 6: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 8 +query_id: 72f3f9de-d17c-456b-8316-d494bea2096a +query_kind: Select +is_initial_query: 1 +query: SELECT name FROM system.tables WHERE database = 'system' FORMAT JSONCompact + +read: 74 rows / 2.61 KiB +written: 0 rows / 0.00 B +result: 74 rows / 2.00 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.tables'] +columns: ['system.tables.database','system.tables.name'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['JSONCompact'] +used_functions: ['equals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':2,'IOBufferAllocBytes':8192,'FunctionExecute':4,'SelectedRows':74,'SelectedBytes':2675,'ContextLock':23,'RWLockAcquiredReadLocks':75,'RealTimeMicroseconds':9190,'UserTimeMicroseconds':6468,'SystemTimeMicroseconds':517,'OSCPUWaitMicroseconds':2237,'OSCPUVirtualTimeMicroseconds':6984,'OSReadChars':438,'OSWriteChars':1270} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 7: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 8 +query_id: d55da87f-b030-4b5d-95fc-f9103ce58601 +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY memory_usage DESC +LIMIT 10 FORMAT Vertical + +read: 83 rows / 130.00 KiB +written: 0 rows / 0.00 B +result: 10 rows / 178.41 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,283,283,225,282] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':2,'Seek':6,'ReadBufferFromFileDescriptorRead':16,'ReadBufferFromFileDescriptorReadBytes':30044,'ReadCompressedBytes':25892,'CompressedReadBufferBlocks':82,'CompressedReadBufferBytes':116215,'IOBufferAllocs':8,'IOBufferAllocBytes':45272,'FunctionExecute':51,'MarkCacheHits':4,'CreatedReadBufferOrdinary':4,'SelectedParts':2,'SelectedRanges':2,'SelectedMarks':2,'SelectedRows':83,'SelectedBytes':133125,'ContextLock':351,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':12416,'UserTimeMicroseconds':7727,'SystemTimeMicroseconds':1247,'SoftPageFaults':41,'OSCPUWaitMicroseconds':1058,'OSCPUVirtualTimeMicroseconds':9018,'OSWriteBytes':4096,'OSReadChars':32137,'OSWriteChars':8108} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 8: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 8 +query_id: cc2a0e7a-3b9b-47d2-9255-009c62584bc4 +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY + AND exception != '' +ORDER BY query_start_time DESC +LIMIT 10 FORMAT Vertical + +read: 83 rows / 130.00 KiB +written: 0 rows / 0.00 B +result: 5 rows / 57.80 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,281,283,282,225] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':2,'Seek':6,'ReadBufferFromFileDescriptorRead':16,'ReadBufferFromFileDescriptorReadBytes':31464,'ReadCompressedBytes':25892,'CompressedReadBufferBlocks':82,'CompressedReadBufferBytes':116215,'IOBufferAllocs':8,'IOBufferAllocBytes':46860,'FunctionExecute':56,'MarkCacheHits':4,'CreatedReadBufferOrdinary':4,'SelectedParts':2,'SelectedRanges':2,'SelectedMarks':2,'SelectedRows':83,'SelectedBytes':133125,'ContextLock':370,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':13096,'UserTimeMicroseconds':9503,'SystemTimeMicroseconds':195,'SoftPageFaults':23,'OSCPUWaitMicroseconds':1380,'OSCPUVirtualTimeMicroseconds':9661,'OSWriteBytes':4096,'OSReadChars':33567,'OSWriteChars':8310} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 9: +─────── +type: QueryFinish +query_start_time: 2021-12-25 15:25:01 +query_duration_ms: 8 +query_id: a3d717fd-c43f-4723-a18d-557c733299f6 +query_kind: Select +is_initial_query: 1 +query: SELECT + name, + engine, + tables, + partitions, + parts, + formatReadableSize(bytes_on_disk) "disk_size" +FROM system.databases db +LEFT JOIN +( + SELECT + database, + uniq(table) "tables", + uniq(table, partition) "partitions", + count() AS parts, + sum(bytes_on_disk) "bytes_on_disk" + FROM system.parts + WHERE active + GROUP BY database +) AS db_stats ON db.name = db_stats.database +ORDER BY bytes_on_disk DESC +LIMIT 10 FORMAT PrettyCompactNoEscapes + +read: 9 rows / 845.00 B +written: 0 rows / 0.00 B +result: 4 rows / 640.00 B +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.databases','system.parts'] +columns: ['system.databases.engine','system.databases.name','system.parts.active','system.parts.bytes_on_disk','system.parts.database','system.parts.partition','system.parts.table'] +used_aggregate_functions: ['count','sum','uniq'] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['PrettyCompactNoEscapes'] +used_functions: ['formatReadableSize'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'ArenaAllocChunks':5,'ArenaAllocBytes':20480,'FunctionExecute':1,'SelectedRows':9,'SelectedBytes':845,'ContextLock':69,'RWLockAcquiredReadLocks':6,'RealTimeMicroseconds':9090,'UserTimeMicroseconds':4654,'SystemTimeMicroseconds':1171,'SoftPageFaults':8,'HardPageFaults':2,'OSCPUWaitMicroseconds':2126,'OSCPUVirtualTimeMicroseconds':5824,'OSReadBytes':212992,'OSWriteBytes':4096,'OSReadChars':427,'OSWriteChars':8936} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 10: +─────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:26 +query_duration_ms: 7 +query_id: 49305759-0f08-4d5a-81d8-c1a11cfc0eb4 +query_kind: Select +is_initial_query: 1 +query: SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY memory_usage DESC +LIMIT 10 FORMAT Vertical + +read: 40 rows / 67.42 KiB +written: 0 rows / 0.00 B +result: 10 rows / 57.95 KiB +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.query_log'] +columns: ['system.query_log.ProfileEvents','system.query_log.Settings','system.query_log.client_hostname','system.query_log.client_name','system.query_log.client_version_major','system.query_log.client_version_minor','system.query_log.client_version_patch','system.query_log.columns','system.query_log.databases','system.query_log.event_date','system.query_log.event_time','system.query_log.exception','system.query_log.http_user_agent','system.query_log.initial_user','system.query_log.is_initial_query','system.query_log.memory_usage','system.query_log.query','system.query_log.query_duration_ms','system.query_log.query_id','system.query_log.query_kind','system.query_log.query_start_time','system.query_log.read_bytes','system.query_log.read_rows','system.query_log.result_bytes','system.query_log.result_rows','system.query_log.stack_trace','system.query_log.tables','system.query_log.thread_ids','system.query_log.type','system.query_log.used_aggregate_function_combinators','system.query_log.used_aggregate_functions','system.query_log.used_data_type_families','system.query_log.used_database_engines','system.query_log.used_dictionaries','system.query_log.used_formats','system.query_log.used_functions','system.query_log.used_storages','system.query_log.used_table_functions','system.query_log.user','system.query_log.written_bytes','system.query_log.written_rows'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['Vertical'] +used_functions: ['empty','and','now','concat','today','toIntervalDay','formatReadableSize','minus','greaterOrEquals','multiIf','toString','subtractDays','notEquals'] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'FileOpen':1,'Seek':3,'ReadBufferFromFileDescriptorRead':8,'ReadBufferFromFileDescriptorReadBytes':14777,'ReadCompressedBytes':12855,'CompressedReadBufferBlocks':41,'CompressedReadBufferBytes':61376,'IOBufferAllocs':4,'IOBufferAllocBytes':24498,'FunctionExecute':28,'MarkCacheHits':2,'CreatedReadBufferOrdinary':2,'DiskReadElapsedMicroseconds':16,'SelectedParts':1,'SelectedRanges':1,'SelectedMarks':1,'SelectedRows':40,'SelectedBytes':69039,'ContextLock':342,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':9159,'UserTimeMicroseconds':4713,'SystemTimeMicroseconds':1942,'SoftPageFaults':19,'OSCPUWaitMicroseconds':2421,'OSCPUVirtualTimeMicroseconds':6655,'OSWriteBytes':4096,'OSReadChars':15215,'OSWriteChars':7278} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} +``` +##### Top 10 queries by memory usage +**query** +```sql +SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY memory_usage DESC +LIMIT 10 +``` +**result** +``` +Row 1: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:25 +query_duration_ms: 0 +query_id: c6b6a96c-d5c5-4406-98cd-80857a8412d4 +query_kind: +is_initial_query: 1 +query: SHOW ACCESS FORMAT TSVRaw + +read: 5 rows / 405.00 B +written: 0 rows / 0.00 B +result: 5 rows / 4.50 KiB +memory usage: 1.82 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: [] +tables: [] +columns: [] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TSVRaw'] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [66,283,225,281] +ProfileEvents: {'Query':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':5,'SelectedBytes':405,'ContextLock':8,'RealTimeMicroseconds':959,'UserTimeMicroseconds':452,'SystemTimeMicroseconds':238,'OSCPUWaitMicroseconds':90,'OSCPUVirtualTimeMicroseconds':690,'OSWriteBytes':4096,'OSReadChars':846,'OSWriteChars':880} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 2: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 2 +query_id: 253362ba-40a1-4593-a4cc-30d3dfdfe0ab +query_kind: +is_initial_query: 1 +query: SHOW ACCESS FORMAT TSVRaw + +read: 5 rows / 405.00 B +written: 0 rows / 0.00 B +result: 5 rows / 4.50 KiB +memory usage: 1.82 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: [] +tables: [] +columns: [] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TSVRaw'] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [66,225,283,282] +ProfileEvents: {'Query':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':5,'SelectedBytes':405,'ContextLock':8,'RealTimeMicroseconds':4687,'UserTimeMicroseconds':2171,'SystemTimeMicroseconds':1264,'OSCPUWaitMicroseconds':513,'OSCPUVirtualTimeMicroseconds':3335,'OSReadChars':848,'OSWriteChars':880} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 3: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:25:01 +query_duration_ms: 1 +query_id: 61b20c8c-ca63-4384-adb4-ce7765d77389 +query_kind: +is_initial_query: 1 +query: SHOW ACCESS FORMAT TSVRaw + +read: 5 rows / 405.00 B +written: 0 rows / 0.00 B +result: 5 rows / 4.50 KiB +memory usage: 1.82 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: [] +tables: [] +columns: [] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TSVRaw'] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [66,225,281,283] +ProfileEvents: {'Query':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':5,'SelectedBytes':405,'ContextLock':8,'RealTimeMicroseconds':3442,'UserTimeMicroseconds':715,'SystemTimeMicroseconds':485,'SoftPageFaults':1,'OSCPUWaitMicroseconds':443,'OSCPUVirtualTimeMicroseconds':1170,'OSReadChars':833,'OSWriteChars':880} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 4: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:25 +query_duration_ms: 1 +query_id: 13ebdab7-e368-4f9f-b47e-023dbd9e91ce +query_kind: Select +is_initial_query: 1 +query: +SELECT formatReadableTimeDelta(uptime()) + + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.49 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['uptime','formatReadableTimeDelta'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,283,282,225,281] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':17,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':1613,'UserTimeMicroseconds':708,'SystemTimeMicroseconds':274,'SoftPageFaults':3,'OSCPUWaitMicroseconds':2,'OSCPUVirtualTimeMicroseconds':980,'OSReadChars':846,'OSWriteChars':1190} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 5: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 2 +query_id: ff330183-854b-46bc-a548-30e12a7bee9c +query_kind: Select +is_initial_query: 1 +query: +SELECT formatReadableTimeDelta(uptime()) + + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.49 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['formatReadableTimeDelta','uptime'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,225,283,281,282] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':17,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':4372,'UserTimeMicroseconds':1022,'SystemTimeMicroseconds':177,'OSCPUWaitMicroseconds':2070,'OSCPUVirtualTimeMicroseconds':1198,'OSWriteBytes':4096,'OSReadChars':848,'OSWriteChars':1190} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 6: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:25:01 +query_duration_ms: 3 +query_id: b763c2f9-6234-47f7-8b30-43d619909289 +query_kind: Select +is_initial_query: 1 +query: +SELECT formatReadableTimeDelta(uptime()) + + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.49 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['uptime','formatReadableTimeDelta'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,225,281,283,282] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':17,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':6367,'UserTimeMicroseconds':3329,'SystemTimeMicroseconds':531,'SoftPageFaults':6,'HardPageFaults':1,'OSCPUWaitMicroseconds':1090,'OSCPUVirtualTimeMicroseconds':3859,'OSReadBytes':102400,'OSReadChars':830,'OSWriteChars':1190} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 7: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:25 +query_duration_ms: 1 +query_id: e9c25bd1-00d3-4239-9611-1c3d391178da +query_kind: Select +is_initial_query: 1 +query: SELECT version() + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.45 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['version'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,283,225,282] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':15,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':2720,'UserTimeMicroseconds':648,'SystemTimeMicroseconds':1144,'OSCPUWaitMicroseconds':110,'OSCPUVirtualTimeMicroseconds':1790,'OSReadChars':845,'OSWriteChars':1140} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 8: +────── +type: QueryFinish +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 4 +query_id: 69762642-8a75-4149-aaf5-bc1969558747 +query_kind: Select +is_initial_query: 1 +query: SELECT version() + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.45 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['version'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,282,283] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':15,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':10137,'UserTimeMicroseconds':6289,'SystemTimeMicroseconds':47,'SoftPageFaults':2,'HardPageFaults':1,'OSCPUWaitMicroseconds':859,'OSCPUVirtualTimeMicroseconds':6336,'OSReadBytes':12288,'OSReadChars':845,'OSWriteChars':1140} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 9: +─────── +type: QueryFinish +query_start_time: 2021-12-25 15:25:01 +query_duration_ms: 4 +query_id: 9e31242c-62c5-4bb1-9a3e-f96e99f3bddf +query_kind: Select +is_initial_query: 1 +query: SELECT version() + +read: 1 rows / 1.00 B +written: 0 rows / 0.00 B +result: 1 rows / 128.00 B +memory usage: 1.45 KiB +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.one'] +columns: ['system.one.dummy'] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['TabSeparated'] +used_functions: ['version'] +used_storages: [] +used_table_functions: [] +thread_ids: [66,225,282,281,283] +ProfileEvents: {'Query':1,'SelectQuery':1,'IOBufferAllocs':3,'IOBufferAllocBytes':3145728,'SelectedRows':1,'SelectedBytes':1,'ContextLock':15,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':8688,'UserTimeMicroseconds':3598,'SystemTimeMicroseconds':1288,'SoftPageFaults':42,'HardPageFaults':1,'OSCPUWaitMicroseconds':214,'OSCPUVirtualTimeMicroseconds':4885,'OSReadBytes':98304,'OSReadChars':818,'OSWriteChars':1140} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} + +Row 10: +─────── +type: QueryFinish +query_start_time: 2021-12-25 15:26:26 +query_duration_ms: 2 +query_id: de1fc64c-09c3-420a-8801-a2f9f04407cd +query_kind: Select +is_initial_query: 1 +query: SELECT + database, + table, + count() "partitions", + sum(part_count) "parts", + max(part_count) "max_parts_per_partition" +FROM +( + SELECT + database, + table, + partition, + count() "part_count" + FROM system.parts + WHERE active + GROUP BY database, table, partition +) partitions +GROUP BY database, table +ORDER BY max_parts_per_partition DESC +LIMIT 10 FORMAT PrettyCompactNoEscapes + +read: 12 rows / 643.00 B +written: 0 rows / 0.00 B +result: 6 rows / 752.00 B +memory usage: 0.00 B +exception: +stack_trace: + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: ['system'] +tables: ['system.parts'] +columns: ['system.parts.active','system.parts.database','system.parts.partition','system.parts.table'] +used_aggregate_functions: ['count','max','sum'] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: ['PrettyCompactNoEscapes'] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [66] +ProfileEvents: {'Query':1,'SelectQuery':1,'ArenaAllocChunks':2,'ArenaAllocBytes':8192,'SelectedRows':12,'SelectedBytes':643,'ContextLock':58,'RWLockAcquiredReadLocks':9,'RWLockReadersWaitMilliseconds':1,'RealTimeMicroseconds':2924,'UserTimeMicroseconds':1583,'SystemTimeMicroseconds':892,'SoftPageFaults':6,'OSCPUVirtualTimeMicroseconds':3423,'OSReadChars':438,'OSWriteChars':5086} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} +``` +##### Last 10 failed queries +**query** +```sql +SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + query, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + ProfileEvents, + Settings + FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY + AND exception != '' +ORDER BY query_start_time DESC +LIMIT 10 +``` +**result** +``` +Row 1: +────── +type: ExceptionBeforeStart +query_start_time: 2021-12-25 15:33:29 +query_duration_ms: 0 +query_id: 323743ef-4dff-4ed3-9559-f405c64fbd4a +query_kind: Select +is_initial_query: 1 +query: SELECT + '\n' || arrayStringConcat( + arrayMap( + x, + y -> concat(x, ': ', y), + arrayMap(x -> addressToLine(x), trace), + arrayMap(x -> demangle(addressToSymbol(x)), trace)), + '\n') AS trace +FROM system.stack_trace FORMAT Vertical + +read: 0 rows / 0.00 B +written: 0 rows / 0.00 B +result: 0 rows / 0.00 B +memory usage: 0.00 B +exception: Code: 446. DB::Exception: default: Introspection functions are disabled, because setting 'allow_introspection_functions' is set to 0: While processing concat('\n', arrayStringConcat(arrayMap((x, y) -> concat(x, ': ', y), arrayMap(x -> addressToLine(x), trace), arrayMap(x -> demangle(addressToSymbol(x)), trace)), '\n')) AS trace. (FUNCTION_NOT_ALLOWED) (version 21.11.8.4 (official build)) +stack_trace: +0. DB::Exception::Exception(std::__1::basic_string, std::__1::allocator > const&, int, bool) @ 0x9b682d4 in /usr/bin/clickhouse +1. bool DB::ContextAccess::checkAccessImplHelper(DB::AccessFlags const&) const::'lambda'(std::__1::basic_string, std::__1::allocator > const&, int)::operator()(std::__1::basic_string, std::__1::allocator > const&, int) const @ 0x119786bc in /usr/bin/clickhouse +2. bool DB::ContextAccess::checkAccessImplHelper(DB::AccessFlags const&) const @ 0x11977416 in /usr/bin/clickhouse +3. DB::Context::checkAccess(DB::AccessFlags const&) const @ 0x11eb2f08 in /usr/bin/clickhouse +4. ? @ 0xf96aefb in /usr/bin/clickhouse +5. DB::FunctionFactory::tryGetImpl(std::__1::basic_string, std::__1::allocator > const&, std::__1::shared_ptr) const @ 0x118f74b4 in /usr/bin/clickhouse +6. DB::FunctionFactory::getImpl(std::__1::basic_string, std::__1::allocator > const&, std::__1::shared_ptr) const @ 0x118f71fc in /usr/bin/clickhouse +7. DB::ActionsMatcher::visit(DB::ASTFunction const&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120c3abf in /usr/bin/clickhouse +8. DB::ActionsMatcher::visit(DB::ASTFunction const&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120c6b9f in /usr/bin/clickhouse +9. DB::ActionsMatcher::visit(DB::ASTFunction const&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120c41ed in /usr/bin/clickhouse +10. DB::ActionsMatcher::visit(DB::ASTFunction const&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120c41ed in /usr/bin/clickhouse +11. DB::ActionsMatcher::visit(DB::ASTFunction const&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120c41ed in /usr/bin/clickhouse +12. DB::ActionsMatcher::visit(DB::ASTExpressionList&, std::__1::shared_ptr const&, DB::ActionsMatcher::Data&) @ 0x120ca818 in /usr/bin/clickhouse +13. DB::InDepthNodeVisitor const>::visit(std::__1::shared_ptr const&) @ 0x12099bb7 in /usr/bin/clickhouse +14. DB::ExpressionAnalyzer::getRootActions(std::__1::shared_ptr const&, bool, std::__1::shared_ptr&, bool) @ 0x120999cb in /usr/bin/clickhouse +15. DB::SelectQueryExpressionAnalyzer::appendSelect(DB::ExpressionActionsChain&, bool) @ 0x120a4409 in /usr/bin/clickhouse +16. DB::ExpressionAnalysisResult::ExpressionAnalysisResult(DB::SelectQueryExpressionAnalyzer&, std::__1::shared_ptr const&, bool, bool, bool, std::__1::shared_ptr const&, DB::Block const&) @ 0x120a9070 in /usr/bin/clickhouse +17. DB::InterpreterSelectQuery::getSampleBlockImpl() @ 0x1232fd0d in /usr/bin/clickhouse +18. ? @ 0x12328864 in /usr/bin/clickhouse +19. DB::InterpreterSelectQuery::InterpreterSelectQuery(std::__1::shared_ptr const&, std::__1::shared_ptr, std::__1::optional, std::__1::shared_ptr const&, DB::SelectQueryOptions const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&, std::__1::shared_ptr const&, std::__1::unordered_map, DB::PreparedSetKey::Hash, std::__1::equal_to, std::__1::allocator > > >) @ 0x123232c7 in /usr/bin/clickhouse +20. DB::InterpreterSelectQuery::InterpreterSelectQuery(std::__1::shared_ptr const&, std::__1::shared_ptr, DB::SelectQueryOptions const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) @ 0x12321c54 in /usr/bin/clickhouse +21. DB::InterpreterSelectWithUnionQuery::buildCurrentChildInterpreter(std::__1::shared_ptr const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) @ 0x12547fa2 in /usr/bin/clickhouse +22. DB::InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery(std::__1::shared_ptr const&, std::__1::shared_ptr, DB::SelectQueryOptions const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) @ 0x12546680 in /usr/bin/clickhouse +23. DB::InterpreterFactory::get(std::__1::shared_ptr&, std::__1::shared_ptr, DB::SelectQueryOptions const&) @ 0x122c6216 in /usr/bin/clickhouse +24. ? @ 0x1277dd26 in /usr/bin/clickhouse +25. DB::executeQuery(DB::ReadBuffer&, DB::WriteBuffer&, bool, std::__1::shared_ptr, std::__1::function, std::__1::allocator > const&, std::__1::basic_string, std::__1::allocator > const&, std::__1::basic_string, std::__1::allocator > const&, std::__1::basic_string, std::__1::allocator > const&)>, std::__1::optional const&) @ 0x12781319 in /usr/bin/clickhouse +26. DB::HTTPHandler::processQuery(DB::HTTPServerRequest&, DB::HTMLForm&, DB::HTTPServerResponse&, DB::HTTPHandler::Output&, std::__1::optional&) @ 0x130c20fa in /usr/bin/clickhouse +27. DB::HTTPHandler::handleRequest(DB::HTTPServerRequest&, DB::HTTPServerResponse&) @ 0x130c6760 in /usr/bin/clickhouse +28. DB::HTTPServerConnection::run() @ 0x1312b5e8 in /usr/bin/clickhouse +29. Poco::Net::TCPServerConnection::start() @ 0x15d682cf in /usr/bin/clickhouse +30. Poco::Net::TCPServerDispatcher::run() @ 0x15d6a6c1 in /usr/bin/clickhouse +31. Poco::PooledThread::run() @ 0x15e7f069 in /usr/bin/clickhouse + +user: default +initial_user: default +client: python-requests/2.26.0 +client_hostname: +databases: [] +tables: [] +columns: [] +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: [] +used_functions: [] +used_storages: [] +used_table_functions: [] +thread_ids: [] +ProfileEvents: {} +Settings: {} + +``` +#### Stack traces +**query** +```sql +SELECT + '\n' || arrayStringConcat( + arrayMap( + x, + y -> concat(x, ': ', y), + arrayMap(x -> addressToLine(x), trace), + arrayMap(x -> demangle(addressToSymbol(x)), trace)), + '\n') AS trace +FROM system.stack_trace +``` +**result** +``` +ClickhouseError("Code: 446. DB::Exception: default: Introspection functions are disabled, because setting 'allow_introspection_functions' is set to 0: While processing concat('\\n', arrayStringConcat(arrayMap((x, y) -> concat(x, ': ', y), arrayMap(x -> addressToLine(x), trace), arrayMap(x -> demangle(addressToSymbol(x)), trace)), '\\n')) AS trace. (FUNCTION_NOT_ALLOWED) (version 21.11.8.4 (official build))",) +``` +#### uname +**command** +``` +uname -a +``` +**result** +``` +Linux clickhouse01 5.10.76-linuxkit #1 SMP Mon Nov 8 10:21:19 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux +``` diff --git a/utils/clickhouse-diagnostics/clickhouse-diagnostics b/utils/clickhouse-diagnostics/clickhouse-diagnostics new file mode 100644 index 00000000000..ffddee0bdc4 --- /dev/null +++ b/utils/clickhouse-diagnostics/clickhouse-diagnostics @@ -0,0 +1,960 @@ +#!/usr/bin/env python3 + +import argparse +import gzip +import io +import json +import socket +import subprocess +import sys +from copy import deepcopy +from datetime import datetime +from typing import MutableMapping + +import jinja2 +import requests +import sqlparse +import tenacity +import xmltodict +import yaml + +SELECT_VERSION = r'SELECT version()' + +SELECT_UPTIME = r''' +{% if version_ge('21.3') -%} +SELECT formatReadableTimeDelta(uptime()) +{% else -%} +SELECT + toString(floor(uptime() / 3600 / 24)) || ' days ' || + toString(floor(uptime() % (24 * 3600) / 3600, 1)) || ' hours' +{% endif -%} +''' + +SELECT_SYSTEM_TABLES = "SELECT name FROM system.tables WHERE database = 'system'" + +SELECT_DATABASE_ENGINES = r'''SELECT + engine, + count() "count" +FROM system.databases +GROUP BY engine +''' + +SELECT_DATABASES = r'''SELECT + name, + engine, + tables, + partitions, + parts, + formatReadableSize(bytes_on_disk) "disk_size" +FROM system.databases db +LEFT JOIN +( + SELECT + database, + uniq(table) "tables", + uniq(table, partition) "partitions", + count() AS parts, + sum(bytes_on_disk) "bytes_on_disk" + FROM system.parts + WHERE active + GROUP BY database +) AS db_stats ON db.name = db_stats.database +ORDER BY bytes_on_disk DESC +LIMIT 10 +''' + +SELECT_TABLE_ENGINES = r'''SELECT + engine, + count() "count" +FROM system.tables +WHERE database != 'system' +GROUP BY engine +''' + +SELECT_DICTIONARIES = r'''SELECT + source, + type, + status, + count() "count" +FROM system.dictionaries +GROUP BY source, type, status +ORDER BY status DESC, source +''' + +SELECT_ACCESS = "SHOW ACCESS" + +SELECT_QUOTA_USAGE = "SHOW QUOTA" + +SELECT_REPLICAS = r'''SELECT + database, + table, + is_leader, + is_readonly, + absolute_delay, + queue_size, + inserts_in_queue, + merges_in_queue +FROM system.replicas +ORDER BY absolute_delay DESC +LIMIT 10 +''' + +SELECT_REPLICATION_QUEUE = r'''SELECT + database, + table, + replica_name, + position, + node_name, + type, + source_replica, + parts_to_merge, + new_part_name, + create_time, + required_quorum, + is_detach, + is_currently_executing, + num_tries, + last_attempt_time, + last_exception, + concat('time: ', toString(last_postpone_time), ', number: ', toString(num_postponed), ', reason: ', postpone_reason) postpone +FROM system.replication_queue +ORDER BY create_time ASC +LIMIT 20 +''' + +SELECT_REPLICATED_FETCHES = r'''SELECT + database, + table, + round(elapsed, 1) "elapsed", + round(100 * progress, 1) "progress", + partition_id, + result_part_name, + result_part_path, + total_size_bytes_compressed, + bytes_read_compressed, + source_replica_path, + source_replica_hostname, + source_replica_port, + interserver_scheme, + to_detached, + thread_id +FROM system.replicated_fetches +''' + +SELECT_PARTS_PER_TABLE = r'''SELECT + database, + table, + count() "partitions", + sum(part_count) "parts", + max(part_count) "max_parts_per_partition" +FROM +( + SELECT + database, + table, + partition, + count() "part_count" + FROM system.parts + WHERE active + GROUP BY database, table, partition +) partitions +GROUP BY database, table +ORDER BY max_parts_per_partition DESC +LIMIT 10 +''' + +SELECT_MERGES = r'''SELECT + database, + table, + round(elapsed, 1) "elapsed", + round(100 * progress, 1) "progress", + is_mutation, + partition_id, +{% if version_ge('20.3') -%} + result_part_path, + source_part_paths, +{% endif -%} + num_parts, + formatReadableSize(total_size_bytes_compressed) "total_size_compressed", + formatReadableSize(bytes_read_uncompressed) "read_uncompressed", + formatReadableSize(bytes_written_uncompressed) "written_uncompressed", + columns_written, +{% if version_ge('20.3') -%} + formatReadableSize(memory_usage) "memory_usage", + thread_id +{% else -%} + formatReadableSize(memory_usage) "memory_usage" +{% endif -%} +FROM system.merges +''' + +SELECT_MUTATIONS = r'''SELECT + database, + table, + mutation_id, + command, + create_time, +{% if version_ge('20.3') -%} + parts_to_do_names, +{% endif -%} + parts_to_do, + is_done, + latest_failed_part, + latest_fail_time, + latest_fail_reason +FROM system.mutations +WHERE NOT is_done +ORDER BY create_time DESC +''' + +SELECT_RECENT_DATA_PARTS = r'''SELECT + database, + table, + engine, + partition_id, + name, +{% if version_ge('20.3') -%} + part_type, +{% endif -%} + active, + level, +{% if version_ge('20.3') -%} + disk_name, +{% endif -%} + path, + marks, + rows, + bytes_on_disk, + data_compressed_bytes, + data_uncompressed_bytes, + marks_bytes, + modification_time, + remove_time, + refcount, + is_frozen, + min_date, + max_date, + min_time, + max_time, + min_block_number, + max_block_number +FROM system.parts +WHERE modification_time > now() - INTERVAL 3 MINUTE +ORDER BY modification_time DESC +''' + +SELECT_DETACHED_DATA_PARTS = r'''SELECT + database, + table, + partition_id, + name, + disk, + reason, + min_block_number, + max_block_number, + level +FROM system.detached_parts +''' + +SELECT_PROCESSES = r'''SELECT + elapsed, + query_id, + {% if normalize_queries -%} + normalizeQuery(query) AS normalized_query, + {% else -%} + query, + {% endif -%} + is_cancelled, + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + formatReadableSize(memory_usage) AS "memory usage", + user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + {% if version_ge('21.3') -%} + thread_ids, + {% endif -%} + {% if version_ge('21.8') -%} + ProfileEvents, + Settings + {% else -%} + ProfileEvents.Names, + ProfileEvents.Values, + Settings.Names, + Settings.Values + {% endif -%} +FROM system.processes +ORDER BY elapsed DESC +''' + +SELECT_TOP_QUERIES_BY_DURATION = r'''SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + {% if normalize_queries -%} + normalizeQuery(query) AS normalized_query, + {% else -%} + query, + {% endif -%} + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + {% if version_ge('21.3') -%} + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + {% endif -%} + {% if version_ge('21.8') -%} + ProfileEvents, + Settings + {% else -%} + ProfileEvents.Names, + ProfileEvents.Values, + Settings.Names, + Settings.Values + {% endif -%} +FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY query_duration_ms DESC +LIMIT 10 +''' + +SELECT_TOP_QUERIES_BY_MEMORY_USAGE = r'''SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + {% if normalize_queries -%} + normalizeQuery(query) AS normalized_query, + {% else -%} + query, + {% endif -%} + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + {% if version_ge('21.3') -%} + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + {% endif -%} + {% if version_ge('21.8') -%} + ProfileEvents, + Settings + {% else -%} + ProfileEvents.Names, + ProfileEvents.Values, + Settings.Names, + Settings.Values + {% endif -%} +FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY +ORDER BY memory_usage DESC +LIMIT 10 +''' + +SELECT_FAILED_QUERIES = r'''SELECT + type, + query_start_time, + query_duration_ms, + query_id, + query_kind, + is_initial_query, + {% if normalize_queries -%} + normalizeQuery(query) AS normalized_query, + {% else -%} + query, + {% endif -%} + concat(toString(read_rows), ' rows / ', formatReadableSize(read_bytes)) AS read, + concat(toString(written_rows), ' rows / ', formatReadableSize(written_bytes)) AS written, + concat(toString(result_rows), ' rows / ', formatReadableSize(result_bytes)) AS result, + formatReadableSize(memory_usage) AS "memory usage", + exception, + '\n' || stack_trace AS stack_trace, + user, + initial_user, + multiIf(empty(client_name), http_user_agent, concat(client_name, ' ', toString(client_version_major), '.', toString(client_version_minor), '.', toString(client_version_patch))) AS client, + client_hostname, + {% if version_ge('21.3') -%} + databases, + tables, + columns, + used_aggregate_functions, + used_aggregate_function_combinators, + used_database_engines, + used_data_type_families, + used_dictionaries, + used_formats, + used_functions, + used_storages, + used_table_functions, + thread_ids, + {% endif -%} + {% if version_ge('21.8') -%} + ProfileEvents, + Settings + {% else -%} + ProfileEvents.Names, + ProfileEvents.Values, + Settings.Names, + Settings.Values + {% endif -%} +FROM system.query_log +WHERE type != 'QueryStart' + AND event_date >= today() - 1 + AND event_time >= now() - INTERVAL 1 DAY + AND exception != '' +ORDER BY query_start_time DESC +LIMIT 10 +''' + +SELECT_STACK_TRACES = r'''SELECT + '\n' || arrayStringConcat( + arrayMap( + x, + y -> concat(x, ': ', y), + arrayMap(x -> addressToLine(x), trace), + arrayMap(x -> demangle(addressToSymbol(x)), trace)), + '\n') AS trace +FROM system.stack_trace +''' + +SELECT_CRASH_LOG = r'''SELECT + event_time, + signal, + thread_id, + query_id, + '\n' || arrayStringConcat(trace_full, '\n') AS trace, + version +FROM system.crash_log +ORDER BY event_time DESC +''' + + +def retry(exception_types, max_attempts=5, max_interval=5): + """ + Function decorator that retries wrapped function on failures. + """ + return tenacity.retry( + retry=tenacity.retry_if_exception_type(exception_types), + wait=tenacity.wait_random_exponential(multiplier=0.5, max=max_interval), + stop=tenacity.stop_after_attempt(max_attempts), + reraise=True) + + +class ClickhouseError(Exception): + """ + ClickHouse interaction error. + """ + + def __init__(self, response): + self.response = response + super().__init__(self.response.text.strip()) + + +class ClickhouseClient: + """ + ClickHouse client. + """ + + def __init__(self, *, host, port=8123, user=None): + self._session = requests.Session() + if user: + self._session.headers['X-ClickHouse-User'] = user + self._url = f'http://{host}:{port}' + self._timeout = 60 + self._ch_version = None + + @property + def clickhouse_version(self): + if self._ch_version is None: + self._ch_version = self.query(SELECT_VERSION) + + return self._ch_version + + @retry(requests.exceptions.ConnectionError) + def query(self, query, query_args=None, format=None, post_data=None, timeout=None, echo=False, dry_run=False): + """ + Execute query. + """ + if query_args: + query = self.render_query(query, **query_args) + + if format: + query += f' FORMAT {format}' + + if timeout is None: + timeout = self._timeout + + if echo: + print(sqlparse.format(query, reindent=True), '\n') + + if dry_run: + return None + + try: + response = self._session.post(self._url, + params={ + 'query': query, + }, + json=post_data, + timeout=timeout) + + response.raise_for_status() + + if format in ('JSON', 'JSONCompact'): + return response.json() + + return response.text.strip() + except requests.exceptions.HTTPError as e: + raise ClickhouseError(e.response) from None + + def render_query(self, query, **kwargs): + env = jinja2.Environment() + + env.globals['version_ge'] = lambda version: version_ge(self.clickhouse_version, version) + + template = env.from_string(query) + return template.render(kwargs) + + +class ClickhouseConfig: + """ + ClickHouse server configuration. + """ + + def __init__(self, config): + self._config = config + + def dump(self, mask_secrets=True): + config = deepcopy(self._config) + if mask_secrets: + self._mask_secrets(config) + + return xmltodict.unparse(config, pretty=True) + + @classmethod + def load(cls): + return ClickhouseConfig(cls._load_config('/var/lib/clickhouse/preprocessed_configs/config.xml')) + + @staticmethod + def _load_config(config_path): + with open(config_path, 'r') as file: + return xmltodict.parse(file.read()) + + @classmethod + def _mask_secrets(cls, config): + if isinstance(config, MutableMapping): + for key, value in list(config.items()): + if isinstance(value, MutableMapping): + cls._mask_secrets(config[key]) + elif key in ('password', 'secret_access_key', 'header', 'identity'): + config[key] = '*****' + + +class DiagnosticsData: + """ + Diagnostics data. + """ + + def __init__(self, args, host): + self.args = args + self.host = host + self._sections = [{'section': None, 'data': {}}] + + def add_string(self, name, value, section=None): + self._section(section)[name] = { + 'type': 'string', + 'value': value, + } + + def add_xml_document(self, name, document, section=None): + self._section(section)[name] = { + 'type': 'xml', + 'value': document, + } + + def add_query(self, name, query, result, section=None): + self._section(section)[name] = { + 'type': 'query', + 'query': query, + 'result': result, + } + + def add_command(self, name, command, result, section=None): + self._section(section)[name] = { + 'type': 'command', + 'command': command, + 'result': result, + } + + def dump(self, format): + if format.startswith('json'): + result = self._dump_json() + elif format.startswith('yaml'): + result = self._dump_yaml() + else: + result = self._dump_wiki() + + if format.endswith('.gz'): + compressor = gzip.GzipFile(mode='wb', fileobj=sys.stdout.buffer) + compressor.write(result.encode()) + else: + print(result) + + def _section(self, name=None): + if self._sections[-1]['section'] != name: + self._sections.append({'section': name, 'data': {}}) + + return self._sections[-1]['data'] + + def _dump_json(self): + """ + Dump diagnostic data in JSON format. + """ + return json.dumps(self._sections, indent=2, ensure_ascii=False) + + def _dump_yaml(self): + """ + Dump diagnostic data in YAML format. + """ + return yaml.dump(self._sections, default_flow_style=False, allow_unicode=True) + + def _dump_wiki(self): + """ + Dump diagnostic data in Yandex wiki format. + """ + + def _write_title(buffer, value): + buffer.write(f'### {value}\n') + + def _write_subtitle(buffer, value): + buffer.write(f'#### {value}\n') + + def _write_string_item(buffer, name, item): + value = item['value'] + if value != '': + value = f'**{value}**' + buffer.write(f'{name}: {value}\n') + + def _write_xml_item(buffer, section_name, name, item): + if section_name: + buffer.write(f'##### {name}\n') + else: + _write_subtitle(buffer, name) + + _write_result(buffer, item['value'], format='XML') + + def _write_query_item(buffer, section_name, name, item): + if section_name: + buffer.write(f'##### {name}\n') + else: + _write_subtitle(buffer, name) + + _write_query(buffer, item['query']) + _write_result(buffer, item['result']) + + def _write_command_item(buffer, section_name, name, item): + if section_name: + buffer.write(f'##### {name}\n') + else: + _write_subtitle(buffer, name) + + _write_command(buffer, item['command']) + _write_result(buffer, item['result']) + + def _write_unknown_item(buffer, section_name, name, item): + if section_name: + buffer.write(f'**{name}**\n') + else: + _write_subtitle(buffer, name) + + json.dump(item, buffer, indent=2) + + def _write_query(buffer, query): + buffer.write('**query**\n') + buffer.write('```sql\n') + buffer.write(query) + buffer.write('\n```\n') + + def _write_command(buffer, command): + buffer.write('**command**\n') + buffer.write('```\n') + buffer.write(command) + buffer.write('\n```\n') + + def _write_result(buffer, result, format=None): + buffer.write('**result**\n') + buffer.write(f'```{format}\n' if format else '```\n') + buffer.write(result) + buffer.write('\n```\n') + + buffer = io.StringIO() + + _write_title(buffer, f'Diagnostics data for host {self.host}') + for section in self._sections: + section_name = section['section'] + if section_name: + _write_subtitle(buffer, section_name) + + for name, item in section['data'].items(): + if item['type'] == 'string': + _write_string_item(buffer, name, item) + elif item['type'] == 'query': + _write_query_item(buffer, section_name, name, item) + elif item['type'] == 'command': + _write_command_item(buffer, section_name, name, item) + elif item['type'] == 'xml': + _write_xml_item(buffer, section_name, name, item) + else: + _write_unknown_item(buffer, section_name, name, item) + + return buffer.getvalue() + + +def main(): + """ + Program entry point. + """ + args = parse_args() + + host = socket.getfqdn() + timestamp = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') + client = ClickhouseClient(host=host) + ch_config = ClickhouseConfig.load() + version = client.clickhouse_version + system_tables = [row[0] for row in execute_query(client, SELECT_SYSTEM_TABLES, format='JSONCompact')['data']] + + diagnostics = DiagnosticsData(args, host) + diagnostics.add_string('Version', version) + diagnostics.add_string('Timestamp', timestamp) + diagnostics.add_string('Uptime', execute_query(client, SELECT_UPTIME)) + + diagnostics.add_xml_document('ClickHouse configuration', ch_config.dump()) + + if version_ge(version, '20.8'): + add_query(diagnostics, 'Access configuration', + client=client, + query=SELECT_ACCESS, + format='TSVRaw') + add_query(diagnostics, 'Quotas', + client=client, + query=SELECT_QUOTA_USAGE, + format='Vertical') + + add_query(diagnostics, 'Database engines', + client=client, + query=SELECT_DATABASE_ENGINES, + format='PrettyCompactNoEscapes', + section='Schema') + add_query(diagnostics, 'Databases (top 10 by size)', + client=client, + query=SELECT_DATABASES, + format='PrettyCompactNoEscapes', + section='Schema') + add_query(diagnostics, 'Table engines', + client=client, + query=SELECT_TABLE_ENGINES, + format='PrettyCompactNoEscapes', + section='Schema') + add_query(diagnostics, 'Dictionaries', + client=client, + query=SELECT_DICTIONARIES, + format='PrettyCompactNoEscapes', + section='Schema') + + add_query(diagnostics, 'Replicated tables (top 10 by absolute delay)', + client=client, + query=SELECT_REPLICAS, + format='PrettyCompactNoEscapes', + section='Replication') + add_query(diagnostics, 'Replication queue (top 20 oldest tasks)', + client=client, + query=SELECT_REPLICATION_QUEUE, + format='Vertical', + section='Replication') + if version_ge(version, '21.3'): + add_query(diagnostics, 'Replicated fetches', + client=client, + query=SELECT_REPLICATED_FETCHES, + format='Vertical', + section='Replication') + + add_query(diagnostics, 'Top 10 tables by max parts per partition', + client=client, + query=SELECT_PARTS_PER_TABLE, + format='PrettyCompactNoEscapes') + add_query(diagnostics, 'Merges in progress', + client=client, + query=SELECT_MERGES, + format='Vertical') + add_query(diagnostics, 'Mutations in progress', + client=client, + query=SELECT_MUTATIONS, + format='Vertical') + add_query(diagnostics, 'Recent data parts (modification time within last 3 minutes)', + client=client, + query=SELECT_RECENT_DATA_PARTS, + format='Vertical') + + add_query(diagnostics, 'system.detached_parts', + client=client, + query=SELECT_DETACHED_DATA_PARTS, + format='PrettyCompactNoEscapes', + section='Detached data') + add_command(diagnostics, 'Disk space usage', + command='du -sh -L -c /var/lib/clickhouse/data/*/*/detached/* | sort -rsh', + section='Detached data') + + add_query(diagnostics, 'Queries in progress (process list)', + client=client, + query=SELECT_PROCESSES, + format='Vertical', + section='Queries') + add_query(diagnostics, 'Top 10 queries by duration', + client=client, + query=SELECT_TOP_QUERIES_BY_DURATION, + format='Vertical', + section='Queries') + add_query(diagnostics, 'Top 10 queries by memory usage', + client=client, + query=SELECT_TOP_QUERIES_BY_MEMORY_USAGE, + format='Vertical', + section='Queries') + add_query(diagnostics, 'Last 10 failed queries', + client=client, + query=SELECT_FAILED_QUERIES, + format='Vertical', + section='Queries') + + add_query(diagnostics, 'Stack traces', + client=client, + query=SELECT_STACK_TRACES, + format='Vertical') + + if 'crash_log' in system_tables: + add_query(diagnostics, 'Crash log', + client=client, + query=SELECT_CRASH_LOG, + format='Vertical') + + add_command(diagnostics, 'uname', 'uname -a') + + diagnostics.dump(args.format) + + +def parse_args(): + """ + Parse command-line arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument('--format', + choices=['json', 'yaml', 'json.gz', 'yaml.gz', 'wiki', 'wiki.gz'], + default='wiki') + parser.add_argument('--normalize-queries', + action='store_true', + default=False) + return parser.parse_args() + + +def add_query(diagnostics, name, client, query, format, section=None): + query_args = { + 'normalize_queries': diagnostics.args.normalize_queries, + } + query = client.render_query(query, **query_args) + diagnostics.add_query( + name=name, + query=query, + result=execute_query(client, query, render_query=False, format=format), + section=section) + + +def execute_query(client, query, render_query=True, format=None): + if render_query: + query = client.render_query(query) + + try: + return client.query(query, format=format) + except Exception as e: + return repr(e) + + +def add_command(diagnostics, name, command, section=None): + diagnostics.add_command( + name=name, + command=command, + result=execute_command(command), + section=section) + + +def execute_command(command, input=None): + proc = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if isinstance(input, str): + input = input.encode() + + stdout, stderr = proc.communicate(input=input) + + if proc.returncode: + return f'failed with exit code {proc.returncode}\n{stderr.decode()}' + + return stdout.decode() + + +def version_ge(version1, version2): + """ + Return True if version1 is greater or equal than version2. + """ + return parse_version(version1) >= parse_version(version2) + + +def parse_version(version): + """ + Parse version string. + """ + return [int(x) for x in version.strip().split('.')] + + +if __name__ == '__main__': + main() diff --git a/utils/clickhouse-diagnostics/requirements.txt b/utils/clickhouse-diagnostics/requirements.txt new file mode 100644 index 00000000000..1d2b6ef3916 --- /dev/null +++ b/utils/clickhouse-diagnostics/requirements.txt @@ -0,0 +1,6 @@ +Jinja2 +PyYAML +requests +sqlparse +tenacity +xmltodict diff --git a/utils/grammar-fuzzer/ClickHouseUnlexer.py b/utils/grammar-fuzzer/ClickHouseUnlexer.py deleted file mode 100644 index c91522bd7be..00000000000 --- a/utils/grammar-fuzzer/ClickHouseUnlexer.py +++ /dev/null @@ -1,1771 +0,0 @@ -# Generated by Grammarinator 19.3 - -from itertools import chain -from grammarinator.runtime import * - -charset_0 = list(chain(*multirange_diff(printable_unicode_ranges, [(39, 40),(92, 93)]))) -charset_1 = list(chain(range(97, 98), range(65, 66))) -charset_2 = list(chain(range(98, 99), range(66, 67))) -charset_3 = list(chain(range(99, 100), range(67, 68))) -charset_4 = list(chain(range(100, 101), range(68, 69))) -charset_5 = list(chain(range(101, 102), range(69, 70))) -charset_6 = list(chain(range(102, 103), range(70, 71))) -charset_7 = list(chain(range(103, 104), range(71, 72))) -charset_8 = list(chain(range(104, 105), range(72, 73))) -charset_9 = list(chain(range(105, 106), range(73, 74))) -charset_10 = list(chain(range(106, 107), range(74, 75))) -charset_11 = list(chain(range(107, 108), range(75, 76))) -charset_12 = list(chain(range(108, 109), range(76, 77))) -charset_13 = list(chain(range(109, 110), range(77, 78))) -charset_14 = list(chain(range(110, 111), range(78, 79))) -charset_15 = list(chain(range(111, 112), range(79, 80))) -charset_16 = list(chain(range(112, 113), range(80, 81))) -charset_17 = list(chain(range(113, 114), range(81, 82))) -charset_18 = list(chain(range(114, 115), range(82, 83))) -charset_19 = list(chain(range(115, 116), range(83, 84))) -charset_20 = list(chain(range(116, 117), range(84, 85))) -charset_21 = list(chain(range(117, 118), range(85, 86))) -charset_22 = list(chain(range(118, 119), range(86, 87))) -charset_23 = list(chain(range(119, 120), range(87, 88))) -charset_24 = list(chain(range(120, 121), range(88, 89))) -charset_25 = list(chain(range(121, 122), range(89, 90))) -charset_26 = list(chain(range(122, 123), range(90, 91))) -charset_27 = list(chain(range(97, 123), range(65, 91))) -charset_28 = list(chain(range(48, 58))) -charset_29 = list(chain(range(48, 58), range(97, 103), range(65, 71))) -charset_30 = list(chain(*multirange_diff(printable_unicode_ranges, [(92, 93),(92, 93)]))) -charset_31 = list(chain(range(32, 33), range(11, 12), range(12, 13), range(9, 10), range(13, 14), range(10, 11))) - - -class ClickHouseUnlexer(Grammarinator): - - def __init__(self, *, max_depth=float('inf'), weights=None, cooldown=1.0): - super(ClickHouseUnlexer, self).__init__() - self.unlexer = self - self.max_depth = max_depth - self.weights = weights or dict() - self.cooldown = cooldown - - def EOF(self, *args, **kwargs): - pass - - @depthcontrol - def INTERVAL_TYPE(self): - current = self.create_node(UnlexerRule(name='INTERVAL_TYPE')) - choice = self.choice([0 if [2, 2, 2, 2, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_0', i), 1) for i, w in enumerate([1, 1, 1, 1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_0', choice)] = self.unlexer.weights.get(('alt_0', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.SECOND() - elif choice == 1: - current += self.unlexer.MINUTE() - elif choice == 2: - current += self.unlexer.HOUR() - elif choice == 3: - current += self.unlexer.DAY() - elif choice == 4: - current += self.unlexer.WEEK() - elif choice == 5: - current += self.unlexer.MONTH() - elif choice == 6: - current += self.unlexer.QUARTER() - elif choice == 7: - current += self.unlexer.YEAR() - return current - INTERVAL_TYPE.min_depth = 2 - - @depthcontrol - def ALIAS(self): - current = self.create_node(UnlexerRule(name='ALIAS')) - current += self.unlexer.A() - current += self.unlexer.L() - current += self.unlexer.I() - current += self.unlexer.A() - current += self.unlexer.S() - return current - ALIAS.min_depth = 1 - - @depthcontrol - def ALL(self): - current = self.create_node(UnlexerRule(name='ALL')) - current += self.unlexer.A() - current += self.unlexer.L() - current += self.unlexer.L() - return current - ALL.min_depth = 1 - - @depthcontrol - def AND(self): - current = self.create_node(UnlexerRule(name='AND')) - current += self.unlexer.A() - current += self.unlexer.N() - current += self.unlexer.D() - return current - AND.min_depth = 1 - - @depthcontrol - def ANTI(self): - current = self.create_node(UnlexerRule(name='ANTI')) - current += self.unlexer.A() - current += self.unlexer.N() - current += self.unlexer.T() - current += self.unlexer.I() - return current - ANTI.min_depth = 1 - - @depthcontrol - def ANY(self): - current = self.create_node(UnlexerRule(name='ANY')) - current += self.unlexer.A() - current += self.unlexer.N() - current += self.unlexer.Y() - return current - ANY.min_depth = 1 - - @depthcontrol - def ARRAY(self): - current = self.create_node(UnlexerRule(name='ARRAY')) - current += self.unlexer.A() - current += self.unlexer.R() - current += self.unlexer.R() - current += self.unlexer.A() - current += self.unlexer.Y() - return current - ARRAY.min_depth = 1 - - @depthcontrol - def AS(self): - current = self.create_node(UnlexerRule(name='AS')) - current += self.unlexer.A() - current += self.unlexer.S() - return current - AS.min_depth = 1 - - @depthcontrol - def ASCENDING(self): - current = self.create_node(UnlexerRule(name='ASCENDING')) - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_9', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_9', choice)] = self.unlexer.weights.get(('alt_9', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.C() - elif choice == 1: - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.C() - current += self.unlexer.E() - current += self.unlexer.N() - current += self.unlexer.D() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - ASCENDING.min_depth = 1 - - @depthcontrol - def ASOF(self): - current = self.create_node(UnlexerRule(name='ASOF')) - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.O() - current += self.unlexer.F() - return current - ASOF.min_depth = 1 - - @depthcontrol - def BETWEEN(self): - current = self.create_node(UnlexerRule(name='BETWEEN')) - current += self.unlexer.B() - current += self.unlexer.E() - current += self.unlexer.T() - current += self.unlexer.W() - current += self.unlexer.E() - current += self.unlexer.E() - current += self.unlexer.N() - return current - BETWEEN.min_depth = 1 - - @depthcontrol - def BOTH(self): - current = self.create_node(UnlexerRule(name='BOTH')) - current += self.unlexer.B() - current += self.unlexer.O() - current += self.unlexer.T() - current += self.unlexer.H() - return current - BOTH.min_depth = 1 - - @depthcontrol - def BY(self): - current = self.create_node(UnlexerRule(name='BY')) - current += self.unlexer.B() - current += self.unlexer.Y() - return current - BY.min_depth = 1 - - @depthcontrol - def CASE(self): - current = self.create_node(UnlexerRule(name='CASE')) - current += self.unlexer.C() - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.E() - return current - CASE.min_depth = 1 - - @depthcontrol - def CAST(self): - current = self.create_node(UnlexerRule(name='CAST')) - current += self.unlexer.C() - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.T() - return current - CAST.min_depth = 1 - - @depthcontrol - def CLUSTER(self): - current = self.create_node(UnlexerRule(name='CLUSTER')) - current += self.unlexer.C() - current += self.unlexer.L() - current += self.unlexer.U() - current += self.unlexer.S() - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.R() - return current - CLUSTER.min_depth = 1 - - @depthcontrol - def COLLATE(self): - current = self.create_node(UnlexerRule(name='COLLATE')) - current += self.unlexer.C() - current += self.unlexer.O() - current += self.unlexer.L() - current += self.unlexer.L() - current += self.unlexer.A() - current += self.unlexer.T() - current += self.unlexer.E() - return current - COLLATE.min_depth = 1 - - @depthcontrol - def CREATE(self): - current = self.create_node(UnlexerRule(name='CREATE')) - current += self.unlexer.C() - current += self.unlexer.R() - current += self.unlexer.E() - current += self.unlexer.A() - current += self.unlexer.T() - current += self.unlexer.E() - return current - CREATE.min_depth = 1 - - @depthcontrol - def CROSS(self): - current = self.create_node(UnlexerRule(name='CROSS')) - current += self.unlexer.C() - current += self.unlexer.R() - current += self.unlexer.O() - current += self.unlexer.S() - current += self.unlexer.S() - return current - CROSS.min_depth = 1 - - @depthcontrol - def DATABASE(self): - current = self.create_node(UnlexerRule(name='DATABASE')) - current += self.unlexer.D() - current += self.unlexer.A() - current += self.unlexer.T() - current += self.unlexer.A() - current += self.unlexer.B() - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.E() - return current - DATABASE.min_depth = 1 - - @depthcontrol - def DAY(self): - current = self.create_node(UnlexerRule(name='DAY')) - current += self.unlexer.D() - current += self.unlexer.A() - current += self.unlexer.Y() - return current - DAY.min_depth = 1 - - @depthcontrol - def DEFAULT(self): - current = self.create_node(UnlexerRule(name='DEFAULT')) - current += self.unlexer.D() - current += self.unlexer.E() - current += self.unlexer.F() - current += self.unlexer.A() - current += self.unlexer.U() - current += self.unlexer.L() - current += self.unlexer.T() - return current - DEFAULT.min_depth = 1 - - @depthcontrol - def DELETE(self): - current = self.create_node(UnlexerRule(name='DELETE')) - current += self.unlexer.D() - current += self.unlexer.E() - current += self.unlexer.L() - current += self.unlexer.E() - current += self.unlexer.T() - current += self.unlexer.E() - return current - DELETE.min_depth = 1 - - @depthcontrol - def DESCENDING(self): - current = self.create_node(UnlexerRule(name='DESCENDING')) - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_12', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_12', choice)] = self.unlexer.weights.get(('alt_12', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.D() - current += self.unlexer.E() - current += self.unlexer.S() - current += self.unlexer.C() - elif choice == 1: - current += self.unlexer.D() - current += self.unlexer.E() - current += self.unlexer.S() - current += self.unlexer.C() - current += self.unlexer.E() - current += self.unlexer.N() - current += self.unlexer.D() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - DESCENDING.min_depth = 1 - - @depthcontrol - def DISK(self): - current = self.create_node(UnlexerRule(name='DISK')) - current += self.unlexer.D() - current += self.unlexer.I() - current += self.unlexer.S() - current += self.unlexer.K() - return current - DISK.min_depth = 1 - - @depthcontrol - def DISTINCT(self): - current = self.create_node(UnlexerRule(name='DISTINCT')) - current += self.unlexer.D() - current += self.unlexer.I() - current += self.unlexer.S() - current += self.unlexer.T() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.C() - current += self.unlexer.T() - return current - DISTINCT.min_depth = 1 - - @depthcontrol - def DROP(self): - current = self.create_node(UnlexerRule(name='DROP')) - current += self.unlexer.D() - current += self.unlexer.R() - current += self.unlexer.O() - current += self.unlexer.P() - return current - DROP.min_depth = 1 - - @depthcontrol - def ELSE(self): - current = self.create_node(UnlexerRule(name='ELSE')) - current += self.unlexer.E() - current += self.unlexer.L() - current += self.unlexer.S() - current += self.unlexer.E() - return current - ELSE.min_depth = 1 - - @depthcontrol - def END(self): - current = self.create_node(UnlexerRule(name='END')) - current += self.unlexer.E() - current += self.unlexer.N() - current += self.unlexer.D() - return current - END.min_depth = 1 - - @depthcontrol - def ENGINE(self): - current = self.create_node(UnlexerRule(name='ENGINE')) - current += self.unlexer.E() - current += self.unlexer.N() - current += self.unlexer.G() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.E() - return current - ENGINE.min_depth = 1 - - @depthcontrol - def EXISTS(self): - current = self.create_node(UnlexerRule(name='EXISTS')) - current += self.unlexer.E() - current += self.unlexer.X() - current += self.unlexer.I() - current += self.unlexer.S() - current += self.unlexer.T() - current += self.unlexer.S() - return current - EXISTS.min_depth = 1 - - @depthcontrol - def EXTRACT(self): - current = self.create_node(UnlexerRule(name='EXTRACT')) - current += self.unlexer.E() - current += self.unlexer.X() - current += self.unlexer.T() - current += self.unlexer.R() - current += self.unlexer.A() - current += self.unlexer.C() - current += self.unlexer.T() - return current - EXTRACT.min_depth = 1 - - @depthcontrol - def FINAL(self): - current = self.create_node(UnlexerRule(name='FINAL')) - current += self.unlexer.F() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.A() - current += self.unlexer.L() - return current - FINAL.min_depth = 1 - - @depthcontrol - def FIRST(self): - current = self.create_node(UnlexerRule(name='FIRST')) - current += self.unlexer.F() - current += self.unlexer.I() - current += self.unlexer.R() - current += self.unlexer.S() - current += self.unlexer.T() - return current - FIRST.min_depth = 1 - - @depthcontrol - def FORMAT(self): - current = self.create_node(UnlexerRule(name='FORMAT')) - current += self.unlexer.F() - current += self.unlexer.O() - current += self.unlexer.R() - current += self.unlexer.M() - current += self.unlexer.A() - current += self.unlexer.T() - return current - FORMAT.min_depth = 1 - - @depthcontrol - def FROM(self): - current = self.create_node(UnlexerRule(name='FROM')) - current += self.unlexer.F() - current += self.unlexer.R() - current += self.unlexer.O() - current += self.unlexer.M() - return current - FROM.min_depth = 1 - - @depthcontrol - def FULL(self): - current = self.create_node(UnlexerRule(name='FULL')) - current += self.unlexer.F() - current += self.unlexer.U() - current += self.unlexer.L() - current += self.unlexer.L() - return current - FULL.min_depth = 1 - - @depthcontrol - def GLOBAL(self): - current = self.create_node(UnlexerRule(name='GLOBAL')) - current += self.unlexer.G() - current += self.unlexer.L() - current += self.unlexer.O() - current += self.unlexer.B() - current += self.unlexer.A() - current += self.unlexer.L() - return current - GLOBAL.min_depth = 1 - - @depthcontrol - def GROUP(self): - current = self.create_node(UnlexerRule(name='GROUP')) - current += self.unlexer.G() - current += self.unlexer.R() - current += self.unlexer.O() - current += self.unlexer.U() - current += self.unlexer.P() - return current - GROUP.min_depth = 1 - - @depthcontrol - def HAVING(self): - current = self.create_node(UnlexerRule(name='HAVING')) - current += self.unlexer.H() - current += self.unlexer.A() - current += self.unlexer.V() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - HAVING.min_depth = 1 - - @depthcontrol - def HOUR(self): - current = self.create_node(UnlexerRule(name='HOUR')) - current += self.unlexer.H() - current += self.unlexer.O() - current += self.unlexer.U() - current += self.unlexer.R() - return current - HOUR.min_depth = 1 - - @depthcontrol - def IF(self): - current = self.create_node(UnlexerRule(name='IF')) - current += self.unlexer.I() - current += self.unlexer.F() - return current - IF.min_depth = 1 - - @depthcontrol - def IN(self): - current = self.create_node(UnlexerRule(name='IN')) - current += self.unlexer.I() - current += self.unlexer.N() - return current - IN.min_depth = 1 - - @depthcontrol - def INF(self): - current = self.create_node(UnlexerRule(name='INF')) - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_15', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_15', choice)] = self.unlexer.weights.get(('alt_15', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.F() - elif choice == 1: - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.F() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.I() - current += self.unlexer.T() - current += self.unlexer.Y() - return current - INF.min_depth = 1 - - @depthcontrol - def INNER(self): - current = self.create_node(UnlexerRule(name='INNER')) - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.N() - current += self.unlexer.E() - current += self.unlexer.R() - return current - INNER.min_depth = 1 - - @depthcontrol - def INSERT(self): - current = self.create_node(UnlexerRule(name='INSERT')) - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.R() - current += self.unlexer.T() - return current - INSERT.min_depth = 1 - - @depthcontrol - def INTERVAL(self): - current = self.create_node(UnlexerRule(name='INTERVAL')) - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.R() - current += self.unlexer.V() - current += self.unlexer.A() - current += self.unlexer.L() - return current - INTERVAL.min_depth = 1 - - @depthcontrol - def INTO(self): - current = self.create_node(UnlexerRule(name='INTO')) - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.T() - current += self.unlexer.O() - return current - INTO.min_depth = 1 - - @depthcontrol - def IS(self): - current = self.create_node(UnlexerRule(name='IS')) - current += self.unlexer.I() - current += self.unlexer.S() - return current - IS.min_depth = 1 - - @depthcontrol - def JOIN(self): - current = self.create_node(UnlexerRule(name='JOIN')) - current += self.unlexer.J() - current += self.unlexer.O() - current += self.unlexer.I() - current += self.unlexer.N() - return current - JOIN.min_depth = 1 - - @depthcontrol - def KEY(self): - current = self.create_node(UnlexerRule(name='KEY')) - current += self.unlexer.K() - current += self.unlexer.E() - current += self.unlexer.Y() - return current - KEY.min_depth = 1 - - @depthcontrol - def LAST(self): - current = self.create_node(UnlexerRule(name='LAST')) - current += self.unlexer.L() - current += self.unlexer.A() - current += self.unlexer.S() - current += self.unlexer.T() - return current - LAST.min_depth = 1 - - @depthcontrol - def LEADING(self): - current = self.create_node(UnlexerRule(name='LEADING')) - current += self.unlexer.L() - current += self.unlexer.E() - current += self.unlexer.A() - current += self.unlexer.D() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - LEADING.min_depth = 1 - - @depthcontrol - def LEFT(self): - current = self.create_node(UnlexerRule(name='LEFT')) - current += self.unlexer.L() - current += self.unlexer.E() - current += self.unlexer.F() - current += self.unlexer.T() - return current - LEFT.min_depth = 1 - - @depthcontrol - def LIKE(self): - current = self.create_node(UnlexerRule(name='LIKE')) - current += self.unlexer.L() - current += self.unlexer.I() - current += self.unlexer.K() - current += self.unlexer.E() - return current - LIKE.min_depth = 1 - - @depthcontrol - def LIMIT(self): - current = self.create_node(UnlexerRule(name='LIMIT')) - current += self.unlexer.L() - current += self.unlexer.I() - current += self.unlexer.M() - current += self.unlexer.I() - current += self.unlexer.T() - return current - LIMIT.min_depth = 1 - - @depthcontrol - def LOCAL(self): - current = self.create_node(UnlexerRule(name='LOCAL')) - current += self.unlexer.L() - current += self.unlexer.O() - current += self.unlexer.C() - current += self.unlexer.A() - current += self.unlexer.L() - return current - LOCAL.min_depth = 1 - - @depthcontrol - def MATERIALIZED(self): - current = self.create_node(UnlexerRule(name='MATERIALIZED')) - current += self.unlexer.M() - current += self.unlexer.A() - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.R() - current += self.unlexer.I() - current += self.unlexer.A() - current += self.unlexer.L() - current += self.unlexer.I() - current += self.unlexer.Z() - current += self.unlexer.E() - current += self.unlexer.D() - return current - MATERIALIZED.min_depth = 1 - - @depthcontrol - def MINUTE(self): - current = self.create_node(UnlexerRule(name='MINUTE')) - current += self.unlexer.M() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.U() - current += self.unlexer.T() - current += self.unlexer.E() - return current - MINUTE.min_depth = 1 - - @depthcontrol - def MONTH(self): - current = self.create_node(UnlexerRule(name='MONTH')) - current += self.unlexer.M() - current += self.unlexer.O() - current += self.unlexer.N() - current += self.unlexer.T() - current += self.unlexer.H() - return current - MONTH.min_depth = 1 - - @depthcontrol - def NAN_SQL(self): - current = self.create_node(UnlexerRule(name='NAN_SQL')) - current += self.unlexer.N() - current += self.unlexer.A() - current += self.unlexer.N() - return current - NAN_SQL.min_depth = 1 - - @depthcontrol - def NOT(self): - current = self.create_node(UnlexerRule(name='NOT')) - current += self.unlexer.N() - current += self.unlexer.O() - current += self.unlexer.T() - return current - NOT.min_depth = 1 - - @depthcontrol - def NULL_SQL(self): - current = self.create_node(UnlexerRule(name='NULL_SQL')) - current += self.unlexer.N() - current += self.unlexer.U() - current += self.unlexer.L() - current += self.unlexer.L() - return current - NULL_SQL.min_depth = 1 - - @depthcontrol - def NULLS(self): - current = self.create_node(UnlexerRule(name='NULLS')) - current += self.unlexer.N() - current += self.unlexer.U() - current += self.unlexer.L() - current += self.unlexer.L() - current += self.unlexer.S() - return current - NULLS.min_depth = 1 - - @depthcontrol - def OFFSET(self): - current = self.create_node(UnlexerRule(name='OFFSET')) - current += self.unlexer.O() - current += self.unlexer.F() - current += self.unlexer.F() - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.T() - return current - OFFSET.min_depth = 1 - - @depthcontrol - def ON(self): - current = self.create_node(UnlexerRule(name='ON')) - current += self.unlexer.O() - current += self.unlexer.N() - return current - ON.min_depth = 1 - - @depthcontrol - def OR(self): - current = self.create_node(UnlexerRule(name='OR')) - current += self.unlexer.O() - current += self.unlexer.R() - return current - OR.min_depth = 1 - - @depthcontrol - def ORDER(self): - current = self.create_node(UnlexerRule(name='ORDER')) - current += self.unlexer.O() - current += self.unlexer.R() - current += self.unlexer.D() - current += self.unlexer.E() - current += self.unlexer.R() - return current - ORDER.min_depth = 1 - - @depthcontrol - def OUTER(self): - current = self.create_node(UnlexerRule(name='OUTER')) - current += self.unlexer.O() - current += self.unlexer.U() - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.R() - return current - OUTER.min_depth = 1 - - @depthcontrol - def OUTFILE(self): - current = self.create_node(UnlexerRule(name='OUTFILE')) - current += self.unlexer.O() - current += self.unlexer.U() - current += self.unlexer.T() - current += self.unlexer.F() - current += self.unlexer.I() - current += self.unlexer.L() - current += self.unlexer.E() - return current - OUTFILE.min_depth = 1 - - @depthcontrol - def PARTITION(self): - current = self.create_node(UnlexerRule(name='PARTITION')) - current += self.unlexer.P() - current += self.unlexer.A() - current += self.unlexer.R() - current += self.unlexer.T() - current += self.unlexer.I() - current += self.unlexer.T() - current += self.unlexer.I() - current += self.unlexer.O() - current += self.unlexer.N() - return current - PARTITION.min_depth = 1 - - @depthcontrol - def PREWHERE(self): - current = self.create_node(UnlexerRule(name='PREWHERE')) - current += self.unlexer.P() - current += self.unlexer.R() - current += self.unlexer.E() - current += self.unlexer.W() - current += self.unlexer.H() - current += self.unlexer.E() - current += self.unlexer.R() - current += self.unlexer.E() - return current - PREWHERE.min_depth = 1 - - @depthcontrol - def PRIMARY(self): - current = self.create_node(UnlexerRule(name='PRIMARY')) - current += self.unlexer.P() - current += self.unlexer.R() - current += self.unlexer.I() - current += self.unlexer.M() - current += self.unlexer.A() - current += self.unlexer.R() - current += self.unlexer.Y() - return current - PRIMARY.min_depth = 1 - - @depthcontrol - def QUARTER(self): - current = self.create_node(UnlexerRule(name='QUARTER')) - current += self.unlexer.Q() - current += self.unlexer.U() - current += self.unlexer.A() - current += self.unlexer.R() - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.R() - return current - QUARTER.min_depth = 1 - - @depthcontrol - def RIGHT(self): - current = self.create_node(UnlexerRule(name='RIGHT')) - current += self.unlexer.R() - current += self.unlexer.I() - current += self.unlexer.G() - current += self.unlexer.H() - current += self.unlexer.T() - return current - RIGHT.min_depth = 1 - - @depthcontrol - def SAMPLE(self): - current = self.create_node(UnlexerRule(name='SAMPLE')) - current += self.unlexer.S() - current += self.unlexer.A() - current += self.unlexer.M() - current += self.unlexer.P() - current += self.unlexer.L() - current += self.unlexer.E() - return current - SAMPLE.min_depth = 1 - - @depthcontrol - def SECOND(self): - current = self.create_node(UnlexerRule(name='SECOND')) - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.C() - current += self.unlexer.O() - current += self.unlexer.N() - current += self.unlexer.D() - return current - SECOND.min_depth = 1 - - @depthcontrol - def SELECT(self): - current = self.create_node(UnlexerRule(name='SELECT')) - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.L() - current += self.unlexer.E() - current += self.unlexer.C() - current += self.unlexer.T() - return current - SELECT.min_depth = 1 - - @depthcontrol - def SEMI(self): - current = self.create_node(UnlexerRule(name='SEMI')) - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.M() - current += self.unlexer.I() - return current - SEMI.min_depth = 1 - - @depthcontrol - def SET(self): - current = self.create_node(UnlexerRule(name='SET')) - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.T() - return current - SET.min_depth = 1 - - @depthcontrol - def SETTINGS(self): - current = self.create_node(UnlexerRule(name='SETTINGS')) - current += self.unlexer.S() - current += self.unlexer.E() - current += self.unlexer.T() - current += self.unlexer.T() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - current += self.unlexer.S() - return current - SETTINGS.min_depth = 1 - - @depthcontrol - def TABLE(self): - current = self.create_node(UnlexerRule(name='TABLE')) - current += self.unlexer.T() - current += self.unlexer.A() - current += self.unlexer.B() - current += self.unlexer.L() - current += self.unlexer.E() - return current - TABLE.min_depth = 1 - - @depthcontrol - def TEMPORARY(self): - current = self.create_node(UnlexerRule(name='TEMPORARY')) - current += self.unlexer.T() - current += self.unlexer.E() - current += self.unlexer.M() - current += self.unlexer.P() - current += self.unlexer.O() - current += self.unlexer.R() - current += self.unlexer.A() - current += self.unlexer.R() - current += self.unlexer.Y() - return current - TEMPORARY.min_depth = 1 - - @depthcontrol - def THEN(self): - current = self.create_node(UnlexerRule(name='THEN')) - current += self.unlexer.T() - current += self.unlexer.H() - current += self.unlexer.E() - current += self.unlexer.N() - return current - THEN.min_depth = 1 - - @depthcontrol - def TO(self): - current = self.create_node(UnlexerRule(name='TO')) - current += self.unlexer.T() - current += self.unlexer.O() - return current - TO.min_depth = 1 - - @depthcontrol - def TOTALS(self): - current = self.create_node(UnlexerRule(name='TOTALS')) - current += self.unlexer.T() - current += self.unlexer.O() - current += self.unlexer.T() - current += self.unlexer.A() - current += self.unlexer.L() - current += self.unlexer.S() - return current - TOTALS.min_depth = 1 - - @depthcontrol - def TRAILING(self): - current = self.create_node(UnlexerRule(name='TRAILING')) - current += self.unlexer.T() - current += self.unlexer.R() - current += self.unlexer.A() - current += self.unlexer.I() - current += self.unlexer.L() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - TRAILING.min_depth = 1 - - @depthcontrol - def TRIM(self): - current = self.create_node(UnlexerRule(name='TRIM')) - current += self.unlexer.T() - current += self.unlexer.R() - current += self.unlexer.I() - current += self.unlexer.M() - return current - TRIM.min_depth = 1 - - @depthcontrol - def TTL(self): - current = self.create_node(UnlexerRule(name='TTL')) - current += self.unlexer.T() - current += self.unlexer.T() - current += self.unlexer.L() - return current - TTL.min_depth = 1 - - @depthcontrol - def UNION(self): - current = self.create_node(UnlexerRule(name='UNION')) - current += self.unlexer.U() - current += self.unlexer.N() - current += self.unlexer.I() - current += self.unlexer.O() - current += self.unlexer.N() - return current - UNION.min_depth = 1 - - @depthcontrol - def USING(self): - current = self.create_node(UnlexerRule(name='USING')) - current += self.unlexer.U() - current += self.unlexer.S() - current += self.unlexer.I() - current += self.unlexer.N() - current += self.unlexer.G() - return current - USING.min_depth = 1 - - @depthcontrol - def VALUES(self): - current = self.create_node(UnlexerRule(name='VALUES')) - current += self.unlexer.V() - current += self.unlexer.A() - current += self.unlexer.L() - current += self.unlexer.U() - current += self.unlexer.E() - current += self.unlexer.S() - return current - VALUES.min_depth = 1 - - @depthcontrol - def VOLUME(self): - current = self.create_node(UnlexerRule(name='VOLUME')) - current += self.unlexer.V() - current += self.unlexer.O() - current += self.unlexer.L() - current += self.unlexer.U() - current += self.unlexer.M() - current += self.unlexer.E() - return current - VOLUME.min_depth = 1 - - @depthcontrol - def WEEK(self): - current = self.create_node(UnlexerRule(name='WEEK')) - current += self.unlexer.W() - current += self.unlexer.E() - current += self.unlexer.E() - current += self.unlexer.K() - return current - WEEK.min_depth = 1 - - @depthcontrol - def WHEN(self): - current = self.create_node(UnlexerRule(name='WHEN')) - current += self.unlexer.W() - current += self.unlexer.H() - current += self.unlexer.E() - current += self.unlexer.N() - return current - WHEN.min_depth = 1 - - @depthcontrol - def WHERE(self): - current = self.create_node(UnlexerRule(name='WHERE')) - current += self.unlexer.W() - current += self.unlexer.H() - current += self.unlexer.E() - current += self.unlexer.R() - current += self.unlexer.E() - return current - WHERE.min_depth = 1 - - @depthcontrol - def WITH(self): - current = self.create_node(UnlexerRule(name='WITH')) - current += self.unlexer.W() - current += self.unlexer.I() - current += self.unlexer.T() - current += self.unlexer.H() - return current - WITH.min_depth = 1 - - @depthcontrol - def YEAR(self): - current = self.create_node(UnlexerRule(name='YEAR')) - current += self.unlexer.Y() - current += self.unlexer.E() - current += self.unlexer.A() - current += self.unlexer.R() - return current - YEAR.min_depth = 1 - - @depthcontrol - def IDENTIFIER(self): - current = self.create_node(UnlexerRule(name='IDENTIFIER')) - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_18', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_18', choice)] = self.unlexer.weights.get(('alt_18', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.LETTER() - elif choice == 1: - current += self.unlexer.UNDERSCORE() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_more(): - choice = self.choice([0 if [1, 1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_22', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_22', choice)] = self.unlexer.weights.get(('alt_22', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.LETTER() - elif choice == 1: - current += self.unlexer.UNDERSCORE() - elif choice == 2: - current += self.unlexer.DEC_DIGIT() - - return current - IDENTIFIER.min_depth = 1 - - @depthcontrol - def FLOATING_LITERAL(self): - current = self.create_node(UnlexerRule(name='FLOATING_LITERAL')) - choice = self.choice([0 if [2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_26', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_26', choice)] = self.unlexer.weights.get(('alt_26', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.HEXADECIMAL_LITERAL() - current += self.unlexer.DOT() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_more(): - current += self.unlexer.HEX_DIGIT() - - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_33', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_33', choice)] = self.unlexer.weights.get(('alt_33', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.P() - elif choice == 1: - current += self.unlexer.E() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_37', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_37', choice)] = self.unlexer.weights.get(('alt_37', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.PLUS() - elif choice == 1: - current += self.unlexer.DASH() - - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.DEC_DIGIT() - - - elif choice == 1: - current += self.unlexer.HEXADECIMAL_LITERAL() - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_40', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_40', choice)] = self.unlexer.weights.get(('alt_40', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.P() - elif choice == 1: - current += self.unlexer.E() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_44', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_44', choice)] = self.unlexer.weights.get(('alt_44', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.PLUS() - elif choice == 1: - current += self.unlexer.DASH() - - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.DEC_DIGIT() - - elif choice == 2: - current += self.unlexer.INTEGER_LITERAL() - current += self.unlexer.DOT() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_more(): - current += self.unlexer.DEC_DIGIT() - - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - current += self.unlexer.E() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_50', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_50', choice)] = self.unlexer.weights.get(('alt_50', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.PLUS() - elif choice == 1: - current += self.unlexer.DASH() - - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.DEC_DIGIT() - - - elif choice == 3: - current += self.unlexer.INTEGER_LITERAL() - current += self.unlexer.E() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_54', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_54', choice)] = self.unlexer.weights.get(('alt_54', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.PLUS() - elif choice == 1: - current += self.unlexer.DASH() - - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.DEC_DIGIT() - - return current - FLOATING_LITERAL.min_depth = 2 - - @depthcontrol - def HEXADECIMAL_LITERAL(self): - current = self.create_node(UnlexerRule(name='HEXADECIMAL_LITERAL')) - current += self.create_node(UnlexerRule(src='0')) - current += self.unlexer.X() - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.HEX_DIGIT() - - return current - HEXADECIMAL_LITERAL.min_depth = 1 - - @depthcontrol - def INTEGER_LITERAL(self): - current = self.create_node(UnlexerRule(name='INTEGER_LITERAL')) - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.DEC_DIGIT() - - return current - INTEGER_LITERAL.min_depth = 1 - - @depthcontrol - def STRING_LITERAL(self): - current = self.create_node(UnlexerRule(name='STRING_LITERAL')) - current += self.unlexer.QUOTE_SINGLE() - if self.unlexer.max_depth >= 0: - for _ in self.zero_or_more(): - choice = self.choice([0 if [0, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_59', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_59', choice)] = self.unlexer.weights.get(('alt_59', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += UnlexerRule(src=self.char_from_list(charset_0)) - elif choice == 1: - current += self.unlexer.BACKSLASH() - current += UnlexerRule(src=self.any_char()) - - current += self.unlexer.QUOTE_SINGLE() - return current - STRING_LITERAL.min_depth = 1 - - @depthcontrol - def A(self): - current = self.create_node(UnlexerRule(name='A')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_1))) - return current - A.min_depth = 0 - - @depthcontrol - def B(self): - current = self.create_node(UnlexerRule(name='B')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_2))) - return current - B.min_depth = 0 - - @depthcontrol - def C(self): - current = self.create_node(UnlexerRule(name='C')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_3))) - return current - C.min_depth = 0 - - @depthcontrol - def D(self): - current = self.create_node(UnlexerRule(name='D')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_4))) - return current - D.min_depth = 0 - - @depthcontrol - def E(self): - current = self.create_node(UnlexerRule(name='E')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_5))) - return current - E.min_depth = 0 - - @depthcontrol - def F(self): - current = self.create_node(UnlexerRule(name='F')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_6))) - return current - F.min_depth = 0 - - @depthcontrol - def G(self): - current = self.create_node(UnlexerRule(name='G')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_7))) - return current - G.min_depth = 0 - - @depthcontrol - def H(self): - current = self.create_node(UnlexerRule(name='H')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_8))) - return current - H.min_depth = 0 - - @depthcontrol - def I(self): - current = self.create_node(UnlexerRule(name='I')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_9))) - return current - I.min_depth = 0 - - @depthcontrol - def J(self): - current = self.create_node(UnlexerRule(name='J')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_10))) - return current - J.min_depth = 0 - - @depthcontrol - def K(self): - current = self.create_node(UnlexerRule(name='K')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_11))) - return current - K.min_depth = 0 - - @depthcontrol - def L(self): - current = self.create_node(UnlexerRule(name='L')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_12))) - return current - L.min_depth = 0 - - @depthcontrol - def M(self): - current = self.create_node(UnlexerRule(name='M')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_13))) - return current - M.min_depth = 0 - - @depthcontrol - def N(self): - current = self.create_node(UnlexerRule(name='N')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_14))) - return current - N.min_depth = 0 - - @depthcontrol - def O(self): - current = self.create_node(UnlexerRule(name='O')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_15))) - return current - O.min_depth = 0 - - @depthcontrol - def P(self): - current = self.create_node(UnlexerRule(name='P')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_16))) - return current - P.min_depth = 0 - - @depthcontrol - def Q(self): - current = self.create_node(UnlexerRule(name='Q')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_17))) - return current - Q.min_depth = 0 - - @depthcontrol - def R(self): - current = self.create_node(UnlexerRule(name='R')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_18))) - return current - R.min_depth = 0 - - @depthcontrol - def S(self): - current = self.create_node(UnlexerRule(name='S')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_19))) - return current - S.min_depth = 0 - - @depthcontrol - def T(self): - current = self.create_node(UnlexerRule(name='T')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_20))) - return current - T.min_depth = 0 - - @depthcontrol - def U(self): - current = self.create_node(UnlexerRule(name='U')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_21))) - return current - U.min_depth = 0 - - @depthcontrol - def V(self): - current = self.create_node(UnlexerRule(name='V')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_22))) - return current - V.min_depth = 0 - - @depthcontrol - def W(self): - current = self.create_node(UnlexerRule(name='W')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_23))) - return current - W.min_depth = 0 - - @depthcontrol - def X(self): - current = self.create_node(UnlexerRule(name='X')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_24))) - return current - X.min_depth = 0 - - @depthcontrol - def Y(self): - current = self.create_node(UnlexerRule(name='Y')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_25))) - return current - Y.min_depth = 0 - - @depthcontrol - def Z(self): - current = self.create_node(UnlexerRule(name='Z')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_26))) - return current - Z.min_depth = 0 - - @depthcontrol - def LETTER(self): - current = self.create_node(UnlexerRule(name='LETTER')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_27))) - return current - LETTER.min_depth = 0 - - @depthcontrol - def DEC_DIGIT(self): - current = self.create_node(UnlexerRule(name='DEC_DIGIT')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_28))) - return current - DEC_DIGIT.min_depth = 0 - - @depthcontrol - def HEX_DIGIT(self): - current = self.create_node(UnlexerRule(name='HEX_DIGIT')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_29))) - return current - HEX_DIGIT.min_depth = 0 - - @depthcontrol - def ARROW(self): - current = self.create_node(UnlexerRule(name='ARROW')) - current += self.create_node(UnlexerRule(src='->')) - return current - ARROW.min_depth = 0 - - @depthcontrol - def ASTERISK(self): - current = self.create_node(UnlexerRule(name='ASTERISK')) - current += self.create_node(UnlexerRule(src='*')) - return current - ASTERISK.min_depth = 0 - - @depthcontrol - def BACKQUOTE(self): - current = self.create_node(UnlexerRule(name='BACKQUOTE')) - current += self.create_node(UnlexerRule(src='`')) - return current - BACKQUOTE.min_depth = 0 - - @depthcontrol - def BACKSLASH(self): - current = self.create_node(UnlexerRule(name='BACKSLASH')) - current += self.create_node(UnlexerRule(src='\\')) - return current - BACKSLASH.min_depth = 0 - - @depthcontrol - def COLON(self): - current = self.create_node(UnlexerRule(name='COLON')) - current += self.create_node(UnlexerRule(src=':')) - return current - COLON.min_depth = 0 - - @depthcontrol - def COMMA(self): - current = self.create_node(UnlexerRule(name='COMMA')) - current += self.create_node(UnlexerRule(src=',')) - return current - COMMA.min_depth = 0 - - @depthcontrol - def CONCAT(self): - current = self.create_node(UnlexerRule(name='CONCAT')) - current += self.create_node(UnlexerRule(src='||')) - return current - CONCAT.min_depth = 0 - - @depthcontrol - def DASH(self): - current = self.create_node(UnlexerRule(name='DASH')) - current += self.create_node(UnlexerRule(src='-')) - return current - DASH.min_depth = 0 - - @depthcontrol - def DOT(self): - current = self.create_node(UnlexerRule(name='DOT')) - current += self.create_node(UnlexerRule(src='.')) - return current - DOT.min_depth = 0 - - @depthcontrol - def EQ_DOUBLE(self): - current = self.create_node(UnlexerRule(name='EQ_DOUBLE')) - current += self.create_node(UnlexerRule(src='==')) - return current - EQ_DOUBLE.min_depth = 0 - - @depthcontrol - def EQ_SINGLE(self): - current = self.create_node(UnlexerRule(name='EQ_SINGLE')) - current += self.create_node(UnlexerRule(src='=')) - return current - EQ_SINGLE.min_depth = 0 - - @depthcontrol - def GE(self): - current = self.create_node(UnlexerRule(name='GE')) - current += self.create_node(UnlexerRule(src='>=')) - return current - GE.min_depth = 0 - - @depthcontrol - def GT(self): - current = self.create_node(UnlexerRule(name='GT')) - current += self.create_node(UnlexerRule(src='>')) - return current - GT.min_depth = 0 - - @depthcontrol - def LBRACKET(self): - current = self.create_node(UnlexerRule(name='LBRACKET')) - current += self.create_node(UnlexerRule(src='[')) - return current - LBRACKET.min_depth = 0 - - @depthcontrol - def LE(self): - current = self.create_node(UnlexerRule(name='LE')) - current += self.create_node(UnlexerRule(src='<=')) - return current - LE.min_depth = 0 - - @depthcontrol - def LPAREN(self): - current = self.create_node(UnlexerRule(name='LPAREN')) - current += self.create_node(UnlexerRule(src='(')) - return current - LPAREN.min_depth = 0 - - @depthcontrol - def LT(self): - current = self.create_node(UnlexerRule(name='LT')) - current += self.create_node(UnlexerRule(src='<')) - return current - LT.min_depth = 0 - - @depthcontrol - def NOT_EQ(self): - current = self.create_node(UnlexerRule(name='NOT_EQ')) - choice = self.choice([0 if [0, 0][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_79', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_79', choice)] = self.unlexer.weights.get(('alt_79', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.create_node(UnlexerRule(src='!=')) - elif choice == 1: - current += self.create_node(UnlexerRule(src='<>')) - return current - NOT_EQ.min_depth = 0 - - @depthcontrol - def PERCENT(self): - current = self.create_node(UnlexerRule(name='PERCENT')) - current += self.create_node(UnlexerRule(src='%')) - return current - PERCENT.min_depth = 0 - - @depthcontrol - def PLUS(self): - current = self.create_node(UnlexerRule(name='PLUS')) - current += self.create_node(UnlexerRule(src='+')) - return current - PLUS.min_depth = 0 - - @depthcontrol - def QUERY(self): - current = self.create_node(UnlexerRule(name='QUERY')) - current += self.create_node(UnlexerRule(src='?')) - return current - QUERY.min_depth = 0 - - @depthcontrol - def QUOTE_SINGLE(self): - current = self.create_node(UnlexerRule(name='QUOTE_SINGLE')) - current += self.create_node(UnlexerRule(src='\'')) - return current - QUOTE_SINGLE.min_depth = 0 - - @depthcontrol - def RBRACKET(self): - current = self.create_node(UnlexerRule(name='RBRACKET')) - current += self.create_node(UnlexerRule(src=']')) - return current - RBRACKET.min_depth = 0 - - @depthcontrol - def RPAREN(self): - current = self.create_node(UnlexerRule(name='RPAREN')) - current += self.create_node(UnlexerRule(src=')')) - return current - RPAREN.min_depth = 0 - - @depthcontrol - def SEMICOLON(self): - current = self.create_node(UnlexerRule(name='SEMICOLON')) - current += self.create_node(UnlexerRule(src=';')) - return current - SEMICOLON.min_depth = 0 - - @depthcontrol - def SLASH(self): - current = self.create_node(UnlexerRule(name='SLASH')) - current += self.create_node(UnlexerRule(src='/')) - return current - SLASH.min_depth = 0 - - @depthcontrol - def UNDERSCORE(self): - current = self.create_node(UnlexerRule(name='UNDERSCORE')) - current += self.create_node(UnlexerRule(src='_')) - return current - UNDERSCORE.min_depth = 0 - - @depthcontrol - def SINGLE_LINE_COMMENT(self): - current = self.create_node(UnlexerRule(name='SINGLE_LINE_COMMENT')) - current += self.create_node(UnlexerRule(src='--')) - if self.unlexer.max_depth >= 0: - for _ in self.zero_or_more(): - current += UnlexerRule(src=self.char_from_list(charset_30)) - - choice = self.choice([0 if [0, 0, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_95', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_95', choice)] = self.unlexer.weights.get(('alt_95', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.create_node(UnlexerRule(src='\n')) - elif choice == 1: - current += self.create_node(UnlexerRule(src='\r')) - elif choice == 2: - current += self.unlexer.EOF() - return current - SINGLE_LINE_COMMENT.min_depth = 0 - - @depthcontrol - def MULTI_LINE_COMMENT(self): - current = self.create_node(UnlexerRule(name='MULTI_LINE_COMMENT')) - current += self.create_node(UnlexerRule(src='/*')) - if self.unlexer.max_depth >= 0: - for _ in self.zero_or_more(): - current += UnlexerRule(src=self.any_char()) - - current += self.create_node(UnlexerRule(src='*/')) - return current - MULTI_LINE_COMMENT.min_depth = 0 - - @depthcontrol - def WHITESPACE(self): - current = self.create_node(UnlexerRule(name='WHITESPACE')) - current += self.create_node(UnlexerRule(src=self.char_from_list(charset_31))) - return current - WHITESPACE.min_depth = 0 - diff --git a/utils/grammar-fuzzer/ClickHouseUnparser.py b/utils/grammar-fuzzer/ClickHouseUnparser.py deleted file mode 100644 index 7fa5eb96d31..00000000000 --- a/utils/grammar-fuzzer/ClickHouseUnparser.py +++ /dev/null @@ -1,1815 +0,0 @@ -# Generated by Grammarinator 19.3 - -from itertools import chain -from grammarinator.runtime import * - -import ClickHouseUnlexer - - -class ClickHouseUnparser(Grammarinator): - - def __init__(self, unlexer): - super(ClickHouseUnparser, self).__init__() - self.unlexer = unlexer - @depthcontrol - def queryList(self): - current = self.create_node(UnparserRule(name='queryList')) - current += self.queryStmt() - if self.unlexer.max_depth >= 8: - for _ in self.zero_or_more(): - current += self.unlexer.SEMICOLON() - current += self.queryStmt() - - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - current += self.unlexer.SEMICOLON() - - current += self.unlexer.EOF() - return current - queryList.min_depth = 8 - - @depthcontrol - def queryStmt(self): - current = self.create_node(UnparserRule(name='queryStmt')) - current += self.query() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.INTO() - current += self.unlexer.OUTFILE() - current += self.unlexer.STRING_LITERAL() - - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.FORMAT() - current += self.identifier() - - return current - queryStmt.min_depth = 7 - - @depthcontrol - def query(self): - current = self.create_node(UnparserRule(name='query')) - choice = self.choice([0 if [6, 7, 6, 6][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_108', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_108', choice)] = self.unlexer.weights.get(('alt_108', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.distributedStmt() - elif choice == 1: - current += self.insertStmt() - elif choice == 2: - current += self.selectUnionStmt() - elif choice == 3: - current += self.setStmt() - return current - query.min_depth = 6 - - @depthcontrol - def distributedStmt(self): - current = self.create_node(UnparserRule(name='distributedStmt')) - choice = self.choice([0 if [5, 6, 6][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_113', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_113', choice)] = self.unlexer.weights.get(('alt_113', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.createDatabaseStmt() - elif choice == 1: - current += self.createTableStmt() - elif choice == 2: - current += self.dropStmt() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.ON() - current += self.unlexer.CLUSTER() - current += self.identifier() - - return current - distributedStmt.min_depth = 5 - - @depthcontrol - def createDatabaseStmt(self): - current = self.create_node(UnparserRule(name='createDatabaseStmt')) - current += self.unlexer.CREATE() - current += self.unlexer.DATABASE() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.IF() - current += self.unlexer.NOT() - current += self.unlexer.EXISTS() - - current += self.databaseIdentifier() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.engineExpr() - - return current - createDatabaseStmt.min_depth = 4 - - @depthcontrol - def createTableStmt(self): - current = self.create_node(UnparserRule(name='createTableStmt')) - current += self.unlexer.CREATE() - current += self.unlexer.TABLE() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.IF() - current += self.unlexer.NOT() - current += self.unlexer.EXISTS() - - current += self.tableIdentifier() - current += self.schemaClause() - return current - createTableStmt.min_depth = 5 - - @depthcontrol - def schemaClause(self): - current = self.create_node(UnparserRule(name='schemaClause')) - choice = self.choice([0 if [8, 7, 5, 4][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_121', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_121', choice)] = self.unlexer.weights.get(('alt_121', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.schemaClause_SchemaDescriptionClause() - elif choice == 1: - current = self.schemaClause_SchemaAsSubqueryClause() - elif choice == 2: - current = self.schemaClause_SchemaAsTableClause() - elif choice == 3: - current = self.schemaClause_SchemaAsFunctionClause() - return current - schemaClause.min_depth = 4 - - @depthcontrol - def schemaClause_SchemaDescriptionClause(self): - current = self.create_node(UnparserRule(name='schemaClause_SchemaDescriptionClause')) - current += self.unlexer.LPAREN() - current += self.tableElementExpr() - if self.unlexer.max_depth >= 7: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.tableElementExpr() - - current += self.unlexer.RPAREN() - current += self.engineClause() - return current - schemaClause_SchemaDescriptionClause.min_depth = 7 - - @depthcontrol - def schemaClause_SchemaAsSubqueryClause(self): - current = self.create_node(UnparserRule(name='schemaClause_SchemaAsSubqueryClause')) - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.engineClause() - - current += self.unlexer.AS() - current += self.selectUnionStmt() - return current - schemaClause_SchemaAsSubqueryClause.min_depth = 6 - - @depthcontrol - def schemaClause_SchemaAsTableClause(self): - current = self.create_node(UnparserRule(name='schemaClause_SchemaAsTableClause')) - current += self.unlexer.AS() - current += self.tableIdentifier() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.engineClause() - - return current - schemaClause_SchemaAsTableClause.min_depth = 4 - - @depthcontrol - def schemaClause_SchemaAsFunctionClause(self): - current = self.create_node(UnparserRule(name='schemaClause_SchemaAsFunctionClause')) - current += self.unlexer.AS() - current += self.identifier() - current += self.unlexer.LPAREN() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.tableArgList() - - current += self.unlexer.RPAREN() - return current - schemaClause_SchemaAsFunctionClause.min_depth = 3 - - @depthcontrol - def engineClause(self): - current = self.create_node(UnparserRule(name='engineClause')) - current += self.engineExpr() - if self.unlexer.max_depth >= 6: - for _ in self.zero_or_one(): - current += self.orderByClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.partitionByClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.primaryKeyClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.sampleByClause() - - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.ttlClause() - - if self.unlexer.max_depth >= 6: - for _ in self.zero_or_one(): - current += self.settingsClause() - - return current - engineClause.min_depth = 4 - - @depthcontrol - def partitionByClause(self): - current = self.create_node(UnparserRule(name='partitionByClause')) - current += self.unlexer.PARTITION() - current += self.unlexer.BY() - current += self.columnExpr() - return current - partitionByClause.min_depth = 3 - - @depthcontrol - def primaryKeyClause(self): - current = self.create_node(UnparserRule(name='primaryKeyClause')) - current += self.unlexer.PRIMARY() - current += self.unlexer.KEY() - current += self.columnExpr() - return current - primaryKeyClause.min_depth = 3 - - @depthcontrol - def sampleByClause(self): - current = self.create_node(UnparserRule(name='sampleByClause')) - current += self.unlexer.SAMPLE() - current += self.unlexer.BY() - current += self.columnExpr() - return current - sampleByClause.min_depth = 3 - - @depthcontrol - def ttlClause(self): - current = self.create_node(UnparserRule(name='ttlClause')) - current += self.unlexer.TTL() - current += self.ttlExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.ttlExpr() - - return current - ttlClause.min_depth = 4 - - @depthcontrol - def engineExpr(self): - current = self.create_node(UnparserRule(name='engineExpr')) - current += self.unlexer.ENGINE() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - current += self.unlexer.EQ_SINGLE() - - current += self.identifier() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - current += self.unlexer.LPAREN() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.tableArgList() - - current += self.unlexer.RPAREN() - - return current - engineExpr.min_depth = 3 - - @depthcontrol - def tableElementExpr(self): - current = self.create_node(UnparserRule(name='tableElementExpr')) - current = self.tableElementExpr_TableElementColumn() - return current - tableElementExpr.min_depth = 6 - - @depthcontrol - def tableElementExpr_TableElementColumn(self): - current = self.create_node(UnparserRule(name='tableElementExpr_TableElementColumn')) - current += self.identifier() - current += self.columnTypeExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.tableColumnPropertyExpr() - - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.TTL() - current += self.columnExpr() - - return current - tableElementExpr_TableElementColumn.min_depth = 5 - - @depthcontrol - def tableColumnPropertyExpr(self): - current = self.create_node(UnparserRule(name='tableColumnPropertyExpr')) - choice = self.choice([0 if [2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_142', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_142', choice)] = self.unlexer.weights.get(('alt_142', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.DEFAULT() - elif choice == 1: - current += self.unlexer.MATERIALIZED() - elif choice == 2: - current += self.unlexer.ALIAS() - current += self.columnExpr() - return current - tableColumnPropertyExpr.min_depth = 3 - - @depthcontrol - def ttlExpr(self): - current = self.create_node(UnparserRule(name='ttlExpr')) - current += self.columnExpr() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_147', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_147', choice)] = self.unlexer.weights.get(('alt_147', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.DELETE() - elif choice == 1: - current += self.unlexer.TO() - current += self.unlexer.DISK() - current += self.unlexer.STRING_LITERAL() - elif choice == 2: - current += self.unlexer.TO() - current += self.unlexer.VOLUME() - current += self.unlexer.STRING_LITERAL() - - return current - ttlExpr.min_depth = 3 - - @depthcontrol - def dropStmt(self): - current = self.create_node(UnparserRule(name='dropStmt')) - choice = self.choice([0 if [5, 5][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_151', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_151', choice)] = self.unlexer.weights.get(('alt_151', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.dropStmt_DropDatabaseStmt() - elif choice == 1: - current = self.dropStmt_DropTableStmt() - return current - dropStmt.min_depth = 5 - - @depthcontrol - def dropStmt_DropDatabaseStmt(self): - current = self.create_node(UnparserRule(name='dropStmt_DropDatabaseStmt')) - current += self.unlexer.DROP() - current += self.unlexer.DATABASE() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.IF() - current += self.unlexer.EXISTS() - - current += self.databaseIdentifier() - return current - dropStmt_DropDatabaseStmt.min_depth = 4 - - @depthcontrol - def dropStmt_DropTableStmt(self): - current = self.create_node(UnparserRule(name='dropStmt_DropTableStmt')) - current += self.unlexer.DROP() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.TEMPORARY() - - current += self.unlexer.TABLE() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.IF() - current += self.unlexer.EXISTS() - - current += self.tableIdentifier() - return current - dropStmt_DropTableStmt.min_depth = 4 - - @depthcontrol - def insertStmt(self): - current = self.create_node(UnparserRule(name='insertStmt')) - current += self.unlexer.INSERT() - current += self.unlexer.INTO() - current += self.tableIdentifier() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.LPAREN() - current += self.identifier() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.identifier() - - current += self.unlexer.RPAREN() - - current += self.valuesClause() - return current - insertStmt.min_depth = 6 - - @depthcontrol - def valuesClause(self): - current = self.create_node(UnparserRule(name='valuesClause')) - choice = self.choice([0 if [5, 6][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_159', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_159', choice)] = self.unlexer.weights.get(('alt_159', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.VALUES() - current += self.valueTupleExpr() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.valueTupleExpr() - - elif choice == 1: - current += self.selectUnionStmt() - return current - valuesClause.min_depth = 5 - - @depthcontrol - def valueTupleExpr(self): - current = self.create_node(UnparserRule(name='valueTupleExpr')) - current += self.unlexer.LPAREN() - current += self.valueExprList() - current += self.unlexer.RPAREN() - return current - valueTupleExpr.min_depth = 4 - - @depthcontrol - def selectUnionStmt(self): - current = self.create_node(UnparserRule(name='selectUnionStmt')) - current += self.selectStmt() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_more(): - current += self.unlexer.UNION() - current += self.unlexer.ALL() - current += self.selectStmt() - - return current - selectUnionStmt.min_depth = 5 - - @depthcontrol - def selectStmt(self): - current = self.create_node(UnparserRule(name='selectStmt')) - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.withClause() - - current += self.unlexer.SELECT() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.DISTINCT() - - current += self.columnExprList() - if self.unlexer.max_depth >= 8: - for _ in self.zero_or_one(): - current += self.fromClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.sampleClause() - - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.arrayJoinClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.prewhereClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.whereClause() - - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.groupByClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.havingClause() - - if self.unlexer.max_depth >= 6: - for _ in self.zero_or_one(): - current += self.orderByClause() - - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.limitByClause() - - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.limitClause() - - if self.unlexer.max_depth >= 6: - for _ in self.zero_or_one(): - current += self.settingsClause() - - return current - selectStmt.min_depth = 4 - - @depthcontrol - def withClause(self): - current = self.create_node(UnparserRule(name='withClause')) - current += self.unlexer.WITH() - current += self.columnExprList() - return current - withClause.min_depth = 4 - - @depthcontrol - def fromClause(self): - current = self.create_node(UnparserRule(name='fromClause')) - current += self.unlexer.FROM() - current += self.joinExpr() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.FINAL() - - return current - fromClause.min_depth = 7 - - @depthcontrol - def sampleClause(self): - current = self.create_node(UnparserRule(name='sampleClause')) - current += self.unlexer.SAMPLE() - current += self.ratioExpr() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.OFFSET() - current += self.ratioExpr() - - return current - sampleClause.min_depth = 3 - - @depthcontrol - def arrayJoinClause(self): - current = self.create_node(UnparserRule(name='arrayJoinClause')) - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.LEFT() - - current += self.unlexer.ARRAY() - current += self.unlexer.JOIN() - current += self.columnExprList() - return current - arrayJoinClause.min_depth = 4 - - @depthcontrol - def prewhereClause(self): - current = self.create_node(UnparserRule(name='prewhereClause')) - current += self.unlexer.PREWHERE() - current += self.columnExpr() - return current - prewhereClause.min_depth = 3 - - @depthcontrol - def whereClause(self): - current = self.create_node(UnparserRule(name='whereClause')) - current += self.unlexer.WHERE() - current += self.columnExpr() - return current - whereClause.min_depth = 3 - - @depthcontrol - def groupByClause(self): - current = self.create_node(UnparserRule(name='groupByClause')) - current += self.unlexer.GROUP() - current += self.unlexer.BY() - current += self.columnExprList() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.WITH() - current += self.unlexer.TOTALS() - - return current - groupByClause.min_depth = 4 - - @depthcontrol - def havingClause(self): - current = self.create_node(UnparserRule(name='havingClause')) - current += self.unlexer.HAVING() - current += self.columnExpr() - return current - havingClause.min_depth = 3 - - @depthcontrol - def orderByClause(self): - current = self.create_node(UnparserRule(name='orderByClause')) - current += self.unlexer.ORDER() - current += self.unlexer.BY() - current += self.orderExprList() - return current - orderByClause.min_depth = 5 - - @depthcontrol - def limitByClause(self): - current = self.create_node(UnparserRule(name='limitByClause')) - current += self.unlexer.LIMIT() - current += self.limitExpr() - current += self.unlexer.BY() - current += self.columnExprList() - return current - limitByClause.min_depth = 4 - - @depthcontrol - def limitClause(self): - current = self.create_node(UnparserRule(name='limitClause')) - current += self.unlexer.LIMIT() - current += self.limitExpr() - return current - limitClause.min_depth = 3 - - @depthcontrol - def settingsClause(self): - current = self.create_node(UnparserRule(name='settingsClause')) - current += self.unlexer.SETTINGS() - current += self.settingExprList() - return current - settingsClause.min_depth = 5 - - @depthcontrol - def joinExpr(self): - current = self.create_node(UnparserRule(name='joinExpr')) - choice = self.choice([0 if [6, 8, 8, 8][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_181', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_181', choice)] = self.unlexer.weights.get(('alt_181', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.joinExpr_JoinExprTable() - elif choice == 1: - current = self.joinExpr_JoinExprParens() - elif choice == 2: - current = self.joinExpr_JoinExprOp() - elif choice == 3: - current = self.joinExpr_JoinExprCrossOp() - return current - joinExpr.min_depth = 6 - - @depthcontrol - def joinExpr_JoinExprTable(self): - current = self.create_node(UnparserRule(name='joinExpr_JoinExprTable')) - current += self.tableExpr() - return current - joinExpr_JoinExprTable.min_depth = 5 - - @depthcontrol - def joinExpr_JoinExprParens(self): - current = self.create_node(UnparserRule(name='joinExpr_JoinExprParens')) - current += self.unlexer.LPAREN() - current += self.joinExpr() - current += self.unlexer.RPAREN() - return current - joinExpr_JoinExprParens.min_depth = 7 - - @depthcontrol - def joinExpr_JoinExprOp(self): - current = self.create_node(UnparserRule(name='joinExpr_JoinExprOp')) - current += self.joinExpr() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_187', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_187', choice)] = self.unlexer.weights.get(('alt_187', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.GLOBAL() - elif choice == 1: - current += self.unlexer.LOCAL() - - current += self.joinOp() - current += self.unlexer.JOIN() - current += self.joinExpr() - current += self.joinConstraintClause() - return current - joinExpr_JoinExprOp.min_depth = 7 - - @depthcontrol - def joinExpr_JoinExprCrossOp(self): - current = self.create_node(UnparserRule(name='joinExpr_JoinExprCrossOp')) - current += self.joinExpr() - current += self.joinOpCross() - current += self.joinExpr() - return current - joinExpr_JoinExprCrossOp.min_depth = 7 - - @depthcontrol - def joinOp(self): - current = self.create_node(UnparserRule(name='joinOp')) - choice = self.choice([0 if [3, 3, 3][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_190', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_190', choice)] = self.unlexer.weights.get(('alt_190', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.joinOp_JoinOpInner() - elif choice == 1: - current = self.joinOp_JoinOpLeftRight() - elif choice == 2: - current = self.joinOp_JoinOpFull() - return current - joinOp.min_depth = 3 - - @depthcontrol - def joinOp_JoinOpInner(self): - current = self.create_node(UnparserRule(name='joinOp_JoinOpInner')) - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_194', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_194', choice)] = self.unlexer.weights.get(('alt_194', choice), 1) * self.unlexer.cooldown - if choice == 0: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.ANY() - - current += self.unlexer.INNER() - elif choice == 1: - current += self.unlexer.INNER() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.ANY() - - return current - joinOp_JoinOpInner.min_depth = 2 - - @depthcontrol - def joinOp_JoinOpLeftRight(self): - current = self.create_node(UnparserRule(name='joinOp_JoinOpLeftRight')) - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_199', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_199', choice)] = self.unlexer.weights.get(('alt_199', choice), 1) * self.unlexer.cooldown - if choice == 0: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_203', i), 1) for i, w in enumerate([1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_203', choice)] = self.unlexer.weights.get(('alt_203', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.OUTER() - elif choice == 1: - current += self.unlexer.SEMI() - elif choice == 2: - current += self.unlexer.ANTI() - elif choice == 3: - current += self.unlexer.ANY() - elif choice == 4: - current += self.unlexer.ASOF() - - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_209', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_209', choice)] = self.unlexer.weights.get(('alt_209', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.LEFT() - elif choice == 1: - current += self.unlexer.RIGHT() - elif choice == 1: - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_212', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_212', choice)] = self.unlexer.weights.get(('alt_212', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.LEFT() - elif choice == 1: - current += self.unlexer.RIGHT() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_216', i), 1) for i, w in enumerate([1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_216', choice)] = self.unlexer.weights.get(('alt_216', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.OUTER() - elif choice == 1: - current += self.unlexer.SEMI() - elif choice == 2: - current += self.unlexer.ANTI() - elif choice == 3: - current += self.unlexer.ANY() - elif choice == 4: - current += self.unlexer.ASOF() - - return current - joinOp_JoinOpLeftRight.min_depth = 2 - - @depthcontrol - def joinOp_JoinOpFull(self): - current = self.create_node(UnparserRule(name='joinOp_JoinOpFull')) - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_222', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_222', choice)] = self.unlexer.weights.get(('alt_222', choice), 1) * self.unlexer.cooldown - if choice == 0: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_226', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_226', choice)] = self.unlexer.weights.get(('alt_226', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.OUTER() - elif choice == 1: - current += self.unlexer.ANY() - - current += self.unlexer.FULL() - elif choice == 1: - current += self.unlexer.FULL() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_230', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_230', choice)] = self.unlexer.weights.get(('alt_230', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.OUTER() - elif choice == 1: - current += self.unlexer.ANY() - - return current - joinOp_JoinOpFull.min_depth = 2 - - @depthcontrol - def joinOpCross(self): - current = self.create_node(UnparserRule(name='joinOpCross')) - choice = self.choice([0 if [2, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_233', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_233', choice)] = self.unlexer.weights.get(('alt_233', choice), 1) * self.unlexer.cooldown - if choice == 0: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_237', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_237', choice)] = self.unlexer.weights.get(('alt_237', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.GLOBAL() - elif choice == 1: - current += self.unlexer.LOCAL() - - current += self.unlexer.CROSS() - current += self.unlexer.JOIN() - elif choice == 1: - current += self.unlexer.COMMA() - return current - joinOpCross.min_depth = 1 - - @depthcontrol - def joinConstraintClause(self): - current = self.create_node(UnparserRule(name='joinConstraintClause')) - choice = self.choice([0 if [4, 4, 4][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_240', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_240', choice)] = self.unlexer.weights.get(('alt_240', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.ON() - current += self.columnExprList() - elif choice == 1: - current += self.unlexer.USING() - current += self.unlexer.LPAREN() - current += self.columnExprList() - current += self.unlexer.RPAREN() - elif choice == 2: - current += self.unlexer.USING() - current += self.columnExprList() - return current - joinConstraintClause.min_depth = 4 - - @depthcontrol - def limitExpr(self): - current = self.create_node(UnparserRule(name='limitExpr')) - current += self.unlexer.INTEGER_LITERAL() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_245', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_245', choice)] = self.unlexer.weights.get(('alt_245', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.COMMA() - elif choice == 1: - current += self.unlexer.OFFSET() - current += self.unlexer.INTEGER_LITERAL() - - return current - limitExpr.min_depth = 2 - - @depthcontrol - def orderExprList(self): - current = self.create_node(UnparserRule(name='orderExprList')) - current += self.orderExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.orderExpr() - - return current - orderExprList.min_depth = 4 - - @depthcontrol - def orderExpr(self): - current = self.create_node(UnparserRule(name='orderExpr')) - current += self.columnExpr() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_250', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_250', choice)] = self.unlexer.weights.get(('alt_250', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.ASCENDING() - elif choice == 1: - current += self.unlexer.DESCENDING() - - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.NULLS() - choice = self.choice([0 if [2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_254', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_254', choice)] = self.unlexer.weights.get(('alt_254', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.FIRST() - elif choice == 1: - current += self.unlexer.LAST() - - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.COLLATE() - current += self.unlexer.STRING_LITERAL() - - return current - orderExpr.min_depth = 3 - - @depthcontrol - def ratioExpr(self): - current = self.create_node(UnparserRule(name='ratioExpr')) - current += self.unlexer.INTEGER_LITERAL() - current += self.unlexer.SLASH() - current += self.unlexer.INTEGER_LITERAL() - return current - ratioExpr.min_depth = 2 - - @depthcontrol - def settingExprList(self): - current = self.create_node(UnparserRule(name='settingExprList')) - current += self.settingExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.settingExpr() - - return current - settingExprList.min_depth = 4 - - @depthcontrol - def settingExpr(self): - current = self.create_node(UnparserRule(name='settingExpr')) - current += self.identifier() - current += self.unlexer.EQ_SINGLE() - current += self.literal() - return current - settingExpr.min_depth = 3 - - @depthcontrol - def setStmt(self): - current = self.create_node(UnparserRule(name='setStmt')) - current += self.unlexer.SET() - current += self.settingExprList() - return current - setStmt.min_depth = 5 - - @depthcontrol - def valueExprList(self): - current = self.create_node(UnparserRule(name='valueExprList')) - current += self.valueExpr() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.valueExpr() - - return current - valueExprList.min_depth = 3 - - @depthcontrol - def valueExpr(self): - current = self.create_node(UnparserRule(name='valueExpr')) - choice = self.choice([0 if [4, 6, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_260', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_260', choice)] = self.unlexer.weights.get(('alt_260', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.valueExpr_ValueExprLiteral() - elif choice == 1: - current = self.valueExpr_ValueExprTuple() - elif choice == 2: - current = self.valueExpr_ValueExprArray() - return current - valueExpr.min_depth = 2 - - @depthcontrol - def valueExpr_ValueExprLiteral(self): - current = self.create_node(UnparserRule(name='valueExpr_ValueExprLiteral')) - current += self.literal() - return current - valueExpr_ValueExprLiteral.min_depth = 3 - - @depthcontrol - def valueExpr_ValueExprTuple(self): - current = self.create_node(UnparserRule(name='valueExpr_ValueExprTuple')) - current += self.valueTupleExpr() - return current - valueExpr_ValueExprTuple.min_depth = 5 - - @depthcontrol - def valueExpr_ValueExprArray(self): - current = self.create_node(UnparserRule(name='valueExpr_ValueExprArray')) - current += self.unlexer.LBRACKET() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.valueExprList() - - current += self.unlexer.RBRACKET() - return current - valueExpr_ValueExprArray.min_depth = 1 - - @depthcontrol - def columnTypeExpr(self): - current = self.create_node(UnparserRule(name='columnTypeExpr')) - choice = self.choice([0 if [4, 5, 4, 6][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_265', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_265', choice)] = self.unlexer.weights.get(('alt_265', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.columnTypeExpr_ColumnTypeExprSimple() - elif choice == 1: - current = self.columnTypeExpr_ColumnTypeExprParam() - elif choice == 2: - current = self.columnTypeExpr_ColumnTypeExprEnum() - elif choice == 3: - current = self.columnTypeExpr_ColumnTypeExprComplex() - return current - columnTypeExpr.min_depth = 4 - - @depthcontrol - def columnTypeExpr_ColumnTypeExprSimple(self): - current = self.create_node(UnparserRule(name='columnTypeExpr_ColumnTypeExprSimple')) - current += self.identifier() - return current - columnTypeExpr_ColumnTypeExprSimple.min_depth = 3 - - @depthcontrol - def columnTypeExpr_ColumnTypeExprParam(self): - current = self.create_node(UnparserRule(name='columnTypeExpr_ColumnTypeExprParam')) - current += self.identifier() - current += self.unlexer.LPAREN() - current += self.columnParamList() - current += self.unlexer.RPAREN() - return current - columnTypeExpr_ColumnTypeExprParam.min_depth = 4 - - @depthcontrol - def columnTypeExpr_ColumnTypeExprEnum(self): - current = self.create_node(UnparserRule(name='columnTypeExpr_ColumnTypeExprEnum')) - current += self.identifier() - current += self.unlexer.LPAREN() - current += self.enumValue() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.enumValue() - - current += self.unlexer.RPAREN() - return current - columnTypeExpr_ColumnTypeExprEnum.min_depth = 3 - - @depthcontrol - def columnTypeExpr_ColumnTypeExprComplex(self): - current = self.create_node(UnparserRule(name='columnTypeExpr_ColumnTypeExprComplex')) - current += self.identifier() - current += self.unlexer.LPAREN() - current += self.columnTypeExpr() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.columnTypeExpr() - - current += self.unlexer.RPAREN() - return current - columnTypeExpr_ColumnTypeExprComplex.min_depth = 5 - - @depthcontrol - def columnExprList(self): - current = self.create_node(UnparserRule(name='columnExprList')) - current += self.columnExpr() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.columnExpr() - - return current - columnExprList.min_depth = 3 - - @depthcontrol - def columnExpr(self): - current = self.create_node(UnparserRule(name='columnExpr')) - choice = self.choice([0 if [4, 2, 5, 2, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_273', i), 1) for i, w in enumerate([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_273', choice)] = self.unlexer.weights.get(('alt_273', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.columnExpr_ColumnExprLiteral() - elif choice == 1: - current = self.columnExpr_ColumnExprAsterisk() - elif choice == 2: - current = self.columnExpr_ColumnExprTuple() - elif choice == 3: - current = self.columnExpr_ColumnExprArray() - elif choice == 4: - current = self.columnExpr_ColumnExprCase() - elif choice == 5: - current = self.columnExpr_ColumnExprExtract() - elif choice == 6: - current = self.columnExpr_ColumnExprTrim() - elif choice == 7: - current = self.columnExpr_ColumnExprInterval() - elif choice == 8: - current = self.columnExpr_ColumnExprIdentifier() - elif choice == 9: - current = self.columnExpr_ColumnExprFunction() - elif choice == 10: - current = self.columnExpr_ColumnExprArrayAccess() - elif choice == 11: - current = self.columnExpr_ColumnExprTupleAccess() - elif choice == 12: - current = self.columnExpr_ColumnExprUnaryOp() - elif choice == 13: - current = self.columnExpr_ColumnExprIsNull() - elif choice == 14: - current = self.columnExpr_ColumnExprBinaryOp() - elif choice == 15: - current = self.columnExpr_ColumnExprTernaryOp() - elif choice == 16: - current = self.columnExpr_ColumnExprBetween() - elif choice == 17: - current = self.columnExpr_ColumnExprAlias() - return current - columnExpr.min_depth = 2 - - @depthcontrol - def columnExpr_ColumnExprLiteral(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprLiteral')) - current += self.literal() - return current - columnExpr_ColumnExprLiteral.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprAsterisk(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprAsterisk')) - current += self.unlexer.ASTERISK() - return current - columnExpr_ColumnExprAsterisk.min_depth = 1 - - @depthcontrol - def columnExpr_ColumnExprTuple(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprTuple')) - current += self.unlexer.LPAREN() - current += self.columnExprList() - current += self.unlexer.RPAREN() - return current - columnExpr_ColumnExprTuple.min_depth = 4 - - @depthcontrol - def columnExpr_ColumnExprArray(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprArray')) - current += self.unlexer.LBRACKET() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.columnExprList() - - current += self.unlexer.RBRACKET() - return current - columnExpr_ColumnExprArray.min_depth = 1 - - @depthcontrol - def columnExpr_ColumnExprCase(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprCase')) - current += self.unlexer.CASE() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.columnExpr() - - if self.unlexer.max_depth >= 0: - for _ in self.one_or_more(): - current += self.unlexer.WHEN() - current += self.columnExpr() - current += self.unlexer.THEN() - current += self.columnExpr() - - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_one(): - current += self.unlexer.ELSE() - current += self.columnExpr() - - current += self.unlexer.END() - return current - columnExpr_ColumnExprCase.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprExtract(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprExtract')) - current += self.unlexer.EXTRACT() - current += self.unlexer.LPAREN() - current += self.unlexer.INTERVAL_TYPE() - current += self.unlexer.FROM() - current += self.columnExpr() - current += self.unlexer.RPAREN() - return current - columnExpr_ColumnExprExtract.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprTrim(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprTrim')) - current += self.unlexer.TRIM() - current += self.unlexer.LPAREN() - choice = self.choice([0 if [2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_295', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_295', choice)] = self.unlexer.weights.get(('alt_295', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.BOTH() - elif choice == 1: - current += self.unlexer.LEADING() - elif choice == 2: - current += self.unlexer.TRAILING() - current += self.unlexer.STRING_LITERAL() - current += self.unlexer.FROM() - current += self.columnExpr() - current += self.unlexer.RPAREN() - return current - columnExpr_ColumnExprTrim.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprInterval(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprInterval')) - current += self.unlexer.INTERVAL() - current += self.columnExpr() - current += self.unlexer.INTERVAL_TYPE() - return current - columnExpr_ColumnExprInterval.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprIdentifier(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprIdentifier')) - current += self.columnIdentifier() - return current - columnExpr_ColumnExprIdentifier.min_depth = 4 - - @depthcontrol - def columnExpr_ColumnExprFunction(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprFunction')) - current += self.identifier() - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - current += self.unlexer.LPAREN() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.columnParamList() - - current += self.unlexer.RPAREN() - - current += self.unlexer.LPAREN() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.columnArgList() - - current += self.unlexer.RPAREN() - return current - columnExpr_ColumnExprFunction.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprArrayAccess(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprArrayAccess')) - current += self.columnExpr() - current += self.unlexer.LBRACKET() - current += self.columnExpr() - current += self.unlexer.RBRACKET() - return current - columnExpr_ColumnExprArrayAccess.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprTupleAccess(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprTupleAccess')) - current += self.columnExpr() - current += self.unlexer.DOT() - current += self.unlexer.INTEGER_LITERAL() - return current - columnExpr_ColumnExprTupleAccess.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprUnaryOp(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprUnaryOp')) - current += self.unaryOp() - current += self.columnExpr() - return current - columnExpr_ColumnExprUnaryOp.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprIsNull(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprIsNull')) - current += self.columnExpr() - current += self.unlexer.IS() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.NOT() - - current += self.unlexer.NULL_SQL() - return current - columnExpr_ColumnExprIsNull.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprBinaryOp(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprBinaryOp')) - current += self.columnExpr() - current += self.binaryOp() - current += self.columnExpr() - return current - columnExpr_ColumnExprBinaryOp.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprTernaryOp(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprTernaryOp')) - current += self.columnExpr() - current += self.unlexer.QUERY() - current += self.columnExpr() - current += self.unlexer.COLON() - current += self.columnExpr() - return current - columnExpr_ColumnExprTernaryOp.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprBetween(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprBetween')) - current += self.columnExpr() - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.NOT() - - current += self.unlexer.BETWEEN() - current += self.columnExpr() - current += self.unlexer.AND() - current += self.columnExpr() - return current - columnExpr_ColumnExprBetween.min_depth = 3 - - @depthcontrol - def columnExpr_ColumnExprAlias(self): - current = self.create_node(UnparserRule(name='columnExpr_ColumnExprAlias')) - current += self.columnExpr() - current += self.unlexer.AS() - current += self.identifier() - return current - columnExpr_ColumnExprAlias.min_depth = 3 - - @depthcontrol - def columnParamList(self): - current = self.create_node(UnparserRule(name='columnParamList')) - current += self.literal() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.literal() - - return current - columnParamList.min_depth = 3 - - @depthcontrol - def columnArgList(self): - current = self.create_node(UnparserRule(name='columnArgList')) - current += self.columnArgExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.columnArgExpr() - - return current - columnArgList.min_depth = 4 - - @depthcontrol - def columnArgExpr(self): - current = self.create_node(UnparserRule(name='columnArgExpr')) - choice = self.choice([0 if [4, 3][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_306', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_306', choice)] = self.unlexer.weights.get(('alt_306', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.columnLambdaExpr() - elif choice == 1: - current += self.columnExpr() - return current - columnArgExpr.min_depth = 3 - - @depthcontrol - def columnLambdaExpr(self): - current = self.create_node(UnparserRule(name='columnLambdaExpr')) - choice = self.choice([0 if [3, 3][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_309', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_309', choice)] = self.unlexer.weights.get(('alt_309', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.LPAREN() - current += self.identifier() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.identifier() - - current += self.unlexer.RPAREN() - elif choice == 1: - current += self.identifier() - if self.unlexer.max_depth >= 3: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.identifier() - - current += self.unlexer.ARROW() - current += self.columnExpr() - return current - columnLambdaExpr.min_depth = 3 - - @depthcontrol - def columnIdentifier(self): - current = self.create_node(UnparserRule(name='columnIdentifier')) - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.tableIdentifier() - current += self.unlexer.DOT() - - current += self.identifier() - return current - columnIdentifier.min_depth = 3 - - @depthcontrol - def tableExpr(self): - current = self.create_node(UnparserRule(name='tableExpr')) - choice = self.choice([0 if [5, 4, 7, 6][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_315', i), 1) for i, w in enumerate([1, 1, 1, 1])]) - self.unlexer.weights[('alt_315', choice)] = self.unlexer.weights.get(('alt_315', choice), 1) * self.unlexer.cooldown - if choice == 0: - current = self.tableExpr_TableExprIdentifier() - elif choice == 1: - current = self.tableExpr_TableExprFunction() - elif choice == 2: - current = self.tableExpr_TableExprSubquery() - elif choice == 3: - current = self.tableExpr_TableExprAlias() - return current - tableExpr.min_depth = 4 - - @depthcontrol - def tableExpr_TableExprIdentifier(self): - current = self.create_node(UnparserRule(name='tableExpr_TableExprIdentifier')) - current += self.tableIdentifier() - return current - tableExpr_TableExprIdentifier.min_depth = 4 - - @depthcontrol - def tableExpr_TableExprFunction(self): - current = self.create_node(UnparserRule(name='tableExpr_TableExprFunction')) - current += self.identifier() - current += self.unlexer.LPAREN() - if self.unlexer.max_depth >= 5: - for _ in self.zero_or_one(): - current += self.tableArgList() - - current += self.unlexer.RPAREN() - return current - tableExpr_TableExprFunction.min_depth = 3 - - @depthcontrol - def tableExpr_TableExprSubquery(self): - current = self.create_node(UnparserRule(name='tableExpr_TableExprSubquery')) - current += self.unlexer.LPAREN() - current += self.selectUnionStmt() - current += self.unlexer.RPAREN() - return current - tableExpr_TableExprSubquery.min_depth = 6 - - @depthcontrol - def tableExpr_TableExprAlias(self): - current = self.create_node(UnparserRule(name='tableExpr_TableExprAlias')) - current += self.tableExpr() - current += self.unlexer.AS() - current += self.identifier() - return current - tableExpr_TableExprAlias.min_depth = 5 - - @depthcontrol - def tableIdentifier(self): - current = self.create_node(UnparserRule(name='tableIdentifier')) - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_one(): - current += self.databaseIdentifier() - current += self.unlexer.DOT() - - current += self.identifier() - return current - tableIdentifier.min_depth = 3 - - @depthcontrol - def tableArgList(self): - current = self.create_node(UnparserRule(name='tableArgList')) - current += self.tableArgExpr() - if self.unlexer.max_depth >= 4: - for _ in self.zero_or_more(): - current += self.unlexer.COMMA() - current += self.tableArgExpr() - - return current - tableArgList.min_depth = 4 - - @depthcontrol - def tableArgExpr(self): - current = self.create_node(UnparserRule(name='tableArgExpr')) - choice = self.choice([0 if [3, 4][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_323', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_323', choice)] = self.unlexer.weights.get(('alt_323', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.literal() - elif choice == 1: - current += self.tableIdentifier() - return current - tableArgExpr.min_depth = 3 - - @depthcontrol - def databaseIdentifier(self): - current = self.create_node(UnparserRule(name='databaseIdentifier')) - current += self.identifier() - return current - databaseIdentifier.min_depth = 3 - - @depthcontrol - def literal(self): - current = self.create_node(UnparserRule(name='literal')) - choice = self.choice([0 if [2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_326', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_326', choice)] = self.unlexer.weights.get(('alt_326', choice), 1) * self.unlexer.cooldown - if choice == 0: - if self.unlexer.max_depth >= 1: - for _ in self.zero_or_one(): - choice = self.choice([0 if [1, 1][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_331', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_331', choice)] = self.unlexer.weights.get(('alt_331', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.PLUS() - elif choice == 1: - current += self.unlexer.DASH() - - choice = self.choice([0 if [3, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_334', i), 1) for i, w in enumerate([1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_334', choice)] = self.unlexer.weights.get(('alt_334', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.FLOATING_LITERAL() - elif choice == 1: - current += self.unlexer.HEXADECIMAL_LITERAL() - elif choice == 2: - current += self.unlexer.INTEGER_LITERAL() - elif choice == 3: - current += self.unlexer.INF() - elif choice == 4: - current += self.unlexer.NAN_SQL() - elif choice == 1: - current += self.unlexer.STRING_LITERAL() - elif choice == 2: - current += self.unlexer.NULL_SQL() - return current - literal.min_depth = 2 - - @depthcontrol - def keyword(self): - current = self.create_node(UnparserRule(name='keyword')) - choice = self.choice([0 if [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_340', i), 1) for i, w in enumerate([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_340', choice)] = self.unlexer.weights.get(('alt_340', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.ALIAS() - elif choice == 1: - current += self.unlexer.ALL() - elif choice == 2: - current += self.unlexer.AND() - elif choice == 3: - current += self.unlexer.ANTI() - elif choice == 4: - current += self.unlexer.ANY() - elif choice == 5: - current += self.unlexer.ARRAY() - elif choice == 6: - current += self.unlexer.AS() - elif choice == 7: - current += self.unlexer.ASCENDING() - elif choice == 8: - current += self.unlexer.ASOF() - elif choice == 9: - current += self.unlexer.BETWEEN() - elif choice == 10: - current += self.unlexer.BOTH() - elif choice == 11: - current += self.unlexer.BY() - elif choice == 12: - current += self.unlexer.CASE() - elif choice == 13: - current += self.unlexer.CAST() - elif choice == 14: - current += self.unlexer.CLUSTER() - elif choice == 15: - current += self.unlexer.COLLATE() - elif choice == 16: - current += self.unlexer.CREATE() - elif choice == 17: - current += self.unlexer.CROSS() - elif choice == 18: - current += self.unlexer.DAY() - elif choice == 19: - current += self.unlexer.DATABASE() - elif choice == 20: - current += self.unlexer.DEFAULT() - elif choice == 21: - current += self.unlexer.DELETE() - elif choice == 22: - current += self.unlexer.DESCENDING() - elif choice == 23: - current += self.unlexer.DISK() - elif choice == 24: - current += self.unlexer.DISTINCT() - elif choice == 25: - current += self.unlexer.DROP() - elif choice == 26: - current += self.unlexer.ELSE() - elif choice == 27: - current += self.unlexer.END() - elif choice == 28: - current += self.unlexer.ENGINE() - elif choice == 29: - current += self.unlexer.EXISTS() - elif choice == 30: - current += self.unlexer.EXTRACT() - elif choice == 31: - current += self.unlexer.FINAL() - elif choice == 32: - current += self.unlexer.FIRST() - elif choice == 33: - current += self.unlexer.FORMAT() - elif choice == 34: - current += self.unlexer.FROM() - elif choice == 35: - current += self.unlexer.FULL() - elif choice == 36: - current += self.unlexer.GLOBAL() - elif choice == 37: - current += self.unlexer.GROUP() - elif choice == 38: - current += self.unlexer.HAVING() - elif choice == 39: - current += self.unlexer.HOUR() - elif choice == 40: - current += self.unlexer.IF() - elif choice == 41: - current += self.unlexer.IN() - elif choice == 42: - current += self.unlexer.INNER() - elif choice == 43: - current += self.unlexer.INSERT() - elif choice == 44: - current += self.unlexer.INTERVAL() - elif choice == 45: - current += self.unlexer.INTO() - elif choice == 46: - current += self.unlexer.IS() - elif choice == 47: - current += self.unlexer.JOIN() - elif choice == 48: - current += self.unlexer.KEY() - elif choice == 49: - current += self.unlexer.LAST() - elif choice == 50: - current += self.unlexer.LEADING() - elif choice == 51: - current += self.unlexer.LEFT() - elif choice == 52: - current += self.unlexer.LIKE() - elif choice == 53: - current += self.unlexer.LIMIT() - elif choice == 54: - current += self.unlexer.LOCAL() - elif choice == 55: - current += self.unlexer.MATERIALIZED() - elif choice == 56: - current += self.unlexer.MINUTE() - elif choice == 57: - current += self.unlexer.MONTH() - elif choice == 58: - current += self.unlexer.NOT() - elif choice == 59: - current += self.unlexer.NULLS() - elif choice == 60: - current += self.unlexer.OFFSET() - elif choice == 61: - current += self.unlexer.ON() - elif choice == 62: - current += self.unlexer.OR() - elif choice == 63: - current += self.unlexer.ORDER() - elif choice == 64: - current += self.unlexer.OUTER() - elif choice == 65: - current += self.unlexer.OUTFILE() - elif choice == 66: - current += self.unlexer.PARTITION() - elif choice == 67: - current += self.unlexer.PREWHERE() - elif choice == 68: - current += self.unlexer.PRIMARY() - elif choice == 69: - current += self.unlexer.QUARTER() - elif choice == 70: - current += self.unlexer.RIGHT() - elif choice == 71: - current += self.unlexer.SAMPLE() - elif choice == 72: - current += self.unlexer.SECOND() - elif choice == 73: - current += self.unlexer.SEMI() - elif choice == 74: - current += self.unlexer.SET() - elif choice == 75: - current += self.unlexer.SETTINGS() - elif choice == 76: - current += self.unlexer.TABLE() - elif choice == 77: - current += self.unlexer.TEMPORARY() - elif choice == 78: - current += self.unlexer.THEN() - elif choice == 79: - current += self.unlexer.TOTALS() - elif choice == 80: - current += self.unlexer.TRAILING() - elif choice == 81: - current += self.unlexer.TRIM() - elif choice == 82: - current += self.unlexer.TO() - elif choice == 83: - current += self.unlexer.TTL() - elif choice == 84: - current += self.unlexer.UNION() - elif choice == 85: - current += self.unlexer.USING() - elif choice == 86: - current += self.unlexer.VALUES() - elif choice == 87: - current += self.unlexer.VOLUME() - elif choice == 88: - current += self.unlexer.WEEK() - elif choice == 89: - current += self.unlexer.WHEN() - elif choice == 90: - current += self.unlexer.WHERE() - elif choice == 91: - current += self.unlexer.WITH() - elif choice == 92: - current += self.unlexer.YEAR() - return current - keyword.min_depth = 2 - - @depthcontrol - def identifier(self): - current = self.create_node(UnparserRule(name='identifier')) - choice = self.choice([0 if [2, 3, 3][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_434', i), 1) for i, w in enumerate([1, 1, 1])]) - self.unlexer.weights[('alt_434', choice)] = self.unlexer.weights.get(('alt_434', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.IDENTIFIER() - elif choice == 1: - current += self.unlexer.INTERVAL_TYPE() - elif choice == 2: - current += self.keyword() - return current - identifier.min_depth = 2 - - @depthcontrol - def unaryOp(self): - current = self.create_node(UnparserRule(name='unaryOp')) - choice = self.choice([0 if [1, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_438', i), 1) for i, w in enumerate([1, 1])]) - self.unlexer.weights[('alt_438', choice)] = self.unlexer.weights.get(('alt_438', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.DASH() - elif choice == 1: - current += self.unlexer.NOT() - return current - unaryOp.min_depth = 1 - - @depthcontrol - def binaryOp(self): - current = self.create_node(UnparserRule(name='binaryOp')) - choice = self.choice([0 if [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2][i] > self.unlexer.max_depth else w * self.unlexer.weights.get(('alt_441', i), 1) for i, w in enumerate([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]) - self.unlexer.weights[('alt_441', choice)] = self.unlexer.weights.get(('alt_441', choice), 1) * self.unlexer.cooldown - if choice == 0: - current += self.unlexer.CONCAT() - elif choice == 1: - current += self.unlexer.ASTERISK() - elif choice == 2: - current += self.unlexer.SLASH() - elif choice == 3: - current += self.unlexer.PLUS() - elif choice == 4: - current += self.unlexer.DASH() - elif choice == 5: - current += self.unlexer.PERCENT() - elif choice == 6: - current += self.unlexer.EQ_DOUBLE() - elif choice == 7: - current += self.unlexer.EQ_SINGLE() - elif choice == 8: - current += self.unlexer.NOT_EQ() - elif choice == 9: - current += self.unlexer.LE() - elif choice == 10: - current += self.unlexer.GE() - elif choice == 11: - current += self.unlexer.LT() - elif choice == 12: - current += self.unlexer.GT() - elif choice == 13: - current += self.unlexer.AND() - elif choice == 14: - current += self.unlexer.OR() - elif choice == 15: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.NOT() - - current += self.unlexer.LIKE() - elif choice == 16: - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.GLOBAL() - - if self.unlexer.max_depth >= 2: - for _ in self.zero_or_one(): - current += self.unlexer.NOT() - - current += self.unlexer.IN() - return current - binaryOp.min_depth = 1 - - @depthcontrol - def enumValue(self): - current = self.create_node(UnparserRule(name='enumValue')) - current += self.unlexer.STRING_LITERAL() - current += self.unlexer.EQ_SINGLE() - current += self.unlexer.INTEGER_LITERAL() - return current - enumValue.min_depth = 2 - - default_rule = queryList - diff --git a/utils/grammar-fuzzer/README.md b/utils/grammar-fuzzer/README.md deleted file mode 100644 index b3f233c8648..00000000000 --- a/utils/grammar-fuzzer/README.md +++ /dev/null @@ -1,41 +0,0 @@ -How to use Fuzzer -=== - -The fuzzer consists of auto-generated files: - - ClickHouseUnlexer.py - ClickHouseUnparser.py - -They are generated from grammar files (.g4) using Grammarinator: - - pip3 install grammarinator - grammarinator-process ClickHouseLexer.g4 ClickHouseParser.g4 -o fuzzer/ - -Then you can generate test input for ClickHouse client: - - cd fuzzer - grammarinator-generate \ - -r query_list \ # top-level rule - -o /tmp/sql_test_%d.sql \ # template for output test names - -n 10 \ # number of tests - -c 0.3 \ - -d 20 \ # depth of recursion - -p ClickHouseUnparser.py -l ClickHouseUnlexer.py \ # auto-generated unparser and unlexer - --test-transformers SpaceTransformer.single_line_whitespace \ # transform function to insert whitespace - -For more details see `grammarinator-generate --help`. As a test-transformer function also can be used `SpaceTransformer.multi_line_transformer` - both functions reside in `fuzzer/SpaceTransformer.py` file. - - -Parsing steps -=== - -1. Replace all operators with corresponding functions. -2. Replace all asterisks with columns - if it's inside function call, then expand it as multiple arguments. Warn about undeterministic invocations when functions have positional arguments. - -Old vs. new parser -=== - -- `a as b [c]` - accessing aliased array expression is not possible. -- `a as b . 1` - accessing aliased tuple expression is not possible. -- `between a is not null and b` - `between` operator should have lower priority than `is null`. -- `*.1` - accessing asterisk tuple expression is not possible. diff --git a/utils/grammar-fuzzer/SpaceTransformer.py b/utils/grammar-fuzzer/SpaceTransformer.py deleted file mode 100644 index ad96845c7e2..00000000000 --- a/utils/grammar-fuzzer/SpaceTransformer.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- - -from grammarinator.runtime.tree import * - -from itertools import tee, islice, zip_longest -import random - - -def single_line_whitespace(node): - return _whitespace(node, ' \t') - - -def multi_line_whitespace(node): - return _whitespace(node, ' \t\r\n') - - -def _whitespace(node, symbols): - for child in node.children: - _whitespace(child, symbols) - - # helper function to look ahead one child - def with_next(iterable): - items, nexts = tee(iterable, 2) - nexts = islice(nexts, 1, None) - return zip_longest(items, nexts) - - if isinstance(node, UnparserRule): - new_children = [] - for child, next_child in with_next(node.children): - if (not next_child or - next_child and isinstance(next_child, UnlexerRule) and next_child.name == 'DOT' or - isinstance(child, UnlexerRule) and child.name == 'DOT'): - new_children.append(child) - else: - new_children.extend([child, UnlexerRule(src=random.choice(symbols))]) - node.children = new_children - - return node diff --git a/utils/grammar-fuzzer/__init__.py b/utils/grammar-fuzzer/__init__.py deleted file mode 100644 index 40a96afc6ff..00000000000 --- a/utils/grammar-fuzzer/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- diff --git a/utils/junit_to_html/junit-noframes.xsl b/utils/junit_to_html/junit-noframes.xsl deleted file mode 100644 index ae70e230ef6..00000000000 --- a/utils/junit_to_html/junit-noframes.xsl +++ /dev/null @@ -1,390 +0,0 @@ - - - - - - - - Test Results - - - - - - - - -
- - - - -
- - - - - - - - - - - - -

-
- - - - - - - - - -
-

- Back to top - - -

Summary

- - - - - - - - - - - - - - - - - Failure - Error - - - - - - - - -
TestsFailuresErrorsSuccess rateTime
- - - - - - - -
- - - - -
- Note: failures are anticipated and checked for with assertions while errors are unanticipated. -
-
- - - - -

Test Results

-
-
- - - Name - Tests - Errors - Failures - Time(s) - - - - - - Name - Tests - Errors - Failures - Time(s) - Time Stamp - Host - - - - - - Name - Status - Type - Time(s) - - - - - - - - - Failure - Error - - - - - - - - - - - - - - - - - - - - - Error - Failure - TableRowColor - - - - - - Failure - - - - Error - - - - Success - - - - - - - - - - - - -

- - - - - -
- - - -

- - - - - -
- - - - N/A - - - - - - -

- at line - - - , column - - -
-
-
- - - - - - - - - - 32 - - - - - - - - - - - - -
- - - -
- - -
- - - -
- - - -
-
- - - - - - - - -
diff --git a/utils/junit_to_html/junit_to_html b/utils/junit_to_html/junit_to_html deleted file mode 100755 index 132763c7d4c..00000000000 --- a/utils/junit_to_html/junit_to_html +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import os -import lxml.etree as etree -import json -import argparse - -def export_testcases_json(report, path): - with open(os.path.join(path, "cases.jer"), "w") as testcases_file: - for testsuite in report.getroot(): - for testcase in testsuite: - row = {} - row["hostname"] = testsuite.get("hostname") - row["suite"] = testsuite.get("name") - row["suite_duration"] = testsuite.get("time") - row["timestamp"] = testsuite.get("timestamp") - row["testname"] = testcase.get("name") - row["classname"] = testcase.get("classname") - row["file"] = testcase.get("file") - row["line"] = testcase.get("line") - row["duration"] = testcase.get("time") - for el in testcase: - if el.tag == "system-err": - row["stderr"] = el.text - else: - row["stderr"] = "" - - if el.tag == "system-out": - row["stdout"] = el.text - else: - row["stdout"] = "" - - json.dump(row, testcases_file) - testcases_file.write("\n") - -def export_testsuites_json(report, path): - with open(os.path.join(path, "suites.jer"), "w") as testsuites_file: - for testsuite in report.getroot(): - row = {} - row["suite"] = testsuite.get("name") - row["errors"] = testsuite.get("errors") - row["failures"] = testsuite.get("failures") - row["hostname"] = testsuite.get("hostname") - row["skipped"] = testsuite.get("skipped") - row["duration"] = testsuite.get("time") - row["timestamp"] = testsuite.get("timestamp") - json.dump(row, testsuites_file) - testsuites_file.write("\n") - - -def _convert_junit_to_html(junit_path, result_path, export_cases, export_suites): - with open(os.path.join(os.path.dirname(__file__), "junit-noframes.xsl")) as xslt_file: - junit_to_html_xslt = etree.parse(xslt_file) - if not os.path.exists(result_path): - os.makedirs(result_path) - - with open(junit_path) as junit_file: - junit_xml = etree.parse(junit_file) - - if export_suites: - export_testsuites_json(junit_xml, result_path) - if export_cases: - export_testcases_json(junit_xml, result_path) - transform = etree.XSLT(junit_to_html_xslt) - html = etree.tostring(transform(junit_xml), encoding="utf-8") - - with open(os.path.join(result_path, "result.html"), "w") as html_file: - html_file.write(html) - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description='Convert JUnit XML.') - parser.add_argument('junit', help='path to junit.xml report') - parser.add_argument('result_dir', nargs='?', help='directory for result files. Default to junit.xml directory') - parser.add_argument('--export-cases', help='Export JSONEachRow result for testcases to upload in CI', action='store_true') - parser.add_argument('--export-suites', help='Export JSONEachRow result for testsuites to upload in CI', action='store_true') - - args = parser.parse_args() - - junit_path = args.junit - if args.result_dir: - result_path = args.result_dir - else: - result_path = os.path.dirname(junit_path) - print("junit_path: {}, result_path: {}, export cases:{}, export suites: {}".format(junit_path, result_path, args.export_cases, args.export_suites)) - _convert_junit_to_html(junit_path, result_path, args.export_cases, args.export_suites) diff --git a/utils/link-validate/link-validate.sh b/utils/link-validate/link-validate.sh deleted file mode 100755 index 2d8d57b95fc..00000000000 --- a/utils/link-validate/link-validate.sh +++ /dev/null @@ -1,42 +0,0 @@ -#/bin/sh -# -# This script is used to validate the shared libraries -# -# Authors: FoundationDB team, https://github.com/apple/foundationdb/blame/master/build/link-validate.sh -# License: Apache License 2.0 - -verlte() { - [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] -} - -ALLOWED_SHARED_LIBS=("libdl.so.2" "libpthread.so.0" "librt.so.1" "libm.so.6" "libc.so.6" "ld-linux-x86-64.so.2") - -if [ "$#" -lt 1 ]; then - echo "USAGE: link-validate.sh BINNAME GLIBC_VERSION" - exit 1 -fi - -# Step 1: glibc version - -for i in $(objdump -T "$1" | awk '{print $5}' | grep GLIBC | sed 's/ *$//g' | sed 's/GLIBC_//' | sort | uniq); do - if ! verlte "$i" "${2:-2.10}"; then - echo "Dependency on newer libc detected: $i" - exit 1 - fi -done - -# Step 2: Other dynamic dependencies - -for j in $(objdump -p "$1" | grep NEEDED | awk '{print $2}'); do - PRESENT=0 - for k in ${ALLOWED_SHARED_LIBS[@]}; do - if [[ "$k" == "$j" ]]; then - PRESENT=1 - break - fi - done - if ! [[ $PRESENT == 1 ]]; then - echo "Unexpected shared object dependency detected: $j" - exit 1 - fi -done diff --git a/utils/tests-visualizer/index.html b/utils/tests-visualizer/index.html index 690c42e486e..13f8daaa151 100644 --- a/utils/tests-visualizer/index.html +++ b/utils/tests-visualizer/index.html @@ -1,16 +1,85 @@ - + + - -

Loading (10 seconds, 20 MB)...

- - + +

+

Data not load

+
+ +

Loading (~10 seconds, ~20 MB)

+
+ diff --git a/utils/upload_test_results/README.md b/utils/upload_test_results/README.md deleted file mode 100644 index e6b361081a2..00000000000 --- a/utils/upload_test_results/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## Tool to upload results to CI ClickHouse - -Currently allows to upload results from `junit_to_html` tool to ClickHouse CI - -``` -usage: upload_test_results [-h] --sha SHA --pr PR --file FILE --type - {suites,cases} [--user USER] --password PASSWORD - [--ca-cert CA_CERT] [--host HOST] [--db DB] - -Upload test result to CI ClickHouse. - -optional arguments: - -h, --help show this help message and exit - --sha SHA sha of current commit - --pr PR pr of current commit. 0 for master - --file FILE file to upload - --type {suites,cases} - Export type - --user USER user name - --password PASSWORD password - --ca-cert CA_CERT CA certificate path - --host HOST CI ClickHouse host - --db DB CI ClickHouse database name -``` - -$ ./upload_test_results --sha "cf7eaee3301d4634acdacbfa308ddbe0cc6a061d" --pr "0" --file xyz/cases.jer --type cases --password $PASSWD - -CI checks has single commit sha and pr identifier. -While uploading your local results for testing purposes try to use correct sha and pr. - -CA Certificate for ClickHouse CI can be obtained from Yandex.Cloud where CI database is hosted -``` bash -wget "https://storage.yandexcloud.net/cloud-certs/CA.pem" -O YandexInternalRootCA.crt -``` \ No newline at end of file diff --git a/utils/upload_test_results/upload_test_results b/utils/upload_test_results/upload_test_results deleted file mode 100755 index 5916d0d85e8..00000000000 --- a/utils/upload_test_results/upload_test_results +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python3 -import requests -import argparse - -# CREATE TABLE test_suites -# ( -# sha String, -# pr UInt16, -# suite String, -# errors UInt16, -# failures UInt16, -# hostname String, -# skipped UInt16, -# duration Double, -# timestamp DateTime -# ) ENGINE = MergeTree ORDER BY tuple(timestamp, suite); - -QUERY_SUITES="INSERT INTO test_suites "\ - "SELECT '{sha}' AS sha, "\ - "{pr} AS pr, "\ - "suite, "\ - "errors, "\ - "failures, "\ - "hostname, "\ - "skipped, "\ - "duration, "\ - "timestamp "\ - "FROM input('"\ - "suite String, "\ - "errors UInt16, "\ - "failures UInt16, "\ - "hostname String, "\ - "skipped UInt16, "\ - "duration Double, "\ - "timestamp DateTime"\ - "') FORMAT JSONEachRow" - -# CREATE TABLE test_cases -# ( -# sha String, -# pr UInt16, -# hostname String, -# suite String, -# timestamp DateTime, -# testname String, -# classname String, -# file String, -# line UInt16, -# duration Double, -# suite_duration Double, -# stderr String, -# stdout String -# ) ENGINE = MergeTree ORDER BY tuple(timestamp, testname); - -QUERY_CASES="INSERT INTO test_cases "\ - "SELECT '{sha}' AS sha, "\ - "{pr} AS pr, "\ - "hostname, "\ - "suite, "\ - "timestamp, "\ - "testname, "\ - "classname, "\ - "file, "\ - "line, "\ - "duration, "\ - "suite_duration, "\ - "stderr,"\ - "stdout "\ - "FROM input('"\ - "hostname String, "\ - "suite String, "\ - "timestamp DateTime, "\ - "testname String, "\ - "classname String, "\ - "file String, "\ - "line UInt16, "\ - "duration Double, "\ - "suite_duration Double, "\ - "stderr String, "\ - "stdout String"\ - "') FORMAT JSONEachRow" - - -def upload_request(sha, pr, file, q_type, user, password, ca_cert, host, db): - with open(file) as upload_f: - query = QUERY_SUITES if q_type=="suites" else QUERY_CASES - query = query.format(sha=sha, pr=pr) - url = 'https://{host}:8443/?database={db}&query={query}&date_time_input_format=best_effort'.format( - host=host, - db=db, - query=query - ) - data=upload_f - auth = { - 'X-ClickHouse-User': user, - 'X-ClickHouse-Key': password, - } - - print(query); - - res = requests.post( - url, - data=data, - headers=auth, - verify=ca_cert) - res.raise_for_status() - return res.text - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description='Upload test result to CI ClickHouse.') - parser.add_argument('--sha', help='sha of current commit', type=str, required=True) - parser.add_argument('--pr', help='pr of current commit. 0 for master', type=int, required=True) - parser.add_argument('--file', help='file to upload', required=True) - parser.add_argument('--type', help='Export type', choices=['suites', 'cases'] , required=True) - parser.add_argument('--user', help='user name', type=str, default="clickhouse-ci") - parser.add_argument('--password', help='password', type=str, required=True) - parser.add_argument('--ca-cert', help='CA certificate path', type=str, default="/usr/local/share/ca-certificates/YandexInternalRootCA.crt") - parser.add_argument('--host', help='CI ClickHouse host', type=str, default="c1a-ity5agjmuhyu6nu9.mdb.yandexcloud.net") - parser.add_argument('--db', help='CI ClickHouse database name', type=str, default="clickhouse-ci") - - args = parser.parse_args() - - print((upload_request(args.sha, args.pr, args.file, args.type, args.user, args.password, args.ca_cert, args.host, args.db))) - - - diff --git a/utils/zero_copy/zero_copy_schema_converter.py b/utils/zero_copy/zero_copy_schema_converter.py new file mode 100755 index 00000000000..6fdd03add5a --- /dev/null +++ b/utils/zero_copy/zero_copy_schema_converter.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +import argparse +import socket +import uuid +from kazoo.client import KazooClient + +def parse_args(): + """ + Parse command-line arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument('--hosts', default=socket.getfqdn() + ':2181', help='ZooKeeper hosts (host:port,host:port,...)') + parser.add_argument('-s', '--secure', default=False, action='store_true', help='Use secure connection') + parser.add_argument('--cert', default='', help='Client TLS certificate file') + parser.add_argument('--key', default='', help='Client TLS key file') + parser.add_argument('--ca', default='', help='Client TLS ca file') + parser.add_argument('-u', '--user', default='', help='ZooKeeper ACL user') + parser.add_argument('-p', '--password', default='', help='ZooKeeper ACL password') + parser.add_argument('-r', '--root', default='/clickhouse', help='ZooKeeper root path for ClickHouse') + parser.add_argument('-z', '--zcroot', default='zero_copy', help='ZooKeeper node for new zero-copy data') + parser.add_argument('--dryrun', default=False, action='store_true', help='Do not perform any actions') + parser.add_argument('--cleanup', default=False, action='store_true', help='Clean old nodes') + parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Verbose mode') + + return parser.parse_args() + + +# Several folders to heuristic that zookeepr node is folder node +# May be false positive when someone creates set of tables with same paths +table_nodes = ['alter_partition_version', 'block_numbers', 'blocks', 'columns', 'leader_election'] +zc_nodes = ['zero_copy_s3', 'zero_copy_hdfs'] + + +def convert_node(client, args, path, zc_node): + base_path = f'{path}/{zc_node}/shared' + parts = client.get_children(base_path) + table_id_path = f'{path}/table_id' + table_id = '' + if client.exists(table_id_path): + table_id = client.get(table_id_path)[0].decode('UTF-8') + else: + table_id = str(uuid.uuid4()) + if args.verbose: + print(f'Make table_id "{table_id_path}" = "{table_id}"') + if not args.dryrun: + client.create(table_id_path, bytes(table_id, 'UTF-8')) + for part in parts: + part_path = f'{base_path}/{part}' + uniq_ids = client.get_children(part_path) + for uniq_id in uniq_ids: + uniq_path = f'{part_path}/{uniq_id}' + replicas = client.get_children(uniq_path) + for replica in replicas: + replica_path = f'{uniq_path}/{replica}' + new_path = f'{args.root}/{args.zcroot}/{zc_node}/{table_id}/{part}/{uniq_id}/{replica}' + if not client.exists(new_path): + if args.verbose: + print(f'Make node "{new_path}"') + if not args.dryrun: + client.ensure_path(f'{args.root}/{args.zcroot}/{zc_node}/{table_id}/{part}/{uniq_id}') + client.create(new_path, value=b'lock') + if args.cleanup: + if args.verbose: + print(f'Remove node "{replica_path}"') + if not args.dryrun: + client.delete(replica_path) + if args.cleanup and not args.dryrun: + client.delete(uniq_path) + if args.cleanup and not args.dryrun: + client.delete(part_path) + if args.cleanup and not args.dryrun: + client.delete(base_path) + client.delete(f'{path}/{zc_node}') + + +def convert_table(client, args, path, nodes): + print(f'Convert table nodes by path "{path}"') + for zc_node in zc_nodes: + if zc_node in nodes: + convert_node(client, args, path, zc_node) + + +def is_like_a_table(nodes): + for tn in table_nodes: + if tn not in nodes: + return False + return True + + +def scan_recursive(client, args, path): + nodes = client.get_children(path) + if is_like_a_table(nodes): + convert_table(client, args, path, nodes) + else: + for node in nodes: + scan_recursive(client, args, f'{path}/{node}') + + +def scan(client, args): + nodes = client.get_children(args.root) + for node in nodes: + if node != args.zcroot: + scan_recursive(client, args, f'{args.root}/{node}') + + +def get_client(args): + client = KazooClient(connection_retry=3, + command_retry=3, + timeout=1, + hosts=args.hosts, + use_ssl=args.secure, + certfile=args.cert, + keyfile=args.key, + ca=args.ca + ) + client.start() + if (args.user and args.password): + client.add_auth('digest', f'{args.user}:{args.password}') + return client + + +def main(): + args = parse_args() + client = get_client(args) + scan(client, args) + + +if __name__ == '__main__': + main() diff --git a/website/benchmark/hardware/index.html b/website/benchmark/hardware/index.html index 260a928184d..c6b1e2be275 100644 --- a/website/benchmark/hardware/index.html +++ b/website/benchmark/hardware/index.html @@ -82,6 +82,9 @@ Comparison of EBS and EFS is from Ramazan Polat.
Results for Hetzner and Scaleway are from Anthony Najjar Simon (Panelbear).
Results for GCP are from Vy Nguyen Tan.
Results for ThinkPad P15 are from Mikhail Shiryaev.
+Results for RockPi4 are from Kirill Zholnay.
+Results for Xeon 6266C are from David in Shanghai.
+Results for SSDNodes and Cavium are from Lorenzo QXIP.

diff --git a/website/benchmark/hardware/results/cavium_4core.json b/website/benchmark/hardware/results/cavium_4core.json new file mode 100644 index 00000000000..a7cb96b2cd3 --- /dev/null +++ b/website/benchmark/hardware/results/cavium_4core.json @@ -0,0 +1,54 @@ +[ + { + "system": "Cavium ARM64 CPU (4 Core, 1.5 GHz, NVMe SSD)", + "system_full": "Cavium ARM64 CPU (4 Corem 1.5 GHz, NVMe SSD), 16 GiB", + "time": "2021-12-27 00:00:00", + "kind": "server", + "result": + [ +[0.004, 0.004, 0.004], +[0.196, 0.178, 0.180], +[0.495, 0.437, 0.426], +[0.715, 0.499, 0.499], +[0.992, 0.798, 0.795], +[3.958, 3.750, 3.751], +[0.288, 0.274, 0.273], +[0.236, 0.231, 0.239], +[3.129, 2.936, 2.918], +[4.221, 3.924, 3.934], +[2.395, 2.285, 2.226], +[2.832, 2.703, 2.644], +[6.510, 6.301, 6.262], +[7.933, 7.669, 7.704], +[7.397, 7.122, 7.146], +[4.692, 4.537, 4.540], +[15.194, 14.835, 15.051], +[10.446, 10.036, 10.072], +[26.472, 25.655, 25.809], +[0.879, 0.669, 0.694], +[14.614, 13.755, 13.726], +[16.876, 15.675, 15.703], +[34.715, 33.204, 33.250], +[18.850, 15.387, 15.332], +[4.455, 4.025, 4.016], +[3.667, 3.415, 3.457], +[4.507, 4.057, 4.049], +[14.344, 13.394, 13.390], +[17.519, 17.052, 17.067], +[8.606, 8.611, 8.545], +[6.936, 6.491, 6.496], +[10.020, 9.260, 9.233], +[39.793, 39.631, 39.553], +[30.310, 29.604, 29.572], +[30.485, 29.557, 29.649], +[8.539, 8.337, 8.342], +[0.931, 0.912, 0.912], +[0.523, 0.516, 0.507], +[0.460, 0.448, 0.450], +[1.880, 1.817, 1.884], +[0.141, 0.119, 0.117], +[0.116, 0.095, 0.092], +[0.021, 0.017, 0.014] + ] + } +] diff --git a/website/benchmark/hardware/results/rock_pi.json b/website/benchmark/hardware/results/rock_pi.json new file mode 100644 index 00000000000..210dc213a49 --- /dev/null +++ b/website/benchmark/hardware/results/rock_pi.json @@ -0,0 +1,54 @@ +[ + { + "system": "Rock Pi 4, 4GiB, NVMe", + "system_full": "Rock Pi 4, 4GiB C, NVMe", + "time": "2021-12-23 00:00:00", + "kind": "desktop", + "result": + [ +[0.007, 0.014, 0.005], +[0.229, 0.132, 0.215], +[0.489, 0.351, 0.306], +[0.879, 0.774, 0.768], +[1.034, 0.966, 0.879], +[2.491, 2.249, 2.493], +[0.379, 0.212, 0.213], +[0.227, 0.140, 0.152], +[3.944, 3.823, 3.805], +[5.272, 4.985, 5.069], +[2.356, 2.193, 2.254], +[2.819, 2.595, 2.568], +[9.124, 8.306, 8.529], +[11.857, 11.412, 11.290], +[9.796, 9.477, 9.610], +[8.846, 8.867, 8.909], +[null, null, null], +[null, null, null], +[null, null, null], +[1.293, 0.887, 0.980], +[15.018, 14.928, 14.748], +[19.179, 17.889, 18.021], +[45.524, 46.927, 46.909], +[23.904, 23.197, 23.511], +[5.264, 4.891, 4.936], +[4.211, 3.940, 4.047], +[5.113, 4.615, 4.783], +[17.910, 16.800, 16.410], +[23.537, 22.249, 22.172], +[16.549, 16.388, 16.337], +[9.562, 9.006, 9.260], +[17.097, 17.676, 17.585], +[null, null, null], +[null, null, null], +[null, null, null], +[null, null, null], +[1.668, 1.469, 1.342], +[0.463, 0.442, 0.353], +[0.486, 0.410, 0.346], +[2.190, 2.014, 1.878], +[0.263, 0.097, 0.201], +[0.173, 0.082, 0.139], +[0.188, 0.024, 0.016] + ] + } +] diff --git a/website/benchmark/hardware/results/ssdnodes.json b/website/benchmark/hardware/results/ssdnodes.json new file mode 100644 index 00000000000..623f4b49687 --- /dev/null +++ b/website/benchmark/hardware/results/ssdnodes.json @@ -0,0 +1,54 @@ +[ + { + "system": "SSDNodes G6", + "system_full": "G6 Performance+ 48GB RAM, 720GB NVMe, 12x Intel Silver vCPU, KVM", + "time": "2021-12-27 00:00:00", + "kind": "cloud", + "result": + [ +[0.002, 0.002, 0.002], +[0.021, 0.017, 0.017], +[0.053, 0.034, 0.039], +[0.090, 0.053, 0.047], +[0.146, 0.123, 0.117], +[0.358, 0.325, 0.323], +[0.025, 0.020, 0.021], +[0.042, 0.015, 0.014], +[0.566, 0.511, 0.524], +[0.704, 0.626, 0.591], +[0.229, 0.174, 0.194], +[0.255, 0.210, 0.206], +[0.849, 0.725, 0.701], +[0.984, 0.907, 0.948], +[0.952, 0.886, 0.899], +[0.772, 0.741, 0.738], +[2.945, 2.667, 2.703], +[1.645, 1.646, 1.576], +[5.342, 5.042, 5.306], +[0.088, 0.052, 0.051], +[1.176, 0.825, 0.839], +[1.261, 1.001, 0.933], +[2.977, 2.190, 2.193], +[1.872, 0.991, 0.956], +[0.368, 0.264, 0.275], +[0.300, 0.247, 0.241], +[0.329, 0.272, 0.277], +[1.124, 0.870, 0.824], +[1.545, 1.270, 1.281], +[1.478, 1.399, 1.463], +[0.809, 0.696, 0.677], +[1.095, 0.875, 0.832], +[5.164, 4.841, 4.613], +[3.859, 3.435, 3.396], +[4.054, 3.479, 3.496], +[1.325, 1.274, 1.294], +[0.261, 0.248, 0.266], +[0.102, 0.096, 0.104], +[0.102, 0.090, 0.094], +[0.600, 0.550, 0.566], +[0.041, 0.031, 0.028], +[0.029, 0.021, 0.025], +[0.007, 0.006, 0.005] + ] + } +] diff --git a/website/benchmark/hardware/results/xeon_gold_6266.json b/website/benchmark/hardware/results/xeon_gold_6266.json new file mode 100644 index 00000000000..0e68466a633 --- /dev/null +++ b/website/benchmark/hardware/results/xeon_gold_6266.json @@ -0,0 +1,56 @@ +[ + { + "system": "Huawei Cloud c6.xlarge.4, 4vCPUs, 16 GiB", + "system_full": "Huawei Cloud c6.xlarge.4, Xeon Gold 6266C, 3GHz, 4vCPU, 16GiB RAM, vda1 40GB", + "cpu_vendor": "Intel", + "cpu_model": "Xeon Gold 6266C", + "time": "2021-12-23 00:00:00", + "kind": "cloud", + "result": + [ +[0.001, 0.001, 0.001], +[0.034, 0.023, 0.023], +[0.168, 0.105, 0.104], +[0.745, 0.162, 0.160], +[1.512, 0.328, 0.327], +[2.408, 1.162, 1.155], +[0.069, 0.052, 0.051], +[0.074, 0.027, 0.026], +[2.314, 1.833, 1.796], +[2.749, 2.014, 2.011], +[1.424, 0.618, 0.579], +[1.494, 0.681, 0.677], +[3.208, 2.457, 2.529], +[5.071, 3.329, 3.411], +[3.968, 3.289, 3.330], +[3.142, 2.925, 2.827], +[9.473, 9.034, 8.850], +[6.768, 6.256, 6.115], +[18.388, 17.790, 17.892], +[1.105, 0.195, 0.194], +[20.310, 3.459, 3.416], +[22.772, 3.811, 3.773], +[42.554, 8.738, 8.640], +[30.747, 4.013, 3.967], +[4.707, 0.973, 0.965], +[2.003, 0.845, 0.839], +[4.978, 0.991, 0.974], +[19.726, 3.293, 3.264], +[17.151, 5.171, 5.134], +[3.620, 3.600, 3.600], +[4.693, 2.172, 2.115], +[10.842, 2.686, 2.750], +[17.857, 17.086, 16.907], +[22.926, 13.070, 12.808], +[22.803, 12.727, 12.867], +[4.189, 3.888, 3.893], +[0.227, 0.176, 0.177], +[0.085, 0.068, 0.067], +[0.101, 0.064, 0.067], +[0.493, 0.438, 0.399], +[0.042, 0.022, 0.021], +[0.029, 0.017, 0.015], +[0.007, 0.005, 0.003] + ] + } +] diff --git a/website/blog/en/2021/tests-visualization.md b/website/blog/en/2021/tests-visualization.md new file mode 100644 index 00000000000..259cb4d8e34 --- /dev/null +++ b/website/blog/en/2021/tests-visualization.md @@ -0,0 +1,45 @@ +--- +title: 'Decorating a Christmas Tree With the Help Of Flaky Tests' +image: 'https://blog-images.clickhouse.com/en/2021/tests-visualization/tests.png' +date: '2021-12-27' +author: '[Alexey Milovidov](https://github.com/alexey-milovidov)' +tags: ['tests', 'ci', 'flaky', 'christmas', 'visualization'] +--- + +Test suites and testing infrastructure are one of the main assets of ClickHouse. We have tons of functional, integration, unit, performance, stress and fuzz tests. Tests are run on a per commit basis and results are publicly available. + +We also save the results of all test runs into the database in ClickHouse. We started collecting results in June 2020, and we have 1 777 608 240 records so far. Now we run around 5 to 9 million tests every day. + +Tests are good (in general). A good test suite allows for fast development iterations, stable releases, and to accept more contributions from the community. We love tests. If there's something strange in ClickHouse, what are we gonna do? Write more tests. + +Some tests can be flaky. The reasons for flakiness are uncountable - most of them are simple timing issues in the test script itself, but sometimes if a test has failed one of a thousand times it can uncover subtle logic errors in code. + +The problem is how to deal with flaky tests. Some people suggest automatically muting the "annoying" flaky tests. Or adding automatic retries in case of failure. We believe that this is all wrong. Instead of trying to ignore flaky tests, we do the opposite: we put maximum effort into making the tests even more flaky! + +Our recipes for flaky tests: +— never mute or restart them; if the test failed once, always look and investigate the cause; +— randomize the environment for every test run so the test will have more possible reasons to fail; +— if new tests are added, run them 100 times and if at least one fails, do not merge the pull request; +— if new tests are added, use them as a corpus for fuzzing - it will uncover corner cases even if author did not write tests for them; +— [randomize thread scheduling](https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/ThreadFuzzer.h) and add random sleeps and switching between CPU cores at random places and before and after mutex locks/unlocks; +— run everything in parallel on slow machines; + +Key point: to prevent flaky tests, we make our tests as flaky as possible. + +## Nice Way To Visualize Flaky Tests + +There is a test suite named "[functional stateless tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/0_stateless)" that has 3772 tests. For every day since 2020-06-13 (561 days) and every test (3772 tests), I drew a picture of size 561x3772 where a pixel is green if all test runs finished successfully in the master branch during this day (for all commits and all combinations: release, debug+assertions, ASan, MSan, TSan, UBSan), and a pixel is red if at least one run failed. The pixel will be transparent if the test did not exist that day. + +This visualization is a toy that I've made for fun: + +![Visualization](https://blog-images.clickhouse.com/en/2021/tests-visualization/tree_half.png) + +It looks like a Christmas Tree (you need a bit of imagination). If you have a different kind of imagination, you can see it as a green field with flowers. + +The time is from left to right. The tests are numbered with non-unique numbers (new tests usually get larger numbers), and these numbers are on the vertical axis (newer tests on top). + +If you see red dots in a horizontal line - it is a flaky test. If you see red dots in a vertical line - it means that one day we accidentally broke the master branch. If you see black horizontal lines or cuts in the tree - it means that the tests were added with some old numbers, most likely because some long living feature branch was merged. If you see black vertical lines - it means that some days tests were not run. + +The velocity of adding new tests is represented by how tall and narrow the Christmas tree is. When we add a large number of tests, the tree grows with almost vertical slope. + +The image is prepared by [HTML page](https://github.com/ClickHouse/ClickHouse/pull/33185) with some JavaScript that is querying a ClickHouse database directly and writing to a canvas. It took around ten seconds to build this picture. I also prepared an [interactive version](https://blog-images.clickhouse.com/en/2021/tests-visualization/demo.html) with already-saved data where you can play and find your favorite tests. diff --git a/website/templates/index/success.html b/website/templates/index/success.html index e09274c3a6f..7d70f4367b2 100644 --- a/website/templates/index/success.html +++ b/website/templates/index/success.html @@ -62,7 +62,7 @@
-

{{ _('Uber moved it’s logging platform to ClickHouse increasing developer productivity and overall reliability of the platform while seeing 3x data compression, 10x performance increase, and ½ the reduction in hardware cost.') }}

+

{{ _('Uber moved its logging platform to ClickHouse increasing developer productivity and overall reliability of the platform while seeing 3x data compression, 10x performance increase, and ½ the reduction in hardware cost.') }}

{{ _('Read the Case Study') }}