diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index be66f21b838..8b137891791 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,3 +1 @@ -docs/* @ClickHouse/docs -docs/zh/* @ClickHouse/docs-zh -website/* @ClickHouse/docs + diff --git a/SECURITY.md b/SECURITY.md index 1872d67a529..f002dd53ca9 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -21,9 +21,10 @@ The following versions of ClickHouse server are currently being supported with s | 21.6 | :x: | | 21.7 | :x: | | 21.8 | ✅ | -| 21.9 | ✅ | +| 21.9 | :x: | | 21.10 | ✅ | | 21.11 | ✅ | +| 21.12 | ✅ | ## Reporting a Vulnerability diff --git a/base/base/StringRef.h b/base/base/StringRef.h index d0184dbc24c..eefc87121fc 100644 --- a/base/base/StringRef.h +++ b/base/base/StringRef.h @@ -48,7 +48,9 @@ struct StringRef std::string toString() const { return std::string(data, size); } explicit operator std::string() const { return toString(); } - constexpr explicit operator std::string_view() const { return {data, size}; } + std::string_view toView() const { return std::string_view(data, size); } + + constexpr explicit operator std::string_view() const { return std::string_view(data, size); } }; /// Here constexpr doesn't implicate inline, see https://www.viva64.com/en/w/v1043/ diff --git a/base/base/wide_integer_impl.h b/base/base/wide_integer_impl.h index b8de5efb859..cffffcc213f 100644 --- a/base/base/wide_integer_impl.h +++ b/base/base/wide_integer_impl.h @@ -827,7 +827,7 @@ public: CompilerUInt128 a = (CompilerUInt128(numerator.items[1]) << 64) + numerator.items[0]; CompilerUInt128 b = (CompilerUInt128(denominator.items[1]) << 64) + denominator.items[0]; - CompilerUInt128 c = a / b; + CompilerUInt128 c = a / b; // NOLINT integer res; res.items[0] = c; @@ -1020,8 +1020,15 @@ constexpr integer::integer(std::initializer_list il) noexcept { auto it = il.begin(); for (size_t i = 0; i < _impl::item_count; ++i) + { if (it < il.end()) + { items[i] = *it; + ++it; + } + else + items[i] = 0; + } } } diff --git a/cmake/find/blob_storage.cmake b/cmake/find/blob_storage.cmake index 74a907da7db..4ad7296e95e 100644 --- a/cmake/find/blob_storage.cmake +++ b/cmake/find/blob_storage.cmake @@ -1,30 +1,29 @@ option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES}) -option(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY - "Set to FALSE to use system Azure SDK instead of bundled (OFF currently not implemented)" - ON) - if (ENABLE_AZURE_BLOB_STORAGE) + option(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY + "Set to FALSE to use system Azure SDK instead of bundled (OFF currently not implemented)" + ON) + set(USE_AZURE_BLOB_STORAGE 1) set(AZURE_BLOB_STORAGE_LIBRARY azure_sdk) + + if ((NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/sdk" + OR NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/cmake-modules") + AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) + message (WARNING "submodule contrib/azure is missing. to fix try run: \n git submodule update --init") + set(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY OFF) + set(USE_AZURE_BLOB_STORAGE 0) + endif () + + if (NOT USE_INTERNAL_SSL_LIBRARY AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) + message (FATAL_ERROR "Currently Blob Storage support can be built only with internal SSL library") + endif() + + if (NOT USE_INTERNAL_CURL AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) + message (FATAL_ERROR "Currently Blob Storage support can be built only with internal curl library") + endif() + endif() -if ((NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/sdk" - OR NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/azure/cmake-modules") - AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) - message (WARNING "submodule contrib/azure is missing. to fix try run: \n git submodule update --init") - set(USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY OFF) - set(USE_AZURE_BLOB_STORAGE 0) -endif () - -if (NOT USE_INTERNAL_SSL_LIBRARY AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) - message (FATAL_ERROR "Currently Blob Storage support can be built only with internal SSL library") -endif() - -if (NOT USE_INTERNAL_CURL AND USE_INTERNAL_AZURE_BLOB_STORAGE_LIBRARY) - message (FATAL_ERROR "Currently Blob Storage support can be built only with internal curl library") -endif() - -if (USE_AZURE_BLOB_STORAGE) - message (STATUS "Using Azure Blob Storage - ${USE_AZURE_BLOB_STORAGE}") -endif() +message (STATUS "Using Azure Blob Storage - ${USE_AZURE_BLOB_STORAGE}") diff --git a/cmake/find/ccache.cmake b/cmake/find/ccache.cmake index 95ec3d8a034..9acc0423f67 100644 --- a/cmake/find/ccache.cmake +++ b/cmake/find/ccache.cmake @@ -31,6 +31,7 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE) if (CCACHE_VERSION VERSION_GREATER "3.2.0" OR NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang") message(STATUS "Using ${CCACHE_FOUND} ${CCACHE_VERSION}") + set(LAUNCHER ${CCACHE_FOUND}) # debian (debhelpers) set SOURCE_DATE_EPOCH environment variable, that is # filled from the debian/changelog or current time. @@ -39,13 +40,8 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE) # of the manifest, which do not allow to use previous cache, # - 4.2+ ccache ignores SOURCE_DATE_EPOCH for every file w/o __DATE__/__TIME__ # - # So for: - # - 4.2+ does not require any sloppiness - # - 4.0+ will ignore SOURCE_DATE_EPOCH environment variable. - if (CCACHE_VERSION VERSION_GREATER_EQUAL "4.2") - message(STATUS "ccache is 4.2+ no quirks for SOURCE_DATE_EPOCH required") - set(LAUNCHER ${CCACHE_FOUND}) - elseif (CCACHE_VERSION VERSION_GREATER_EQUAL "4.0") + # Exclude SOURCE_DATE_EPOCH env for ccache versions between [4.0, 4.2). + if (CCACHE_VERSION VERSION_GREATER_EQUAL "4.0" AND CCACHE_VERSION VERSION_LESS "4.2") message(STATUS "Ignore SOURCE_DATE_EPOCH for ccache") set(LAUNCHER env -u SOURCE_DATE_EPOCH ${CCACHE_FOUND}) endif() diff --git a/contrib/NuRaft b/contrib/NuRaft index ff100a87131..c2043aa250e 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit ff100a8713146e1ca4b4158dd6cc4eef9af47fc3 +Subproject commit c2043aa250e53ad5cf75e596e319d587af4dcb3c diff --git a/docker/builder/Dockerfile b/docker/builder/Dockerfile deleted file mode 100644 index 49c40d576e7..00000000000 --- a/docker/builder/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -FROM ubuntu:20.04 - -# ARG for quick switch to a given ubuntu mirror -ARG apt_archive="http://archive.ubuntu.com" -RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list - -ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=13 - -RUN apt-get update \ - && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \ - --yes --no-install-recommends --verbose-versions \ - && export LLVM_PUBKEY_HASH="bda960a8da687a275a2078d43c111d66b1c6a893a3275271beedf266c1ff4a0cdecb429c7a5cccf9f486ea7aa43fd27f" \ - && wget -nv -O /tmp/llvm-snapshot.gpg.key https://apt.llvm.org/llvm-snapshot.gpg.key \ - && echo "${LLVM_PUBKEY_HASH} /tmp/llvm-snapshot.gpg.key" | sha384sum -c \ - && apt-key add /tmp/llvm-snapshot.gpg.key \ - && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \ - && echo "deb [trusted=yes] http://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \ - /etc/apt/sources.list - -RUN apt-get update \ - && apt-get install \ - bash \ - ccache \ - cmake \ - curl \ - expect \ - g++ \ - gcc \ - ninja-build \ - perl \ - pkg-config \ - python3 \ - python3-lxml \ - python3-requests \ - python3-termcolor \ - tzdata \ - llvm-${LLVM_VERSION} \ - clang-${LLVM_VERSION} \ - clang-tidy-${LLVM_VERSION} \ - lld-${LLVM_VERSION} \ - lldb-${LLVM_VERSION} \ - --yes --no-install-recommends - -COPY build.sh / - -CMD ["/bin/bash", "/build.sh"] diff --git a/docker/builder/Makefile b/docker/builder/Makefile deleted file mode 100644 index a9a7cddf3f2..00000000000 --- a/docker/builder/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -build: image - mkdir -p $(HOME)/.ccache - docker run --network=host --rm --workdir /server --volume $(realpath ../..):/server --cap-add=SYS_PTRACE --mount=type=bind,source=$(HOME)/.ccache,destination=/ccache -e CCACHE_DIR=/ccache -it yandex/clickhouse-builder - -pull: - docker pull yandex/clickhouse-builder - -image: - docker build --network=host -t yandex/clickhouse-builder . - -image_push: - docker push yandex/clickhouse-builder diff --git a/docker/builder/README.md b/docker/builder/README.md deleted file mode 100644 index cb9fb7d1b77..00000000000 --- a/docker/builder/README.md +++ /dev/null @@ -1,33 +0,0 @@ -Allows to build ClickHouse in Docker. -This is useful if you have an old OS distribution and you don't want to build fresh gcc or clang from sources. - -Usage: - -Prepare image: -``` -make image -``` - -Run build: -``` -make build -``` - -Before run, ensure that your user has access to docker: -To check, that you have access to Docker, run `docker ps`. -If not, you must add this user to `docker` group: `sudo usermod -aG docker $USER` and relogin. -(You must close all your sessions. For example, restart your computer.) - -Build results are available in `build_docker` directory at top level of your working copy. -It builds only binaries, not packages. - -For example, run server: -``` -cd $(git rev-parse --show-toplevel)/src/Server -$(git rev-parse --show-toplevel)/docker/builder/programs/clickhouse server --config-file $(git rev-parse --show-toplevel)/programs/server/config.xml -``` - -Run client: -``` -$(git rev-parse --show-toplevel)/docker/builder/programs/clickhouse client -``` diff --git a/docker/builder/build.sh b/docker/builder/build.sh deleted file mode 100755 index 1025af3f96e..00000000000 --- a/docker/builder/build.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash -set -e - -#ccache -s # uncomment to display CCache statistics -mkdir -p /server/build_docker -cd /server/build_docker -cmake -G Ninja /server "-DCMAKE_C_COMPILER=$(command -v clang-13)" "-DCMAKE_CXX_COMPILER=$(command -v clang++-13)" - -# Set the number of build jobs to the half of number of virtual CPU cores (rounded up). -# By default, ninja use all virtual CPU cores, that leads to very high memory consumption without much improvement in build time. -# Note that modern x86_64 CPUs use two-way hyper-threading (as of 2018). -# Without this option my laptop with 16 GiB RAM failed to execute build due to full system freeze. -NUM_JOBS=$(( ($(nproc || grep -c ^processor /proc/cpuinfo) + 1) / 2 )) - -ninja -j $NUM_JOBS && env TEST_OPT="--skip long compile $TEST_OPT" ctest -V -j $NUM_JOBS diff --git a/docker/images.json b/docker/images.json index dc7126a3f5a..354bdaa8728 100644 --- a/docker/images.json +++ b/docker/images.json @@ -103,6 +103,10 @@ "name": "clickhouse/mysql-golang-client", "dependent": [] }, + "docker/test/integration/dotnet_client": { + "name": "clickhouse/dotnet-client", + "dependent": [] + }, "docker/test/integration/mysql_java_client": { "name": "clickhouse/mysql-java-client", "dependent": [] diff --git a/docker/test/integration/dotnet_client/.gitignore b/docker/test/integration/dotnet_client/.gitignore new file mode 100644 index 00000000000..cd42ee34e87 --- /dev/null +++ b/docker/test/integration/dotnet_client/.gitignore @@ -0,0 +1,2 @@ +bin/ +obj/ diff --git a/docker/test/integration/dotnet_client/Dockerfile b/docker/test/integration/dotnet_client/Dockerfile new file mode 100644 index 00000000000..f8d33415175 --- /dev/null +++ b/docker/test/integration/dotnet_client/Dockerfile @@ -0,0 +1,10 @@ +# docker build . +# docker run -it --rm --network=host 14f23e59669c dotnet run --host localhost --port 8123 --user default --database default + +FROM mcr.microsoft.com/dotnet/sdk:3.1 + +WORKDIR /client +COPY *.cs *.csproj /client/ + +ARG VERSION=4.1.0 +RUN dotnet add package ClickHouse.Client -v ${VERSION} diff --git a/docker/test/integration/dotnet_client/Program.cs b/docker/test/integration/dotnet_client/Program.cs new file mode 100644 index 00000000000..3f640d15e86 --- /dev/null +++ b/docker/test/integration/dotnet_client/Program.cs @@ -0,0 +1,90 @@ +using System; +using System.Threading.Tasks; +using ClickHouse.Client.ADO; +using ClickHouse.Client.Utility; + +namespace clickhouse.test +{ + class Program + { + static async Task Main(string[] args) + { + try + { + using var connection = new ClickHouseConnection(GetConnectionString(args)); + + await connection.ExecuteStatementAsync("CREATE DATABASE IF NOT EXISTS test"); + await connection.ExecuteStatementAsync("TRUNCATE TABLE IF EXISTS test.dotnet_test"); + await connection.ExecuteStatementAsync("CREATE TABLE IF NOT EXISTS test.dotnet_test (`age` Int32, `name` String) Engine = Memory"); + + using var command = connection.CreateCommand(); + command.AddParameter("name", "Linus Torvalds"); + command.AddParameter("age", 51); + command.CommandText = "INSERT INTO test.dotnet_test VALUES({age:Int32}, {name:String})"; + await command.ExecuteNonQueryAsync(); + + using var result1 = await connection.ExecuteReaderAsync("SELECT * FROM test.dotnet_test"); + while (result1.Read()) + { + var values = new object[result1.FieldCount]; + result1.GetValues(values); + + foreach (var row in values) + { + Console.WriteLine(row); + } + } + + using var result2 = await connection.ExecuteReaderAsync(selectSql); + while (result2.Read()) + { + var values = new object[result2.FieldCount]; + result2.GetValues(values); + + foreach (var row in values) + { + Console.WriteLine(row); + } + } + } + catch (Exception e) + { + Console.Error.WriteLine(e); + Environment.ExitCode = 1; + } + } + + private static string GetConnectionString(string[] args) + { + var builder = new ClickHouseConnectionStringBuilder(); + int i = 0; + while (i < args.Length) + { + switch (args[i]) + { + case "--host": + builder.Host = args[++i]; + break; + case "--port": + builder.Port = UInt16.Parse(args[++i]); + break; + case "--user": + builder.Username = args[++i]; + break; + case "--password": + builder.Password = args[++i]; + break; + case "--database": + builder.Database = args[++i]; + break; + default: + i++; + break; + } + } + return builder.ToString(); + } + + private static string selectSql = @"SELECT NULL, toInt8(-8), toUInt8(8), toInt16(-16), toUInt16(16), toInt16(-32), toUInt16(32), toInt64(-64), toUInt64(64), toFloat32(32e6), toFloat32(-32e6), toFloat64(64e6), toFloat64(-64e6), 'TestString', toFixedString('ASD',3), toFixedString('ASD',5), toUUID('00000000-0000-0000-0000-000000000000'), toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0'), toIPv4('1.2.3.4'), toIPv4('255.255.255.255'), CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)'), CAST('a', 'Enum8(\'a\' = -1, \'b\' = 127)'), CAST('a', 'Enum16(\'a\' = -32768, \'b\' = 32767)'), array(1, 2, 3), array('a', 'b', 'c'), array(1, 2, NULL), toInt32OrNull('123'), toInt32OrNull(NULL), CAST(NULL AS Nullable(DateTime)), CAST(NULL AS LowCardinality(Nullable(String))), toLowCardinality('lowcardinality'), tuple(1, 'a', 8), tuple(123, tuple(5, 'a', 7)), toDateOrNull('1999-11-12'), toDateTime('1988-08-28 11:22:33'), toDateTime64('2043-03-01 18:34:04.4444444', 9), toDecimal32(123.45, 3), toDecimal32(-123.45, 3), toDecimal64(1.2345, 7), toDecimal64(-1.2345, 7), toDecimal128(12.34, 9), toDecimal128(-12.34, 9), toIPv6('2001:0db8:85a3:0000:0000:8a2e:0370:7334')"; + } +} diff --git a/docker/test/integration/dotnet_client/clickhouse.test.csproj b/docker/test/integration/dotnet_client/clickhouse.test.csproj new file mode 100644 index 00000000000..11704487bf6 --- /dev/null +++ b/docker/test/integration/dotnet_client/clickhouse.test.csproj @@ -0,0 +1,13 @@ + + + + Exe + netcoreapp3.1 + + + + + + + + diff --git a/docker/test/integration/runner/compose/docker_compose_dotnet_client.yml b/docker/test/integration/runner/compose/docker_compose_dotnet_client.yml new file mode 100644 index 00000000000..b63dac51522 --- /dev/null +++ b/docker/test/integration/runner/compose/docker_compose_dotnet_client.yml @@ -0,0 +1,6 @@ +version: '2.3' +services: + dotnet1: + image: clickhouse/dotnet-client:${DOCKER_DOTNET_CLIENT_TAG:-latest} + # to keep container running + command: sleep infinity diff --git a/docker/test/integration/runner/dockerd-entrypoint.sh b/docker/test/integration/runner/dockerd-entrypoint.sh index ad8a8e4eb84..8109ef7ae64 100755 --- a/docker/test/integration/runner/dockerd-entrypoint.sh +++ b/docker/test/integration/runner/dockerd-entrypoint.sh @@ -39,6 +39,7 @@ export CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH=/clickhouse-odbc-bridge export CLICKHOUSE_LIBRARY_BRIDGE_BINARY_PATH=/clickhouse-library-bridge export DOCKER_MYSQL_GOLANG_CLIENT_TAG=${DOCKER_MYSQL_GOLANG_CLIENT_TAG:=latest} +export DOCKER_DOTNET_CLIENT_TAG=${DOCKER_DOTNET_CLIENT_TAG:=latest} export DOCKER_MYSQL_JAVA_CLIENT_TAG=${DOCKER_MYSQL_JAVA_CLIENT_TAG:=latest} export DOCKER_MYSQL_JS_CLIENT_TAG=${DOCKER_MYSQL_JS_CLIENT_TAG:=latest} export DOCKER_MYSQL_PHP_CLIENT_TAG=${DOCKER_MYSQL_PHP_CLIENT_TAG:=latest} diff --git a/docs/_includes/install/freebsd.sh b/docs/_includes/install/freebsd.sh index 50e3bc02cb7..2a715a1795f 100644 --- a/docs/_includes/install/freebsd.sh +++ b/docs/_includes/install/freebsd.sh @@ -1,3 +1,3 @@ -wget 'https://builds.clickhouse.com/master/freebsd/clickhouse' +fetch 'https://builds.clickhouse.com/master/freebsd/clickhouse' chmod a+x ./clickhouse -sudo ./clickhouse install +su -m root -c './clickhouse install' diff --git a/docs/en/development/developer-instruction.md b/docs/en/development/developer-instruction.md index 52fa307333c..ccf6da355b9 100644 --- a/docs/en/development/developer-instruction.md +++ b/docs/en/development/developer-instruction.md @@ -158,6 +158,8 @@ While inside the `build` directory, configure your build by running CMake. Befor export CC=clang CXX=clang++ cmake .. +If you installed clang using the automatic installation script above, also specify the version of clang installed in the first command, e.g. `export CC=clang-13 CXX=clang++-13`. The clang version will be in the script output. + The `CC` variable specifies the compiler for C (short for C Compiler), and `CXX` variable instructs which C++ compiler is to be used for building. For a faster build, you can resort to the `debug` build type - a build with no optimizations. For that supply the following parameter `-D CMAKE_BUILD_TYPE=Debug`: diff --git a/docs/en/engines/database-engines/materialized-postgresql.md b/docs/en/engines/database-engines/materialized-postgresql.md index 4dea156f32e..43f61201946 100644 --- a/docs/en/engines/database-engines/materialized-postgresql.md +++ b/docs/en/engines/database-engines/materialized-postgresql.md @@ -7,7 +7,7 @@ toc_title: MaterializedPostgreSQL Creates a ClickHouse database with tables from PostgreSQL database. Firstly, database with engine `MaterializedPostgreSQL` creates a snapshot of PostgreSQL database and loads required tables. Required tables can include any subset of tables from any subset of schemas from specified database. Along with the snapshot database engine acquires LSN and once initial dump of tables is performed - it starts pulling updates from WAL. After database is created, newly added tables to PostgreSQL database are not automatically added to replication. They have to be added manually with `ATTACH TABLE db.table` query. -Replication is implemented with PostgreSQL Logical Replication Protocol, which does not allow to replicate DDL, but allows to know whether replication breaking changes happened (column type changes, adding/removing columns). Such changes are detected and according tables stop receiving updates. Such tables can be automatically reloaded in the background in case required setting is turned on. Safest way for now is to use `ATTACH`/ `DETACH` queries to reload table completely. If DDL does not break replication (for example, renaming a column) table will still receive updates (insertion is done by position). +Replication is implemented with PostgreSQL Logical Replication Protocol, which does not allow to replicate DDL, but allows to know whether replication breaking changes happened (column type changes, adding/removing columns). Such changes are detected and according tables stop receiving updates. Such tables can be automatically reloaded in the background in case required setting is turned on (can be used starting from 22.1). Safest way for now is to use `ATTACH`/ `DETACH` queries to reload table completely. If DDL does not break replication (for example, renaming a column) table will still receive updates (insertion is done by position). ## Creating a Database {#creating-a-database} @@ -46,7 +46,7 @@ After `MaterializedPostgreSQL` database is created, it does not automatically de ATTACH TABLE postgres_database.new_table; ``` -Warning: before version 21.13 adding table to replication left unremoved temprorary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in clickhouse version before 21.13, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. Issue is fixed in 21.13. +Warning: before version 22.1 adding table to replication left unremoved temprorary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in clickhouse version before 22.1, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. Issue is fixed in 22.1. ## Dynamically removing tables from replication {#dynamically-removing-table-from-replication} @@ -77,7 +77,7 @@ Tables are accessed via schema name and table name at the same time: ``` sql CREATE DATABASE database1 ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') -SETTINGS materialized_postgresql_tables_list = 'schema1.table1,schema2.table2,schema1.table3'; +SETTINGS materialized_postgresql_tables_list = 'schema1.table1,schema2.table2,schema1.table3', materialized_postgresql_tables_list_with_schema = 1; SELECT * FROM database1.`schema1.table1`; @@ -156,6 +156,8 @@ Default value: empty list. (Default schema is used) 4. materialized_postgresql_allow_automatic_update {#materialized-postgresql-allow-automatic-update} +Do not use this setting before 22.1 version. + Allows reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them. Possible values: diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index 82227215da2..0d6d90f9d31 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -189,7 +189,7 @@ Similar to GraphiteMergeTree, the HDFS engine supports extended configuration us |libhdfs3\_conf | "" | ### Limitations {#limitations} - * `hadoop_security_kerberos_ticket_cache_path` and `libhdfs3_conf` can be global only, not user specific +* `hadoop_security_kerberos_ticket_cache_path` and `libhdfs3_conf` can be global only, not user specific ## Kerberos support {#kerberos-support} diff --git a/docs/en/engines/table-engines/integrations/mongodb.md b/docs/en/engines/table-engines/integrations/mongodb.md index 52876674475..475416ffb94 100644 --- a/docs/en/engines/table-engines/integrations/mongodb.md +++ b/docs/en/engines/table-engines/integrations/mongodb.md @@ -66,4 +66,14 @@ SELECT COUNT() FROM mongo_table; └─────────┘ ``` +You can also adjust connection timeout: + +``` sql +CREATE TABLE mongo_table +( + key UInt64, + data String +) ENGINE = MongoDB('mongo2:27017', 'test', 'simple_table', 'testuser', 'clickhouse', 'connectTimeoutMS=100000'); +``` + [Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/mongodb/) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 4b7473f76ad..6769f48a466 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -339,7 +339,7 @@ SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 For `Map` data type client can specify if index should be created for keys or values using [mapKeys](../../../sql-reference/functions/tuple-map-functions.md#mapkeys) or [mapValues](../../../sql-reference/functions/tuple-map-functions.md#mapvalues) function. - The following functions can use the filter: [equals](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [in](../../../sql-reference/functions/in-functions.md), [notIn](../../../sql-reference/functions/in-functions.md), [has](../../../sql-reference/functions/array-functions.md#hasarr-elem). + The following functions can use the filter: [equals](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [in](../../../sql-reference/functions/in-functions.md), [notIn](../../../sql-reference/functions/in-functions.md), [has](../../../sql-reference/functions/array-functions.md#hasarr-elem), [hasAny](../../../sql-reference/functions/array-functions.md#hasany), [hasAll](../../../sql-reference/functions/array-functions.md#hasall). Example of index creation for `Map` data type diff --git a/docs/en/faq/operations/index.md b/docs/en/faq/operations/index.md index c0a6d85b66d..81aec18b9cf 100644 --- a/docs/en/faq/operations/index.md +++ b/docs/en/faq/operations/index.md @@ -11,6 +11,7 @@ Questions: - [Which ClickHouse version to use in production?](../../faq/operations/production.md) - [Is it possible to delete old records from a ClickHouse table?](../../faq/operations/delete-old-data.md) +- [Does ClickHouse support multi-region replication?](../../faq/operations/multi-region-replication.md) !!! info "Don’t see what you were looking for?" Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. diff --git a/docs/en/faq/operations/multi-region-replication.md b/docs/en/faq/operations/multi-region-replication.md new file mode 100644 index 00000000000..7d78737544a --- /dev/null +++ b/docs/en/faq/operations/multi-region-replication.md @@ -0,0 +1,13 @@ +--- +title: Does ClickHouse support multi-region replication? +toc_hidden: true +toc_priority: 30 +--- + +# Does ClickHouse support multi-region replication? {#does-clickhouse-support-multi-region-replication} + +The short answer is "yes". However, we recommend keeping latency between all regions/datacenters in two-digit range, otherwise write performance will suffer as it goes through distributed consensus protocol. For example, replication between US coasts will likely work fine, but between the US and Europe won't. + +Configuration-wise there's no difference compared to single-region replication, simply use hosts that are located in different locations for replicas. + +For more information, see [full article on data replication](../../engines/table-engines/mergetree-family/replication.md). diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 313c6508b55..f8f6f26d208 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -9,6 +9,8 @@ The HTTP interface lets you use ClickHouse on any platform from any programming By default, `clickhouse-server` listens for HTTP on port 8123 (this can be changed in the config). +Sometimes, `curl` command is not available on user operating systems. On Ubuntu or Debian, run `sudo apt install curl`. Please refer this [documentation](https://curl.se/download.html) to install it before running the examples. + If you make a `GET /` request without parameters, it returns 200 response code and the string which defined in [http_server_default_response](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-http_server_default_response) default value “Ok.” (with a line feed at the end) ``` bash @@ -186,7 +188,7 @@ $ echo "SELECT 1" | gzip -c | \ ``` ``` bash -# Receiving compressed data from the server +# Receiving compressed data archive from the server $ curl -vsS "http://localhost:8123/?enable_http_compression=1" \ -H 'Accept-Encoding: gzip' --output result.gz -d 'SELECT number FROM system.numbers LIMIT 3' $ zcat result.gz @@ -195,6 +197,15 @@ $ zcat result.gz 2 ``` +```bash +# Receiving compressed data from the server and using the gunzip to receive decompressed data +$ curl -sS "http://localhost:8123/?enable_http_compression=1" \ + -H 'Accept-Encoding: gzip' -d 'SELECT number FROM system.numbers LIMIT 3' | gunzip - +0 +1 +2 +``` + ## Default Database {#default-database} You can use the ‘database’ URL parameter or the ‘X-ClickHouse-Database’ header to specify the default database. @@ -424,10 +435,10 @@ Next are the configuration methods for different `type`. `query` value is a predefined query of `predefined_query_handler`, which is executed by ClickHouse when an HTTP request is matched and the result of the query is returned. It is a must configuration. -The following example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_alter_threads` settings, then queries the system table to check whether these settings were set successfully. +The following example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` settings, then queries the system table to check whether these settings were set successfully. !!! note "Warning" - To keep the default `handlers` such as` query`, `play`,` ping`, use the `` rule. + To keep the default `handlers` such as` query`, `play`,` ping`, use the `` rule. Example: @@ -451,9 +462,9 @@ Example: ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "caution" @@ -465,7 +476,7 @@ In `dynamic_query_handler`, the query is written in the form of param of the HTT ClickHouse extracts and executes the value corresponding to the `query_param_name` value in the URL of the HTTP request. The default value of `query_param_name` is `/query` . It is an optional configuration. If there is no definition in the configuration file, the param is not passed in. -To experiment with this functionality, the example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_alter_threads` and `queries` whether the settings were set successfully. +To experiment with this functionality, the example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` and `queries` whether the settings were set successfully. Example: @@ -484,9 +495,9 @@ Example: ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ### static {#static} diff --git a/docs/en/interfaces/mysql.md b/docs/en/interfaces/mysql.md index 38bcc2b68f8..9932e6b6cb3 100644 --- a/docs/en/interfaces/mysql.md +++ b/docs/en/interfaces/mysql.md @@ -36,7 +36,7 @@ mysql> ``` For compatibility with all MySQL clients, it is recommended to specify user password with [double SHA1](../operations/settings/settings-users.md#password_double_sha1_hex) in configuration file. -If user password is specified using [SHA256](../operations/settings/settings-users.md#password_sha256_hex), some clients won’t be able to authenticate (mysqljs and old versions of command-line tool mysql). +If user password is specified using [SHA256](../operations/settings/settings-users.md#password_sha256_hex), some clients won’t be able to authenticate (mysqljs and old versions of command-line tool MySQL and MariaDB). Restrictions: diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md index 87c5a6f7aec..c2660653907 100644 --- a/docs/en/introduction/adopters.md +++ b/docs/en/introduction/adopters.md @@ -60,8 +60,10 @@ toc_title: Adopters | Exness | Trading | Metrics, Logging | — | — | [Talk in Russian, May 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) | | EventBunker.io | Serverless Data Processing | — | — | — | [Tweet, April 2021](https://twitter.com/Halil_D_/status/1379839133472985091) | | FastNetMon | DDoS Protection | Main Product | | — | [Official website](https://fastnetmon.com/docs-fnm-advanced/fastnetmon-advanced-traffic-persistency/) | +| Firebolt | Analytics | Main product | - | - | [YouTube Tech Talk](https://www.youtube.com/watch?v=9rW9uEJ15tU) | | Flipkart | e-Commerce | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=239) | | FunCorp | Games | | — | 14 bn records/day as of Jan 2021 | [Article](https://www.altinity.com/blog/migrating-from-redshift-to-clickhouse) | +| Futurra Group | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) | | Geniee | Ad network | Main product | — | — | [Blog post in Japanese, July 2017](https://tech.geniee.co.jp/entry/2017/07/20/160100) | | Genotek | Bioinformatics | Main product | — | — | [Video, August 2020](https://youtu.be/v3KyZbz9lEE) | | Gigapipe | Managed ClickHouse | Main product | — | — | [Official website](https://gigapipe.com/) | @@ -70,6 +72,7 @@ toc_title: Adopters | Grouparoo | Data Warehouse Integrations | Main product | — | — | [Official Website, November 2021](https://www.grouparoo.com/integrations) | | HUYA | Video Streaming | Analytics | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/7.%20ClickHouse万亿数据分析实践%20李本旺(sundy-li)%20虎牙.pdf) | | Hydrolix | Cloud data platform | Main product | — | — | [Documentation](https://docs.hydrolix.io/guide/query) | +| Hystax | Cloud Operations | Observability Analytics | - | - | [Blog](https://hystax.com/clickhouse-for-real-time-cost-saving-analytics-how-to-stop-hammering-screws-and-use-an-electric-screwdriver/) | | ICA | FinTech | Risk Management | — | — | [Blog Post in English, Sep 2020](https://altinity.com/blog/clickhouse-vs-redshift-performance-for-fintech-risk-management?utm_campaign=ClickHouse%20vs%20RedShift&utm_content=143520807&utm_medium=social&utm_source=twitter&hss_channel=tw-3894792263) | | Idealista | Real Estate | Analytics | — | — | [Blog Post in English, April 2019](https://clickhouse.com/blog/en/clickhouse-meetup-in-madrid-on-april-2-2019) | | Infobaleen | AI markting tool | Analytics | — | — | [Official site](https://infobaleen.com) | @@ -81,14 +84,18 @@ toc_title: Adopters | Ippon Technologies | Technology Consulting | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=205) | | Ivi | Online Cinema | Analytics, Monitoring | — | — | [Article in Russian, Jan 2018](https://habr.com/en/company/ivi/blog/347408/) | | Jinshuju 金数据 | BI Analytics | Main product | — | — | [Slides in Chinese, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/3.%20金数据数据架构调整方案Public.pdf) | -| Jitsu | Cloud Software | Data Pipeline | — | — | [Documentation](https://jitsu.com/docs/destinations-configuration/clickhouse-destination), [Hacker News](https://news.ycombinator.com/item?id=29106082) | +| Jitsu | Cloud Software | Data Pipeline | — | — | [Documentation](https://jitsu.com/docs/destinations-configuration/clickhouse-destination), [Hacker News post](https://news.ycombinator.com/item?id=29106082) | +| JuiceFS | Storage | Shopping Cart | - | - | [Blog](https://juicefs.com/blog/en/posts/shopee-clickhouse-with-juicefs/) | | kakaocorp | Internet company | — | — | — | [if(kakao)2020](https://tv.kakao.com/channel/3693125/cliplink/414129353), [if(kakao)2021](https://if.kakao.com/session/24) | | Kodiak Data | Clouds | Main product | — | — | [Slides in Engish, April 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup13/kodiak_data.pdf) | | Kontur | Software Development | Metrics | — | — | [Talk in Russian, November 2018](https://www.youtube.com/watch?v=U4u4Bd0FtrY) | | Kuaishou | Video | — | — | — | [ClickHouse Meetup, October 2018](https://clickhouse.com/blog/en/2018/clickhouse-community-meetup-in-beijing-on-october-28-2018/) | | KGK Global | Vehicle monitoring | — | — | — | [Press release, June 2021](https://zoom.cnews.ru/news/item/530921) | +| LANCOM Systems | Network Solutions | Traffic analysis | - | - | [ClickHouse Operator for Kubernetes](https://www.lancom-systems.com/), [Hacker News post] (https://news.ycombinator.com/item?id=29413660) | | Lawrence Berkeley National Laboratory | Research | Traffic analysis | 5 servers | 55 TiB | [Slides in English, April 2019](https://www.smitasin.com/presentations/2019-04-17_DOE-NSM.pdf) | +| Lever | Talent Management | Recruiting | - | - | [Hacker News post](https://news.ycombinator.com/item?id=29558544) | | LifeStreet | Ad network | Main product | 75 servers (3 replicas) | 5.27 PiB | [Blog post in Russian, February 2017](https://habr.com/en/post/322620/) | +| Lookforsale | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) | | Mail.ru Cloud Solutions | Cloud services | Main product | — | — | [Article in Russian](https://mcs.mail.ru/help/db-create/clickhouse#) | | MAXILECT | Ad Tech, Blockchain, ML, AI | — | — | — | [Job advertisement, 2021](https://www.linkedin.com/feed/update/urn:li:activity:6780842017229430784/) | | Marilyn | Advertising | Statistics | — | — | [Talk in Russian, June 2017](https://www.youtube.com/watch?v=iXlIgx2khwc) | @@ -106,6 +113,7 @@ toc_title: Adopters | Ok.ru | Social Network | — | 72 servers | 810 TB compressed, 50bn rows/day, 1.5 TB/day | [SmartData conference, October 2021](https://assets.ctfassets.net/oxjq45e8ilak/4JPHkbJenLgZhBGGyyonFP/57472ec6987003ec4078d0941740703b/____________________ClickHouse_______________________.pdf) | | Omnicomm | Transportation Monitoring | — | — | — | [Facebook post, October 2021](https://www.facebook.com/OmnicommTeam/posts/2824479777774500) | | OneAPM | Monitoring and Data Analysis | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) | +| Opensee | Financial Analytics | Main product | - | - | [Blog](https://opensee.io/news/from-moscow-to-wall-street-the-remarkable-journey-of-clickhouse/) | | Open Targets | Genome Research | Genome Search | — | — | [Tweet, October 2021](https://twitter.com/OpenTargets/status/1452570865342758913?s=20), [Blog](https://blog.opentargets.org/graphql/) | | OZON | E-commerce | — | — | — | [Official website](https://job.ozon.ru/vacancy/razrabotchik-clickhouse-ekspluatatsiya-40991870/) | | Panelbear | Analytics | Monitoring and Analytics | — | — | [Tech Stack, November 2020](https://panelbear.com/blog/tech-stack/) | @@ -118,6 +126,7 @@ toc_title: Adopters | PRANA | Industrial predictive analytics | Main product | — | — | [News (russian), Feb 2021](https://habr.com/en/news/t/541392/) | | QINGCLOUD | Cloud services | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/4.%20Cloud%20%2B%20TSDB%20for%20ClickHouse%20张健%20QingCloud.pdf) | | Qrator | DDoS protection | Main product | — | — | [Blog Post, March 2019](https://blog.qrator.net/en/clickhouse-ddos-mitigation_37/) | +| R-Vision | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) | | Raiffeisenbank | Banking | Analytics | — | — | [Lecture in Russian, December 2020](https://cs.hse.ru/announcements/421965599.html) | | Rambler | Internet services | Analytics | — | — | [Talk in Russian, April 2018](https://medium.com/@ramblertop/разработка-api-clickhouse-для-рамблер-топ-100-f4c7e56f3141) | | Replica | Urban Planning | Analytics | — | — | [Job advertisement](https://boards.greenhouse.io/replica/jobs/5547732002?gh_jid=5547732002) | @@ -153,6 +162,7 @@ toc_title: Adopters | Tinybird | Real-time Data Products | Data processing | — | — | [Official website](https://www.tinybird.co/) | | Traffic Stars | AD network | — | 300 servers in Europe/US | 1.8 PiB, 700 000 insert rps (as of 2021) | [Slides in Russian, May 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup15/lightning/ninja.pdf) | | Uber | Taxi | Logging | — | — | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/uber.pdf) | +| UseTech | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) | | UTMSTAT | Analytics | Main product | — | — | [Blog post, June 2020](https://vc.ru/tribuna/133956-striming-dannyh-iz-servisa-skvoznoy-analitiki-v-clickhouse) | | Vercel | Traffic and Performance Analytics | — | — | — | Direct reference, October 2021 | | VKontakte | Social Network | Statistics, Logging | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/3_vk.pdf) | @@ -168,7 +178,8 @@ toc_title: Adopters | Yandex Cloud | Public Cloud | Main product | — | — | [Talk in Russian, December 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) | | Yandex DataLens | Business Intelligence | Main product | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.com/meetup38/datalens.pdf) | | Yandex Market | e-Commerce | Metrics, Logging | — | — | [Talk in Russian, January 2019](https://youtu.be/_l1qP0DyBcA?t=478) | -| Yandex Metrica | Web analytics | Main product | 630 servers in one cluster, 360 servers in another cluster, 1862 servers in one department | 133 PiB / 8.31 PiB / 120 trillion records | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/introduction/#13) | +| Yandex Metrica | Web analytics | Macin product | 630 servers in one cluster, 360 servers in another cluster, 1862 servers in one department | 133 PiB / 8.31 PiB / 120 trillion records | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/introduction/#13) | +| | Analytics | Main product | - | - | [Integration](https://www.yellowfinbi.com/campaign/yellowfin-9-whats-new#el-30219e0e) | | Yotascale | Cloud | Data pipeline | — | 2 bn records/day | [LinkedIn (Accomplishments)](https://www.linkedin.com/in/adilsaleem/) | | Your Analytics | Product Analytics | Main Product | — | - | [Tweet, November 2021](https://twitter.com/mikenikles/status/1459737241165565953) | | Zagrava Trading | — | — | — | — | [Job offer, May 2021](https://twitter.com/datastackjobs/status/1394707267082063874) | @@ -178,9 +189,5 @@ toc_title: Adopters | Цифровой Рабочий | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) | | ООО «МПЗ Богородский» | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) | | ДомКлик | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) | -| Futurra Group | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) | -| UseTech | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) | -| Lookforsale | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) | -| R-Vision | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) | [Original article](https://clickhouse.com/docs/en/introduction/adopters/) diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index 4e50752b1f9..fcfc675f9d7 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -3,14 +3,14 @@ toc_priority: 66 toc_title: ClickHouse Keeper --- -# [pre-production] ClickHouse Keeper +# [pre-production] ClickHouse Keeper {#clickHouse-keeper} ClickHouse server uses [ZooKeeper](https://zookeeper.apache.org/) coordination system for data [replication](../engines/table-engines/mergetree-family/replication.md) and [distributed DDL](../sql-reference/distributed-ddl.md) queries execution. ClickHouse Keeper is an alternative coordination system compatible with ZooKeeper. !!! warning "Warning" This feature is currently in the pre-production stage. We test it in our CI and on small internal installations. -## Implementation details +## Implementation details {#implementation-details} ZooKeeper is one of the first well-known open-source coordination systems. It's implemented in Java, has quite a simple and powerful data model. ZooKeeper's coordination algorithm called ZAB (ZooKeeper Atomic Broadcast) doesn't provide linearizability guarantees for reads, because each ZooKeeper node serves reads locally. Unlike ZooKeeper ClickHouse Keeper is written in C++ and uses [RAFT algorithm](https://raft.github.io/) [implementation](https://github.com/eBay/NuRaft). This algorithm allows to have linearizability for reads and writes, has several open-source implementations in different languages. @@ -21,7 +21,7 @@ ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper] !!! info "Note" External integrations are not supported. -## Configuration +## Configuration {#configuration} ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server, but in both cases configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is ``. Keeper configuration has the following parameters: @@ -103,7 +103,7 @@ Examples of configuration for quorum with three nodes can be found in [integrati ``` -## How to run +## How to run {#how-to-run} ClickHouse Keeper is bundled into the ClickHouse server package, just add configuration of `` and start ClickHouse server as always. If you want to run standalone ClickHouse Keeper you can start it in a similar way with: @@ -111,13 +111,14 @@ ClickHouse Keeper is bundled into the ClickHouse server package, just add config clickhouse-keeper --config /etc/your_path_to_config/config.xml --daemon ``` -## Four Letter Word Commands +## Four Letter Word Commands {#four-letter-word-commands} ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively. The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro". You can issue the commands to ClickHouse Keeper via telnet or nc, at the client port. + ``` echo mntr | nc localhost 9181 ``` @@ -297,7 +298,7 @@ Sessions with Ephemerals (1): /clickhouse/task_queue/ddl ``` -## [experimental] Migration from ZooKeeper +## [experimental] Migration from ZooKeeper {#migration-from-zookeeper} Seamlessly migration from ZooKeeper to ClickHouse Keeper is impossible you have to stop your ZooKeeper cluster, convert data and start ClickHouse Keeper. `clickhouse-keeper-converter` tool allows converting ZooKeeper logs and snapshots to ClickHouse Keeper snapshot. It works only with ZooKeeper > 3.4. Steps for migration: diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index e8099ef0ac6..78f6c71c65f 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -672,7 +672,8 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa ## max_concurrent_queries {#max-concurrent-queries} -The maximum number of simultaneously processed queries related to MergeTree table. Queries may be limited by other settings: [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). +The maximum number of simultaneously processed queries related to MergeTree table. +Queries may be limited by other settings: [max_concurrent_insert_queries](#max-concurrent-insert-queries), [max_concurrent_select_queries](#max-concurrent-select-queries), [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). !!! info "Note" These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. @@ -680,7 +681,9 @@ The maximum number of simultaneously processed queries related to MergeTree tabl Possible values: - Positive integer. -- 0 — Disabled. +- 0 — No limit. + +Default value: `100`. **Example** @@ -688,6 +691,46 @@ Possible values: 100 ``` +## max_concurrent_insert_queries {#max-concurrent-insert-queries} + +The maximum number of simultaneously processed `INSERT` queries. + +!!! info "Note" + These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. + +**Example** + +``` xml +100 +``` + +## max_concurrent_select_queries {#max-concurrent-select-queries} + +The maximum number of simultaneously processed `SELECT` queries. + +!!! info "Note" + These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. + +**Example** + +``` xml +100 +``` + ## max_concurrent_queries_for_user {#max-concurrent-queries-for-user} The maximum number of simultaneously processed queries related to MergeTree table per user. @@ -695,7 +738,9 @@ The maximum number of simultaneously processed queries related to MergeTree tabl Possible values: - Positive integer. -- 0 — Disabled. +- 0 — No limit. + +Default value: `0`. **Example** @@ -711,7 +756,12 @@ Example: `max_concurrent_queries_for_all_users` can be set to 99 for all users a Modifying the setting for one query or user does not affect other queries. -Default value: `0` that means no limit. +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. **Example** @@ -1238,6 +1288,20 @@ Example 9004 ``` +## postgresql_port {#server_configuration_parameters-postgresql_port} + +Port for communicating with clients over PostgreSQL protocol. + +**Possible values** + +Positive integer. + +Example + +``` xml +9005 +``` + ## tmp_path {#tmp-path} Path to temporary data for processing large queries. diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md index af75d130ed3..a7bba76a05a 100644 --- a/docs/en/operations/settings/merge-tree-settings.md +++ b/docs/en/operations/settings/merge-tree-settings.md @@ -27,6 +27,10 @@ An example of changing the settings for a specific table with the `ALTER TABLE . ``` sql ALTER TABLE foo MODIFY SETTING max_suspicious_broken_parts = 100; + +-- reset to default (use value from system.merge_tree_settings) +ALTER TABLE foo + RESET SETTING max_suspicious_broken_parts; ``` ## parts_to_throw_insert {#parts-to-throw-insert} diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 510047f4353..8a0fd618d32 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4155,3 +4155,20 @@ Default value: `''`. Sets the character that is interpreted as a suffix after the result set for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. Default value: `''`. + +## shutdown_wait_unfinished_queries + +Enables or disables waiting unfinished queries when shutdown server. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. The wait time equal shutdown_wait_unfinished config. + +Default value: 0. + +## shutdown_wait_unfinished + +The waiting time in seconds for currently handled connections when shutdown server. + +Default Value: 5. diff --git a/docs/en/operations/utilities/clickhouse-copier.md b/docs/en/operations/utilities/clickhouse-copier.md index de3443fb845..6587d45abd9 100644 --- a/docs/en/operations/utilities/clickhouse-copier.md +++ b/docs/en/operations/utilities/clickhouse-copier.md @@ -174,7 +174,7 @@ Parameters: - + ... ... diff --git a/docs/en/operations/utilities/index.md b/docs/en/operations/utilities/index.md index 4adbb299b1d..e307f9fde0c 100644 --- a/docs/en/operations/utilities/index.md +++ b/docs/en/operations/utilities/index.md @@ -6,7 +6,7 @@ toc_title: Overview # ClickHouse Utility {#clickhouse-utility} -- [clickhouse-local](../../operations/utilities/clickhouse-local.md) — Allows running SQL queries on data without stopping the ClickHouse server, similar to how `awk` does this. +- [clickhouse-local](../../operations/utilities/clickhouse-local.md) — Allows running SQL queries on data without starting the ClickHouse server, similar to how `awk` does this. - [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. - [clickhouse-benchmark](../../operations/utilities/clickhouse-benchmark.md) — Loads server with the custom queries and settings. - [clickhouse-format](../../operations/utilities/clickhouse-format.md) — Enables formatting input queries. diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index 095f059513c..c3c4bbc6493 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -122,7 +122,12 @@ Setting fields: - `command` — The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). - `format` — The file format. All the formats described in [Formats](../../../interfaces/formats.md#formats) are supported. +- `command_termination_timeout` — executable script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. - `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. +- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder. Additional script arguments can be specified using whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `0`. Optional parameter. +- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. @@ -150,10 +155,14 @@ Setting fields: - `command` — The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). - `format` — The file format. All the formats described in “[Formats](../../../interfaces/formats.md#formats)” are supported. -- `pool_size` — Size of pool. If 0 is specified as `pool_size` then there is no pool size restrictions. -- `command_termination_timeout` — Executable pool script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. +- `pool_size` — Size of pool. If 0 is specified as `pool_size` then there is no pool size restrictions. Default value is `16`. +- `command_termination_timeout` — executable script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. - `max_command_execution_time` — Maximum executable script command execution time for processing block of data. Specified in seconds. Default value is 10. Optional parameter. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. - `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. Optional parameter. +- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder. Additional script arguments can be specified using whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `1`. Optional parameter. +- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index d85092d683a..8231cda4b77 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1392,12 +1392,24 @@ Returns the first element in the `arr1` array for which `func` returns something Note that the `arrayFirst` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. +## arrayLast(func, arr1, …) {#array-last} + +Returns the last element in the `arr1` array for which `func` returns something other than 0. + +Note that the `arrayLast` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. + ## arrayFirstIndex(func, arr1, …) {#array-first-index} Returns the index of the first element in the `arr1` array for which `func` returns something other than 0. Note that the `arrayFirstIndex` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. +## arrayLastIndex(func, arr1, …) {#array-last-index} + +Returns the index of the last element in the `arr1` array for which `func` returns something other than 0. + +Note that the `arrayLastIndex` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted. + ## arrayMin {#array-min} Returns the minimum of elements in the source array. diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index b85f105758b..7ded7e72d8c 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -57,7 +57,7 @@ Alias: `toTimezone`. **Arguments** - `value` — Time or date and time. [DateTime64](../../sql-reference/data-types/datetime64.md). -- `timezone` — Timezone for the returned value. [String](../../sql-reference/data-types/string.md). +- `timezone` — Timezone for the returned value. [String](../../sql-reference/data-types/string.md). This argument is a constant, because `toTimezone` changes the timezone of a column (timezone is an attribute of `DateTime*` types). **Returned value** diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 0e8352d2d1e..84e1e5eca3b 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -217,8 +217,8 @@ Result: ``` text (0,'2019-05-20') 0 \N \N (NULL,NULL) (1,'2019-05-20') 1 First First ('First','First') -(2,'2019-05-20') 0 \N \N (NULL,NULL) -(3,'2019-05-20') 0 \N \N (NULL,NULL) +(2,'2019-05-20') 1 Second \N ('Second',NULL) +(3,'2019-05-20') 1 Third Third ('Third','Third') (4,'2019-05-20') 0 \N \N (NULL,NULL) ``` diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index 048834806d1..2efe980a4cf 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -380,6 +380,42 @@ Result: └──────┘ ``` +## h3HexAreaKm2 {#h3hexareakm2} + +Returns average hexagon area in square kilometers at the given resolution. + +**Syntax** + +``` sql +h3HexAreaKm2(resolution) +``` + +**Parameter** + +- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Area in square kilometers. + +Type: [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT h3HexAreaKm2(13) AS area; +``` + +Result: + +``` text +┌──────area─┐ +│ 0.0000439 │ +└───────────┘ +``` + ## h3IndexesAreNeighbors {#h3indexesareneighbors} Returns whether or not the provided [H3](#h3index) indexes are neighbors. @@ -704,4 +740,144 @@ Result: └───────┘ ``` +## h3DegsToRads {#h3degstorads} + +Converts degrees to radians. + +**Syntax** + +``` sql +h3DegsToRads(degrees) +``` + +**Parameter** + +- `degrees` — Input in degrees. Type: [Float64](../../../sql-reference/data-types/float.md). + +**Returned values** + +- Radians. Type: [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT h3DegsToRads(180.0) AS radians; +``` + +Result: + +``` text +┌───────────radians─┐ +│ 3.141592653589793 │ +└───────────────────┘ +``` + +## h3RadsToDegs {#h3radstodegs} + +Converts radians to degrees. + +**Syntax** + +``` sql +h3RadsToDegs(radians) +``` + +**Parameter** + +- `radians` — Input in radians. Type: [Float64](../../../sql-reference/data-types/float.md). + +**Returned values** + +- Degrees. Type: [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT h3RadsToDegs(3.141592653589793) AS degrees; +``` + +Result: + +``` text +┌─degrees─┐ +│ 180 │ +└─────────┘ +``` + +## h3CellAreaM2 {#h3cellaream2} + +Returns the exact area of a specific cell in square meters corresponding to the given input H3 index. + +**Syntax** + +``` sql +h3CellAreaM2(index) +``` + +**Parameter** + +- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Cell area in square meters. + +Type: [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT h3CellAreaM2(579205133326352383) AS area; +``` + +Result: + +``` text +┌───────────────area─┐ +│ 4106166334463.9233 │ +└────────────────────┘ +``` + +## h3CellAreaRads2 {#h3cellarearads2} + +Returns the exact area of a specific cell in square radians corresponding to the given input H3 index. + +**Syntax** + +``` sql +h3CellAreaRads2(index) +``` + +**Parameter** + +- `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). + +**Returned value** + +- Cell area in square radians. + +Type: [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT h3CellAreaRads2(579205133326352383) AS area; +``` + +Result: + +``` text +┌────────────────area─┐ +│ 0.10116268528089567 │ +└─────────────────────┘ +``` + [Original article](https://clickhouse.com/docs/en/sql-reference/functions/geo/h3) diff --git a/docs/en/sql-reference/functions/index.md b/docs/en/sql-reference/functions/index.md index e86e6b37998..ddc113d31f9 100644 --- a/docs/en/sql-reference/functions/index.md +++ b/docs/en/sql-reference/functions/index.md @@ -73,26 +73,74 @@ User defined function configurations are searched relative to the path specified A function configuration contains the following settings: - `name` - a function name. -- `command` - a command or a script to execute. +- `command` - script name to execute or command if `execute_direct` is false. - `argument` - argument description with the `type` of an argument. Each argument is described in a separate setting. - `format` - a [format](../../interfaces/formats.md) in which arguments are passed to the command. - `return_type` - the type of a returned value. - `type` - an executable type. If `type` is set to `executable` then single command is started. If it is set to `executable_pool` then a pool of commands is created. - `max_command_execution_time` - maximum execution time in seconds for processing block of data. This setting is valid for `executable_pool` commands only. Optional. Default value is `10`. -- `command_termination_timeout` - time in seconds during which a command should finish after its pipe is closed. After that time `SIGTERM` is sent to the process executing the command. This setting is valid for `executable_pool` commands only. Optional. Default value is `10`. +- `command_termination_timeout` - time in seconds during which a command should finish after its pipe is closed. After that time `SIGTERM` is sent to the process executing the command. Optional. Default value is `10`. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. - `pool_size` - the size of a command pool. Optional. Default value is `16`. -- `lifetime` - the reload interval of a function in seconds. If it is set to `0` then the function is not reloaded. - `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. +- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder. Additional script arguments can be specified using whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `1`. Optional parameter. +- `lifetime` - the reload interval of a function in seconds. If it is set to `0` then the function is not reloaded. Default value is `0`. Optional parameter. The command must read arguments from `STDIN` and must output the result to `STDOUT`. The command must process arguments iteratively. That is after processing a chunk of arguments it must wait for the next chunk. **Example** -Creating `test_function` using XML configuration: -``` +Creating `test_function` using XML configuration. +File test_function.xml. +```xml executable - test_function + test_function_python + String + + UInt64 + + TabSeparated + test_function.py + + +``` + +Script file inside `user_scripts` folder `test_function.py`. + +```python +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Value " + line, end='') + sys.stdout.flush() +``` + +Query: + +``` sql +SELECT test_function_python(toUInt64(2)); +``` + +Result: + +``` text +┌─test_function_python(2)─┐ +│ Value 2 │ +└─────────────────────────┘ +``` + +Creating `test_function_sum` manually specifying `execute_direct` to `0` using XML configuration. +File test_function.xml. +```xml + + + executable + test_function_sum UInt64 UInt64 @@ -102,7 +150,7 @@ Creating `test_function` using XML configuration: TabSeparated cd /; clickhouse-local --input-format TabSeparated --output-format TabSeparated --structure 'x UInt64, y UInt64' --query "SELECT x + y FROM table" - 0 + 0 ``` @@ -110,15 +158,15 @@ Creating `test_function` using XML configuration: Query: ``` sql -SELECT test_function(toUInt64(2), toUInt64(2)); +SELECT test_function_sum(2, 2); ``` Result: ``` text -┌─test_function(toUInt64(2), toUInt64(2))─┐ -│ 4 │ -└─────────────────────────────────────────┘ +┌─test_function_sum(2, 2)─┐ +│ 4 │ +└─────────────────────────┘ ``` diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index c62603a50b9..a0c0116a058 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -351,8 +351,6 @@ Checks whether the string matches the `pattern` regular expression. A `re2` regu Returns 0 if it does not match, or 1 if it matches. -Note that the backslash symbol (`\`) is used for escaping in the regular expression. The same symbol is used for escaping in string literals. So in order to escape the symbol in a regular expression, you must write two backslashes (\\) in a string literal. - The regular expression works with the string as if it is a set of bytes. The regular expression can’t contain null bytes. For patterns to search for substrings in a string, it is better to use LIKE or ‘position’, since they work much faster. diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 88e1cf47592..160e7be156e 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -344,9 +344,9 @@ SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); Result: ``` text -┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ -│ -1.11100 │ Nullable(Decimal(9, 5)) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ +│ -1.111 │ Nullable(Decimal(9, 5)) │ +└────────┴────────────────────────────────────────────────────┘ ``` Query: @@ -451,9 +451,9 @@ SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); Result: ``` text -┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ -│ -1.11100 │ Decimal(9, 5) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ +│ -1.111 │ Decimal(9, 5) │ +└────────┴────────────────────────────────────────────────────┘ ``` Query: diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index ae2113a2b64..98c3135f2b4 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -360,6 +360,21 @@ SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS Decod └────────────────────────────────────────┘ ``` +### decodeURLFormComponent(URL) {#decodeurlformcomponenturl} + +Returns the decoded URL. Follows rfc-1866, plain plus(`+`) is decoded as space(` `). +Example: + +``` sql +SELECT decodeURLFormComponent('http://127.0.0.1:8123/?query=SELECT%201+2%2B3') AS DecodedURL; +``` + +``` text +┌─DecodedURL────────────────────────────────┐ +│ http://127.0.0.1:8123/?query=SELECT 1 2+3 │ +└───────────────────────────────────────────┘ +``` + ### netloc {#netloc} Extracts network locality (`username:password@host:port`) from a URL. diff --git a/docs/en/sql-reference/statements/alter/projection.md b/docs/en/sql-reference/statements/alter/projection.md index 96cd8f5d607..c7ebc83c496 100644 --- a/docs/en/sql-reference/statements/alter/projection.md +++ b/docs/en/sql-reference/statements/alter/projection.md @@ -9,11 +9,12 @@ The following operations with [projections](../../../engines/table-engines/merge - `ALTER TABLE [db].name ADD PROJECTION name ( SELECT [GROUP BY] [ORDER BY] )` - Adds projection description to tables metadata. -- `ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. +- `ALTER TABLE [db].name DROP PROJECTION name` - Removes projection description from tables metadata and deletes projection files from disk. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). - `ALTER TABLE [db.]table MATERIALIZE PROJECTION name IN PARTITION partition_name` - The query rebuilds the projection `name` in the partition `partition_name`. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). -- `ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. +- `ALTER TABLE [db.]table CLEAR PROJECTION name IN PARTITION partition_name` - Deletes projection files from disk without removing description. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). + The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only change metadata or remove files. diff --git a/docs/en/sql-reference/statements/create/dictionary.md b/docs/en/sql-reference/statements/create/dictionary.md index 61428cce126..86ab8f977b0 100644 --- a/docs/en/sql-reference/statements/create/dictionary.md +++ b/docs/en/sql-reference/statements/create/dictionary.md @@ -10,7 +10,7 @@ Creates a new [external dictionary](../../../sql-reference/dictionaries/external **Syntax** ``` sql -CREATE DICTIONARY [OR REPLACE][IF NOT EXISTS] [db.]dictionary_name [ON CLUSTER cluster] +CREATE [OR REPLACE] DICTIONARY [IF NOT EXISTS] [db.]dictionary_name [ON CLUSTER cluster] ( key1 type1 [DEFAULT|EXPRESSION expr1] [IS_OBJECT_ID], key2 type2 [DEFAULT|EXPRESSION expr2], diff --git a/docs/en/sql-reference/statements/explain.md b/docs/en/sql-reference/statements/explain.md index 2d129f1bc60..9c74c069f02 100644 --- a/docs/en/sql-reference/statements/explain.md +++ b/docs/en/sql-reference/statements/explain.md @@ -10,7 +10,12 @@ Shows the execution plan of a statement. Syntax: ```sql -EXPLAIN [AST | SYNTAX | PLAN | PIPELINE] [setting = value, ...] SELECT ... [FORMAT ...] +EXPLAIN [AST | SYNTAX | PLAN | PIPELINE | TABLE OVERRIDE] [setting = value, ...] + [ + SELECT ... | + tableFunction(...) [COLUMNS (...)] [ORDER BY ...] [PARTITION BY ...] [PRIMARY KEY] [SAMPLE BY ...] [TTL ...] + ] + [FORMAT ...] ``` Example: @@ -412,4 +417,37 @@ Result: └──────────┴───────┴───────┴──────┴───────┘ ``` +### EXPLAIN TABLE OVERRIDE {#explain-table-override} + +Shows the result of a table override on a table schema accessed through a table function. +Also does some validation, throwing an exception if the override would have caused some kind of failure. + +**Example** + +Assume you have a remote MySQL table like this: + +```sql +CREATE TABLE db.tbl ( + id INT PRIMARY KEY, + created DATETIME DEFAULT now() +) +``` + +```sql +EXPLAIN TABLE OVERRIDE mysql('127.0.0.1:3306', 'db', 'tbl', 'root', 'clickhouse') +PARTITION BY toYYYYMM(assumeNotNull(created)) +``` + +Result: + +```text +┌─explain─────────────────────────────────────────────────┐ +│ PARTITION BY uses columns: `created` Nullable(DateTime) │ +└─────────────────────────────────────────────────────────┘ +``` + +!!! note "Note" + The validation is not complete, so a successfull query does not guarantee that the override would + not cause issues. + [Оriginal article](https://clickhouse.com/docs/en/sql-reference/statements/explain/) diff --git a/docs/ja/faq/operations/multi-region-replication.md b/docs/ja/faq/operations/multi-region-replication.md new file mode 120000 index 00000000000..dbc985ee1fb --- /dev/null +++ b/docs/ja/faq/operations/multi-region-replication.md @@ -0,0 +1 @@ +../../../en/faq/operations/multi-region-replication.md \ No newline at end of file diff --git a/docs/ja/interfaces/http.md b/docs/ja/interfaces/http.md index 4ac9cd9e472..210e3f46d24 100644 --- a/docs/ja/interfaces/http.md +++ b/docs/ja/interfaces/http.md @@ -397,7 +397,7 @@ $ curl -v 'http://localhost:8123/predefined_query' `` 値は以下の定義済みクエリです `` これは、Http要求が一致し、クエリの結果が返されたときにClickHouseによって実行されます。 これは必須構成です。 -次の例では、次の値を定義します `max_threads` と `max_alter_threads` 設定、そしてクエリのテーブルから設定設定します。 +次の例では、次の値を定義します `max_threads` と `max_final_threads` 設定、そしてクエリのテーブルから設定設定します。 例: @@ -420,9 +420,9 @@ $ curl -v 'http://localhost:8123/predefined_query' ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "注意" @@ -434,7 +434,7 @@ max_alter_threads 2 クリックハウスは、 `` HTTP要求のurlの値。 のデフォルト値 `` は `/query` . これはオプションの構成です。 設定ファイルに定義がない場合、paramは渡されません。 -この機能を試すために、この例ではmax_threadsとmax_alter_threadsの値を定義し、設定が正常に設定されたかどうかを照会します。 +この機能を試すために、この例ではmax_threadsとmax_final_threadsの値を定義し、設定が正常に設定されたかどうかを照会します。 例: @@ -452,9 +452,9 @@ max_alter_threads 2 ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ## 静的 {#static} diff --git a/docs/ja/operations/utilities/clickhouse-copier.md b/docs/ja/operations/utilities/clickhouse-copier.md index 614984af42b..3de925f6488 100644 --- a/docs/ja/operations/utilities/clickhouse-copier.md +++ b/docs/ja/operations/utilities/clickhouse-copier.md @@ -163,7 +163,7 @@ $ clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/pat - + ... ... diff --git a/docs/ja/sql-reference/functions/type-conversion-functions.md b/docs/ja/sql-reference/functions/type-conversion-functions.md index fd935c23d5f..a16bca0c1f9 100644 --- a/docs/ja/sql-reference/functions/type-conversion-functions.md +++ b/docs/ja/sql-reference/functions/type-conversion-functions.md @@ -170,9 +170,9 @@ SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) ``` ``` text -┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ -│ -1.11100 │ Nullable(Decimal(9, 5)) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ +│ -1.111 │ Nullable(Decimal(9, 5)) │ +└────────┴────────────────────────────────────────────────────┘ ``` ``` sql @@ -214,9 +214,9 @@ SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) ``` ``` text -┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ -│ -1.11100 │ Decimal(9, 5) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ +│ -1.111 │ Decimal(9, 5) │ +└────────┴────────────────────────────────────────────────────┘ ``` ``` sql diff --git a/docs/ru/development/developer-instruction.md b/docs/ru/development/developer-instruction.md index 8466c709ad1..964d39163d8 100644 --- a/docs/ru/development/developer-instruction.md +++ b/docs/ru/development/developer-instruction.md @@ -40,10 +40,10 @@ ClickHouse не работает и не собирается на 32-битны Выполните в терминале: - git clone git@github.com:ClickHouse/ClickHouse.git + git clone git@github.com:your_github_username/ClickHouse.git --recursive cd ClickHouse -Замените первое вхождение слова `ClickHouse` в команде для git на имя вашего аккаунта на GitHub. +Замените слово `your_github_username` в команде для git на имя вашего аккаунта на GitHub. Эта команда создаст директорию ClickHouse, содержащую рабочую копию проекта. diff --git a/docs/ru/engines/table-engines/integrations/hdfs.md b/docs/ru/engines/table-engines/integrations/hdfs.md index 5949cc8a0d7..78a82955cd2 100644 --- a/docs/ru/engines/table-engines/integrations/hdfs.md +++ b/docs/ru/engines/table-engines/integrations/hdfs.md @@ -5,7 +5,7 @@ toc_title: HDFS # HDFS {#table_engines-hdfs} -Управляет данными в HDFS. Данный движок похож на движки [File](../special/file.md#table_engines-file) и [URL](../special/url.md#table_engines-url). +Этот движок обеспечивает интеграцию с экосистемой [Apache Hadoop](https://ru.wikipedia.org/wiki/Hadoop), позволяя управлять данными в HDFS посредством ClickHouse. Данный движок похож на движки [File](../special/file.md#table_engines-file) и [URL](../special/url.md#table_engines-url), но предоставляет возможности, характерные для Hadoop. ## Использование движка {#usage} @@ -13,9 +13,11 @@ toc_title: HDFS ENGINE = HDFS(URI, format) ``` -В параметр `URI` нужно передавать полный URI файла в HDFS. +**Параметры движка** + +В параметр `URI` нужно передавать полный URI файла в HDFS. Часть URI с путем файла может содержать шаблоны. В этом случае таблица может использоваться только для чтения. Параметр `format` должен быть таким, который ClickHouse может использовать и в запросах `INSERT`, и в запросах `SELECT`. Полный список поддерживаемых форматов смотрите в разделе [Форматы](../../../interfaces/formats.md#formats). -Часть URI с путем файла может содержать шаблоны. В этом случае таблица может использоваться только для чтения. + **Пример:** @@ -67,12 +69,12 @@ SELECT * FROM hdfs_engine_table LIMIT 2 1. Предположим, у нас есть несколько файлов со следующими URI в HDFS: -- 'hdfs://hdfs1:9000/some_dir/some_file_1' -- 'hdfs://hdfs1:9000/some_dir/some_file_2' -- 'hdfs://hdfs1:9000/some_dir/some_file_3' -- 'hdfs://hdfs1:9000/another_dir/some_file_1' -- 'hdfs://hdfs1:9000/another_dir/some_file_2' -- 'hdfs://hdfs1:9000/another_dir/some_file_3' + - 'hdfs://hdfs1:9000/some_dir/some_file_1' + - 'hdfs://hdfs1:9000/some_dir/some_file_2' + - 'hdfs://hdfs1:9000/some_dir/some_file_3' + - 'hdfs://hdfs1:9000/another_dir/some_file_1' + - 'hdfs://hdfs1:9000/another_dir/some_file_2' + - 'hdfs://hdfs1:9000/another_dir/some_file_3' 1. Есть несколько возможностей создать таблицу, состояющую из этих шести файлов: @@ -128,6 +130,7 @@ CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9 | **параметр** | **по умолчанию** | +| - | - | | rpc\_client\_connect\_tcpnodelay | true | | dfs\_client\_read\_shortcircuit | true | | output\_replace-datanode-on-failure | true | @@ -177,22 +180,23 @@ CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9 #### Расширенные параметры для ClickHouse {#clickhouse-extras} | **параметр** | **по умолчанию** | +| - | - | |hadoop\_kerberos\_keytab | "" | |hadoop\_kerberos\_principal | "" | |hadoop\_kerberos\_kinit\_command | kinit | ### Ограничения {#limitations} - * hadoop\_security\_kerberos\_ticket\_cache\_path могут быть определены только на глобальном уровне + * `hadoop_security_kerberos_ticket_cache_path` и `libhdfs3_conf` могут быть определены только на глобальном, а не на пользовательском уровне ## Поддержка Kerberos {#kerberos-support} -Если hadoop\_security\_authentication параметр имеет значение 'kerberos', ClickHouse аутентифицируется с помощью Kerberos. -[Расширенные параметры](#clickhouse-extras) и hadoop\_security\_kerberos\_ticket\_cache\_path помогают сделать это. +Если параметр `hadoop_security_authentication` имеет значение `kerberos`, ClickHouse аутентифицируется с помощью Kerberos. +[Расширенные параметры](#clickhouse-extras) и `hadoop_security_kerberos_ticket_cache_path` помогают сделать это. Обратите внимание что из-за ограничений libhdfs3 поддерживается только устаревший метод аутентификации, -коммуникация с узлами данных не защищена SASL (HADOOP\_SECURE\_DN\_USER надежный показатель такого -подхода к безопасности). Используйте tests/integration/test\_storage\_kerberized\_hdfs/hdfs_configs/bootstrap.sh для примера настроек. +коммуникация с узлами данных не защищена SASL (`HADOOP_SECURE_DN_USER` надежный показатель такого +подхода к безопасности). Используйте `tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh` для примера настроек. -Если hadoop\_kerberos\_keytab, hadoop\_kerberos\_principal или hadoop\_kerberos\_kinit\_command указаны в настройках, kinit будет вызван. hadoop\_kerberos\_keytab и hadoop\_kerberos\_principal обязательны в этом случае. Необходимо также будет установить kinit и файлы конфигурации krb5. +Если `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` или `hadoop_kerberos_kinit_command` указаны в настройках, `kinit` будет вызван. `hadoop_kerberos_keytab` и `hadoop_kerberos_principal` обязательны в этом случае. Необходимо также будет установить `kinit` и файлы конфигурации krb5. ## Виртуальные столбцы {#virtual-columns} diff --git a/docs/ru/engines/table-engines/integrations/kafka.md b/docs/ru/engines/table-engines/integrations/kafka.md index 19e2850dd51..7ea3d124ab3 100644 --- a/docs/ru/engines/table-engines/integrations/kafka.md +++ b/docs/ru/engines/table-engines/integrations/kafka.md @@ -191,5 +191,5 @@ ClickHouse может поддерживать учетные данные Kerbe **Смотрите также** - [Виртуальные столбцы](index.md#table_engines-virtual_columns) -- [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) +- [background_message_broker_schedule_pool_size](../../../operations/settings/settings.md#background_message_broker_schedule_pool_size) diff --git a/docs/ru/faq/operations/multi-region-replication.md b/docs/ru/faq/operations/multi-region-replication.md new file mode 120000 index 00000000000..dbc985ee1fb --- /dev/null +++ b/docs/ru/faq/operations/multi-region-replication.md @@ -0,0 +1 @@ +../../../en/faq/operations/multi-region-replication.md \ No newline at end of file diff --git a/docs/ru/interfaces/http.md b/docs/ru/interfaces/http.md index 27a70a5c26d..8687201e1c9 100644 --- a/docs/ru/interfaces/http.md +++ b/docs/ru/interfaces/http.md @@ -422,7 +422,7 @@ $ curl -v 'http://localhost:8123/predefined_query' Значение `query` — это предопределенный запрос `predefined_query_handler`, который выполняется ClickHouse при совпадении HTTP-запроса и возврате результата запроса. Это обязательная настройка. -В следующем примере определяются настройки [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_alter_threads`, а затем запрашивается системная таблица, чтобы проверить, были ли эти параметры успешно установлены. +В следующем примере определяются настройки [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_final_threads`, а затем запрашивается системная таблица, чтобы проверить, были ли эти параметры успешно установлены. !!! note "Предупреждение" Чтобы сохранить стандартные `handlers` такие как `query`, `play`, `ping`, используйте правило ``. @@ -449,9 +449,9 @@ $ curl -v 'http://localhost:8123/predefined_query' ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "Предупреждение" @@ -463,7 +463,7 @@ max_alter_threads 2 ClickHouse извлекает и выполняет значение, соответствующее значению `query_param_name` URL-адресе HTTP-запроса. Значение по умолчанию `query_param_name` — это `/query` . Это необязательная настройка. Если в файле конфигурации нет определения, параметр не передается. -Чтобы поэкспериментировать с этой функциональностью, в примере определяются значения [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_alter_threads` и запрашивается, успешно ли были установлены настройки. +Чтобы поэкспериментировать с этой функциональностью, в примере определяются значения [max_threads](../operations/settings/settings.md#settings-max_threads) и `max_final_threads` и запрашивается, успешно ли были установлены настройки. Пример: @@ -482,9 +482,9 @@ ClickHouse извлекает и выполняет значение, соотв ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ### static {#static} diff --git a/docs/ru/operations/clickhouse-keeper.md b/docs/ru/operations/clickhouse-keeper.md index 9d6c4799008..2f3f3c0f63c 100644 --- a/docs/ru/operations/clickhouse-keeper.md +++ b/docs/ru/operations/clickhouse-keeper.md @@ -3,14 +3,14 @@ toc_priority: 66 toc_title: ClickHouse Keeper --- -# [пре-продакшн] ClickHouse Keeper +# [пре-продакшн] ClickHouse Keeper {#clickHouse-keeper} Сервер ClickHouse использует сервис координации [ZooKeeper](https://zookeeper.apache.org/) для [репликации](../engines/table-engines/mergetree-family/replication.md) данных и выполнения [распределенных DDL запросов](../sql-reference/distributed-ddl.md). ClickHouse Keeper — это альтернативный сервис координации, совместимый с ZooKeeper. !!! warning "Предупреждение" ClickHouse Keeper находится в стадии пре-продакшн и тестируется в CI ClickHouse и на нескольких внутренних инсталляциях. -## Детали реализации +## Детали реализации {#implementation-details} ZooKeeper — один из первых широко известных сервисов координации с открытым исходным кодом. Он реализован на языке программирования Java, имеет достаточно простую и мощную модель данных. Алгоритм координации Zookeeper называется ZAB (ZooKeeper Atomic Broadcast). Он не гарантирует линеаризуемость операций чтения, поскольку каждый узел ZooKeeper обслуживает чтения локально. В отличие от ZooKeeper, ClickHouse Keeper реализован на C++ и использует алгоритм [RAFT](https://raft.github.io/), [реализация](https://github.com/eBay/NuRaft). Этот алгоритм позволяет достичь линеаризуемости чтения и записи, имеет несколько реализаций с открытым исходным кодом на разных языках. @@ -21,7 +21,7 @@ ZooKeeper — один из первых широко известных сер !!! info "Примечание" Внешние интеграции не поддерживаются. -## Конфигурация +## Конфигурация {#configuration} ClickHouse Keeper может использоваться как равноценная замена ZooKeeper или как внутренняя часть сервера ClickHouse, но в обоих случаях конфигурация представлена файлом `.xml`. Главный тег конфигурации ClickHouse Keeper — это ``. Параметры конфигурации: @@ -54,6 +54,7 @@ ClickHouse Keeper может использоваться как равноце - `auto_forwarding` — разрешить пересылку запросов на запись от последователей лидеру (по умолчанию: true). - `shutdown_timeout` — время ожидания завершения внутренних подключений и выключения, в миллисекундах (по умолчанию: 5000). - `startup_timeout` — время отключения сервера, если он не подключается к другим участникам кворума, в миллисекундах (по умолчанию: 30000). +- `four_letter_word_white_list` — список разрешенных 4-х буквенных команд (по умолчанию: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro"). Конфигурация кворума находится в `.` и содержит описание серверов. @@ -101,7 +102,7 @@ ClickHouse Keeper может использоваться как равноце ``` -## Как запустить +## Как запустить {#how-to-run} ClickHouse Keeper входит в пакет `clickhouse-server`, просто добавьте кофигурацию `` и запустите сервер ClickHouse как обычно. Если вы хотите запустить ClickHouse Keeper автономно, сделайте это аналогичным способом: @@ -109,7 +110,195 @@ ClickHouse Keeper входит в пакет `clickhouse-server`, просто clickhouse-keeper --config /etc/your_path_to_config/config.xml --daemon ``` -## [экспериментально] Переход с ZooKeeper +## 4-х буквенные команды {#four-letter-word-commands} + +ClickHouse Keeper также поддерживает 4-х буквенные команды, почти такие же, как у Zookeeper. Каждая команда состоит из 4-х символов, например, `mntr`, `stat` и т. д. Несколько интересных команд: `stat` предоставляет общую информацию о сервере и подключенных клиентах, а `srvr` и `cons` предоставляют расширенные сведения о сервере и подключениях соответственно. + +У 4-х буквенных команд есть параметр для настройки разрешенного списка `four_letter_word_white_list`, который имеет значение по умолчанию "conf,cons,crst,envi,ruok,srst,srvr,stat, wchc,wchs,dirs,mntr,isro". + +Вы можете отправлять команды в ClickHouse Keeper через telnet или nc на порт для клиента. + +``` +echo mntr | nc localhost 9181 +``` + +Ниже приведен подробный список 4-х буквенных команд: + +- `ruok`: Проверяет, что сервер запущен без ошибок. В этом случае сервер ответит `imok`. В противном случае он не ответит. Ответ `imok` не обязательно означает, что сервер присоединился к кворуму, а указывает, что процесс сервера активен и привязан к указанному клиентскому порту. Используйте команду `stat` для получения подробной информации о состоянии кворума и клиентском подключении. + +``` +imok +``` + +- `mntr`: Выводит список переменных, которые используются для мониторинга работоспособности кластера. + +``` +zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +zk_avg_latency 0 +zk_max_latency 0 +zk_min_latency 0 +zk_packets_received 68 +zk_packets_sent 68 +zk_num_alive_connections 1 +zk_outstanding_requests 0 +zk_server_state leader +zk_znode_count 4 +zk_watch_count 1 +zk_ephemerals_count 0 +zk_approximate_data_size 723 +zk_open_file_descriptor_count 310 +zk_max_file_descriptor_count 10240 +zk_followers 0 +zk_synced_followers 0 +``` + +- `srvr`: Выводит информацию о сервере: его версию, роль участника кворума и т.п. + +``` +ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +Latency min/avg/max: 0/0/0 +Received: 2 +Sent : 2 +Connections: 1 +Outstanding: 0 +Zxid: 34 +Mode: leader +Node count: 4 +``` + +- `stat`: Выводит краткие сведения о сервере и подключенных клиентах. + +``` +ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +Clients: + 192.168.1.1:52852(recved=0,sent=0) + 192.168.1.1:52042(recved=24,sent=48) +Latency min/avg/max: 0/0/0 +Received: 4 +Sent : 4 +Connections: 1 +Outstanding: 0 +Zxid: 36 +Mode: leader +Node count: 4 +``` + +- `srst`: Сбрасывает статистику сервера. Команда влияет на результат вывода `srvr`, `mntr` и `stat`. + +``` +Server stats reset. +``` + +- `conf`: Выводит подробную информацию о серверной конфигурации. + +``` +server_id=1 +tcp_port=2181 +four_letter_word_white_list=* +log_storage_path=./coordination/logs +snapshot_storage_path=./coordination/snapshots +max_requests_batch_size=100 +session_timeout_ms=30000 +operation_timeout_ms=10000 +dead_session_check_period_ms=500 +heart_beat_interval_ms=500 +election_timeout_lower_bound_ms=1000 +election_timeout_upper_bound_ms=2000 +reserved_log_items=1000000000000000 +snapshot_distance=10000 +auto_forwarding=true +shutdown_timeout=5000 +startup_timeout=240000 +raft_logs_level=information +snapshots_to_keep=3 +rotate_log_storage_interval=100000 +stale_log_gap=10000 +fresh_log_gap=200 +max_requests_batch_size=100 +quorum_reads=false +force_sync=false +compress_logs=true +compress_snapshots_with_zstd_format=true +configuration_change_tries_count=20 +``` + +- `cons`: Выводит полную информацию о подключениях/сессиях для всех клиентов, подключенных к этому серверу. Включает информацию о количестве принятых/отправленных пакетов, идентификаторе сессии, задержках операций, последней выполненной операции и т. д. + +``` + 192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0) + 192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0) +``` + +- `crst`: Сбрасывает статистику подключений/сессий для всех подключений. + +``` +Connection stats reset. +``` + +- `envi`: Выводит подробную информацию о серверном окружении. + +``` +Environment: +clickhouse.keeper.version=v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +host.name=ZBMAC-C02D4054M.local +os.name=Darwin +os.arch=x86_64 +os.version=19.6.0 +cpu.count=12 +user.name=root +user.home=/Users/JackyWoo/ +user.dir=/Users/JackyWoo/project/jd/clickhouse/cmake-build-debug/programs/ +user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/ +``` + + +- `dirs`: Показывает общий размер файлов снэпшотов и журналов в байтах. + +``` +snapshot_dir_size: 0 +log_dir_size: 3875 +``` + +- `isro`: Проверяет, что сервер работает в режиме только для чтения. Сервер ответит `ro`, если он находится в режиме только для чтения, или `rw`, если нет. + +``` +rw +``` + +- `wchs`: Показывает краткую информацию о количестве отслеживаемых путей (watches) на сервере. + +``` +1 connections watching 1 paths +Total watches:1 +``` + +- `wchc`: Показывает подробную информацию об отслеживаемых путях (watches) на сервере в разбивке по сессиям. При этом выводится список сессий (подключений) с соответствующими отслеживаемыми путями. Обратите внимание, что в зависимости от количества отслеживаемых путей эта операция может быть дорогостоящей (т. е. повлиять на производительность сервера), используйте ее осторожно. + +``` +0x0000000000000001 + /clickhouse/task_queue/ddl +``` + +- `wchp`: Показывает подробную информацию об отслеживаемых путях (watches) на сервере в разбивке по пути. При этом выводится список путей (узлов) с соответствующими сессиями. Обратите внимание, что в зависимости от количества отселживаемых путей (watches) эта операция может быть дорогостоящей (т. е. повлиять на производительность сервера), используйте ее осторожно. + +``` +/clickhouse/task_queue/ddl + 0x0000000000000001 +``` + +- `dump`: Выводит список незавершенных сеансов и эфемерных узлов. Команда работает только на лидере. + +``` +Sessions dump (2): +0x0000000000000001 +0x0000000000000002 +Sessions with Ephemerals (1): +0x0000000000000001 + /clickhouse/task_queue/ddl +``` + + +## [экспериментально] Переход с ZooKeeper {#migration-from-zookeeper} Плавный переход с ZooKeeper на ClickHouse Keeper невозможен, необходимо остановить кластер ZooKeeper, преобразовать данные и запустить ClickHouse Keeper. Утилита `clickhouse-keeper-converter` конвертирует журналы и снэпшоты ZooKeeper в снэпшот ClickHouse Keeper. Работа утилиты проверена только для версий ZooKeeper выше 3.4. Для миграции необходимо выполнить следующие шаги: diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md index 4a2da778a06..d2cc133e0c9 100644 --- a/docs/ru/operations/server-configuration-parameters/settings.md +++ b/docs/ru/operations/server-configuration-parameters/settings.md @@ -673,7 +673,7 @@ ClickHouse поддерживает динамическое изменение ## max_concurrent_queries {#max-concurrent-queries} -Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`. Запросы также могут быть ограничены настройками: [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). +Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`. Запросы также могут быть ограничены настройками: [max_concurrent_insert_queries](#max-concurrent-insert-queries), [max_concurrent_select_queries](#max-concurrent-select-queries), [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). !!! info "Примечание" Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. @@ -681,7 +681,9 @@ ClickHouse поддерживает динамическое изменение Возможные значения: - Положительное целое число. -- 0 — выключена. +- 0 — нет лимита. + +Значение по умолчанию: `100`. **Пример** @@ -689,6 +691,46 @@ ClickHouse поддерживает динамическое изменение 100 ``` +## max_concurrent_insert_queries {#max-concurrent-insert-queries} + +Определяет максимальное количество одновременных `INSERT` запросов. + +!!! info "Примечание" + Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. + +Возможные значения: + +- Положительное целое число. +- 0 — нет лимита. + +Значение по умолчанию: `0`. + +**Example** + +``` xml +100 +``` + +## max_concurrent_select_queries {#max-concurrent-select-queries} + +Определяет максимальное количество одновременных `SELECT` запросов. + +!!! info "Примечание" + Параметры этих настроек могут быть изменены во время выполнения запросов и вступят в силу немедленно. Запросы, которые уже запущены, выполнятся без изменений. + +Возможные значения: + +- Положительное целое число. +- 0 — нет лимита. + +Значение по умолчанию: `0`. + +**Example** + +``` xml +100 +``` + ## max_concurrent_queries_for_user {#max-concurrent-queries-for-user} Определяет максимальное количество одновременно обрабатываемых запросов, связанных с таблицей семейства `MergeTree`, для пользователя. @@ -696,7 +738,9 @@ ClickHouse поддерживает динамическое изменение Возможные значения: - Положительное целое число. -- 0 — выключена. +- 0 — нет лимита. + +Значение по умолчанию: `0`. **Пример** @@ -712,7 +756,12 @@ ClickHouse поддерживает динамическое изменение Изменение настройки для одного запроса или пользователя не влияет на другие запросы. -Значение по умолчанию: `0` — отсутствие ограничений. +Возможные значения: + +- Положительное целое число. +- 0 — нет лимита. + +Значение по умолчанию: `0`. **Пример** diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 933060482e3..affa90d9840 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1641,18 +1641,19 @@ SELECT * FROM table_with_enum_column_for_csv_insert; `INSERT` завершается успешно только в том случае, когда ClickHouse смог без ошибки записать данные в `insert_quorum` реплик за время `insert_quorum_timeout`. Если по любой причине количество реплик с успешной записью не достигнет `insert_quorum`, то запись считается не состоявшейся и ClickHouse удалит вставленный блок из всех реплик, куда уже успел записать данные. -Все реплики в кворуме консистентны, т.е. содержат данные всех более ранних запросов `INSERT`. Последовательность `INSERT` линеаризуется. +Когда `insert_quorum_parallel` выключена, все реплики кворума консистентны, то есть содержат данные всех предыдущих запросов `INSERT` (последовательность `INSERT` линеаризуется). При чтении с диска данных, записанных с помощью `insert_quorum` и при выключенной `insert_quorum_parallel`, можно включить последовательную консистентность для запросов `SELECT` с помощью [select_sequential_consistency](#settings-select_sequential_consistency). -При чтении данных, записанных с `insert_quorum` можно использовать настройку [select_sequential_consistency](#settings-select_sequential_consistency). - -ClickHouse генерирует исключение +ClickHouse генерирует исключение: - Если количество доступных реплик на момент запроса меньше `insert_quorum`. - При попытке записать данные в момент, когда предыдущий блок ещё не вставлен в `insert_quorum` реплик. Эта ситуация может возникнуть, если пользователь вызвал `INSERT` прежде, чем завершился предыдущий с `insert_quorum`. +- При выключенной `insert_quorum_parallel` и при попытке записать данные в момент, когда предыдущий блок еще не вставлен в `insert_quorum` реплик (несколько параллельных `INSERT`-запросов). Эта ситуация может возникнуть при попытке пользователя выполнить очередной запрос `INSERT` к той же таблице, прежде чем завершится предыдущий с `insert_quorum`. + См. также: - [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) - [select_sequential_consistency](#settings-select_sequential_consistency) ## insert_quorum_timeout {#settings-insert_quorum_timeout} @@ -1664,11 +1665,29 @@ ClickHouse генерирует исключение См. также: - [insert_quorum](#settings-insert_quorum) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) +- [select_sequential_consistency](#settings-select_sequential_consistency) + +## insert_quorum_parallel {#settings-insert_quorum_parallel} + +Включает и выключает параллелизм для кворумных вставок (`INSERT`-запросы). Когда опция включена, возможно выполнять несколько кворумных `INSERT`-запросов одновременно, при этом запросы не дожидаются окончания друг друга . Когда опция выключена, одновременные записи с кворумом в одну и ту же таблицу будут отклонены (будет выполнена только одна из них). + +Возможные значения: + +- 0 — Выключена. +- 1 — Включена. + +Значение по умолчанию: 1. + +См. также: + +- [insert_quorum](#settings-insert_quorum) +- [insert_quorum_timeout](#settings-insert_quorum_timeout) - [select_sequential_consistency](#settings-select_sequential_consistency) ## select_sequential_consistency {#settings-select_sequential_consistency} -Включает или выключает последовательную консистентность для запросов `SELECT`. +Включает или выключает последовательную консистентность для запросов `SELECT`. Необходимо, чтобы `insert_quorum_parallel` была выключена (по умолчанию включена), а опция `insert_quorum` включена. Возможные значения: @@ -1681,10 +1700,13 @@ ClickHouse генерирует исключение Когда последовательная консистентность включена, то ClickHouse позволит клиенту выполнить запрос `SELECT` только к тем репликам, которые содержат данные всех предыдущих запросов `INSERT`, выполненных с `insert_quorum`. Если клиент обратится к неполной реплике, то ClickHouse сгенерирует исключение. В запросе SELECT не будут участвовать данные, которые ещё не были записаны на кворум реплик. +Если `insert_quorum_parallel` включена (по умолчанию это так), тогда `select_sequential_consistency` не будет работать. Причина в том, что параллельные запросы `INSERT` можно записать в разные наборы реплик кворума, поэтому нет гарантии того, что в отдельно взятую реплику будут сделаны все записи. + См. также: - [insert_quorum](#settings-insert_quorum) - [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) ## insert_deduplicate {#settings-insert-deduplicate} diff --git a/docs/ru/operations/utilities/clickhouse-copier.md b/docs/ru/operations/utilities/clickhouse-copier.md index 07467c3e5da..cdcd275e10f 100644 --- a/docs/ru/operations/utilities/clickhouse-copier.md +++ b/docs/ru/operations/utilities/clickhouse-copier.md @@ -171,7 +171,7 @@ $ clickhouse-copier --daemon --config zookeeper.xml --task-path /task/path --bas - + ... ... diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index fc5533e75b1..b41defdc92d 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -57,7 +57,7 @@ toTimezone(value, timezone) **Аргументы** - `value` — время или дата с временем. [DateTime64](../../sql-reference/data-types/datetime64.md). -- `timezone` — часовой пояс для возвращаемого значения. [String](../../sql-reference/data-types/string.md). +- `timezone` — часовой пояс для возвращаемого значения. [String](../../sql-reference/data-types/string.md). Этот аргумент является константой, потому что `toTimezone` изменяет часовой пояс столбца (часовой пояс является атрибутом типов `DateTime*`). **Возвращаемое значение** diff --git a/docs/ru/sql-reference/functions/nlp-functions.md b/docs/ru/sql-reference/functions/nlp-functions.md index 250403ab127..992a7d6ccf3 100644 --- a/docs/ru/sql-reference/functions/nlp-functions.md +++ b/docs/ru/sql-reference/functions/nlp-functions.md @@ -3,10 +3,10 @@ toc_priority: 67 toc_title: NLP --- -# [экспериментально] Функции для работы с ествественным языком {#nlp-functions} +# [экспериментально] Функции для работы с естественным языком {#nlp-functions} !!! warning "Предупреждение" - Сейчас использование функций для работы с ествественным языком является экспериментальной возможностью. Чтобы использовать данные функции, включите настройку `allow_experimental_nlp_functions = 1`. + Сейчас использование функций для работы с естественным языком является экспериментальной возможностью. Чтобы использовать данные функции, включите настройку `allow_experimental_nlp_functions = 1`. ## stem {#stem} @@ -84,7 +84,7 @@ SELECT lemmatize('en', 'wolves'); Находит синонимы к заданному слову. Представлены два типа расширений словарей: `plain` и `wordnet`. -Для работы расширения типа `plain` необходимо указать путь до простого текстового файла, где каждая строка соотвествует одному набору синонимов. Слова в данной строке должны быть разделены с помощью пробела или знака табуляции. +Для работы расширения типа `plain` необходимо указать путь до простого текстового файла, где каждая строка соответствует одному набору синонимов. Слова в данной строке должны быть разделены с помощью пробела или знака табуляции. Для работы расширения типа `plain` необходимо указать путь до WordNet тезауруса. Тезаурус должен содержать WordNet sense index. diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 1b4ea4ef609..50a458bb453 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -343,9 +343,9 @@ SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); Результат: ``` text -┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ -│ -1.11100 │ Nullable(Decimal(9, 5)) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ +│ -1.111 │ Nullable(Decimal(9, 5)) │ +└────────┴────────────────────────────────────────────────────┘ ``` Запрос: @@ -449,9 +449,9 @@ SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); Результат: ``` text -┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ -│ -1.11100 │ Decimal(9, 5) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ +│ -1.111 │ Decimal(9, 5) │ +└────────┴────────────────────────────────────────────────────┘ ``` Запрос: diff --git a/docs/tools/build.py b/docs/tools/build.py index 785928cf4ab..75278075996 100755 --- a/docs/tools/build.py +++ b/docs/tools/build.py @@ -95,7 +95,7 @@ def build_for_lang(lang, args): site_dir=site_dir, strict=True, theme=theme_cfg, - copyright='©2016–2021 ClickHouse, Inc.', + copyright='©2016–2022 ClickHouse, Inc.', use_directory_urls=True, repo_name='ClickHouse/ClickHouse', repo_url='https://github.com/ClickHouse/ClickHouse/', diff --git a/docs/zh/engines/table-engines/integrations/kafka.md b/docs/zh/engines/table-engines/integrations/kafka.md index 6784b366e85..ee6bbbe67fc 100644 --- a/docs/zh/engines/table-engines/integrations/kafka.md +++ b/docs/zh/engines/table-engines/integrations/kafka.md @@ -132,4 +132,33 @@ Kafka 特性: 有关详细配置选项列表,请参阅 [librdkafka配置参考](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md)。在 ClickHouse 配置中使用下划线 (`_`) ,并不是使用点 (`.`)。例如,`check.crcs=true` 将是 `true`。 +### Kerberos 支持 {#kafka-kerberos-zhi-chi} + +对于使用了kerberos的kafka, 将security_protocol 设置为sasl_plaintext就够了,如果kerberos的ticket是由操作系统获取和缓存的。 +clickhouse也支持自己使用keyfile的方式来维护kerbros的凭证。配置sasl_kerberos_service_name、sasl_kerberos_keytab、sasl_kerberos_principal三个子元素就可以。 + +示例: + +``` xml + + + SASL_PLAINTEXT + /home/kafkauser/kafkauser.keytab + kafkauser/kafkahost@EXAMPLE.COM + +``` + +## 虚拟列 + +- `_topic` – Kafka 主题。 +- `_key` – 信息的键。 +- `_offset` – 消息的偏移量。 +- `_timestamp ` – 消息的时间戳。 +- `_partition ` – Kafka 主题的分区。 + +**另请参阅** + +- [虚拟列](../../../engines/table-engines/index.md#table_engines-virtual_columns) +- [后台消息代理调度池大小](../../../operations/settings/settings.md#background_message_broker_schedule_pool_size) + [原始文章](https://clickhouse.com/docs/zh/operations/table_engines/kafka/) diff --git a/docs/zh/faq/operations/multi-region-replication.md b/docs/zh/faq/operations/multi-region-replication.md new file mode 120000 index 00000000000..dbc985ee1fb --- /dev/null +++ b/docs/zh/faq/operations/multi-region-replication.md @@ -0,0 +1 @@ +../../../en/faq/operations/multi-region-replication.md \ No newline at end of file diff --git a/docs/zh/interfaces/http.md b/docs/zh/interfaces/http.md index cdce4f2f2e7..738b0365f46 100644 --- a/docs/zh/interfaces/http.md +++ b/docs/zh/interfaces/http.md @@ -407,7 +407,7 @@ $ curl -v 'http://localhost:8123/predefined_query' `query` 是一个预定义的`predefined_query_handler`查询,它由ClickHouse在匹配HTTP请求并返回查询结果时执行。这是一个必须的配置。 -以下是定义的[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_alter_threads`设置, 然后查询系统表以检查这些设置是否设置成功。 +以下是定义的[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_final_threads`设置, 然后查询系统表以检查这些设置是否设置成功。 示例: @@ -430,9 +430,9 @@ $ curl -v 'http://localhost:8123/predefined_query' ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_alter_threads?max_threads=1&max_alter_threads=2' +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' 1 -max_alter_threads 2 +max_final_threads 2 ``` !!! note "警告" @@ -444,7 +444,7 @@ max_alter_threads 2 ClickHouse提取并执行与HTTP请求URL中的`query_param_name`值对应的值。`query_param_name`的默认值是`/query`。这是一个可选的配置。如果配置文件中没有定义,则不会传入参数。 -为了试验这个功能,示例定义了[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_alter_threads`,`queries`设置是否成功的值。 +为了试验这个功能,示例定义了[max_threads](../operations/settings/settings.md#settings-max_threads)和`max_final_threads`,`queries`设置是否成功的值。 示例: @@ -462,9 +462,9 @@ ClickHouse提取并执行与HTTP请求URL中的`query_param_name`值对应的值 ``` ``` bash -$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_alter_threads=2¶m_name_1=max_threads¶m_name_2=max_alter_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' max_threads 1 -max_alter_threads 2 +max_final_threads 2 ``` ### static {#static} diff --git a/docs/zh/operations/system-tables/asynchronous_metric_log.md b/docs/zh/operations/system-tables/asynchronous_metric_log.md index 592fb99c5ef..ba37713ac44 100644 --- a/docs/zh/operations/system-tables/asynchronous_metric_log.md +++ b/docs/zh/operations/system-tables/asynchronous_metric_log.md @@ -1,18 +1,13 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- - ## system.asynchronous_metric_log {#system-tables-async-log} -包含每分钟记录一次的 `system.asynchronous_metrics`历史值. 默认开启. +包含每分钟记录一次的 `system.asynchronous_metrics`历史值。默认开启。 列: -- `event_date` ([Date](../../sql-reference/data-types/date.md)) — 事件日期. -- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — 事件时间. -- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — 事件时间(微秒). -- `name` ([String](../../sql-reference/data-types/string.md)) — 指标名. -- `value` ([Float64](../../sql-reference/data-types/float.md)) — 指标值. +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — 事件日期。 +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — 事件时间。 +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — 事件时间(微秒)。 +- `name` ([String](../../sql-reference/data-types/string.md)) — 指标名。 +- `value` ([Float64](../../sql-reference/data-types/float.md)) — 指标值。 **示例** ``` sql diff --git a/docs/zh/operations/system-tables/asynchronous_metrics.md b/docs/zh/operations/system-tables/asynchronous_metrics.md index d6d2682c9a1..0303c408497 100644 --- a/docs/zh/operations/system-tables/asynchronous_metrics.md +++ b/docs/zh/operations/system-tables/asynchronous_metrics.md @@ -1,8 +1,3 @@ ---- -machine_translated: true -machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 ---- - # system.asynchronous_metrics {#system_tables-asynchronous_metrics} 包含在后台定期计算的指标。 例如,在使用的RAM量。 @@ -33,8 +28,8 @@ SELECT * FROM system.asynchronous_metrics LIMIT 10 └─────────────────────────────────────────┴────────────┘ ``` -**另请参阅** -- [监测](../../operations/monitoring.md) — ClickHouse监控的基本概念。 -- [系统。指标](../../operations/system-tables/metrics.md#system_tables-metrics) — 包含即时计算的指标。 -- [系统。活动](../../operations/system-tables/events.md#system_tables-events) — 包含出现的事件的次数。 -- [系统。metric\_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — 包含`system.metrics` 和 `system.events`表中的指标的历史值。 +**参见** +- [监控](../../operations/monitoring.md) — ClickHouse监控的基本概念。 +- [system.metrics](../../operations/system-tables/metrics.md#system_tables-metrics) — 包含即时计算的指标。 +- [system.events](../../operations/system-tables/events.md#system_tables-events) — 包含已发生的事件数。 +- [system.metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — 包含 `system.metrics` 和 `system.events` 表中的指标的历史值。 diff --git a/docs/zh/operations/utilities/clickhouse-copier.md b/docs/zh/operations/utilities/clickhouse-copier.md index 537006ecf0d..f6a4f11f81c 100644 --- a/docs/zh/operations/utilities/clickhouse-copier.md +++ b/docs/zh/operations/utilities/clickhouse-copier.md @@ -158,7 +158,7 @@ clickhouse-copier --daemon --config zookeeper.xml --task-path /task/path --base- - + ... ... diff --git a/docs/zh/sql-reference/aggregate-functions/reference/corr.md b/docs/zh/sql-reference/aggregate-functions/reference/corr.md index 5ab49f75023..5352aed5fc4 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/corr.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/corr.md @@ -12,4 +12,4 @@ toc_priority: 107 计算Pearson相关系数: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `corrStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `corrStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md index c6f43c6b9e9..4b961a22795 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md @@ -12,4 +12,4 @@ covarPop(x, y) 计算 `Σ((x - x̅)(y - y̅)) / n` 的值。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarPopStable` 函数。 它的工作速度较慢,但提供了较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarPopStable` 函数。 它的工作速度较慢,但提供了较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md index 5ef5104504b..bed522bbbfa 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md @@ -14,4 +14,4 @@ covarSamp(x, y) 返回Float64。 当 `n <= 1`, 返回 +∞。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `covarSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md index a193b60338a..72bd797279f 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md @@ -46,7 +46,7 @@ quantileTiming(level)(expr) 类型: `Float32`。 !!! note "注" -如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 + 如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 **示例** diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md index 7b130dbddbd..3ae1124c9c0 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md @@ -48,7 +48,7 @@ quantileTimingWeighted(level)(expr, weight) 类型: `Float32`。 !!! note "注" -如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 + 如果没有值传递给函数(当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。 **示例** diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md b/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md index 378ef4ae7e4..03478bae900 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md @@ -4,7 +4,7 @@ toc_priority: 30 # stddevPop {#stddevpop} -结果等于 [varPop] (../../../sql-reference/aggregate-functions/reference/varpop.md)的平方根。 +结果等于 [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md)的平方根。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevPopStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevPopStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md index 68a348146a9..d49b9d89fd9 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md @@ -7,4 +7,4 @@ toc_priority: 31 结果等于 [varSamp] (../../../sql-reference/aggregate-functions/reference/varsamp.md)的平方根。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `stddevSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/varpop.md b/docs/zh/sql-reference/aggregate-functions/reference/varpop.md index 4dca8efde38..502c1887e38 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/varpop.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/varpop.md @@ -9,4 +9,4 @@ toc_priority: 32 换句话说,计算一组数据的离差。 返回 `Float64`。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varPopStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varPopStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md index c83ee7e24d2..73481c329e4 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md @@ -11,5 +11,5 @@ toc_priority: 33 返回 `Float64`。 当 `n <= 1`,返回 `+∞`。 !!! note "注" -该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 + 该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中,使用 `varSampStable` 函数。 它的工作速度较慢,但提供较低的计算错误。 diff --git a/docs/zh/sql-reference/functions/type-conversion-functions.md b/docs/zh/sql-reference/functions/type-conversion-functions.md index 6c8843fe2bd..c1d1e66664e 100644 --- a/docs/zh/sql-reference/functions/type-conversion-functions.md +++ b/docs/zh/sql-reference/functions/type-conversion-functions.md @@ -167,9 +167,9 @@ SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) ``` ``` text -┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ -│ -1.11100 │ Nullable(Decimal(9, 5)) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ +│ -1.111 │ Nullable(Decimal(9, 5)) │ +└────────┴────────────────────────────────────────────────────┘ ``` ``` sql @@ -210,9 +210,9 @@ SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) ``` ``` text -┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ -│ -1.11100 │ Decimal(9, 5) │ -└──────────┴────────────────────────────────────────────────────┘ +┌────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ +│ -1.111 │ Decimal(9, 5) │ +└────────┴────────────────────────────────────────────────────┘ ``` ``` sql diff --git a/docs/zh/sql-reference/statements/create/function.md b/docs/zh/sql-reference/statements/create/function.md deleted file mode 120000 index d41429cb260..00000000000 --- a/docs/zh/sql-reference/statements/create/function.md +++ /dev/null @@ -1 +0,0 @@ -../../../../en/sql-reference/statements/create/function.md \ No newline at end of file diff --git a/docs/zh/sql-reference/statements/create/function.md b/docs/zh/sql-reference/statements/create/function.md new file mode 100644 index 00000000000..d57810ac91b --- /dev/null +++ b/docs/zh/sql-reference/statements/create/function.md @@ -0,0 +1,60 @@ +--- +toc_priority: 38 +toc_title: FUNCTION +--- + +# CREATE FUNCTION {#create-function} + +用一个lambda表达式创建用户自定义函数。该表达式必须由函数参数、常数、运算符或其他函数调用组成。 + +**语法** + +```sql +CREATE FUNCTION name AS (parameter0, ...) -> expression +``` + +一个函数可以有任意数量的参数。 + +存在一些限制如下: + +- 函数名在用户自定义函数和系统函数中必须是唯一的。 +- 递归函数是不允许的。 +- 函数所使用的所有变量必须在其参数列表中指定。 + +如果违反了任何限制,就会产生异常。 + +**示例** + +查询: + +```sql +CREATE FUNCTION linear_equation AS (x, k, b) -> k*x + b; +SELECT number, linear_equation(number, 2, 1) FROM numbers(3); +``` + +结果: + +``` text +┌─number─┬─plus(multiply(2, number), 1)─┐ +│ 0 │ 1 │ +│ 1 │ 3 │ +│ 2 │ 5 │ +└────────┴──────────────────────────────┘ +``` + +在下面的查询中,[conditional function](../../../sql-reference/functions/conditional-functions.md)在用户自定义函数中被调用: + +```sql +CREATE FUNCTION parity_str AS (n) -> if(n % 2, 'odd', 'even'); +SELECT number, parity_str(number) FROM numbers(3); +``` + +结果: + +``` text +┌─number─┬─if(modulo(number, 2), 'odd', 'even')─┐ +│ 0 │ even │ +│ 1 │ odd │ +│ 2 │ even │ +└────────┴──────────────────────────────────────┘ +``` diff --git a/programs/benchmark/Benchmark.cpp b/programs/benchmark/Benchmark.cpp index 1c276a83768..35ffb97b8e2 100644 --- a/programs/benchmark/Benchmark.cpp +++ b/programs/benchmark/Benchmark.cpp @@ -342,6 +342,9 @@ private: } } + /// Now we don't block the Ctrl+C signal and second signal will terminate the program without waiting. + interrupt_listener.unblock(); + pool.wait(); total_watch.stop(); @@ -586,7 +589,6 @@ public: #ifndef __clang__ #pragma GCC optimize("-fno-var-tracking-assignments") #endif -#pragma GCC diagnostic ignored "-Wmissing-declarations" int mainEntryClickHouseBenchmark(int argc, char ** argv) { diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index e01677aaac6..b1e1345cf71 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -25,7 +25,6 @@ #include #include #include -#include "Common/MemoryTracker.h" #include #include @@ -56,11 +55,6 @@ #pragma GCC optimize("-fno-var-tracking-assignments") #endif -namespace CurrentMetrics -{ - extern const Metric MemoryTracking; -} - namespace fs = std::filesystem; @@ -410,16 +404,6 @@ try std::cout << std::fixed << std::setprecision(3); std::cerr << std::fixed << std::setprecision(3); - /// Limit on total memory usage - size_t max_client_memory_usage = config().getInt64("max_memory_usage_in_client", 0 /*default value*/); - - if (max_client_memory_usage != 0) - { - total_memory_tracker.setHardLimit(max_client_memory_usage); - total_memory_tracker.setDescription("(total)"); - total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking); - } - registerFormats(); registerFunctions(); registerAggregateFunctions(); @@ -1014,7 +998,6 @@ void Client::addOptions(OptionsDescription & options_description) ("opentelemetry-tracestate", po::value(), "OpenTelemetry tracestate header as described by W3C Trace Context recommendation") ("no-warnings", "disable warnings when client connects to server") - ("max_memory_usage_in_client", po::value(), "sets memory limit in client") ; /// Commandline options related to external tables. diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index 706e273e2b4..dd93e0b49ab 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -153,10 +153,12 @@ static void createGroup(const String & group_name) if (!group_name.empty()) { #if defined(OS_DARWIN) - // TODO: implement. - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unable to create a group in macOS"); +#elif defined(OS_FREEBSD) + std::string command = fmt::format("pw groupadd {}", group_name); + fmt::print(" {}\n", command); + executeScript(command); #else std::string command = fmt::format("groupadd -r {}", group_name); fmt::print(" {}\n", command); @@ -170,10 +172,14 @@ static void createUser(const String & user_name, [[maybe_unused]] const String & if (!user_name.empty()) { #if defined(OS_DARWIN) - // TODO: implement. - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unable to create a user in macOS"); +#elif defined(OS_FREEBSD) + std::string command = group_name.empty() + ? fmt::format("pw useradd -s /bin/false -d /nonexistent -n {}", user_name) + : fmt::format("pw useradd -s /bin/false -d /nonexistent -g {} -n {}", group_name, user_name); + fmt::print(" {}\n", command); + executeScript(command); #else std::string command = group_name.empty() ? fmt::format("useradd -r --shell /bin/false --home-dir /nonexistent --user-group {}", user_name) @@ -185,6 +191,20 @@ static void createUser(const String & user_name, [[maybe_unused]] const String & } +static std::string formatWithSudo(std::string command, bool needed = true) +{ + if (!needed) + return command; + +#if defined(OS_FREEBSD) + /// FreeBSD does not have 'sudo' installed. + return fmt::format("su -m root -c '{}'", command); +#else + return fmt::format("sudo {}", command); +#endif +} + + int mainEntryClickHouseInstall(int argc, char ** argv) { try @@ -207,10 +227,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) if (options.count("help")) { - std::cout << "Usage: " - << (getuid() == 0 ? "" : "sudo ") - << argv[0] - << " install [options]\n"; + std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " install [options]", getuid() != 0) << '\n'; std::cout << desc << '\n'; return 1; } @@ -233,6 +250,9 @@ int mainEntryClickHouseInstall(int argc, char ** argv) path.pop_back(); fs::path binary_self_path(path); +#elif defined(OS_FREEBSD) + /// https://stackoverflow.com/questions/1023306/finding-current-executables-path-without-proc-self-exe + fs::path binary_self_path = argc >= 1 ? argv[0] : "/proc/curproc/file"; #else fs::path binary_self_path = "/proc/self/exe"; #endif @@ -314,7 +334,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) catch (const Exception & e) { if (e.code() == ErrorCodes::CANNOT_OPEN_FILE && geteuid() != 0) - std::cerr << "Install must be run as root: sudo ./clickhouse install\n"; + std::cerr << "Install must be run as root: " << formatWithSudo("./clickhouse install") << '\n'; throw; } @@ -824,9 +844,10 @@ int mainEntryClickHouseInstall(int argc, char ** argv) fmt::print( "\nClickHouse has been successfully installed.\n" "\nRestart clickhouse-server with:\n" - " sudo clickhouse restart\n" + " {}\n" "\nStart clickhouse-client with:\n" " clickhouse-client{}\n\n", + formatWithSudo("clickhouse restart"), maybe_password); } else @@ -834,9 +855,10 @@ int mainEntryClickHouseInstall(int argc, char ** argv) fmt::print( "\nClickHouse has been successfully installed.\n" "\nStart clickhouse-server with:\n" - " sudo clickhouse start\n" + " {}\n" "\nStart clickhouse-client with:\n" " clickhouse-client{}\n\n", + formatWithSudo("clickhouse start"), maybe_password); } } @@ -845,7 +867,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) std::cerr << getCurrentExceptionMessage(false) << '\n'; if (getuid() != 0) - std::cerr << "\nRun with sudo.\n"; + std::cerr << "\nRun with " << formatWithSudo("...") << "\n"; return getCurrentExceptionCode(); } @@ -901,6 +923,9 @@ namespace if (!user.empty()) { +#if defined(OS_FREEBSD) + command = fmt::format("su -m '{}' -c '{}'", user, command); +#else bool may_need_sudo = geteuid() != 0; if (may_need_sudo) { @@ -910,7 +935,10 @@ namespace command = fmt::format("sudo -u '{}' {}", user, command); } else + { command = fmt::format("su -s /bin/sh '{}' -c '{}'", user, command); + } +#endif } fmt::print("Will run {}\n", command); @@ -1114,10 +1142,7 @@ int mainEntryClickHouseStart(int argc, char ** argv) if (options.count("help")) { - std::cout << "Usage: " - << (getuid() == 0 ? "" : "sudo ") - << argv[0] - << " start\n"; + std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " start", getuid() != 0) << '\n'; return 1; } @@ -1155,10 +1180,7 @@ int mainEntryClickHouseStop(int argc, char ** argv) if (options.count("help")) { - std::cout << "Usage: " - << (getuid() == 0 ? "" : "sudo ") - << argv[0] - << " stop\n"; + std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " stop", getuid() != 0) << '\n'; return 1; } @@ -1191,10 +1213,7 @@ int mainEntryClickHouseStatus(int argc, char ** argv) if (options.count("help")) { - std::cout << "Usage: " - << (getuid() == 0 ? "" : "sudo ") - << argv[0] - << " status\n"; + std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " status", getuid() != 0) << '\n'; return 1; } @@ -1233,10 +1252,7 @@ int mainEntryClickHouseRestart(int argc, char ** argv) if (options.count("help")) { - std::cout << "Usage: " - << (getuid() == 0 ? "" : "sudo ") - << argv[0] - << " restart\n"; + std::cout << "Usage: " << formatWithSudo(std::string(argv[0]) + " restart", getuid() != 0) << '\n'; return 1; } diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 53e295b7fbb..aa4747636c9 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -313,11 +313,11 @@ void LocalServer::cleanup() std::string LocalServer::getInitialCreateTableQuery() { - if (!config().has("table-structure")) + if (!config().has("table-structure") && !config().has("table-file")) return {}; auto table_name = backQuoteIfNeed(config().getString("table-name", "table")); - auto table_structure = config().getString("table-structure"); + auto table_structure = config().getString("table-structure", "auto"); auto data_format = backQuoteIfNeed(config().getString("table-data-format", "TSV")); String table_file; @@ -332,7 +332,12 @@ std::string LocalServer::getInitialCreateTableQuery() table_file = quoteString(config().getString("table-file")); } - return fmt::format("CREATE TABLE {} ({}) ENGINE = File({}, {});", + if (table_structure == "auto") + table_structure = ""; + else + table_structure = "(" + table_structure + ")"; + + return fmt::format("CREATE TABLE {} {} ENGINE = File({}, {});", table_name, table_structure, data_format, table_file); } @@ -422,7 +427,7 @@ try #else is_interactive = stdin_is_a_tty && (config().hasOption("interactive") - || (!config().has("query") && !config().has("table-structure") && queries_files.empty())); + || (!config().has("query") && !config().has("table-structure") && queries_files.empty() && !config().has("table-file"))); #endif if (!is_interactive) { diff --git a/programs/odbc-bridge/ODBCConnectionFactory.h b/programs/odbc-bridge/ODBCConnectionFactory.h index 13396206a46..38f78916107 100644 --- a/programs/odbc-bridge/ODBCConnectionFactory.h +++ b/programs/odbc-bridge/ODBCConnectionFactory.h @@ -91,6 +91,25 @@ T execute(nanodbc::ConnectionHolderPtr connection_holder, std::functionupdateConnection(); return query_func(connection_holder->get()); } + + /// psqlodbc driver error handling is incomplete and under some scenarious + /// it doesn't propagate correct errors to the caller. + /// As a quick workaround we run a quick "ping" query over the connection + /// on generic errors. + /// If "ping" fails, recycle the connection and try the query once more. + if (e.state().starts_with("HY00")) + { + try + { + just_execute(connection_holder->get(), "SELECT 1"); + } + catch (...) + { + connection_holder->updateConnection(); + return query_func(connection_holder->get()); + } + } + throw; } } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index ebe1ebf44e8..67c754495d1 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -924,6 +924,12 @@ if (ThreadFuzzer::instance().isEffective()) if (config->has("max_concurrent_queries")) global_context->getProcessList().setMaxSize(config->getInt("max_concurrent_queries", 0)); + if (config->has("max_concurrent_insert_queries")) + global_context->getProcessList().setMaxInsertQueriesAmount(config->getInt("max_concurrent_insert_queries", 0)); + + if (config->has("max_concurrent_select_queries")) + global_context->getProcessList().setMaxSelectQueriesAmount(config->getInt("max_concurrent_select_queries", 0)); + if (config->has("keeper_server")) global_context->updateKeeperConfiguration(*config); diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index cb6c326cb84..4472e975878 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -160,6 +160,7 @@ enum class AccessType M(SYSTEM_FLUSH_DISTRIBUTED, "FLUSH DISTRIBUTED", TABLE, SYSTEM_FLUSH) \ M(SYSTEM_FLUSH_LOGS, "FLUSH LOGS", GLOBAL, SYSTEM_FLUSH) \ M(SYSTEM_FLUSH, "", GROUP, SYSTEM) \ + M(SYSTEM_THREAD_FUZZER, "SYSTEM START THREAD FUZZER, SYSTEM STOP THREAD FUZZER, START THREAD FUZZER, STOP THREAD FUZZER", GLOBAL, SYSTEM) \ M(SYSTEM, "", GROUP, ALL) /* allows to execute SYSTEM {SHUTDOWN|RELOAD CONFIG|...} */ \ \ M(dictGet, "dictHas, dictGetHierarchy, dictIsIn", DICTIONARY, ALL) /* allows to execute functions dictGet(), dictHas(), dictGetHierarchy(), dictIsIn() */\ diff --git a/src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp b/src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp new file mode 100644 index 00000000000..619abbb8a61 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + +struct ContingencyData : CrossTabData +{ + static const char * getName() + { + return "contingency"; + } + + Float64 getResult() const + { + if (count < 2) + return std::numeric_limits::quiet_NaN(); + + Float64 phi = getPhiSquared(); + return sqrt(phi / (phi + count)); + } +}; + +} + +void registerAggregateFunctionContingency(AggregateFunctionFactory & factory) +{ + factory.registerFunction(ContingencyData::getName(), + [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) + { + assertBinary(name, argument_types); + assertNoParameters(name, parameters); + return std::make_shared>(argument_types); + }); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionCramersV.cpp b/src/AggregateFunctions/AggregateFunctionCramersV.cpp new file mode 100644 index 00000000000..07b691141bc --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionCramersV.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + +struct CramersVData : CrossTabData +{ + static const char * getName() + { + return "cramersV"; + } + + Float64 getResult() const + { + if (count < 2) + return std::numeric_limits::quiet_NaN(); + + UInt64 q = std::min(count_a.size(), count_b.size()); + return sqrt(getPhiSquared() / (q - 1)); + } +}; + +} + +void registerAggregateFunctionCramersV(AggregateFunctionFactory & factory) +{ + factory.registerFunction(CramersVData::getName(), + [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) + { + assertBinary(name, argument_types); + assertNoParameters(name, parameters); + return std::make_shared>(argument_types); + }); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionCramersVBiasCorrected.cpp b/src/AggregateFunctions/AggregateFunctionCramersVBiasCorrected.cpp new file mode 100644 index 00000000000..917869dcd9f --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionCramersVBiasCorrected.cpp @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + +struct CramersVBiasCorrectedData : CrossTabData +{ + static const char * getName() + { + return "cramersVBiasCorrected"; + } + + Float64 getResult() const + { + if (count < 2) + return std::numeric_limits::quiet_NaN(); + + Float64 phi = getPhiSquared(); + + Float64 a_size_adjusted = count_a.size() - 1; + Float64 b_size_adjusted = count_b.size() - 1; + Float64 count_adjusted = count - 1; + + Float64 res = std::max(0.0, phi - a_size_adjusted * b_size_adjusted / count_adjusted); + Float64 correction_a = count_a.size() - a_size_adjusted * a_size_adjusted / count_adjusted; + Float64 correction_b = count_b.size() - b_size_adjusted * b_size_adjusted / count_adjusted; + + res /= std::min(correction_a, correction_b) - 1; + return sqrt(res); + } +}; + +} + +void registerAggregateFunctionCramersVBiasCorrected(AggregateFunctionFactory & factory) +{ + factory.registerFunction(CramersVBiasCorrectedData::getName(), + [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) + { + assertBinary(name, argument_types); + assertNoParameters(name, parameters); + return std::make_shared>(argument_types); + }); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.h b/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.h index 5969fca9dcf..443d76f47cb 100644 --- a/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.h +++ b/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.h @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -15,6 +16,7 @@ #include + namespace DB { @@ -23,12 +25,11 @@ namespace ErrorCodes extern const int TOO_LARGE_ARRAY_SIZE; } -/** - * Calculate total length of intervals without intersections. Each interval is the pair of numbers [begin, end]; - * Return UInt64 for integral types (UInt/Int*, Date/DateTime) and return Float64 for Float*. - * - * Implementation simply stores intervals sorted by beginning and sums lengths at final. - */ +/** Calculate total length of intervals without intersections. Each interval is the pair of numbers [begin, end]; + * Returns UInt64 for integral types (UInt/Int*, Date/DateTime) and returns Float64 for Float*. + * + * Implementation simply stores intervals sorted by beginning and sums lengths at final. + */ template struct AggregateFunctionIntervalLengthSumData { @@ -43,10 +44,14 @@ struct AggregateFunctionIntervalLengthSumData void add(T begin, T end) { + /// Reversed intervals are counted by absolute value of their length. + if (unlikely(end < begin)) + std::swap(begin, end); + else if (unlikely(begin == end)) + return; + if (sorted && !segments.empty()) - { sorted = segments.back().first <= begin; - } segments.emplace_back(begin, end); } @@ -130,6 +135,11 @@ template class AggregateFunctionIntervalLengthSum final : public IAggregateFunctionDataHelper> { private: + static auto NO_SANITIZE_UNDEFINED length(typename Data::Segment segment) + { + return segment.second - segment.first; + } + template TResult getIntervalLengthSum(Data & data) const { @@ -140,21 +150,24 @@ private: TResult res = 0; - typename Data::Segment cur_segment = data.segments[0]; + typename Data::Segment curr_segment = data.segments[0]; - for (size_t i = 1, sz = data.segments.size(); i < sz; ++i) + for (size_t i = 1, size = data.segments.size(); i < size; ++i) { - /// Check if current interval intersect with next one then add length, otherwise advance interval end - if (cur_segment.second < data.segments[i].first) - { - res += cur_segment.second - cur_segment.first; - cur_segment = data.segments[i]; - } - else - cur_segment.second = std::max(cur_segment.second, data.segments[i].second); - } + const typename Data::Segment & next_segment = data.segments[i]; - res += cur_segment.second - cur_segment.first; + /// Check if current interval intersects with next one then add length, otherwise advance interval end. + if (curr_segment.second < next_segment.first) + { + res += length(curr_segment); + curr_segment = next_segment; + } + else if (next_segment.second > curr_segment.second) + { + curr_segment.second = next_segment.second; + } + } + res += length(curr_segment); return res; } diff --git a/src/AggregateFunctions/AggregateFunctionNothing.cpp b/src/AggregateFunctions/AggregateFunctionNothing.cpp new file mode 100644 index 00000000000..b476806da08 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionNothing.cpp @@ -0,0 +1,20 @@ +#include +#include +#include + + +namespace DB +{ + +struct Settings; + +void registerAggregateFunctionNothing(AggregateFunctionFactory & factory) +{ + factory.registerFunction("nothing", [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) + { + assertNoParameters(name, parameters); + return std::make_shared(argument_types, parameters); + }); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionNothing.h b/src/AggregateFunctions/AggregateFunctionNothing.h index 4374ecf85c3..645ea7c3f8a 100644 --- a/src/AggregateFunctions/AggregateFunctionNothing.h +++ b/src/AggregateFunctions/AggregateFunctionNothing.h @@ -4,6 +4,8 @@ #include #include #include +#include +#include namespace DB @@ -26,7 +28,7 @@ public: DataTypePtr getReturnType() const override { - return argument_types.front(); + return argument_types.empty() ? std::make_shared(std::make_shared()) : argument_types.front(); } bool allocatesMemoryInArena() const override { return false; } @@ -62,12 +64,16 @@ public: { } - void serialize(ConstAggregateDataPtr __restrict, WriteBuffer &, std::optional) const override + void serialize(ConstAggregateDataPtr __restrict, WriteBuffer & buf, std::optional) const override { + writeChar('\0', buf); } - void deserialize(AggregateDataPtr, ReadBuffer &, std::optional, Arena *) const override + void deserialize(AggregateDataPtr, ReadBuffer & buf, std::optional, Arena *) const override { + [[maybe_unused]] char symbol; + readChar(symbol, buf); + assert(symbol == '\0'); } void insertResultInto(AggregateDataPtr, IColumn & to, Arena *) const override diff --git a/src/AggregateFunctions/AggregateFunctionTheilsU.cpp b/src/AggregateFunctions/AggregateFunctionTheilsU.cpp new file mode 100644 index 00000000000..96772a0daa8 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionTheilsU.cpp @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + +struct TheilsUData : CrossTabData +{ + static const char * getName() + { + return "theilsU"; + } + + Float64 getResult() const + { + if (count < 2) + return std::numeric_limits::quiet_NaN(); + + Float64 h_a = 0.0; + for (const auto & [key, value] : count_a) + { + Float64 value_float = value; + h_a += (value_float / count) * log(value_float / count); + } + + Float64 dep = 0.0; + for (const auto & [key, value] : count_ab) + { + Float64 value_ab = value; + Float64 value_b = count_b.at(key.items[1]); + + dep += (value_ab / count) * log(value_ab / value_b); + } + + dep -= h_a; + dep /= h_a; + return dep; + } +}; + +} + +void registerAggregateFunctionTheilsU(AggregateFunctionFactory & factory) +{ + factory.registerFunction(TheilsUData::getName(), + [](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) + { + assertBinary(name, argument_types); + assertNoParameters(name, parameters); + return std::make_shared>(argument_types); + }); +} + +} diff --git a/src/AggregateFunctions/CrossTab.h b/src/AggregateFunctions/CrossTab.h new file mode 100644 index 00000000000..e01ebcf71ed --- /dev/null +++ b/src/AggregateFunctions/CrossTab.h @@ -0,0 +1,175 @@ +#pragma once + +#include +#include +#include +#include +#include + + +/** Aggregate function that calculates statistics on top of cross-tab: + * - histogram of every argument and every pair of elements. + * These statistics include: + * - Cramer's V; + * - Theil's U; + * - contingency coefficient; + * It can be interpreted as interdependency coefficient between arguments; + * or non-parametric correlation coefficient. + */ +namespace DB +{ + +struct CrossTabData +{ + /// Total count. + UInt64 count = 0; + + /// Count of every value of the first and second argument (values are pre-hashed). + /// Note: non-cryptographic 64bit hash is used, it means that the calculation is approximate. + HashMapWithStackMemory count_a; + HashMapWithStackMemory count_b; + + /// Count of every pair of values. We pack two hashes into UInt128. + HashMapWithStackMemory count_ab; + + + void add(UInt64 hash1, UInt64 hash2) + { + ++count; + ++count_a[hash1]; + ++count_b[hash2]; + + UInt128 hash_pair{hash1, hash2}; + ++count_ab[hash_pair]; + } + + void merge(const CrossTabData & other) + { + count += other.count; + for (const auto & [key, value] : other.count_a) + count_a[key] += value; + for (const auto & [key, value] : other.count_b) + count_b[key] += value; + for (const auto & [key, value] : other.count_ab) + count_ab[key] += value; + } + + void serialize(WriteBuffer & buf) const + { + writeBinary(count, buf); + count_a.write(buf); + count_b.write(buf); + count_ab.write(buf); + } + + void deserialize(ReadBuffer & buf) + { + readBinary(count, buf); + count_a.read(buf); + count_b.read(buf); + count_ab.read(buf); + } + + /** See https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V + * + * φ² is χ² divided by the sample size (count). + * χ² is the sum of squares of the normalized differences between the "expected" and "observed" statistics. + * ("Expected" in the case when one of the hypotheses is true). + * Something resembling the L2 distance. + * + * Note: statisticians use the name χ² for every statistic that has χ² distribution in many various contexts. + * + * Let's suppose that there is no association between the values a and b. + * Then the frequency (e.g. probability) of (a, b) pair is equal to the multiplied frequencies of a and b: + * count_ab / count = (count_a / count) * (count_b / count) + * count_ab = count_a * count_b / count + * + * Let's calculate the difference between the values that are supposed to be equal if there is no association between a and b: + * count_ab - count_a * count_b / count + * + * Let's sum the squares of the differences across all (a, b) pairs. + * Then divide by the second term for normalization: (count_a * count_b / count) + * + * This will be the χ² statistics. + * This statistics is used as a base for many other statistics. + */ + Float64 getPhiSquared() const + { + Float64 chi_squared = 0; + for (const auto & [key, value_ab] : count_ab) + { + Float64 value_a = count_a.at(key.items[0]); + Float64 value_b = count_b.at(key.items[1]); + + Float64 expected_value_ab = (value_a * value_b) / count; + + Float64 chi_squared_elem = value_ab - expected_value_ab; + chi_squared_elem = chi_squared_elem * chi_squared_elem / expected_value_ab; + + chi_squared += chi_squared_elem; + } + return chi_squared / count; + } +}; + + +template +class AggregateFunctionCrossTab : public IAggregateFunctionDataHelper> +{ +public: + AggregateFunctionCrossTab(const DataTypes & arguments) + : IAggregateFunctionDataHelper>({arguments}, {}) + { + } + + String getName() const override + { + return Data::getName(); + } + + bool allocatesMemoryInArena() const override + { + return false; + } + + DataTypePtr getReturnType() const override + { + return std::make_shared>(); + } + + void add( + AggregateDataPtr __restrict place, + const IColumn ** columns, + size_t row_num, + Arena *) const override + { + UInt64 hash1 = UniqVariadicHash::apply(1, &columns[0], row_num); + UInt64 hash2 = UniqVariadicHash::apply(1, &columns[1], row_num); + + this->data(place).add(hash1, hash2); + } + + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override + { + this->data(place).merge(this->data(rhs)); + } + + void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional) const override + { + this->data(place).serialize(buf); + } + + void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional, Arena *) const override + { + this->data(place).deserialize(buf); + } + + void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override + { + Float64 result = this->data(place).getResult(); + auto & column = static_cast &>(to); + column.getData().push_back(result); + } +}; + +} diff --git a/src/AggregateFunctions/registerAggregateFunctions.cpp b/src/AggregateFunctions/registerAggregateFunctions.cpp index 79a418ac69f..33f6a532224 100644 --- a/src/AggregateFunctions/registerAggregateFunctions.cpp +++ b/src/AggregateFunctions/registerAggregateFunctions.cpp @@ -48,8 +48,13 @@ void registerAggregateFunctionRankCorrelation(AggregateFunctionFactory &); void registerAggregateFunctionMannWhitney(AggregateFunctionFactory &); void registerAggregateFunctionWelchTTest(AggregateFunctionFactory &); void registerAggregateFunctionStudentTTest(AggregateFunctionFactory &); +void registerAggregateFunctionCramersV(AggregateFunctionFactory &); +void registerAggregateFunctionTheilsU(AggregateFunctionFactory &); +void registerAggregateFunctionContingency(AggregateFunctionFactory &); +void registerAggregateFunctionCramersVBiasCorrected(AggregateFunctionFactory &); void registerAggregateFunctionSingleValueOrNull(AggregateFunctionFactory &); void registerAggregateFunctionSequenceNextNode(AggregateFunctionFactory &); +void registerAggregateFunctionNothing(AggregateFunctionFactory &); void registerAggregateFunctionExponentialMovingAverage(AggregateFunctionFactory &); void registerAggregateFunctionSparkbar(AggregateFunctionFactory &); void registerAggregateFunctionIntervalLengthSum(AggregateFunctionFactory &); @@ -99,6 +104,10 @@ void registerAggregateFunctions() registerAggregateFunctionUniqUpTo(factory); registerAggregateFunctionTopK(factory); registerAggregateFunctionsBitwise(factory); + registerAggregateFunctionCramersV(factory); + registerAggregateFunctionTheilsU(factory); + registerAggregateFunctionContingency(factory); + registerAggregateFunctionCramersVBiasCorrected(factory); registerAggregateFunctionsBitmap(factory); registerAggregateFunctionsMaxIntersections(factory); registerAggregateFunctionHistogram(factory); @@ -114,6 +123,7 @@ void registerAggregateFunctions() registerAggregateFunctionSequenceNextNode(factory); registerAggregateFunctionWelchTTest(factory); registerAggregateFunctionStudentTTest(factory); + registerAggregateFunctionNothing(factory); registerAggregateFunctionSingleValueOrNull(factory); registerAggregateFunctionIntervalLengthSum(factory); registerAggregateFunctionExponentialMovingAverage(factory); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7124961821e..c9e9f736e0d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -526,6 +526,14 @@ if (USE_BZIP2) target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${BZIP2_INCLUDE_DIR}) endif() +if(USE_SIMDJSON) + dbms_target_link_libraries(PRIVATE simdjson) +endif() + +if(USE_RAPIDJSON) + dbms_target_include_directories(SYSTEM PRIVATE ${RAPIDJSON_INCLUDE_DIR}) +endif() + dbms_target_link_libraries(PUBLIC consistent-hashing) include ("${ClickHouse_SOURCE_DIR}/cmake/add_check.cmake") @@ -558,7 +566,9 @@ if (ENABLE_TESTS AND USE_GTEST) clickhouse_parsers clickhouse_storages_system dbms + clickhouse_common_config clickhouse_common_zookeeper + clickhouse_common_config string_utils) add_check(unit_tests_dbms) diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 75e0588f786..747603d0e6d 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "Common/Exception.h" @@ -65,6 +66,11 @@ namespace fs = std::filesystem; using namespace std::literals; +namespace CurrentMetrics +{ + extern const Metric MemoryTracking; +} + namespace DB { @@ -457,12 +463,13 @@ void ClientBase::initBlockOutputStream(const Block & block, ASTPtr parsed_query) /// The query can specify output format or output file. if (const auto * query_with_output = dynamic_cast(parsed_query.get())) { + String out_file; if (query_with_output->out_file) { select_into_file = true; const auto & out_file_node = query_with_output->out_file->as(); - const auto & out_file = out_file_node.value.safeGet(); + out_file = out_file_node.value.safeGet(); std::string compression_method; if (query_with_output->compression) @@ -488,6 +495,12 @@ void ClientBase::initBlockOutputStream(const Block & block, ASTPtr parsed_query) const auto & id = query_with_output->format->as(); current_format = id.name(); } + else if (query_with_output->out_file) + { + const auto & format_name = FormatFactory::instance().getFormatFromFileName(out_file); + if (!format_name.empty()) + current_format = format_name; + } } if (has_vertical_output_suffix) @@ -800,7 +813,7 @@ void ClientBase::onProfileEvents(Block & block) if (rows == 0) return; - if (progress_indication.print_hardware_utilization) + if (server_revision >= DBMS_MIN_PROTOCOL_VERSION_WITH_INCREMENTAL_PROFILE_EVENTS) { const auto & array_thread_id = typeid_cast(*block.getByName("thread_id").column).getData(); const auto & names = typeid_cast(*block.getByName("name").column); @@ -834,25 +847,25 @@ void ClientBase::onProfileEvents(Block & block) } auto elapsed_time = profile_events.watch.elapsedMicroseconds(); progress_indication.updateThreadEventData(thread_times, elapsed_time); - } - if (profile_events.print) - { - if (profile_events.watch.elapsedMilliseconds() >= profile_events.delay_ms) + if (profile_events.print) { - initLogsOutputStream(); - progress_indication.clearProgressOutput(); - logs_out_stream->writeProfileEvents(block); - logs_out_stream->flush(); + if (profile_events.watch.elapsedMilliseconds() >= profile_events.delay_ms) + { + initLogsOutputStream(); + progress_indication.clearProgressOutput(); + logs_out_stream->writeProfileEvents(block); + logs_out_stream->flush(); - profile_events.last_block = {}; - } - else - { - incrementProfileEventsBlock(profile_events.last_block, block); + profile_events.last_block = {}; + } + else + { + incrementProfileEventsBlock(profile_events.last_block, block); + } } + profile_events.watch.restart(); } - profile_events.watch.restart(); } @@ -1002,11 +1015,15 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des compression_method = compression_method_node.value.safeGet(); } + String current_format = parsed_insert_query->format; + if (current_format.empty()) + current_format = FormatFactory::instance().getFormatFromFileName(in_file); + /// Create temporary storage file, to support globs and parallel reading StorageFile::CommonArguments args{ WithContext(global_context), parsed_insert_query->table_id, - parsed_insert_query->format, + current_format, getFormatSettings(global_context), compression_method, columns_description_for_query, @@ -1812,6 +1829,7 @@ void ClientBase::init(int argc, char ** argv) ("interactive", "Process queries-file or --query query and start interactive mode") ("pager", po::value(), "Pipe all output into this command (less or similar)") + ("max_memory_usage_in_client", po::value(), "Set memory limit in client/local server") ; addOptions(options_description); @@ -1909,8 +1927,6 @@ void ClientBase::init(int argc, char ** argv) Poco::Logger::root().setLevel(options["log-level"].as()); if (options.count("server_logs_file")) server_logs_file = options["server_logs_file"].as(); - if (options.count("hardware-utilization")) - progress_indication.print_hardware_utilization = true; query_processing_stage = QueryProcessingStage::fromString(options["stage"].as()); profile_events.print = options.count("print-profile-events"); @@ -1919,6 +1935,15 @@ void ClientBase::init(int argc, char ** argv) processOptions(options_description, options, external_tables_arguments); argsToConfig(common_arguments, config(), 100); clearPasswordFromCommandLine(argc, argv); + + /// Limit on total memory usage + size_t max_client_memory_usage = config().getInt64("max_memory_usage_in_client", 0 /*default value*/); + if (max_client_memory_usage != 0) + { + total_memory_tracker.setHardLimit(max_client_memory_usage); + total_memory_tracker.setDescription("(total)"); + total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking); + } } } diff --git a/src/Client/LocalConnection.cpp b/src/Client/LocalConnection.cpp index 8ad853950b2..528c38f9b76 100644 --- a/src/Client/LocalConnection.cpp +++ b/src/Client/LocalConnection.cpp @@ -214,6 +214,12 @@ bool LocalConnection::poll(size_t) if (next_packet_type) return true; + if (state->exception) + { + next_packet_type = Protocol::Server::Exception; + return true; + } + if (!state->is_finished) { if (send_progress && (state->after_send_progress.elapsedMicroseconds() >= query_context->getSettingsRef().interactive_delay)) diff --git a/src/Common/CombinedCardinalityEstimator.h b/src/Common/CombinedCardinalityEstimator.h index 55afb028247..8cf35436840 100644 --- a/src/Common/CombinedCardinalityEstimator.h +++ b/src/Common/CombinedCardinalityEstimator.h @@ -323,7 +323,7 @@ private: UInt64 address = 0; }; static const UInt64 mask = 0xFFFFFFFFFFFFFFFC; - static const UInt32 medium_set_size_max = 1UL << medium_set_power2_max; + static const UInt32 medium_set_size_max = 1ULL << medium_set_power2_max; }; } diff --git a/src/Common/Config/CMakeLists.txt b/src/Common/Config/CMakeLists.txt index 3da44be2af6..4d72960f727 100644 --- a/src/Common/Config/CMakeLists.txt +++ b/src/Common/Config/CMakeLists.txt @@ -4,6 +4,7 @@ set (SRCS configReadClient.cpp ConfigReloader.cpp YAMLParser.cpp + ConfigHelper.cpp ) add_library(clickhouse_common_config ${SRCS}) diff --git a/src/Common/Config/ConfigHelper.cpp b/src/Common/Config/ConfigHelper.cpp new file mode 100644 index 00000000000..69fe42de86c --- /dev/null +++ b/src/Common/Config/ConfigHelper.cpp @@ -0,0 +1,23 @@ +#include +#include + +namespace DB +{ + +namespace ConfigHelper +{ + +bool getBool(const Poco::Util::AbstractConfiguration & config, const std::string & key, bool default_, bool empty_as) +{ + if (!config.has(key)) + return default_; + Poco::Util::AbstractConfiguration::Keys sub_keys; + config.keys(key, sub_keys); + if (sub_keys.empty() && config.getString(key).empty()) + return empty_as; + return config.getBool(key, default_); +} + +} + +} diff --git a/src/Common/Config/ConfigHelper.h b/src/Common/Config/ConfigHelper.h new file mode 100644 index 00000000000..62271bbaf0a --- /dev/null +++ b/src/Common/Config/ConfigHelper.h @@ -0,0 +1,18 @@ +#pragma once + +namespace Poco +{ + namespace Util + { + class AbstractConfiguration; + } +} + +namespace DB::ConfigHelper +{ + +/// The behavior is like `config.getBool(key, default_)`, +/// except when the tag is empty (aka. self-closing), `empty_as` will be used instead of throwing Poco::Exception. +bool getBool(const Poco::Util::AbstractConfiguration & config, const std::string & key, bool default_, bool empty_as); + +} diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 5c9ba177b78..896168253cf 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -63,8 +63,10 @@ M(MaxDDLEntryID, "Max processed DDL entry of DDLWorker.") \ M(MaxPushedDDLEntryID, "Max DDL entry of DDLWorker that pushed to zookeeper.") \ M(PartsTemporary, "The part is generating now, it is not in data_parts list.") \ - M(PartsPreCommitted, "The part is in data_parts, but not used for SELECTs.") \ - M(PartsCommitted, "Active data part, used by current and upcoming SELECTs.") \ + M(PartsPreCommitted, "Deprecated. See PartsPreActive.") \ + M(PartsCommitted, "Deprecated. See PartsActive.") \ + M(PartsPreActive, "The part is in data_parts, but not used for SELECTs.") \ + M(PartsActive, "Active data part, used by current and upcoming SELECTs.") \ M(PartsOutdated, "Not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes.") \ M(PartsDeleting, "Not active data part with identity refcounter, it is deleting right now by a cleaner.") \ M(PartsDeleteOnDestroy, "Part was moved to another disk and should be deleted in own destructor.") \ diff --git a/src/Common/DateLUTImpl.cpp b/src/Common/DateLUTImpl.cpp index ebf32c4dbd9..869954bb2ae 100644 --- a/src/Common/DateLUTImpl.cpp +++ b/src/Common/DateLUTImpl.cpp @@ -174,6 +174,20 @@ DateLUTImpl::DateLUTImpl(const std::string & time_zone_) { years_months_lut[year_months_lut_index] = first_day_of_last_month; } + + /// Fill saturated LUT. + { + ssize_t day = DATE_LUT_SIZE - 1; + for (; day >= 0; --day) + { + if (lut[day].date >= 0) + lut_saturated[day] = lut[day]; + else + break; + } + for (; day >= 0; --day) + lut_saturated[day] = lut_saturated[day + 1]; + } } diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index e52e6547fa2..c178dc58854 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -61,6 +61,8 @@ private: // has to be a separate type to support overloading // TODO: make sure that any arithmetic on LUTIndex actually results in valid LUTIndex. STRONG_TYPEDEF(UInt32, LUTIndex) + // Same as above but select different function overloads for zero saturation. + STRONG_TYPEDEF(UInt32, LUTIndexWithSaturation) template friend inline LUTIndex operator+(const LUTIndex & index, const T v) @@ -182,6 +184,9 @@ private: /// In comparison to std::vector, plain array is cheaper by one indirection. Values lut[DATE_LUT_SIZE + 1]; + /// Same as above but with dates < 1970-01-01 saturated to 1970-01-01. + Values lut_saturated[DATE_LUT_SIZE + 1]; + /// Year number after DATE_LUT_MIN_YEAR -> LUTIndex in lut for start of year. LUTIndex years_lut[DATE_LUT_YEARS]; @@ -278,19 +283,39 @@ public: auto getOffsetAtStartOfEpoch() const { return offset_at_start_of_epoch; } auto getTimeOffsetAtStartOfLUT() const { return offset_at_start_of_lut; } - auto getDayNumOffsetEpoch() const { return daynum_offset_epoch; } + static auto getDayNumOffsetEpoch() { return daynum_offset_epoch; } /// All functions below are thread-safe; arguments are not checked. - inline ExtendedDayNum toDayNum(ExtendedDayNum d) const + static ExtendedDayNum toDayNum(ExtendedDayNum d) { return d; } - template - inline ExtendedDayNum toDayNum(DateOrTime v) const + static UInt32 saturateMinus(UInt32 x, UInt32 y) { - return ExtendedDayNum{static_cast(toLUTIndex(v).toUnderType() - daynum_offset_epoch)}; + UInt32 res = x - y; + res &= -Int32(res <= x); + return res; + } + + static ExtendedDayNum toDayNum(LUTIndex d) + { + return ExtendedDayNum{static_cast(d.toUnderType() - daynum_offset_epoch)}; + } + + static DayNum toDayNum(LUTIndexWithSaturation d) + { + return DayNum{static_cast(saturateMinus(d.toUnderType(), daynum_offset_epoch))}; + } + + template + inline auto toDayNum(DateOrTime v) const + { + if constexpr (std::is_unsigned_v || std::is_same_v) + return DayNum{static_cast(saturateMinus(toLUTIndex(v).toUnderType(), daynum_offset_epoch))}; + else + return ExtendedDayNum{static_cast(toLUTIndex(v).toUnderType() - daynum_offset_epoch)}; } /// Round down to start of monday. @@ -298,14 +323,20 @@ public: inline Time toFirstDayOfWeek(DateOrTime v) const { const LUTIndex i = toLUTIndex(v); - return lut[i - (lut[i].day_of_week - 1)].date; + if constexpr (std::is_unsigned_v || std::is_same_v) + return lut_saturated[i - (lut[i].day_of_week - 1)].date; + else + return lut[i - (lut[i].day_of_week - 1)].date; } template - inline ExtendedDayNum toFirstDayNumOfWeek(DateOrTime v) const + inline auto toFirstDayNumOfWeek(DateOrTime v) const { const LUTIndex i = toLUTIndex(v); - return toDayNum(i - (lut[i].day_of_week - 1)); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(i - (lut[i].day_of_week - 1))); + else + return toDayNum(LUTIndex(i - (lut[i].day_of_week - 1))); } /// Round down to start of month. @@ -313,21 +344,30 @@ public: inline Time toFirstDayOfMonth(DateOrTime v) const { const LUTIndex i = toLUTIndex(v); - return lut[i - (lut[i].day_of_month - 1)].date; + if constexpr (std::is_unsigned_v || std::is_same_v) + return lut_saturated[i - (lut[i].day_of_month - 1)].date; + else + return lut[i - (lut[i].day_of_month - 1)].date; } template - inline ExtendedDayNum toFirstDayNumOfMonth(DateOrTime v) const + inline auto toFirstDayNumOfMonth(DateOrTime v) const { const LUTIndex i = toLUTIndex(v); - return toDayNum(i - (lut[i].day_of_month - 1)); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(i - (lut[i].day_of_month - 1))); + else + return toDayNum(LUTIndex(i - (lut[i].day_of_month - 1))); } /// Round down to start of quarter. template - inline ExtendedDayNum toFirstDayNumOfQuarter(DateOrTime v) const + inline auto toFirstDayNumOfQuarter(DateOrTime v) const { - return toDayNum(toFirstDayOfQuarterIndex(v)); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(toFirstDayOfQuarterIndex(v))); + else + return toDayNum(LUTIndex(toFirstDayOfQuarterIndex(v))); } template @@ -365,9 +405,12 @@ public: } template - inline ExtendedDayNum toFirstDayNumOfYear(DateOrTime v) const + inline auto toFirstDayNumOfYear(DateOrTime v) const { - return toDayNum(toFirstDayNumOfYearIndex(v)); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(toFirstDayNumOfYearIndex(v))); + else + return toDayNum(LUTIndex(toFirstDayNumOfYearIndex(v))); } inline Time toFirstDayOfNextMonth(Time t) const @@ -514,11 +557,17 @@ public: * because the same calendar day starts/ends at different timestamps in different time zones) */ - inline Time fromDayNum(DayNum d) const { return lut[toLUTIndex(d)].date; } + inline Time fromDayNum(DayNum d) const { return lut_saturated[toLUTIndex(d)].date; } inline Time fromDayNum(ExtendedDayNum d) const { return lut[toLUTIndex(d)].date; } template - inline Time toDate(DateOrTime v) const { return lut[toLUTIndex(v)].date; } + inline Time toDate(DateOrTime v) const + { + if constexpr (std::is_unsigned_v || std::is_same_v) + return lut_saturated[toLUTIndex(v)].date; + else + return lut[toLUTIndex(v)].date; + } template inline unsigned toMonth(DateOrTime v) const { return lut[toLUTIndex(v)].month; } @@ -581,9 +630,12 @@ public: } template - inline ExtendedDayNum toFirstDayNumOfISOYear(DateOrTime v) const + inline auto toFirstDayNumOfISOYear(DateOrTime v) const { - return toDayNum(toFirstDayNumOfISOYearIndex(v)); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(toFirstDayNumOfISOYearIndex(v))); + else + return toDayNum(LUTIndex(toFirstDayNumOfISOYearIndex(v))); } inline Time toFirstDayOfISOYear(Time t) const @@ -596,7 +648,7 @@ public: template inline unsigned toISOWeek(DateOrTime v) const { - return 1 + (toFirstDayNumOfWeek(v) - toFirstDayNumOfISOYear(v)) / 7; + return 1 + (toFirstDayNumOfWeek(v) - toDayNum(toFirstDayNumOfISOYearIndex(v))) / 7; } /* @@ -662,7 +714,7 @@ public: { if (!week_year_mode && ((first_weekday_mode && weekday != 0) || (!first_weekday_mode && weekday >= 4))) return yw; - week_year_mode = 1; + week_year_mode = true; (yw.first)--; first_daynr -= (days = calc_days_in_year(yw.first)); weekday = (weekday + 53 * 7 - days) % 7; @@ -724,7 +776,7 @@ public: /// Get first day of week with week_mode, return Sunday or Monday template - inline ExtendedDayNum toFirstDayNumOfWeek(DateOrTime v, UInt8 week_mode) const + inline auto toFirstDayNumOfWeek(DateOrTime v, UInt8 week_mode) const { bool monday_first_mode = week_mode & static_cast(WeekModeFlag::MONDAY_FIRST); if (monday_first_mode) @@ -733,7 +785,10 @@ public: } else { - return (toDayOfWeek(v) != 7) ? ExtendedDayNum(v - toDayOfWeek(v)) : toDayNum(v); + if constexpr (std::is_unsigned_v || std::is_same_v) + return (toDayOfWeek(v) != 7) ? DayNum(saturateMinus(v, toDayOfWeek(v))) : toDayNum(v); + else + return (toDayOfWeek(v) != 7) ? ExtendedDayNum(v - toDayOfWeek(v)) : toDayNum(v); } } @@ -809,7 +864,7 @@ public: } template - inline ExtendedDayNum toStartOfYearInterval(DateOrTime v, UInt64 years) const + inline auto toStartOfYearInterval(DateOrTime v, UInt64 years) const { if (years == 1) return toFirstDayNumOfYear(v); @@ -822,39 +877,59 @@ public: if (unlikely(year < DATE_LUT_MIN_YEAR)) year = DATE_LUT_MIN_YEAR; - return toDayNum(years_lut[year - DATE_LUT_MIN_YEAR]); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(years_lut[year - DATE_LUT_MIN_YEAR])); + else + return toDayNum(years_lut[year - DATE_LUT_MIN_YEAR]); } - inline ExtendedDayNum toStartOfQuarterInterval(ExtendedDayNum d, UInt64 quarters) const + template || std::is_same_v>> + inline auto toStartOfQuarterInterval(Date d, UInt64 quarters) const { if (quarters == 1) return toFirstDayNumOfQuarter(d); return toStartOfMonthInterval(d, quarters * 3); } - inline ExtendedDayNum toStartOfMonthInterval(ExtendedDayNum d, UInt64 months) const + template || std::is_same_v>> + inline auto toStartOfMonthInterval(Date d, UInt64 months) const { if (months == 1) return toFirstDayNumOfMonth(d); const Values & values = lut[toLUTIndex(d)]; UInt32 month_total_index = (values.year - DATE_LUT_MIN_YEAR) * 12 + values.month - 1; - return toDayNum(years_months_lut[month_total_index / months * months]); + if constexpr (std::is_same_v) + return toDayNum(LUTIndexWithSaturation(years_months_lut[month_total_index / months * months])); + else + return toDayNum(years_months_lut[month_total_index / months * months]); } - inline ExtendedDayNum toStartOfWeekInterval(ExtendedDayNum d, UInt64 weeks) const + template || std::is_same_v>> + inline auto toStartOfWeekInterval(Date d, UInt64 weeks) const { if (weeks == 1) return toFirstDayNumOfWeek(d); UInt64 days = weeks * 7; // January 1st 1970 was Thursday so we need this 4-days offset to make weeks start on Monday. - return ExtendedDayNum(4 + (d - 4) / days * days); + if constexpr (std::is_same_v) + return DayNum(4 + (d - 4) / days * days); + else + return ExtendedDayNum(4 + (d - 4) / days * days); } - inline Time toStartOfDayInterval(ExtendedDayNum d, UInt64 days) const + template || std::is_same_v>> + inline Time toStartOfDayInterval(Date d, UInt64 days) const { if (days == 1) return toDate(d); - return lut[toLUTIndex(ExtendedDayNum(d / days * days))].date; + if constexpr (std::is_same_v) + return lut_saturated[toLUTIndex(ExtendedDayNum(d / days * days))].date; + else + return lut[toLUTIndex(ExtendedDayNum(d / days * days))].date; } inline Time toStartOfHourInterval(Time t, UInt64 hours) const @@ -1140,7 +1215,11 @@ public: /// If resulting month has less deys than source month, then saturation can happen. /// Example: 31 Aug + 1 month = 30 Sep. - inline Time NO_SANITIZE_UNDEFINED addMonths(Time t, Int64 delta) const + template < + typename DateTime, + typename + = std::enable_if_t || std::is_same_v || std::is_same_v>> + inline Time NO_SANITIZE_UNDEFINED addMonths(DateTime t, Int64 delta) const { const auto result_day = addMonthsIndex(t, delta); @@ -1154,20 +1233,28 @@ public: if (time >= lut[result_day].time_at_offset_change()) time -= lut[result_day].amount_of_offset_change(); - return lut[result_day].date + time; + auto res = lut[result_day].date + time; + if constexpr (std::is_same_v) + { + /// Common compiler should generate branchless code for this saturation operation. + return res <= 0 ? 0 : res; + } + else + return res; } - inline ExtendedDayNum NO_SANITIZE_UNDEFINED addMonths(ExtendedDayNum d, Int64 delta) const + template || std::is_same_v>> + inline auto NO_SANITIZE_UNDEFINED addMonths(Date d, Int64 delta) const { - return toDayNum(addMonthsIndex(d, delta)); + if constexpr (std::is_same_v) + return toDayNum(LUTIndexWithSaturation(addMonthsIndex(d, delta))); + else + return toDayNum(addMonthsIndex(d, delta)); } - inline Time NO_SANITIZE_UNDEFINED addQuarters(Time t, Int32 delta) const - { - return addMonths(t, static_cast(delta) * 3); - } - - inline ExtendedDayNum addQuarters(ExtendedDayNum d, Int32 delta) const + template + inline auto addQuarters(DateOrTime d, Int32 delta) const { return addMonths(d, static_cast(delta) * 3); } @@ -1189,7 +1276,11 @@ public: } /// Saturation can occur if 29 Feb is mapped to non-leap year. - inline Time addYears(Time t, Int64 delta) const + template < + typename DateTime, + typename + = std::enable_if_t || std::is_same_v || std::is_same_v>> + inline Time addYears(DateTime t, Int64 delta) const { auto result_day = addYearsIndex(t, delta); @@ -1203,12 +1294,24 @@ public: if (time >= lut[result_day].time_at_offset_change()) time -= lut[result_day].amount_of_offset_change(); - return lut[result_day].date + time; + auto res = lut[result_day].date + time; + if constexpr (std::is_same_v) + { + /// Common compiler should generate branchless code for this saturation operation. + return res <= 0 ? 0 : res; + } + else + return res; } - inline ExtendedDayNum addYears(ExtendedDayNum d, Int64 delta) const + template || std::is_same_v>> + inline auto addYears(Date d, Int64 delta) const { - return toDayNum(addYearsIndex(d, delta)); + if constexpr (std::is_same_v) + return toDayNum(LUTIndexWithSaturation(addYearsIndex(d, delta))); + else + return toDayNum(addYearsIndex(d, delta)); } diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 70d85433513..ef2be3b2164 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -603,6 +603,9 @@ M(632, UNEXPECTED_DATA_AFTER_PARSED_VALUE) \ M(633, QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW) \ M(634, MONGODB_ERROR) \ + M(635, CANNOT_POLL) \ + M(636, CANNOT_EXTRACT_TABLE_STRUCTURE) \ + M(637, INVALID_TABLE_OVERRIDE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/HashTable/HashMap.h b/src/Common/HashTable/HashMap.h index c5675d4d7c9..236a6d65707 100644 --- a/src/Common/HashTable/HashMap.h +++ b/src/Common/HashTable/HashMap.h @@ -10,6 +10,13 @@ * Also, key in hash table must be of type, that zero bytes is compared equals to zero key. */ +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} +} struct NoInitTag { @@ -262,6 +269,13 @@ public: return it->getMapped(); } + + const typename Cell::Mapped & ALWAYS_INLINE at(const Key & x) const + { + if (auto it = this->find(x); it != this->end()) + return it->getMapped(); + throw DB::Exception("Cannot find element in HashMap::at method", DB::ErrorCodes::LOGICAL_ERROR); + } }; namespace std diff --git a/src/Common/HashTable/StringHashTable.h b/src/Common/HashTable/StringHashTable.h index d30271d65db..7d704e4bdc7 100644 --- a/src/Common/HashTable/StringHashTable.h +++ b/src/Common/HashTable/StringHashTable.h @@ -280,7 +280,7 @@ public: if ((reinterpret_cast(p) & 2048) == 0) { memcpy(&n[0], p, 8); - n[0] &= -1ul >> s; + n[0] &= -1ULL >> s; } else { diff --git a/src/Common/HashTable/TwoLevelStringHashTable.h b/src/Common/HashTable/TwoLevelStringHashTable.h index 93bbcb2835d..871becc86a4 100644 --- a/src/Common/HashTable/TwoLevelStringHashTable.h +++ b/src/Common/HashTable/TwoLevelStringHashTable.h @@ -114,7 +114,7 @@ public: if ((reinterpret_cast(p) & 2048) == 0) { memcpy(&n[0], p, 8); - n[0] &= -1ul >> s; + n[0] &= -1ULL >> s; } else { diff --git a/src/Common/LRUCache.h b/src/Common/LRUCache.h index bbc09fd3aff..480a03ab399 100644 --- a/src/Common/LRUCache.h +++ b/src/Common/LRUCache.h @@ -64,6 +64,18 @@ public: setImpl(key, mapped, lock); } + void remove(const Key & key) + { + std::lock_guard lock(mutex); + auto it = cells.find(key); + if (it == cells.end()) + return; + auto & cell = it->second; + current_size -= cell.size; + queue.erase(cell.queue_iterator); + cells.erase(it); + } + /// If the value for the key is in the cache, returns it. If it is not, calls load_func() to /// produce it, saves the result in the cache and returns it. /// Only one of several concurrent threads calling getOrSet() will call load_func(), diff --git a/src/Common/LRUResourceCache.h b/src/Common/LRUResourceCache.h new file mode 100644 index 00000000000..e1a28e7ab60 --- /dev/null +++ b/src/Common/LRUResourceCache.h @@ -0,0 +1,392 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +template +struct TrivailLRUResourceCacheWeightFunction +{ + size_t operator()(const T &) const { return 1; } +}; + +/** + * Similar to implementation in LRUCache.h, but with the difference that keys can + * only be evicted when they are releasable. Release state is controlled by this implementation. + * get() and getOrSet() methods return a Holder to actual value, which does release() in destructor. + * + * Warning (!): This implementation is in development, not to be used. + */ +template , + typename HashFunction = std::hash> +class LRUResourceCache +{ +public: + using Key = TKey; + using Mapped = TMapped; + using MappedPtr = std::shared_ptr; + + class MappedHolder + { + public: + MappedHolder(LRUResourceCache * cache_, const Key & key_, MappedPtr value_) + : cache(cache_), key(key_), val(value_) {} + + ~MappedHolder() { cache->release(key); } + + Mapped & value() { return *val; } + + protected: + LRUResourceCache * cache; + Key key; + MappedPtr val; + }; + + using MappedHolderPtr = std::unique_ptr; + + explicit LRUResourceCache(size_t max_weight_, size_t max_element_size_ = 0) + : max_weight(max_weight_), max_element_size(max_element_size_) {} + + MappedHolderPtr get(const Key & key) + { + auto mapped_ptr = getImpl(key); + if (!mapped_ptr) + return nullptr; + return std::make_unique(this, key, mapped_ptr); + } + + template + MappedHolderPtr getOrSet(const Key & key, LoadFunc && load_func) + { + auto mapped_ptr = getImpl(key, load_func); + if (!mapped_ptr) + return nullptr; + return std::make_unique(this, key, mapped_ptr); + } + + // If the key's reference_count = 0, delete it immediately. + // Otherwise, mark it expired (not visible to get()), and delete when refcount is 0. + void tryRemove(const Key & key) + { + std::lock_guard lock(mutex); + auto it = cells.find(key); + if (it == cells.end()) + return; + auto & cell = it->second; + if (cell.reference_count == 0) + { + queue.erase(cell.queue_iterator); + current_weight -= cell.weight; + cells.erase(it); + } + else + cell.expired = true; + } + + size_t weight() + { + std::lock_guard lock(mutex); + return current_weight; + } + + size_t size() + { + std::lock_guard lock(mutex); + return cells.size(); + } + + void getStats(size_t & out_hits, size_t & out_misses, size_t & out_evict_count) const + { + out_hits = hits; + out_misses = misses; + out_evict_count = evict_count; + } + +private: + mutable std::mutex mutex; + + using LRUQueue = std::list; + using LRUQueueIterator = typename LRUQueue::iterator; + + struct Cell + { + MappedPtr value; + size_t weight = 0; + LRUQueueIterator queue_iterator; + size_t reference_count = 0; + bool expired = false; + }; + + using Cells = std::unordered_map; + Cells cells; + LRUQueue queue; + size_t current_weight = 0; + size_t max_weight = 0; + size_t max_element_size = 0; + + /// Represents pending insertion attempt. + struct InsertToken + { + explicit InsertToken(LRUResourceCache & cache_) : cache(cache_) { } + + std::mutex mutex; + bool cleaned_up = false; /// Protected by the token mutex + MappedPtr value; /// Protected by the token mutex + + LRUResourceCache & cache; + size_t refcount = 0; /// Protected by the cache mutex + }; + + using InsertTokenById = std::unordered_map, HashFunction>; + + /// This class is responsible for removing used insert tokens from the insert_tokens map. + /// Among several concurrent threads the first successful one is responsible for removal. But if they all + /// fail, then the last one is responsible. + struct InsertTokenHolder + { + const Key * key = nullptr; + std::shared_ptr token; + bool cleaned_up = false; + + InsertTokenHolder() = default; + + void + acquire(const Key * key_, const std::shared_ptr & token_, [[maybe_unused]] std::lock_guard & cache_lock) + { + key = key_; + token = token_; + ++token->refcount; + } + + void cleanup([[maybe_unused]] std::lock_guard & token_lock, [[maybe_unused]] std::lock_guard & cache_lock) + { + token->cache.insert_tokens.erase(*key); + token->cleaned_up = true; + cleaned_up = true; + } + + ~InsertTokenHolder() + { + if (!token) + return; + + if (cleaned_up) + return; + + std::lock_guard token_lock(token->mutex); + + if (token->cleaned_up) + return; + + std::lock_guard cache_lock(token->cache.mutex); + + --token->refcount; + if (token->refcount == 0) + cleanup(token_lock, cache_lock); + } + }; + + friend struct InsertTokenHolder; + InsertTokenById insert_tokens; + WeightFunction weight_function; + std::atomic hits{0}; + std::atomic misses{0}; + std::atomic evict_count{0}; + + /// Returns nullptr when there is no more space for the new value or the old value is in used. + template + MappedPtr getImpl(const Key & key, LoadFunc && load_func) + { + InsertTokenHolder token_holder; + { + std::lock_guard lock(mutex); + auto it = cells.find(key); + if (it != cells.end() && !it->second.expired) + { + if (!it->second.expired) + { + ++hits; + it->second.reference_count += 1; + queue.splice(queue.end(), queue, it->second.queue_iterator); + return it->second.value; + } + else if (it->second.reference_count > 0) + return nullptr; + else + { + // should not reach here + LOG_ERROR(&Poco::Logger::get("LRUResourceCache"), "element is in invalid status."); + abort(); + } + } + ++misses; + auto & token = insert_tokens[key]; + if (!token) + token = std::make_shared(*this); + token_holder.acquire(&key, token, lock); + } + + auto * token = token_holder.token.get(); + std::lock_guard token_lock(token->mutex); + token_holder.cleaned_up = token->cleaned_up; + + if (!token->value) + token->value = load_func(); + + std::lock_guard lock(mutex); + auto token_it = insert_tokens.find(key); + Cell * cell_ptr = nullptr; + if (token_it != insert_tokens.end() && token_it->second.get() == token) + { + cell_ptr = set(key, token->value); + } + else + { + auto cell_it = cells.find(key); + if (cell_it != cells.end() && !cell_it->second.expired) + { + cell_ptr = &cell_it->second; + } + } + + if (!token->cleaned_up) + token_holder.cleanup(token_lock, lock); + + if (cell_ptr) + { + queue.splice(queue.end(), queue, cell_ptr->queue_iterator); + cell_ptr->reference_count++; + return cell_ptr->value; + } + return nullptr; + } + + MappedPtr getImpl(const Key & key) + { + std::lock_guard lock(mutex); + + auto it = cells.find(key); + if (it == cells.end() || it->second.expired) + { + ++misses; + return nullptr; + } + + ++hits; + it->second.reference_count += 1; + queue.splice(queue.end(), queue, it->second.queue_iterator); + return it->second.value; + } + + // mark a reference is released + void release(const Key & key) + { + std::lock_guard lock(mutex); + + auto it = cells.find(key); + if (it == cells.end() || it->second.reference_count == 0) + { + LOG_ERROR(&Poco::Logger::get("LRUResourceCache"), "try to release an invalid element"); + abort(); + } + + auto & cell = it->second; + cell.reference_count -= 1; + if (cell.expired && cell.reference_count == 0) + { + queue.erase(cell.queue_iterator); + current_weight -= cell.weight; + cells.erase(it); + } + } + + InsertToken * acquireInsertToken(const Key & key) + { + auto & token = insert_tokens[key]; + token.reference_count += 1; + return &token; + } + + void releaseInsertToken(const Key & key) + { + auto it = insert_tokens.find(key); + if (it != insert_tokens.end()) + { + it->second.reference_count -= 1; + if (it->second.reference_count == 0) + insert_tokens.erase(it); + } + } + + // key mustn't be in the cache + Cell * set(const Key & insert_key, MappedPtr value) + { + auto weight = value ? weight_function(*value) : 0; + auto queue_size = cells.size() + 1; + auto loss_weight = 0; + + auto is_overflow = [&] { + return current_weight + weight - loss_weight > max_weight || (max_element_size != 0 && queue_size > max_element_size); + }; + + auto key_it = queue.begin(); + std::unordered_set to_release_keys; + + while (is_overflow() && queue_size > 1 && key_it != queue.end()) + { + const Key & key = *key_it; + + auto cell_it = cells.find(key); + if (cell_it == cells.end()) + { + LOG_ERROR(&Poco::Logger::get("LRUResourceCache"), "LRUResourceCache became inconsistent. There must be a bug in it."); + abort(); + } + + auto & cell = cell_it->second; + if (cell.reference_count == 0) + { + loss_weight += cell.weight; + queue_size -= 1; + to_release_keys.insert(key); + } + + ++key_it; + } + + if (is_overflow()) + return nullptr; + + if (loss_weight > current_weight + weight) + { + LOG_ERROR(&Poco::Logger::get("LRUResourceCache"), "LRUResourceCache became inconsistent. There must be a bug in it."); + abort(); + } + + for (auto & key : to_release_keys) + { + auto & cell = cells[key]; + queue.erase(cell.queue_iterator); + cells.erase(key); + ++evict_count; + } + + current_weight = current_weight + weight - loss_weight; + + auto & new_cell = cells[insert_key]; + new_cell.value = value; + new_cell.weight = weight; + new_cell.queue_iterator = queue.insert(queue.end(), insert_key); + return &new_cell; + } +}; +} diff --git a/src/Common/ProgressIndication.cpp b/src/Common/ProgressIndication.cpp index 33508f5ad5f..b9a8bc923f7 100644 --- a/src/Common/ProgressIndication.cpp +++ b/src/Common/ProgressIndication.cpp @@ -16,7 +16,7 @@ namespace { constexpr UInt64 ALL_THREADS = 0; - UInt64 calculateCoresNumber(DB::ThreadIdToTimeMap times, UInt64 elapsed) + double calculateCPUUsage(DB::ThreadIdToTimeMap times, UInt64 elapsed) { auto accumulated = std::accumulate(times.begin(), times.end(), 0, [](Int64 acc, const auto & elem) @@ -25,7 +25,7 @@ namespace return acc; return acc + elem.second.time(); }); - return (static_cast(accumulated) + elapsed - 1) / elapsed; + return static_cast(accumulated) / elapsed; } } @@ -53,7 +53,7 @@ void ProgressIndication::resetProgress() show_progress_bar = false; written_progress_chars = 0; write_progress_on_update = false; - host_active_cores.clear(); + host_cpu_usage.clear(); thread_data.clear(); } @@ -81,8 +81,7 @@ void ProgressIndication::updateThreadEventData(HostToThreadTimesMap & new_thread { for (auto & new_host_map : new_thread_data) { - auto new_cores = calculateCoresNumber(new_host_map.second, elapsed_time); - host_active_cores[new_host_map.first] = new_cores; + host_cpu_usage[new_host_map.first] = calculateCPUUsage(new_host_map.second, elapsed_time); thread_data[new_host_map.first] = std::move(new_host_map.second); } } @@ -96,13 +95,12 @@ size_t ProgressIndication::getUsedThreadsCount() const }); } -UInt64 ProgressIndication::getApproximateCoresNumber() const +double ProgressIndication::getCPUUsage() const { - return std::accumulate(host_active_cores.cbegin(), host_active_cores.cend(), 0, - [](UInt64 acc, auto const & elem) - { - return acc + elem.second; - }); + double res = 0; + for (const auto & elem : host_cpu_usage) + res += elem.second; + return res; } ProgressIndication::MemoryUsage ProgressIndication::getMemoryUsage() const @@ -116,6 +114,7 @@ ProgressIndication::MemoryUsage ProgressIndication::getMemoryUsage() const // memory consumption it's enough to look for data with thread id 0. if (auto it = host_data.second.find(ALL_THREADS); it != host_data.second.end()) host_usage = it->second.memory_usage; + return MemoryUsage{.total = acc.total + host_usage, .max = std::max(acc.max, host_usage)}; }); } @@ -183,27 +182,29 @@ void ProgressIndication::writeProgress() written_progress_chars = message.count() - prefix_size - (strlen(indicator) - 2); /// Don't count invisible output (escape sequences). - // If approximate cores number is known, display it. - auto cores_number = getApproximateCoresNumber(); + /// Display resource usage if possible. std::string profiling_msg; - if (cores_number != 0 && print_hardware_utilization) + + double cpu_usage = getCPUUsage(); + auto [memory_usage, max_host_usage] = getMemoryUsage(); + + if (cpu_usage > 0 || memory_usage > 0) { WriteBufferFromOwnString profiling_msg_builder; - // Calculated cores number may be not accurate - // so it's better to print min(threads, cores). - UInt64 threads_number = getUsedThreadsCount(); - profiling_msg_builder << " Running " << threads_number << " threads on " - << std::min(cores_number, threads_number) << " cores"; - auto [memory_usage, max_host_usage] = getMemoryUsage(); - if (memory_usage != 0) - profiling_msg_builder << " with " << formatReadableSizeWithDecimalSuffix(memory_usage) << " RAM used"; - if (thread_data.size() > 1 && max_host_usage) - profiling_msg_builder << " total (per host max: " << formatReadableSizeWithDecimalSuffix(max_host_usage) << ")"; - profiling_msg_builder << "."; + profiling_msg_builder << "(" << fmt::format("{:.1f}", cpu_usage) << " CPU"; + + if (memory_usage > 0) + profiling_msg_builder << ", " << formatReadableSizeWithDecimalSuffix(memory_usage) << " RAM"; + if (max_host_usage < memory_usage) + profiling_msg_builder << ", " << formatReadableSizeWithDecimalSuffix(max_host_usage) << " max/host"; + + profiling_msg_builder << ")"; profiling_msg = profiling_msg_builder.str(); } + int64_t remaining_space = static_cast(terminal_width) - written_progress_chars; + /// If the approximate number of rows to process is known, we can display a progress bar and percentage. if (progress.total_rows_to_read || progress.total_raw_bytes_to_read) { @@ -230,14 +231,35 @@ void ProgressIndication::writeProgress() if (show_progress_bar) { - ssize_t width_of_progress_bar = static_cast(terminal_width) - written_progress_chars - strlen(" 99%") - profiling_msg.length(); + /// We will display profiling info only if there is enough space for it. + int64_t width_of_progress_bar = remaining_space - strlen(" 99%"); + + /// We need at least twice the space, because it will be displayed either + /// at right after progress bar or at left on top of the progress bar. + if (width_of_progress_bar <= 1 + 2 * static_cast(profiling_msg.size())) + profiling_msg.clear(); + else + width_of_progress_bar -= profiling_msg.size(); + if (width_of_progress_bar > 0) { - std::string bar - = UnicodeBar::render(UnicodeBar::getWidth(current_count, 0, max_count, width_of_progress_bar)); + size_t bar_width = UnicodeBar::getWidth(current_count, 0, max_count, width_of_progress_bar); + std::string bar = UnicodeBar::render(bar_width); + + /// Render profiling_msg at left on top of the progress bar. + bool render_profiling_msg_at_left = current_count * 2 >= max_count; + if (!profiling_msg.empty() && render_profiling_msg_at_left) + message << "\033[30;42m" << profiling_msg << "\033[0m"; + message << "\033[0;32m" << bar << "\033[0m"; - if (width_of_progress_bar > static_cast(bar.size() / UNICODE_BAR_CHAR_SIZE)) + + /// Whitespaces after the progress bar. + if (width_of_progress_bar > static_cast(bar.size() / UNICODE_BAR_CHAR_SIZE)) message << std::string(width_of_progress_bar - bar.size() / UNICODE_BAR_CHAR_SIZE, ' '); + + /// Render profiling_msg at right after the progress bar. + if (!profiling_msg.empty() && !render_profiling_msg_at_left) + message << "\033[2m" << profiling_msg << "\033[0m"; } } } @@ -245,8 +267,17 @@ void ProgressIndication::writeProgress() /// Underestimate percentage a bit to avoid displaying 100%. message << ' ' << (99 * current_count / max_count) << '%'; } + else + { + /// We can still display profiling info. + if (remaining_space >= static_cast(profiling_msg.size())) + { + if (remaining_space > static_cast(profiling_msg.size())) + message << std::string(remaining_space - profiling_msg.size(), ' '); + message << "\033[2m" << profiling_msg << "\033[0m"; + } + } - message << profiling_msg; message << CLEAR_TO_END_OF_LINE; ++increment; diff --git a/src/Common/ProgressIndication.h b/src/Common/ProgressIndication.h index b775279f73b..aad4a8c18e5 100644 --- a/src/Common/ProgressIndication.h +++ b/src/Common/ProgressIndication.h @@ -60,13 +60,10 @@ public: void updateThreadEventData(HostToThreadTimesMap & new_thread_data, UInt64 elapsed_time); - bool print_hardware_utilization = false; - private: - size_t getUsedThreadsCount() const; - UInt64 getApproximateCoresNumber() const; + double getCPUUsage() const; struct MemoryUsage { @@ -93,7 +90,7 @@ private: bool write_progress_on_update = false; - std::unordered_map host_active_cores; + std::unordered_map host_cpu_usage; HostToThreadTimesMap thread_data; }; diff --git a/src/Common/ShellCommand.cpp b/src/Common/ShellCommand.cpp index 99461862ef9..0093d72e766 100644 --- a/src/Common/ShellCommand.cpp +++ b/src/Common/ShellCommand.cpp @@ -65,14 +65,14 @@ ShellCommand::~ShellCommand() size_t try_wait_timeout = config.terminate_in_destructor_strategy.wait_for_normal_exit_before_termination_seconds; bool process_terminated_normally = tryWaitProcessWithTimeout(try_wait_timeout); - if (!process_terminated_normally) - { - LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid); + if (process_terminated_normally) + return; - int retcode = kill(pid, SIGTERM); - if (retcode != 0) - LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode)); - } + LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid); + + int retcode = kill(pid, SIGTERM); + if (retcode != 0) + LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode)); } else { @@ -91,7 +91,7 @@ bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds) { int status = 0; - LOG_TRACE(getLogger(), "Try wait for shell command pid ({}) with timeout ({})", pid, timeout_in_seconds); + LOG_TRACE(getLogger(), "Try wait for shell command pid {} with timeout {}", pid, timeout_in_seconds); wait_called = true; struct timespec interval {.tv_sec = 1, .tv_nsec = 0}; @@ -119,7 +119,9 @@ bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds) bool process_terminated_normally = (waitpid_res == pid); if (process_terminated_normally) + { return true; + } else if (waitpid_res == 0) { --timeout_in_seconds; @@ -128,7 +130,9 @@ bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds) continue; } else if (waitpid_res == -1 && errno != EINTR) + { return false; + } } return false; diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp index 568f633975b..32c1a15337c 100644 --- a/src/Common/SymbolIndex.cpp +++ b/src/Common/SymbolIndex.cpp @@ -86,7 +86,7 @@ namespace /// https://stackoverflow.com/questions/32088140/multiple-string-tables-in-elf-object -void updateResources(std::string_view name, const void * address, SymbolIndex::Resources & resources) +void updateResources(ElfW(Addr) base_address, std::string_view object_name, std::string_view name, const void * address, SymbolIndex::Resources & resources) { const char * char_address = static_cast(address); @@ -97,18 +97,23 @@ void updateResources(std::string_view name, const void * address, SymbolIndex::R name = name.substr((name[0] == '_') + strlen("binary_")); name = name.substr(0, name.size() - strlen("_start")); - resources.emplace(name, std::string_view{char_address, 0}); // NOLINT + resources.emplace(name, SymbolIndex::ResourcesBlob{ + base_address, + object_name, + std::string_view{char_address, 0}, // NOLINT + }); } else if (name.ends_with("_end")) { name = name.substr((name[0] == '_') + strlen("binary_")); name = name.substr(0, name.size() - strlen("_end")); - if (auto it = resources.find(name); it != resources.end() && it->second.empty()) + auto it = resources.find(name); + if (it != resources.end() && it->second.base_address == base_address && it->second.data.empty()) { - const char * start = it->second.data(); + const char * start = it->second.data.data(); assert(char_address >= start); - it->second = std::string_view{start, static_cast(char_address - start)}; + it->second.data = std::string_view{start, static_cast(char_address - start)}; } } } @@ -153,10 +158,12 @@ void collectSymbolsFromProgramHeaders( size_t sym_cnt = 0; for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); + // TODO: this branch leads to invalid address of the hash table. Need further investigation. // if (it->d_tag == DT_HASH) // { - // const ElfW(Word) * hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + // const ElfW(Word) * hash = reinterpret_cast(base_address); // sym_cnt = hash[1]; // break; // } @@ -167,7 +174,7 @@ void collectSymbolsFromProgramHeaders( const uint32_t * buckets = nullptr; const uint32_t * hashval = nullptr; - const ElfW(Word) * hash = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + const ElfW(Word) * hash = reinterpret_cast(base_address); buckets = hash + 4 + (hash[2] * sizeof(size_t) / 4); @@ -196,9 +203,11 @@ void collectSymbolsFromProgramHeaders( const char * strtab = nullptr; for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); + if (it->d_tag == DT_STRTAB) { - strtab = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + strtab = reinterpret_cast(base_address); break; } } @@ -208,10 +217,12 @@ void collectSymbolsFromProgramHeaders( for (const auto * it = dyn_begin; it->d_tag != DT_NULL; ++it) { + ElfW(Addr) base_address = correct_address(info->dlpi_addr, it->d_un.d_ptr); + if (it->d_tag == DT_SYMTAB) { /* Get the pointer to the first entry of the symbol table */ - const ElfW(Sym) * elf_sym = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); + const ElfW(Sym) * elf_sym = reinterpret_cast(base_address); /* Iterate over the symbol table */ for (ElfW(Word) sym_index = 0; sym_index < ElfW(Word)(sym_cnt); ++sym_index) @@ -236,7 +247,7 @@ void collectSymbolsFromProgramHeaders( symbols.push_back(symbol); /// But resources can be represented by a pair of empty symbols (indicating their boundaries). - updateResources(symbol.name, symbol.address_begin, resources); + updateResources(base_address, info->dlpi_name, symbol.name, symbol.address_begin, resources); } break; @@ -299,7 +310,7 @@ void collectSymbolsFromELFSymbolTable( if (symbol_table_entry->st_size) symbols.push_back(symbol); - updateResources(symbol.name, symbol.address_begin, resources); + updateResources(info->dlpi_addr, info->dlpi_name, symbol.name, symbol.address_begin, resources); } } diff --git a/src/Common/SymbolIndex.h b/src/Common/SymbolIndex.h index 7c542980099..1331cf81cf7 100644 --- a/src/Common/SymbolIndex.h +++ b/src/Common/SymbolIndex.h @@ -51,7 +51,7 @@ public: std::string_view getResource(String name) const { if (auto it = data.resources.find(name); it != data.resources.end()) - return it->second; + return it->second.data; return {}; } @@ -59,7 +59,17 @@ public: String getBuildID() const { return data.build_id; } String getBuildIDHex() const; - using Resources = std::unordered_map; + struct ResourcesBlob + { + /// Symbol can be presented in multiple shared objects, + /// base_address will be used to compare only symbols from the same SO. + ElfW(Addr) base_address; + /// Just a human name of the SO. + std::string_view object_name; + /// Data blob. + std::string_view data; + }; + using Resources = std::unordered_map; struct Data { diff --git a/src/Common/Throttler.cpp b/src/Common/Throttler.cpp index f02001e338a..95baf40f2c0 100644 --- a/src/Common/Throttler.cpp +++ b/src/Common/Throttler.cpp @@ -23,7 +23,7 @@ static constexpr auto NS = 1000000000UL; /// Tracking window. Actually the size is not really important. We just want to avoid /// throttles when there are no actions for a long period time. -static const double window_ns = 1UL * NS; +static const double window_ns = 1ULL * NS; void Throttler::add(size_t amount) { diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index f05a10b8815..c8753c8edaf 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -26,6 +26,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; + extern const int BAD_ARGUMENTS; } } @@ -1133,4 +1134,54 @@ Coordination::RequestPtr makeCheckRequest(const std::string & path, int version) return request; } +std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log) +{ + if (!zookeeper_path.empty() && zookeeper_path.back() == '/') + zookeeper_path.resize(zookeeper_path.size() - 1); + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + if (!zookeeper_path.empty() && zookeeper_path.front() != '/') + { + /// Do not allow this for new tables, print warning for tables created in old versions + if (check_starts_with_slash) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "ZooKeeper path must starts with '/', got '{}'", zookeeper_path); + if (log) + LOG_WARNING(log, "ZooKeeper path ('{}') does not start with '/'. It will not be supported in future releases"); + zookeeper_path = "/" + zookeeper_path; + } + + return zookeeper_path; +} + +String extractZooKeeperName(const String & path) +{ + static constexpr auto default_zookeeper_name = "default"; + if (path.empty()) + throw DB::Exception("ZooKeeper path should not be empty", DB::ErrorCodes::BAD_ARGUMENTS); + if (path[0] == '/') + return default_zookeeper_name; + auto pos = path.find(":/"); + if (pos != String::npos && pos < path.find('/')) + { + auto zookeeper_name = path.substr(0, pos); + if (zookeeper_name.empty()) + throw DB::Exception("Zookeeper path should start with '/' or ':/'", DB::ErrorCodes::BAD_ARGUMENTS); + return zookeeper_name; + } + return default_zookeeper_name; +} + +String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log) +{ + if (path.empty()) + throw DB::Exception("ZooKeeper path should not be empty", DB::ErrorCodes::BAD_ARGUMENTS); + if (path[0] == '/') + return normalizeZooKeeperPath(path, check_starts_with_slash, log); + auto pos = path.find(":/"); + if (pos != String::npos && pos < path.find('/')) + { + return normalizeZooKeeperPath(path.substr(pos + 1, String::npos), check_starts_with_slash, log); + } + return normalizeZooKeeperPath(path, check_starts_with_slash, log); +} + } diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 8e015b1f331..371f93f6df3 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -379,4 +379,11 @@ private: }; using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr; + +String normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log = nullptr); + +String extractZooKeeperName(const String & path); + +String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr); + } diff --git a/src/Common/tests/gtest_config_helper.cpp b/src/Common/tests/gtest_config_helper.cpp new file mode 100644 index 00000000000..59a6cfa0ae0 --- /dev/null +++ b/src/Common/tests/gtest_config_helper.cpp @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include + + +using namespace DB; + +TEST(Common, ConfigHelperGetBool) +{ + std::string xml(R"CONFIG( + 0 + 1 + Yes + + + + 1 + 1 + Yes1 +)CONFIG"); + + Poco::XML::DOMParser dom_parser; + Poco::AutoPtr document = dom_parser.parseString(xml); + Poco::AutoPtr config = new Poco::Util::XMLConfiguration(document); + EXPECT_EQ(ConfigHelper::getBool(*config, "zero_as_false", false, true), false); + EXPECT_EQ(ConfigHelper::getBool(*config, "one_as_true", false, true), true); + EXPECT_EQ(ConfigHelper::getBool(*config, "yes_as_true", false, true), true); + EXPECT_EQ(ConfigHelper::getBool(*config, "empty_as_true_1", false, true), true); + EXPECT_EQ(ConfigHelper::getBool(*config, "empty_as_true_2", false, true), true); + ASSERT_THROW(ConfigHelper::getBool(*config, "has_empty_child_1", false, true), Poco::Exception); + EXPECT_EQ(ConfigHelper::getBool(*config, "has_empty_child_2", false, true), true); + EXPECT_EQ(ConfigHelper::getBool(*config, "has_child_1", false, true), true); + ASSERT_THROW(ConfigHelper::getBool(*config, "has_child_2", false, true), Poco::Exception); +} diff --git a/src/Common/tests/gtest_global_context.cpp b/src/Common/tests/gtest_global_context.cpp new file mode 100644 index 00000000000..19ba3cdc269 --- /dev/null +++ b/src/Common/tests/gtest_global_context.cpp @@ -0,0 +1,7 @@ +#include "gtest_global_context.h" + +const ContextHolder & getContext() +{ + static ContextHolder holder; + return holder; +} diff --git a/src/Common/tests/gtest_global_context.h b/src/Common/tests/gtest_global_context.h index 9bd7c2490d6..7756be7ce9b 100644 --- a/src/Common/tests/gtest_global_context.h +++ b/src/Common/tests/gtest_global_context.h @@ -18,8 +18,4 @@ struct ContextHolder ContextHolder(ContextHolder &&) = default; }; -inline const ContextHolder & getContext() -{ - static ContextHolder holder; - return holder; -} +const ContextHolder & getContext(); diff --git a/src/Common/tests/gtest_lru_cache.cpp b/src/Common/tests/gtest_lru_cache.cpp new file mode 100644 index 00000000000..7694a76ea72 --- /dev/null +++ b/src/Common/tests/gtest_lru_cache.cpp @@ -0,0 +1,97 @@ +#include +#include +#include +#include + +TEST(LRUCache, set) +{ + using SimpleLRUCache = DB::LRUCache; + auto lru_cache = SimpleLRUCache(10, 10); + lru_cache.set(1, std::make_shared(2)); + lru_cache.set(2, std::make_shared(3)); + + auto w = lru_cache.weight(); + auto n = lru_cache.count(); + ASSERT_EQ(w, 2); + ASSERT_EQ(n, 2); +} + +TEST(LRUCache, update) +{ + using SimpleLRUCache = DB::LRUCache; + auto lru_cache = SimpleLRUCache(10, 10); + lru_cache.set(1, std::make_shared(2)); + lru_cache.set(1, std::make_shared(3)); + auto val = lru_cache.get(1); + ASSERT_TRUE(val != nullptr); + ASSERT_TRUE(*val == 3); +} + +TEST(LRUCache, get) +{ + using SimpleLRUCache = DB::LRUCache; + auto lru_cache = SimpleLRUCache(10, 10); + lru_cache.set(1, std::make_shared(2)); + lru_cache.set(2, std::make_shared(3)); + SimpleLRUCache::MappedPtr value = lru_cache.get(1); + ASSERT_TRUE(value != nullptr); + ASSERT_EQ(*value, 2); + + value = lru_cache.get(2); + ASSERT_TRUE(value != nullptr); + ASSERT_EQ(*value, 3); +} + +struct ValueWeight +{ + size_t operator()(const size_t & x) const { return x; } +}; + +TEST(LRUCache, evictOnSize) +{ + using SimpleLRUCache = DB::LRUCache; + auto lru_cache = SimpleLRUCache(20, 3); + lru_cache.set(1, std::make_shared(2)); + lru_cache.set(2, std::make_shared(3)); + lru_cache.set(3, std::make_shared(4)); + lru_cache.set(4, std::make_shared(5)); + + auto n = lru_cache.count(); + ASSERT_EQ(n, 3); + + auto value = lru_cache.get(1); + ASSERT_TRUE(value == nullptr); +} + +TEST(LRUCache, evictOnWeight) +{ + using SimpleLRUCache = DB::LRUCache, ValueWeight>; + auto lru_cache = SimpleLRUCache(10, 10); + lru_cache.set(1, std::make_shared(2)); + lru_cache.set(2, std::make_shared(3)); + lru_cache.set(3, std::make_shared(4)); + lru_cache.set(4, std::make_shared(5)); + + auto n = lru_cache.count(); + ASSERT_EQ(n, 2); + + auto w = lru_cache.weight(); + ASSERT_EQ(w, 9); + + auto value = lru_cache.get(1); + ASSERT_TRUE(value == nullptr); + value = lru_cache.get(2); + ASSERT_TRUE(value == nullptr); +} + +TEST(LRUCache, getOrSet) +{ + using SimpleLRUCache = DB::LRUCache, ValueWeight>; + auto lru_cache = SimpleLRUCache(10, 10); + size_t x = 10; + auto load_func = [&] { return std::make_shared(x); }; + auto [value, loaded] = lru_cache.getOrSet(1, load_func); + ASSERT_TRUE(value != nullptr); + ASSERT_TRUE(*value == 10); +} + diff --git a/src/Common/tests/gtest_lru_resource_cache.cpp b/src/Common/tests/gtest_lru_resource_cache.cpp new file mode 100644 index 00000000000..f88eded531e --- /dev/null +++ b/src/Common/tests/gtest_lru_resource_cache.cpp @@ -0,0 +1,270 @@ +#include +#include +#include +#include + +TEST(LRUResourceCache, get) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(10, 10); + int x = 10; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + x = 11; + auto holder2 = mcache.getOrSet(2, load_int); + ASSERT_TRUE(holder2 != nullptr); + ASSERT_TRUE(holder2->value() == 11); + + auto holder3 = mcache.get(1); + ASSERT_TRUE(holder3 != nullptr); + ASSERT_TRUE(holder3->value() == 10); +} + +TEST(LRUResourceCache, remove) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(10, 10); + int x = 10; + auto load_int = [&] { return std::make_shared(x); }; + auto holder0 = mcache.getOrSet(1, load_int); + auto holder1 = mcache.getOrSet(1, load_int); + + mcache.tryRemove(1); + holder0 = mcache.get(1); + ASSERT_TRUE(holder0 == nullptr); + auto n = mcache.size(); + ASSERT_TRUE(n == 1); + + holder0.reset(); + holder1.reset(); + n = mcache.size(); + ASSERT_TRUE(n == 0); +} + +struct MyWeight +{ + size_t operator()(const int & x) const { return static_cast(x); } +}; + +TEST(LRUResourceCache, evictOnWweight) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(5, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + + auto holder2 = mcache.getOrSet(2, load_int); + holder2.reset(); + + x = 3; + auto holder3 = mcache.getOrSet(3, load_int); + ASSERT_TRUE(holder3 != nullptr); + + auto w = mcache.weight(); + ASSERT_EQ(w, 5); + auto n = mcache.size(); + ASSERT_EQ(n, 2); + + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 == nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 != nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 != nullptr); +} + +TEST(LRUResourceCache, evictOnWeightV2) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(5, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + + auto holder2 = mcache.getOrSet(2, load_int); + holder2.reset(); + + holder1 = mcache.get(1); + holder1.reset(); + + x = 3; + auto holder3 = mcache.getOrSet(3, load_int); + ASSERT_TRUE(holder3 != nullptr); + + auto w = mcache.weight(); + ASSERT_EQ(w, 5); + auto n = mcache.size(); + ASSERT_EQ(n, 2); + + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 == nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 != nullptr); +} + +TEST(LRUResourceCache, evictOnWeightV3) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(5, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + + auto holder2 = mcache.getOrSet(2, load_int); + holder2.reset(); + + holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + + x = 3; + auto holder3 = mcache.getOrSet(3, load_int); + ASSERT_TRUE(holder3 != nullptr); + + auto w = mcache.weight(); + ASSERT_EQ(w, 5); + auto n = mcache.size(); + ASSERT_EQ(n, 2); + + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 == nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 != nullptr); +} + +TEST(LRUResourceCache, evictOnSize) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(5, 2); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + + auto holder2 = mcache.getOrSet(2, load_int); + holder2.reset(); + + x = 3; + auto holder3 = mcache.getOrSet(3, load_int); + ASSERT_TRUE(holder3 != nullptr); + + auto n = mcache.size(); + ASSERT_EQ(n, 2); + auto w = mcache.weight(); + ASSERT_EQ(w, 2); + + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 == nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 != nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 != nullptr); +} + +TEST(LRUResourceCache, notEvictUsedElement) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(7, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + + auto holder2 = mcache.getOrSet(2, load_int); + holder2.reset(); + + auto holder3 = mcache.getOrSet(3, load_int); + holder3.reset(); + + x = 3; + auto holder4 = mcache.getOrSet(4, load_int); + ASSERT_TRUE(holder4 != nullptr); + + auto n = mcache.size(); + ASSERT_EQ(n, 3); + auto w = mcache.weight(); + ASSERT_EQ(w, 7); + + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 == nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 != nullptr); + holder4 = mcache.get(4); + ASSERT_TRUE(holder4 != nullptr); +} + +TEST(LRUResourceCache, getFail) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(5, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + auto holder2 = mcache.getOrSet(2, load_int); + auto holder3 = mcache.getOrSet(3, load_int); + ASSERT_TRUE(holder3 == nullptr); + + auto n = mcache.size(); + ASSERT_EQ(n, 2); + auto w = mcache.weight(); + ASSERT_EQ(w, 4); + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + holder2 = mcache.get(2); + ASSERT_TRUE(holder2 != nullptr); + holder3 = mcache.get(3); + ASSERT_TRUE(holder3 == nullptr); +} + +TEST(LRUResourceCache, dupGet) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(20, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + holder1.reset(); + x = 11; + holder1 = mcache.getOrSet(1, load_int); + ASSERT_TRUE(holder1 != nullptr); + + auto n = mcache.size(); + ASSERT_EQ(n, 1); + auto w = mcache.weight(); + ASSERT_EQ(w, 2); + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + ASSERT_TRUE(holder1->value() == 2); +} + +TEST(LRUResourceCache, reGet) +{ + using MyCache = DB::LRUResourceCache; + auto mcache = MyCache(20, 10); + int x = 2; + auto load_int = [&] { return std::make_shared(x); }; + auto holder1 = mcache.getOrSet(1, load_int); + mcache.tryRemove(1); + + x = 11; + holder1.reset(); + holder1 = mcache.getOrSet(1, load_int); + ASSERT_TRUE(holder1 != nullptr); + + auto n = mcache.size(); + ASSERT_EQ(n, 1); + auto w = mcache.weight(); + ASSERT_EQ(w, 11); + holder1 = mcache.get(1); + ASSERT_TRUE(holder1 != nullptr); + ASSERT_TRUE(holder1->value() == 11); +} + diff --git a/src/Coordination/ACLMap.cpp b/src/Coordination/ACLMap.cpp index 863dfdec281..41b759531cf 100644 --- a/src/Coordination/ACLMap.cpp +++ b/src/Coordination/ACLMap.cpp @@ -42,11 +42,14 @@ bool ACLMap::ACLsComparator::operator()(const Coordination::ACLs & left, const C uint64_t ACLMap::convertACLs(const Coordination::ACLs & acls) { + if (acls.empty()) + return 0; + if (acl_to_num.count(acls)) return acl_to_num[acls]; /// Start from one - auto index = acl_to_num.size() + 1; + auto index = max_acl_id++; acl_to_num[acls] = index; num_to_acl[index] = acls; @@ -69,6 +72,7 @@ void ACLMap::addMapping(uint64_t acls_id, const Coordination::ACLs & acls) { num_to_acl[acls_id] = acls; acl_to_num[acls] = acls_id; + max_acl_id = std::max(acls_id + 1, max_acl_id); /// max_acl_id pointer next slot } void ACLMap::addUsage(uint64_t acl_id) diff --git a/src/Coordination/ACLMap.h b/src/Coordination/ACLMap.h index 2313b3e7cd3..e1b2ce1eff6 100644 --- a/src/Coordination/ACLMap.h +++ b/src/Coordination/ACLMap.h @@ -31,6 +31,7 @@ private: ACLToNumMap acl_to_num; NumToACLMap num_to_acl; UsageCounter usage_counter; + uint64_t max_acl_id{1}; public: /// Convert ACL to number. If it's new ACL than adds it to map @@ -43,7 +44,7 @@ public: /// Mapping from numbers to ACLs vectors. Used during serialization. const NumToACLMap & getMapping() const { return num_to_acl; } - /// Add mapping to ACLMap. Used during deserialization. + /// Add mapping to ACLMap. Used during deserialization from snapshot. void addMapping(uint64_t acls_id, const Coordination::ACLs & acls); /// Add/remove usage of some id. Used to remove unused ACLs. diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index 2d2ae4409a3..b93420133fa 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -41,6 +41,7 @@ const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "con KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() : server_id(NOT_EXIST) + , enable_ipv6(true) , tcp_port(NOT_EXIST) , tcp_port_secure(NOT_EXIST) , standalone_keeper(false) @@ -67,6 +68,9 @@ void KeeperConfigurationAndSettings::dump(WriteBufferFromOwnString & buf) const writeText("server_id=", buf); write_int(server_id); + writeText("enable_ipv6=", buf); + write_bool(enable_ipv6); + if (tcp_port != NOT_EXIST) { writeText("tcp_port=", buf); @@ -158,6 +162,8 @@ KeeperConfigurationAndSettings::loadFromConfig(const Poco::Util::AbstractConfigu ret->server_id = config.getInt("keeper_server.server_id"); ret->standalone_keeper = standalone_keeper_; + ret->enable_ipv6 = config.getBool("keeper_server.enable_ipv6", true); + if (config.has("keeper_server.tcp_port")) { ret->tcp_port = config.getInt("keeper_server.tcp_port"); diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 4159c2ad994..38a04043b38 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -64,6 +64,7 @@ struct KeeperConfigurationAndSettings KeeperConfigurationAndSettings(); int server_id; + bool enable_ipv6; int tcp_port; int tcp_port_secure; diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 438e337b64f..8423f10f3a6 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -276,7 +276,7 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf try { LOG_DEBUG(log, "Waiting server to initialize"); - server->startup(); + server->startup(configuration_and_settings->enable_ipv6); LOG_DEBUG(log, "Server initialized, waiting for quorum"); if (!start_async) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 82ea100bccb..25d57e64e0a 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -107,7 +107,7 @@ KeeperServer::KeeperServer( LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); } -void KeeperServer::startup() +void KeeperServer::startup(bool enable_ipv6) { state_machine->init(); @@ -171,13 +171,14 @@ void KeeperServer::startup() #endif } - launchRaftServer(params, asio_opts); + launchRaftServer(enable_ipv6, params, asio_opts); if (!raft_instance) throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance"); } void KeeperServer::launchRaftServer( + bool enable_ipv6, const nuraft::raft_params & params, const nuraft::asio_service::options & asio_opts) { @@ -192,7 +193,7 @@ void KeeperServer::launchRaftServer( nuraft::ptr logger = nuraft::cs_new("RaftInstance", coordination_settings->raft_logs_level); asio_service = nuraft::cs_new(asio_opts, logger); - asio_listener = asio_service->create_rpc_listener(state_manager->getPort(), logger); + asio_listener = asio_service->create_rpc_listener(state_manager->getPort(), logger, enable_ipv6); if (!asio_listener) return; diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 376fe111f15..1fb02bb0987 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -44,6 +44,7 @@ private: /// Almost copy-paste from nuraft::launcher, but with separated server init and start /// Allows to avoid race conditions. void launchRaftServer( + bool enable_ipv6, const nuraft::raft_params & params, const nuraft::asio_service::options & asio_opts); @@ -57,7 +58,7 @@ public: SnapshotsQueue & snapshots_queue_); /// Load state machine from the latest snapshot and load log storage. Start NuRaft with required settings. - void startup(); + void startup(bool enable_ipv6 = true); /// Put local read request and execute in state machine directly and response into /// responses queue diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index a770451a733..a64a7d425f6 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -91,8 +91,7 @@ static bool checkACL(int32_t permission, const Coordination::ACLs & node_acls, c static bool fixupACL( const std::vector & request_acls, const std::vector & current_ids, - std::vector & result_acls, - bool hash_acls) + std::vector & result_acls) { if (request_acls.empty()) return true; @@ -125,8 +124,6 @@ static bool fixupACL( return false; valid_found = true; - if (hash_acls) - new_acl.id = generateDigest(new_acl.id); result_acls.push_back(new_acl); } } @@ -310,7 +307,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr KeeperStorage::Node created_node; Coordination::ACLs node_acls; - if (!fixupACL(request.acls, session_auth_ids, node_acls, !request.restored_from_zookeeper_log)) + if (!fixupACL(request.acls, session_auth_ids, node_acls)) { response.error = Coordination::Error::ZINVALIDACL; return {response_ptr, {}}; @@ -778,7 +775,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr auto & session_auth_ids = storage.session_and_auth[session_id]; Coordination::ACLs node_acls; - if (!fixupACL(request.acls, session_auth_ids, node_acls, !request.restored_from_zookeeper_log)) + if (!fixupACL(request.acls, session_auth_ids, node_acls)) { response.error = Coordination::Error::ZINVALIDACL; return {response_ptr, {}}; diff --git a/src/Core/ExternalTable.cpp b/src/Core/ExternalTable.cpp index b4adbcc0662..3b515fab5c9 100644 --- a/src/Core/ExternalTable.cpp +++ b/src/Core/ExternalTable.cpp @@ -169,7 +169,7 @@ void ExternalTablesHandler::handlePart(const Poco::Net::MessageHeader & header, processors.push_back(std::move(sink)); processors.push_back(std::move(exception_handling)); - auto executor = std::make_shared(processors); + auto executor = std::make_shared(processors, getContext()->getProcessListElement()); executor->execute(/*num_threads = */ 1); /// We are ready to receive the next file, for this we clear all the information received diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 70fb5604997..6e53fa4342c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -46,7 +46,6 @@ class IColumn; M(UInt64, max_insert_threads, 0, "The maximum number of threads to execute the INSERT SELECT query. Values 0 or 1 means that INSERT SELECT is not run in parallel. Higher values will lead to higher memory usage. Parallel INSERT SELECT has effect only if the SELECT part is run on parallel, see 'max_threads' setting.", 0) \ M(UInt64, max_final_threads, 16, "The maximum number of threads to read from table with FINAL.", 0) \ M(MaxThreads, max_threads, 0, "The maximum number of threads to execute the request. By default, it is determined automatically.", 0) \ - M(MaxThreads, max_alter_threads, 0, "The maximum number of threads to execute the ALTER requests. By default, it is determined automatically.", 0) \ M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the buffer to read from the filesystem.", 0) \ M(UInt64, max_distributed_connections, 1024, "The maximum number of connections for distributed processing of one query (should be greater than max_threads).", 0) \ M(UInt64, max_query_size, DBMS_DEFAULT_MAX_QUERY_SIZE, "Which part of the query can be read into RAM for parsing (the remaining data for INSERT, if any, is read later)", 0) \ @@ -572,7 +571,7 @@ class IColumn; MAKE_OBSOLETE(M, UInt64, merge_tree_clear_old_temporary_directories_interval_seconds, 60) \ MAKE_OBSOLETE(M, UInt64, merge_tree_clear_old_parts_interval_seconds, 1) \ MAKE_OBSOLETE(M, UInt64, partial_merge_join_optimizations, 0) \ - + MAKE_OBSOLETE(M, MaxThreads, max_alter_threads, 0) \ /** The section above is for obsolete settings. Do not add anything there. */ @@ -597,6 +596,8 @@ class IColumn; M(Int64, input_format_orc_row_batch_size, 100'000, "Batch size when reading ORC stripes.", 0) \ M(Bool, input_format_parquet_import_nested, false, "Allow to insert array of structs into Nested table in Parquet input format.", 0) \ M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \ + M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ + M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \ \ M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \ M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \ @@ -662,6 +663,7 @@ class IColumn; M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ \ M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0)\ + // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index e74df5c327a..85644b6f6ca 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -377,6 +377,8 @@ struct WhichDataType constexpr bool isNullable() const { return idx == TypeIndex::Nullable; } constexpr bool isFunction() const { return idx == TypeIndex::Function; } constexpr bool isAggregateFunction() const { return idx == TypeIndex::AggregateFunction; } + + constexpr bool isLowCarnality() const { return idx == TypeIndex::LowCardinality; } }; /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) diff --git a/src/DataTypes/hasNullable.cpp b/src/DataTypes/hasNullable.cpp new file mode 100644 index 00000000000..2c699806874 --- /dev/null +++ b/src/DataTypes/hasNullable.cpp @@ -0,0 +1,33 @@ +#include +#include +#include +#include + +namespace DB +{ + +bool hasNullable(const DataTypePtr & type) +{ + if (type->isNullable() || type->isLowCardinalityNullable()) + return true; + + if (const DataTypeArray * type_array = typeid_cast(type.get())) + return hasNullable(type_array->getNestedType()); + else if (const DataTypeTuple * type_tuple = typeid_cast(type.get())) + { + for (const auto & subtype : type_tuple->getElements()) + { + if (hasNullable(subtype)) + return true; + } + return false; + } + else if (const DataTypeMap * type_map = typeid_cast(type.get())) + { + // Key type cannot be nullable. We only check value type. + return hasNullable(type_map->getValueType()); + } + return false; +} + +} diff --git a/src/DataTypes/hasNullable.h b/src/DataTypes/hasNullable.h new file mode 100644 index 00000000000..271803496f1 --- /dev/null +++ b/src/DataTypes/hasNullable.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace DB +{ + +bool hasNullable(const DataTypePtr & type); + +} diff --git a/src/Databases/DatabaseDictionary.cpp b/src/Databases/DatabaseDictionary.cpp index db7da95fb27..82766c1e384 100644 --- a/src/Databases/DatabaseDictionary.cpp +++ b/src/Databases/DatabaseDictionary.cpp @@ -29,10 +29,13 @@ namespace return nullptr; DictionaryStructure dictionary_structure = ExternalDictionariesLoader::getDictionaryStructure(*load_result.config); + auto comment = load_result.config->config->getString("dictionary.comment", ""); + return StorageDictionary::create( StorageID(database_name, load_result.name), load_result.name, dictionary_structure, + comment, StorageDictionary::Location::DictionaryDatabase, context); } diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 3f6cb49fda7..5cc334eaad4 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -156,13 +156,15 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String StorageMySQLConfiguration configuration; ASTs & arguments = engine->arguments->children; + MySQLSettings mysql_settings; - if (auto named_collection = getExternalDataSourceConfiguration(arguments, context, true)) + if (auto named_collection = getExternalDataSourceConfiguration(arguments, context, true, true, mysql_settings)) { - auto [common_configuration, storage_specific_args] = named_collection.value(); + auto [common_configuration, storage_specific_args, settings_changes] = named_collection.value(); configuration.set(common_configuration); configuration.addresses = {std::make_pair(configuration.host, configuration.port)}; + mysql_settings.applyChanges(settings_changes); if (!storage_specific_args.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, @@ -200,7 +202,6 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String if (engine_name == "MySQL") { auto mysql_database_settings = std::make_unique(); - MySQLSettings mysql_settings; auto mysql_pool = createMySQLPoolWithFailover(configuration, mysql_settings); mysql_database_settings->loadFromQueryContext(context); @@ -299,7 +300,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String if (auto named_collection = getExternalDataSourceConfiguration(engine_args, context, true)) { - auto [common_configuration, storage_specific_args] = named_collection.value(); + auto [common_configuration, storage_specific_args, _] = named_collection.value(); configuration.set(common_configuration); configuration.addresses = {std::make_pair(configuration.host, configuration.port)}; @@ -358,7 +359,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String if (auto named_collection = getExternalDataSourceConfiguration(engine_args, context, true)) { - auto [common_configuration, storage_specific_args] = named_collection.value(); + auto [common_configuration, storage_specific_args, _] = named_collection.value(); configuration.set(common_configuration); if (!storage_specific_args.empty()) diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index e9944b592ed..165bad950f5 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -76,10 +76,16 @@ std::pair createTableFromAST( /// - the database has not been loaded yet; /// - the code is simpler, since the query is already brought to a suitable form. if (!ast_create_query.columns_list || !ast_create_query.columns_list->columns) - throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); - - columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); - constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints); + { + if (!StorageFactory::instance().checkIfStorageSupportsSchemaInterface(ast_create_query.storage->engine->name)) + throw Exception("Missing definition of columns.", ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED); + /// Leave columns empty. + } + else + { + columns = InterpreterCreateQuery::getColumnsDescription(*ast_create_query.columns_list->columns, context, true); + constraints = InterpreterCreateQuery::getConstraintsDescription(ast_create_query.columns_list->constraints); + } } return diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index ffb39f5b113..1c3f417b431 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -30,27 +30,33 @@ void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemo auto & ast_create_query = query->as(); bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns; + if (ast_create_query.as_table_function && !has_structure) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function" " and doesn't have structure in metadata", backQuote(ast_create_query.getTable())); - assert(has_structure); - ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); - ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); - ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints); - ASTPtr new_projections = InterpreterCreateQuery::formatProjections(metadata.projections); + if (!has_structure && !ast_create_query.is_dictionary) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot alter table {} metadata doesn't have structure", backQuote(ast_create_query.getTable())); - ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->projections, new_projections); + if (!ast_create_query.is_dictionary) + { + ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); + ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); + ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints); + ASTPtr new_projections = InterpreterCreateQuery::formatProjections(metadata.projections); + + ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->projections, new_projections); + } if (metadata.select.select_query) { query->replace(ast_create_query.select, metadata.select.select_query); } - /// MaterializedView is one type of CREATE query without storage. + /// MaterializedView, Dictionary are types of CREATE query without storage. if (ast_create_query.storage) { ASTStorage & storage_ast = *ast_create_query.storage; diff --git a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp index 5d699955ee5..dba8bf64798 100644 --- a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp +++ b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp @@ -50,12 +50,17 @@ DatabaseMaterializedPostgreSQL::DatabaseMaterializedPostgreSQL( , remote_database_name(postgres_database_name) , connection_info(connection_info_) , settings(std::move(settings_)) + , startup_task(getContext()->getSchedulePool().createTask("MaterializedPostgreSQLDatabaseStartup", [this]{ startSynchronization(); })) { } void DatabaseMaterializedPostgreSQL::startSynchronization() { + std::lock_guard lock(handler_mutex); + if (shutdown_called) + return; + replication_handler = std::make_unique( /* replication_identifier */database_name, remote_database_name, @@ -104,24 +109,14 @@ void DatabaseMaterializedPostgreSQL::startSynchronization() } LOG_TRACE(log, "Loaded {} tables. Starting synchronization", materialized_tables.size()); - replication_handler->startup(); + replication_handler->startup(/* delayed */false); } void DatabaseMaterializedPostgreSQL::startupTables(ThreadPool & thread_pool, bool force_restore, bool force_attach) { DatabaseAtomic::startupTables(thread_pool, force_restore, force_attach); - try - { - startSynchronization(); - } - catch (...) - { - tryLogCurrentException(log, "Cannot load nested database objects for PostgreSQL database engine."); - - if (!force_attach) - throw; - } + startup_task->activateAndSchedule(); } @@ -376,6 +371,7 @@ StoragePtr DatabaseMaterializedPostgreSQL::detachTable(ContextPtr context_, cons void DatabaseMaterializedPostgreSQL::shutdown() { + startup_task->deactivate(); stopReplication(); DatabaseAtomic::shutdown(); } @@ -387,6 +383,7 @@ void DatabaseMaterializedPostgreSQL::stopReplication() if (replication_handler) replication_handler->shutdown(); + shutdown_called = true; /// Clear wrappers over nested, all access is not done to nested tables directly. materialized_tables.clear(); } diff --git a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h index 3b7f0f9d29d..40ff0d9262d 100644 --- a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h +++ b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h @@ -86,6 +86,9 @@ private: std::map materialized_tables; mutable std::mutex tables_mutex; mutable std::mutex handler_mutex; + + BackgroundSchedulePool::TaskHolder startup_task; + bool shutdown_called = false; }; } diff --git a/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp b/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp index fede4319230..d43bde0b886 100644 --- a/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp +++ b/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp @@ -182,19 +182,19 @@ StoragePtr DatabasePostgreSQL::fetchTable(const String & table_name, ContextPtr, return StoragePtr{}; auto connection_holder = pool->get(); - auto columns = fetchPostgreSQLTableStructure(connection_holder->get(), table_name, configuration.schema).columns; + auto columns_info = fetchPostgreSQLTableStructure(connection_holder->get(), table_name, configuration.schema).physical_columns; - if (!columns) + if (!columns_info) return StoragePtr{}; auto storage = StoragePostgreSQL::create( StorageID(database_name, table_name), pool, table_name, - ColumnsDescription{*columns}, ConstraintsDescription{}, String{}, configuration.schema, configuration.on_conflict); + ColumnsDescription{columns_info->columns}, ConstraintsDescription{}, String{}, configuration.schema, configuration.on_conflict); if (cache_tables) cached_tables[table_name] = storage; - return storage; + return std::move(storage); } if (table_checked || checkPostgresTable(table_name)) @@ -414,7 +414,7 @@ ASTPtr DatabasePostgreSQL::getCreateTableQueryImpl(const String & table_name, Co assert(storage_engine_arguments->children.size() >= 2); storage_engine_arguments->children.insert(storage_engine_arguments->children.begin() + 2, std::make_shared(table_id.table_name)); - return create_table_query; + return std::move(create_table_query); } diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp index dd6d1dd2e52..67d328db00b 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp @@ -15,7 +15,7 @@ #include #include #include - +#include namespace DB { @@ -155,10 +155,11 @@ static DataTypePtr convertPostgreSQLDataType(String & type, Fn auto && r template -std::shared_ptr readNamesAndTypesList( - T & tx, const String & postgres_table, const String & query, bool use_nulls, bool only_names_and_types) +PostgreSQLTableStructure::ColumnsInfoPtr readNamesAndTypesList( + T & tx, const String & postgres_table, const String & query, bool use_nulls, bool only_names_and_types) { auto columns = NamesAndTypes(); + PostgreSQLTableStructure::Attributes attributes; try { @@ -180,14 +181,22 @@ std::shared_ptr readNamesAndTypesList( } else { - std::tuple row; + std::tuple row; while (stream >> row) { - auto data_type = convertPostgreSQLDataType(std::get<1>(row), - recheck_array, - use_nulls && (std::get<2>(row) == "f"), /// 'f' means that postgres `not_null` is false, i.e. value is nullable - std::get<3>(row)); + auto data_type = convertPostgreSQLDataType( + std::get<1>(row), recheck_array, + use_nulls && (std::get<2>(row) == /* not nullable */"f"), + std::get<3>(row)); + columns.push_back(NameAndTypePair(std::get<0>(row), data_type)); + + attributes.emplace_back( + PostgreSQLTableStructure::PGAttribute{ + .atttypid = parse(std::get<4>(row)), + .atttypmod = parse(std::get<5>(row)), + }); + ++i; } } @@ -226,7 +235,9 @@ std::shared_ptr readNamesAndTypesList( throw; } - return !columns.empty() ? std::make_shared(columns.begin(), columns.end()) : nullptr; + return !columns.empty() + ? std::make_shared(NamesAndTypesList(columns.begin(), columns.end()), std::move(attributes)) + : nullptr; } @@ -244,14 +255,14 @@ PostgreSQLTableStructure fetchPostgreSQLTableStructure( std::string query = fmt::format( "SELECT attname AS name, format_type(atttypid, atttypmod) AS type, " - "attnotnull AS not_null, attndims AS dims " + "attnotnull AS not_null, attndims AS dims, atttypid as type_id, atttypmod as type_modifier " "FROM pg_attribute " "WHERE attrelid = (SELECT oid FROM pg_class WHERE {}) " "AND NOT attisdropped AND attnum > 0", where); - table.columns = readNamesAndTypesList(tx, postgres_table, query, use_nulls, false); + table.physical_columns = readNamesAndTypesList(tx, postgres_table, query, use_nulls, false); - if (!table.columns) + if (!table.physical_columns) throw Exception(ErrorCodes::UNKNOWN_TABLE, "PostgreSQL table {} does not exist", postgres_table); if (with_primary_key) diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h index 279c88c4571..3be3aa79078 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.h @@ -12,9 +12,24 @@ namespace DB struct PostgreSQLTableStructure { - std::shared_ptr columns = nullptr; - std::shared_ptr primary_key_columns = nullptr; - std::shared_ptr replica_identity_columns = nullptr; + struct PGAttribute + { + Int32 atttypid; + Int32 atttypmod; + }; + using Attributes = std::vector; + + struct ColumnsInfo + { + NamesAndTypesList columns; + Attributes attributes; + ColumnsInfo(NamesAndTypesList && columns_, Attributes && attributes_) : columns(columns_), attributes(attributes_) {} + }; + using ColumnsInfoPtr = std::shared_ptr; + + ColumnsInfoPtr physical_columns; + ColumnsInfoPtr primary_key_columns; + ColumnsInfoPtr replica_identity_columns; }; using PostgreSQLTableStructurePtr = std::unique_ptr; diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index 2c7e9ad7092..5fd1bd420c6 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -13,6 +13,7 @@ #include #include + namespace DB { @@ -308,7 +309,7 @@ private: if (was_inserted) { if constexpr (std::is_same_v) - cell.key = copyStringInArena(key); + cell.key = copyStringInArena(arena, key); else cell.key = key; @@ -332,8 +333,7 @@ private: else if constexpr (std::is_same_v) { const String & string_value = column_value.get(); - StringRef string_value_ref = StringRef {string_value.data(), string_value.size()}; - StringRef inserted_value = copyStringInArena(string_value_ref); + StringRef inserted_value = copyStringInArena(arena, string_value); container.back() = inserted_value; } else @@ -353,7 +353,7 @@ private: { char * data = const_cast(cell.key.data); arena.free(data, cell.key.size); - cell.key = copyStringInArena(key); + cell.key = copyStringInArena(arena, key); } else cell.key = key; @@ -379,8 +379,7 @@ private: else if constexpr (std::is_same_v) { const String & string_value = column_value.get(); - StringRef string_ref_value = StringRef {string_value.data(), string_value.size()}; - StringRef inserted_value = copyStringInArena(string_ref_value); + StringRef inserted_value = copyStringInArena(arena, string_value); if (!cell_was_default) { @@ -423,7 +422,7 @@ private: if (was_inserted) { if constexpr (std::is_same_v) - cell.key = copyStringInArena(key); + cell.key = copyStringInArena(arena, key); else cell.key = key; @@ -463,7 +462,7 @@ private: { char * data = const_cast(cell.key.data); arena.free(data, cell.key.size); - cell.key = copyStringInArena(key); + cell.key = copyStringInArena(arena, key); } else cell.key = key; @@ -526,16 +525,6 @@ private: return const_cast *>(this)->template getAttributeContainer(attribute_index, std::forward(func)); } - StringRef copyStringInArena(StringRef value_to_copy) - { - size_t value_to_copy_size = value_to_copy.size; - char * place_for_key = arena.alloc(value_to_copy_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(value_to_copy.data), value_to_copy_size); - StringRef updated_value{place_for_key, value_to_copy_size}; - - return updated_value; - } - template using ContainerType = std::conditional_t< std::is_same_v || std::is_same_v, diff --git a/src/Dictionaries/ClickHouseDictionarySource.cpp b/src/Dictionaries/ClickHouseDictionarySource.cpp index 6abd5f317e2..bd9a1f7776e 100644 --- a/src/Dictionaries/ClickHouseDictionarySource.cpp +++ b/src/Dictionaries/ClickHouseDictionarySource.cpp @@ -28,6 +28,10 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +static const std::unordered_set dictionary_allowed_keys = { + "host", "port", "user", "password", "db", "database", "table", + "update_field", "update_tag", "invalidate_query", "query", "where", "name", "secure"}; + namespace { constexpr size_t MAX_CONNECTIONS = 16; @@ -235,18 +239,21 @@ void registerDictionarySourceClickHouse(DictionarySourceFactory & factory) std::string db = config.getString(settings_config_prefix + ".db", default_database); std::string table = config.getString(settings_config_prefix + ".table", ""); UInt16 port = static_cast(config.getUInt(settings_config_prefix + ".port", default_port)); + auto has_config_key = [](const String & key) { return dictionary_allowed_keys.contains(key); }; - auto named_collection = created_from_ddl ? - getExternalDataSourceConfiguration(config, settings_config_prefix, global_context) : std::nullopt; + auto named_collection = created_from_ddl + ? getExternalDataSourceConfiguration(config, settings_config_prefix, global_context, has_config_key) + : std::nullopt; if (named_collection) { - host = named_collection->host; - user = named_collection->username; - password = named_collection->password; - db = named_collection->database; - table = named_collection->table; - port = named_collection->port; + const auto & configuration = named_collection->configuration; + host = configuration.host; + user = configuration.username; + password = configuration.password; + db = configuration.database; + table = configuration.table; + port = configuration.port; } ClickHouseDictionarySource::Configuration configuration{ diff --git a/src/Dictionaries/DictionaryHelpers.h b/src/Dictionaries/DictionaryHelpers.h index b59e29c327e..1e6a4a5fb44 100644 --- a/src/Dictionaries/DictionaryHelpers.h +++ b/src/Dictionaries/DictionaryHelpers.h @@ -623,6 +623,17 @@ void mergeBlockWithPipe( } } +template +static StringRef copyStringInArena(Arena & arena, StringRef value) +{ + size_t key_size = value.size; + char * place_for_key = arena.alloc(key_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(value.data), key_size); + StringRef result{place_for_key, key_size}; + + return result; +} + /** * Returns ColumnVector data as PaddedPodArray. diff --git a/src/Dictionaries/DictionaryStructure.cpp b/src/Dictionaries/DictionaryStructure.cpp index 21d43031204..6955b3ddfdc 100644 --- a/src/Dictionaries/DictionaryStructure.cpp +++ b/src/Dictionaries/DictionaryStructure.cpp @@ -25,6 +25,7 @@ namespace ErrorCodes namespace { + DictionaryTypedSpecialAttribute makeDictionaryTypedSpecialAttribute( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const std::string & default_type) { @@ -38,7 +39,7 @@ DictionaryTypedSpecialAttribute makeDictionaryTypedSpecialAttribute( return DictionaryTypedSpecialAttribute{std::move(name), std::move(expression), DataTypeFactory::instance().get(type_name)}; } -std::optional maybeGetAttributeUnderlyingType(TypeIndex index) +std::optional tryGetAttributeUnderlyingType(TypeIndex index) { switch (index) /// Special cases which do not map TypeIndex::T -> AttributeUnderlyingType::T { @@ -65,14 +66,16 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration { std::string structure_prefix = config_prefix + ".structure"; - const auto has_id = config.has(structure_prefix + ".id"); - const auto has_key = config.has(structure_prefix + ".key"); + const bool has_id = config.has(structure_prefix + ".id"); + const bool has_key = config.has(structure_prefix + ".key"); if (has_key && has_id) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only one of 'id' and 'key' should be specified"); if (has_id) + { id.emplace(config, structure_prefix + ".id"); + } else if (has_key) { key.emplace(getAttributes(config, structure_prefix + ".key", /*complex_key_attributes =*/ true)); @@ -80,7 +83,9 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration throw Exception(ErrorCodes::BAD_ARGUMENTS, "Empty 'key' supplied"); } else + { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary structure should specify either 'id' or 'key'"); + } if (id) { @@ -94,7 +99,8 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration parseRangeConfiguration(config, structure_prefix); attributes = getAttributes(config, structure_prefix, /*complex_key_attributes =*/ false); - for (size_t i = 0; i < attributes.size(); ++i) + size_t attributes_size = attributes.size(); + for (size_t i = 0; i < attributes_size; ++i) { const auto & attribute = attributes[i]; const auto & attribute_name = attribute.name; @@ -106,7 +112,6 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration throw Exception(ErrorCodes::TYPE_MISMATCH, "Hierarchical attribute type for dictionary with simple key must be UInt64. Actual {}", attribute.underlying_type); - else if (key) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary with complex key does not support hierarchy"); @@ -121,17 +126,27 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration void DictionaryStructure::validateKeyTypes(const DataTypes & key_types) const { - if (key_types.size() != key->size()) + size_t key_types_size = key_types.size(); + if (key_types_size != getKeysSize()) throw Exception(ErrorCodes::TYPE_MISMATCH, "Key structure does not match, expected {}", getKeyDescription()); - for (size_t i = 0; i < key_types.size(); ++i) + if (id && !isUInt64(key_types[0])) + { + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Key type for simple key does not match, expected {}, found {}", + std::to_string(0), + "UInt64", + key_types[0]->getName()); + } + + for (size_t i = 0; i < key_types_size; ++i) { const auto & expected_type = (*key)[i].type; const auto & actual_type = key_types[i]; if (!areTypesEqual(expected_type, actual_type)) throw Exception(ErrorCodes::TYPE_MISMATCH, - "Key type at position {} does not match, expected {}, found {}", + "Key type for complex key at position {} does not match, expected {}, found {}", std::to_string(i), expected_type->getName(), actual_type->getName()); @@ -204,19 +219,6 @@ std::string DictionaryStructure::getKeyDescription() const return out.str(); } - -bool DictionaryStructure::isKeySizeFixed() const -{ - if (!key) - return true; - - for (const auto & key_i : *key) - if (key_i.underlying_type == AttributeUnderlyingType::String) - return false; - - return true; -} - Strings DictionaryStructure::getKeysNames() const { if (id) @@ -235,7 +237,7 @@ Strings DictionaryStructure::getKeysNames() const static void checkAttributeKeys(const Poco::Util::AbstractConfiguration::Keys & keys) { - static const std::unordered_set valid_keys + static const std::unordered_set valid_keys = {"name", "type", "expression", "null_value", "hierarchical", "injective", "is_object_id"}; for (const auto & key : keys) @@ -256,7 +258,7 @@ std::vector DictionaryStructure::getAttributes( Poco::Util::AbstractConfiguration::Keys config_elems; config.keys(config_prefix, config_elems); - auto has_hierarchy = false; + bool has_hierarchy = false; std::unordered_set attribute_names; std::vector res_attributes; @@ -296,7 +298,7 @@ std::vector DictionaryStructure::getAttributes( auto non_nullable_type = removeNullable(initial_type); - const auto underlying_type_opt = maybeGetAttributeUnderlyingType(non_nullable_type->getTypeId()); + const auto underlying_type_opt = tryGetAttributeUnderlyingType(non_nullable_type->getTypeId()); if (!underlying_type_opt) throw Exception(ErrorCodes::UNKNOWN_TYPE, @@ -336,6 +338,7 @@ std::vector DictionaryStructure::getAttributes( const auto hierarchical = config.getBool(prefix + "hierarchical", false); const auto injective = config.getBool(prefix + "injective", false); const auto is_object_id = config.getBool(prefix + "is_object_id", false); + if (name.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Properties 'name' and 'type' of an attribute cannot be empty"); @@ -388,13 +391,12 @@ void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConf range_max->type->getName()); } - if (range_min) + if (range_min && !range_min->type->isValueRepresentedByInteger()) { - if (!range_min->type->isValueRepresentedByInteger()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum." - " Actual 'range_min' and 'range_max' type is {}", - range_min->type->getName()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum." + " Actual 'range_min' and 'range_max' type is {}", + range_min->type->getName()); } if ((range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty())) diff --git a/src/Dictionaries/DictionaryStructure.h b/src/Dictionaries/DictionaryStructure.h index 4de00ddd259..817bc8d7824 100644 --- a/src/Dictionaries/DictionaryStructure.h +++ b/src/Dictionaries/DictionaryStructure.h @@ -129,7 +129,6 @@ struct DictionaryStructure final size_t getKeysSize() const; std::string getKeyDescription() const; - bool isKeySizeFixed() const; private: /// range_min and range_max have to be parsed before this function call diff --git a/src/Dictionaries/DirectDictionary.cpp b/src/Dictionaries/DirectDictionary.cpp index 12c624a6859..19bbcb6ca98 100644 --- a/src/Dictionaries/DirectDictionary.cpp +++ b/src/Dictionaries/DirectDictionary.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include diff --git a/src/Dictionaries/DirectDictionary.h b/src/Dictionaries/DirectDictionary.h index 4bf24e6ae98..de18e9486e6 100644 --- a/src/Dictionaries/DirectDictionary.h +++ b/src/Dictionaries/DirectDictionary.h @@ -3,15 +3,12 @@ #include #include #include -#include -#include -#include -#include -#include -#include "DictionaryStructure.h" -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryHelpers.h" + +#include +#include +#include +#include + namespace DB { diff --git a/src/Dictionaries/ExecutableDictionarySource.cpp b/src/Dictionaries/ExecutableDictionarySource.cpp index 8d10a6665cf..7a3550e7284 100644 --- a/src/Dictionaries/ExecutableDictionarySource.cpp +++ b/src/Dictionaries/ExecutableDictionarySource.cpp @@ -1,10 +1,16 @@ #include "ExecutableDictionarySource.h" +#include + +#include + #include #include +#include #include #include +#include #include #include @@ -27,15 +33,46 @@ namespace ErrorCodes extern const int UNSUPPORTED_METHOD; } +namespace +{ + + void updateCommandIfNeeded(String & command, bool execute_direct, ContextPtr context) + { + if (!execute_direct) + return; + + auto global_context = context->getGlobalContext(); + auto user_scripts_path = global_context->getUserScriptsPath(); + auto script_path = user_scripts_path + '/' + command; + + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} must be inside user scripts folder {}", + command, + user_scripts_path); + + if (!std::filesystem::exists(std::filesystem::path(script_path))) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} does not exist inside user scripts folder {}", + command, + user_scripts_path); + + command = std::move(script_path); + } + +} + ExecutableDictionarySource::ExecutableDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_) : log(&Poco::Logger::get("ExecutableDictionarySource")) , dict_struct(dict_struct_) , configuration(configuration_) - , sample_block{sample_block_} + , sample_block(sample_block_) + , coordinator(std::move(coordinator_)) , context(context_) { /// Remove keys from sample_block for implicit_key dictionary because @@ -58,6 +95,7 @@ ExecutableDictionarySource::ExecutableDictionarySource(const ExecutableDictionar , dict_struct(other.dict_struct) , configuration(other.configuration) , sample_block(other.sample_block) + , coordinator(other.coordinator) , context(Context::createCopy(other.context)) { } @@ -69,11 +107,11 @@ Pipe ExecutableDictionarySource::loadAll() LOG_TRACE(log, "loadAll {}", toString()); - ShellCommand::Config config(configuration.command); - auto process = ShellCommand::execute(config); + const auto & coordinator_configuration = coordinator->getConfiguration(); + auto command = configuration.command; + updateCommandIfNeeded(command, coordinator_configuration.execute_direct, context); - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process))); - return pipe; + return coordinator->createPipe(command, configuration.command_arguments, sample_block, context); } Pipe ExecutableDictionarySource::loadUpdatedAll() @@ -82,17 +120,32 @@ Pipe ExecutableDictionarySource::loadUpdatedAll() throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "ExecutableDictionarySource with implicit_key does not support loadUpdatedAll method"); time_t new_update_time = time(nullptr); - SCOPE_EXIT(update_time = new_update_time); - std::string command_with_update_field = configuration.command; + const auto & coordinator_configuration = coordinator->getConfiguration(); + auto command = configuration.command; + updateCommandIfNeeded(command, coordinator_configuration.execute_direct, context); + + auto command_arguments = configuration.command_arguments; + if (update_time) - command_with_update_field += " " + configuration.update_field + " " + DB::toString(LocalDateTime(update_time - configuration.update_lag)); + { + auto update_difference = DB::toString(LocalDateTime(update_time - configuration.update_lag)); - LOG_TRACE(log, "loadUpdatedAll {}", command_with_update_field); - ShellCommand::Config config(command_with_update_field); - auto process = ShellCommand::execute(config); - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process))); - return pipe; + if (coordinator_configuration.execute_direct) + { + command_arguments.emplace_back(configuration.update_field); + command_arguments.emplace_back(std::move(update_difference)); + } + else + { + command += ' ' + configuration.update_field + ' ' + update_difference; + } + } + + update_time = new_update_time; + + LOG_TRACE(log, "loadUpdatedAll {}", command); + return coordinator->createPipe(command, command_arguments, sample_block, context); } Pipe ExecutableDictionarySource::loadIds(const std::vector & ids) @@ -113,27 +166,17 @@ Pipe ExecutableDictionarySource::loadKeys(const Columns & key_columns, const std Pipe ExecutableDictionarySource::getStreamForBlock(const Block & block) { - ShellCommand::Config config(configuration.command); - auto process = ShellCommand::execute(config); - auto * process_in = &process->in; + const auto & coordinator_configuration = coordinator->getConfiguration(); + String command = configuration.command; + updateCommandIfNeeded(command, coordinator_configuration.execute_direct, context); - ShellCommandSource::SendDataTask task = {[process_in, block, this]() - { - auto & out = *process_in; + auto source = std::make_shared(block); + auto shell_input_pipe = Pipe(std::move(source)); - if (configuration.send_chunk_header) - { - writeText(block.rows(), out); - writeChar('\n', out); - } + Pipes shell_input_pipes; + shell_input_pipes.emplace_back(std::move(shell_input_pipe)); - auto output_format = context->getOutputFormat(configuration.format, out, block.cloneEmpty()); - formatBlock(output_format, block); - out.close(); - }}; - std::vector tasks = {std::move(task)}; - - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process), std::move(tasks))); + auto pipe = coordinator->createPipe(command, configuration.command_arguments, std::move(shell_input_pipes), sample_block, context); if (configuration.implicit_key) pipe.addTransform(std::make_shared(block, pipe.getHeader())); @@ -189,17 +232,40 @@ void registerDictionarySourceExecutable(DictionarySourceFactory & factory) std::string settings_config_prefix = config_prefix + ".executable"; + bool execute_direct = config.getBool(settings_config_prefix + ".execute_direct", false); + std::string command_value = config.getString(settings_config_prefix + ".command"); + std::vector command_arguments; + + if (execute_direct) + { + boost::split(command_arguments, command_value, [](char c) { return c == ' '; }); + + command_value = std::move(command_arguments[0]); + command_arguments.erase(command_arguments.begin()); + } + ExecutableDictionarySource::Configuration configuration { - .command = config.getString(settings_config_prefix + ".command"), - .format = config.getString(settings_config_prefix + ".format"), + .command = std::move(command_value), + .command_arguments = std::move(command_arguments), .update_field = config.getString(settings_config_prefix + ".update_field", ""), .update_lag = config.getUInt64(settings_config_prefix + ".update_lag", 1), .implicit_key = config.getBool(settings_config_prefix + ".implicit_key", false), - .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false) }; - return std::make_unique(dict_struct, configuration, sample_block, context); + ShellCommandSourceCoordinator::Configuration shell_command_coordinator_configration + { + .format = config.getString(settings_config_prefix + ".format"), + .command_termination_timeout_seconds = config.getUInt64(settings_config_prefix + ".command_termination_timeout", 10), + .command_read_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_read_timeout", 10000), + .command_write_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_write_timeout", 10000), + .is_executable_pool = false, + .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false), + .execute_direct = config.getBool(settings_config_prefix + ".execute_direct", false) + }; + + auto coordinator = std::make_shared(shell_command_coordinator_configration); + return std::make_unique(dict_struct, configuration, sample_block, std::move(coordinator), context); }; factory.registerSource("executable", create_table_source); diff --git a/src/Dictionaries/ExecutableDictionarySource.h b/src/Dictionaries/ExecutableDictionarySource.h index a7ffc8bebcb..6c5d2de3714 100644 --- a/src/Dictionaries/ExecutableDictionarySource.h +++ b/src/Dictionaries/ExecutableDictionarySource.h @@ -7,6 +7,7 @@ #include #include +#include namespace DB @@ -20,20 +21,19 @@ public: struct Configuration { std::string command; - std::string format; + std::vector command_arguments; std::string update_field; UInt64 update_lag; /// Implicit key means that the source script will return only values, /// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result. bool implicit_key; - /// Send number_of_rows\n before sending chunk to process - bool send_chunk_header; }; ExecutableDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_); ExecutableDictionarySource(const ExecutableDictionarySource & other); @@ -69,6 +69,7 @@ private: const DictionaryStructure dict_struct; const Configuration configuration; Block sample_block; + std::shared_ptr coordinator; ContextPtr context; }; diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.cpp b/src/Dictionaries/ExecutablePoolDictionarySource.cpp index a0eb3435a11..48ddeed7fa6 100644 --- a/src/Dictionaries/ExecutablePoolDictionarySource.cpp +++ b/src/Dictionaries/ExecutablePoolDictionarySource.cpp @@ -1,14 +1,20 @@ #include "ExecutablePoolDictionarySource.h" +#include + +#include + #include #include +#include #include +#include +#include +#include #include #include -#include -#include #include #include @@ -23,20 +29,19 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int DICTIONARY_ACCESS_DENIED; extern const int UNSUPPORTED_METHOD; - extern const int TIMEOUT_EXCEEDED; } ExecutablePoolDictionarySource::ExecutablePoolDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_) : dict_struct(dict_struct_) , configuration(configuration_) , sample_block(sample_block_) + , coordinator(std::move(coordinator_)) , context(context_) - /// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type. - , process_pool(std::make_shared(configuration.pool_size == 0 ? std::numeric_limits::max() : configuration.pool_size)) , log(&Poco::Logger::get("ExecutablePoolDictionarySource")) { /// Remove keys from sample_block for implicit_key dictionary because @@ -59,8 +64,8 @@ ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(const ExecutableP : dict_struct(other.dict_struct) , configuration(other.configuration) , sample_block(other.sample_block) + , coordinator(other.coordinator) , context(Context::createCopy(other.context)) - , process_pool(std::make_shared(configuration.pool_size)) , log(&Poco::Logger::get("ExecutablePoolDictionarySource")) { } @@ -93,41 +98,47 @@ Pipe ExecutablePoolDictionarySource::loadKeys(const Columns & key_columns, const Pipe ExecutablePoolDictionarySource::getStreamForBlock(const Block & block) { - std::unique_ptr process; - bool result = process_pool->tryBorrowObject(process, [this]() + String command = configuration.command; + const auto & coordinator_configuration = coordinator->getConfiguration(); + + if (coordinator_configuration.execute_direct) { - ShellCommand::Config config(configuration.command); - config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, configuration.command_termination_timeout }; - auto shell_command = ShellCommand::execute(config); - return shell_command; - }, configuration.max_command_execution_time * 10000); + auto global_context = context->getGlobalContext(); + auto user_scripts_path = global_context->getUserScriptsPath(); + auto script_path = user_scripts_path + '/' + command; - if (!result) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Could not get process from pool, max command execution timeout exceeded {} seconds", - configuration.max_command_execution_time); + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} must be inside user scripts folder {}", + command, + user_scripts_path); - size_t rows_to_read = block.rows(); - auto * process_in = &process->in; - ShellCommandSource::SendDataTask task = [process_in, block, this]() mutable - { - auto & out = *process_in; + if (!std::filesystem::exists(std::filesystem::path(script_path))) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} does not exist inside user scripts folder {}", + command, + user_scripts_path); - if (configuration.send_chunk_header) - { - writeText(block.rows(), out); - writeChar('\n', out); - } + command = std::move(script_path); + } - auto output_format = context->getOutputFormat(configuration.format, out, block.cloneEmpty()); - formatBlock(output_format, block); - }; - std::vector tasks = {std::move(task)}; + auto source = std::make_shared(block); + auto shell_input_pipe = Pipe(std::move(source)); ShellCommandSourceConfiguration command_configuration; command_configuration.read_fixed_number_of_rows = true; - command_configuration.number_of_rows_to_read = rows_to_read; - Pipe pipe(std::make_unique(context, configuration.format, sample_block, std::move(process), std::move(tasks), command_configuration, process_pool)); + command_configuration.number_of_rows_to_read = block.rows(); + + Pipes shell_input_pipes; + shell_input_pipes.emplace_back(std::move(shell_input_pipe)); + + auto pipe = coordinator->createPipe( + command, + configuration.command_arguments, + std::move(shell_input_pipes), + sample_block, + context, + command_configuration); if (configuration.implicit_key) pipe.addTransform(std::make_shared(block, pipe.getHeader())); @@ -157,7 +168,8 @@ DictionarySourcePtr ExecutablePoolDictionarySource::clone() const std::string ExecutablePoolDictionarySource::toString() const { - return "ExecutablePool size: " + std::to_string(configuration.pool_size) + " command: " + configuration.command; + size_t pool_size = coordinator->getConfiguration().pool_size; + return "ExecutablePool size: " + std::to_string(pool_size) + " command: " + configuration.command; } void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory) @@ -189,18 +201,40 @@ void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory) if (max_execution_time_seconds != 0 && max_command_execution_time > max_execution_time_seconds) max_command_execution_time = max_execution_time_seconds; + bool execute_direct = config.getBool(settings_config_prefix + ".execute_direct", false); + std::string command_value = config.getString(settings_config_prefix + ".command"); + std::vector command_arguments; + + if (execute_direct) + { + boost::split(command_arguments, command_value, [](char c) { return c == ' '; }); + + command_value = std::move(command_arguments[0]); + command_arguments.erase(command_arguments.begin()); + } + ExecutablePoolDictionarySource::Configuration configuration { - .command = config.getString(settings_config_prefix + ".command"), - .format = config.getString(settings_config_prefix + ".format"), - .pool_size = config.getUInt64(settings_config_prefix + ".size"), - .command_termination_timeout = config.getUInt64(settings_config_prefix + ".command_termination_timeout", 10), - .max_command_execution_time = max_command_execution_time, + .command = std::move(command_value), + .command_arguments = std::move(command_arguments), .implicit_key = config.getBool(settings_config_prefix + ".implicit_key", false), - .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false) }; - return std::make_unique(dict_struct, configuration, sample_block, context); + ShellCommandSourceCoordinator::Configuration shell_command_coordinator_configration + { + .format = config.getString(settings_config_prefix + ".format"), + .command_termination_timeout_seconds = config.getUInt64(settings_config_prefix + ".command_termination_timeout", 10), + .command_read_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_read_timeout", 10000), + .command_write_timeout_milliseconds = config.getUInt64(settings_config_prefix + ".command_write_timeout", 10000), + .pool_size = config.getUInt64(settings_config_prefix + ".pool_size", 16), + .max_command_execution_time_seconds = max_command_execution_time, + .is_executable_pool = true, + .send_chunk_header = config.getBool(settings_config_prefix + ".send_chunk_header", false), + .execute_direct = execute_direct + }; + + auto coordinator = std::make_shared(shell_command_coordinator_configration); + return std::make_unique(dict_struct, configuration, sample_block, std::move(coordinator), context); }; factory.registerSource("executable_pool", create_table_source); diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.h b/src/Dictionaries/ExecutablePoolDictionarySource.h index 51215b6311b..b9b3b8efb1b 100644 --- a/src/Dictionaries/ExecutablePoolDictionarySource.h +++ b/src/Dictionaries/ExecutablePoolDictionarySource.h @@ -28,21 +28,15 @@ public: struct Configuration { String command; - String format; - size_t pool_size; - size_t command_termination_timeout; - size_t max_command_execution_time; - /// Implicit key means that the source script will return only values, - /// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result. + std::vector command_arguments; bool implicit_key; - /// Send number_of_rows\n before sending chunk to process - bool send_chunk_header; }; ExecutablePoolDictionarySource( const DictionaryStructure & dict_struct_, const Configuration & configuration_, Block & sample_block_, + std::shared_ptr coordinator_, ContextPtr context_); ExecutablePoolDictionarySource(const ExecutablePoolDictionarySource & other); @@ -77,8 +71,8 @@ private: const Configuration configuration; Block sample_block; + std::shared_ptr coordinator; ContextPtr context; - std::shared_ptr process_pool; Poco::Logger * log; }; diff --git a/src/Dictionaries/FlatDictionary.cpp b/src/Dictionaries/FlatDictionary.cpp index de4ae66300a..c260924a82b 100644 --- a/src/Dictionaries/FlatDictionary.cpp +++ b/src/Dictionaries/FlatDictionary.cpp @@ -399,9 +399,6 @@ void FlatDictionary::calculateBytesAllocated() } bucket_count = container.capacity(); - - if constexpr (std::is_same_v) - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); }; callOnDictionaryAttributeType(attribute.type, type_call); @@ -414,12 +411,14 @@ void FlatDictionary::calculateBytesAllocated() if (update_field_loaded_block) bytes_allocated += update_field_loaded_block->allocatedBytes(); + + bytes_allocated += string_arena.size(); } FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttribute & dictionary_attribute) { auto is_nullable_set = dictionary_attribute.is_nullable ? std::make_optional() : std::optional{}; - Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), {}, {}}; + Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), {}}; auto type_call = [&](const auto & dictionary_attribute_type) { @@ -427,9 +426,6 @@ FlatDictionary::Attribute FlatDictionary::createAttribute(const DictionaryAttrib using AttributeType = typename Type::AttributeType; using ValueType = DictionaryValueType; - if constexpr (std::is_same_v) - attribute.string_arena = std::make_unique(); - attribute.container.emplace>(configuration.initial_array_size, ValueType()); }; @@ -510,8 +506,8 @@ void FlatDictionary::setAttributeValueImpl(Attribute & attribute, UInt64 key, co template <> void FlatDictionary::setAttributeValueImpl(Attribute & attribute, UInt64 key, const String & value) { - const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); - setAttributeValueImpl(attribute, key, StringRef{string_in_arena, value.size()}); + auto arena_value = copyStringInArena(string_arena, value); + setAttributeValueImpl(attribute, key, arena_value); } void FlatDictionary::setAttributeValue(Attribute & attribute, const UInt64 key, const Field & value) diff --git a/src/Dictionaries/FlatDictionary.h b/src/Dictionaries/FlatDictionary.h index 308cd72d55b..e8f40ea1d66 100644 --- a/src/Dictionaries/FlatDictionary.h +++ b/src/Dictionaries/FlatDictionary.h @@ -133,8 +133,6 @@ private: ContainerType, ContainerType> container; - - std::unique_ptr string_arena; }; void createAttributes(); @@ -176,6 +174,7 @@ private: mutable std::atomic found_count{0}; BlockPtr update_field_loaded_block; + Arena string_arena; }; } diff --git a/src/Dictionaries/HashedArrayDictionary.cpp b/src/Dictionaries/HashedArrayDictionary.cpp index a92f8bc1191..062620fb25b 100644 --- a/src/Dictionaries/HashedArrayDictionary.cpp +++ b/src/Dictionaries/HashedArrayDictionary.cpp @@ -352,8 +352,7 @@ void HashedArrayDictionary::createAttributes() using ValueType = DictionaryValueType; auto is_index_null = dictionary_attribute.is_nullable ? std::make_optional>() : std::optional>{}; - std::unique_ptr string_arena = std::is_same_v ? std::make_unique() : nullptr; - Attribute attribute{dictionary_attribute.underlying_type, AttributeContainerType(), std::move(is_index_null), std::move(string_arena)}; + Attribute attribute{dictionary_attribute.underlying_type, AttributeContainerType(), std::move(is_index_null)}; attributes.emplace_back(std::move(attribute)); }; @@ -431,7 +430,7 @@ void HashedArrayDictionary::blockToAttributes(const Block & } if constexpr (std::is_same_v) - key = copyKeyInArena(key); + key = copyStringInArena(string_arena, key); key_attribute.container.insert({key, element_count}); @@ -466,11 +465,7 @@ void HashedArrayDictionary::blockToAttributes(const Block & if constexpr (std::is_same_v) { String & value_to_insert = column_value_to_insert.get(); - size_t value_to_insert_size = value_to_insert.size(); - - const char * string_in_arena = attribute.string_arena->insert(value_to_insert.data(), value_to_insert_size); - - StringRef string_in_arena_reference = StringRef{string_in_arena, value_to_insert_size}; + StringRef string_in_arena_reference = copyStringInArena(string_arena, value_to_insert); attribute_container.back() = string_in_arena_reference; } else @@ -676,16 +671,6 @@ void HashedArrayDictionary::getItemsImpl( } } -template -StringRef HashedArrayDictionary::copyKeyInArena(StringRef key) -{ - size_t key_size = key.size; - char * place_for_key = complex_key_arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - StringRef updated_key{place_for_key, key_size}; - return updated_key; -} - template void HashedArrayDictionary::loadData() { @@ -742,21 +727,15 @@ void HashedArrayDictionary::calculateBytesAllocated() } bucket_count = container.capacity(); - - if constexpr (std::is_same_v) - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); }; callOnDictionaryAttributeType(attribute.type, type_call); - if (attribute.string_arena) - bytes_allocated += attribute.string_arena->size(); - if (attribute.is_index_null.has_value()) bytes_allocated += (*attribute.is_index_null).size(); } - bytes_allocated += complex_key_arena.size(); + bytes_allocated += string_arena.size(); if (update_field_loaded_block) bytes_allocated += update_field_loaded_block->allocatedBytes(); diff --git a/src/Dictionaries/HashedArrayDictionary.h b/src/Dictionaries/HashedArrayDictionary.h index 0d07c43477a..80436a3d044 100644 --- a/src/Dictionaries/HashedArrayDictionary.h +++ b/src/Dictionaries/HashedArrayDictionary.h @@ -155,7 +155,6 @@ private: container; std::optional> is_index_null; - std::unique_ptr string_arena; }; struct KeyAttribute final @@ -205,8 +204,6 @@ private: void resize(size_t added_rows); - StringRef copyKeyInArena(StringRef key); - const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const HashedArrayDictionaryStorageConfiguration configuration; @@ -222,7 +219,7 @@ private: mutable std::atomic found_count{0}; BlockPtr update_field_loaded_block; - Arena complex_key_arena; + Arena string_arena; }; extern template class HashedArrayDictionary; diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index f7627d5817f..b4f3aece174 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -239,7 +239,7 @@ ColumnPtr HashedDictionary::getHierarchy(ColumnPtr if (it != parent_keys_map.end()) result = getValueFromCell(it); - keys_found +=result.has_value(); + keys_found += result.has_value(); return result; }; @@ -354,8 +354,7 @@ void HashedDictionary::createAttributes() using ValueType = DictionaryValueType; auto is_nullable_set = dictionary_attribute.is_nullable ? std::make_optional() : std::optional{}; - std::unique_ptr string_arena = std::is_same_v ? std::make_unique() : nullptr; - Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), CollectionType(), std::move(string_arena)}; + Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), CollectionType()}; attributes.emplace_back(std::move(attribute)); }; @@ -449,7 +448,7 @@ void HashedDictionary::blockToAttributes(const Bloc } if constexpr (std::is_same_v) - key = copyKeyInArena(key); + key = copyStringInArena(string_arena, key); attribute_column.get(key_index, column_value_to_insert); @@ -463,12 +462,8 @@ void HashedDictionary::blockToAttributes(const Bloc if constexpr (std::is_same_v) { String & value_to_insert = column_value_to_insert.get(); - size_t value_to_insert_size = value_to_insert.size(); - - const char * string_in_arena = attribute.string_arena->insert(value_to_insert.data(), value_to_insert_size); - - StringRef string_in_arena_reference = StringRef{string_in_arena, value_to_insert_size}; - container.insert({key, string_in_arena_reference}); + StringRef arena_value = copyStringInArena(string_arena, value_to_insert); + container.insert({key, arena_value}); } else { @@ -548,16 +543,6 @@ void HashedDictionary::getItemsImpl( found_count.fetch_add(keys_found, std::memory_order_relaxed); } -template -StringRef HashedDictionary::copyKeyInArena(StringRef key) -{ - size_t key_size = key.size; - char * place_for_key = complex_key_arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - StringRef updated_key{place_for_key, key_size}; - return updated_key; -} - template void HashedDictionary::loadData() { @@ -591,7 +576,9 @@ void HashedDictionary::loadData() } } else + { updateData(); + } if (configuration.require_nonempty && 0 == element_count) throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, @@ -629,16 +616,13 @@ void HashedDictionary::calculateBytesAllocated() } }); - if (attributes[i].string_arena) - bytes_allocated += attributes[i].string_arena->size(); - bytes_allocated += sizeof(attributes[i].is_nullable_set); if (attributes[i].is_nullable_set.has_value()) bytes_allocated = attributes[i].is_nullable_set->getBufferSizeInBytes(); } - bytes_allocated += complex_key_arena.size(); + bytes_allocated += string_arena.size(); if (update_field_loaded_block) bytes_allocated += update_field_loaded_block->allocatedBytes(); diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index 6f63c5ec546..c1761944b14 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -173,8 +173,6 @@ private: CollectionType, CollectionType> container; - - std::unique_ptr string_arena; }; void createAttributes(); @@ -202,8 +200,6 @@ private: void resize(size_t added_rows); - StringRef copyKeyInArena(StringRef key); - const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const HashedDictionaryStorageConfiguration configuration; @@ -217,7 +213,7 @@ private: mutable std::atomic found_count{0}; BlockPtr update_field_loaded_block; - Arena complex_key_arena; + Arena string_arena; }; extern template class HashedDictionary; diff --git a/src/Dictionaries/MongoDBDictionarySource.cpp b/src/Dictionaries/MongoDBDictionarySource.cpp index 1e8be726941..fb637263cf4 100644 --- a/src/Dictionaries/MongoDBDictionarySource.cpp +++ b/src/Dictionaries/MongoDBDictionarySource.cpp @@ -8,6 +8,9 @@ namespace DB { +static const std::unordered_set dictionary_allowed_keys = { + "host", "port", "user", "password", "db", "database", "uri", "collection", "name", "method"}; + void registerDictionarySourceMongoDB(DictionarySourceFactory & factory) { auto create_mongo_db_dictionary = []( @@ -21,10 +24,11 @@ void registerDictionarySourceMongoDB(DictionarySourceFactory & factory) { const auto config_prefix = root_config_prefix + ".mongodb"; ExternalDataSourceConfiguration configuration; - auto named_collection = getExternalDataSourceConfiguration(config, config_prefix, context); + auto has_config_key = [](const String & key) { return dictionary_allowed_keys.contains(key); }; + auto named_collection = getExternalDataSourceConfiguration(config, config_prefix, context, has_config_key); if (named_collection) { - configuration = *named_collection; + configuration = named_collection->configuration; } else { diff --git a/src/Dictionaries/MySQLDictionarySource.cpp b/src/Dictionaries/MySQLDictionarySource.cpp index 0bf5cc3cae0..a291fcea47f 100644 --- a/src/Dictionaries/MySQLDictionarySource.cpp +++ b/src/Dictionaries/MySQLDictionarySource.cpp @@ -30,6 +30,18 @@ namespace ErrorCodes extern const int UNSUPPORTED_METHOD; } +static const std::unordered_set dictionary_allowed_keys = { + "host", "port", "user", "password", + "db", "database", "table", "schema", + "update_field", "invalidate_query", "priority", + "update_tag", "dont_check_update_time", + "query", "where", "name" /* name_collection */, "socket", + "share_connection", "fail_on_connection_loss", "close_connection", + "ssl_ca", "ssl_cert", "ssl_key", + "enable_local_infile", "opt_reconnect", + "connect_timeout", "mysql_connect_timeout", + "mysql_rw_timeout", "rw_timeout"}; + void registerDictionarySourceMysql(DictionarySourceFactory & factory) { auto create_table_source = [=]([[maybe_unused]] const DictionaryStructure & dict_struct, @@ -48,16 +60,25 @@ void registerDictionarySourceMysql(DictionarySourceFactory & factory) auto settings_config_prefix = config_prefix + ".mysql"; std::shared_ptr pool; + MySQLSettings mysql_settings; + auto has_config_key = [&](const String & key) + { + return dictionary_allowed_keys.contains(key) || key.starts_with("replica") || mysql_settings.has(key); + }; StorageMySQLConfiguration configuration; - auto named_collection = created_from_ddl ? getExternalDataSourceConfiguration(config, settings_config_prefix, global_context) : std::nullopt; + auto named_collection = created_from_ddl + ? getExternalDataSourceConfiguration(config, settings_config_prefix, global_context, has_config_key, mysql_settings) + : std::nullopt; if (named_collection) { - configuration.set(*named_collection); + mysql_settings.applyChanges(named_collection->settings_changes); + configuration.set(named_collection->configuration); configuration.addresses = {std::make_pair(configuration.host, configuration.port)}; - MySQLSettings mysql_settings; const auto & settings = global_context->getSettingsRef(); - mysql_settings.connect_timeout = settings.external_storage_connect_timeout_sec; - mysql_settings.read_write_timeout = settings.external_storage_rw_timeout_sec; + if (!mysql_settings.isChanged("connect_timeout")) + mysql_settings.connect_timeout = settings.external_storage_connect_timeout_sec; + if (!mysql_settings.isChanged("read_write_timeout")) + mysql_settings.read_write_timeout = settings.external_storage_rw_timeout_sec; pool = std::make_shared(createMySQLPoolWithFailover(configuration, mysql_settings)); } else diff --git a/src/Dictionaries/PolygonDictionary.h b/src/Dictionaries/PolygonDictionary.h index 762c136b8e0..50810e250cb 100644 --- a/src/Dictionaries/PolygonDictionary.h +++ b/src/Dictionaries/PolygonDictionary.h @@ -3,16 +3,14 @@ #include #include #include -#include -#include -#include #include #include -#include "DictionaryStructure.h" -#include "IDictionary.h" -#include "IDictionarySource.h" -#include "DictionaryHelpers.h" +#include +#include +#include +#include + namespace DB { diff --git a/src/Dictionaries/PostgreSQLDictionarySource.cpp b/src/Dictionaries/PostgreSQLDictionarySource.cpp index 0ac84b35048..9af3ea06838 100644 --- a/src/Dictionaries/PostgreSQLDictionarySource.cpp +++ b/src/Dictionaries/PostgreSQLDictionarySource.cpp @@ -28,6 +28,10 @@ namespace ErrorCodes static const UInt64 max_block_size = 8192; +static const std::unordered_set dictionary_allowed_keys = { + "host", "port", "user", "password", "db", "database", "table", "schema", + "update_field", "update_tag", "invalidate_query", "query", "where", "name", "priority"}; + namespace { ExternalQueryBuilder makeExternalQueryBuilder(const DictionaryStructure & dict_struct, const String & schema, const String & table, const String & query, const String & where) @@ -185,8 +189,8 @@ void registerDictionarySourcePostgreSQL(DictionarySourceFactory & factory) { #if USE_LIBPQXX const auto settings_config_prefix = config_prefix + ".postgresql"; - - auto configuration = getExternalDataSourceConfigurationByPriority(config, settings_config_prefix, context); + auto has_config_key = [](const String & key) { return dictionary_allowed_keys.contains(key) || key.starts_with("replica"); }; + auto configuration = getExternalDataSourceConfigurationByPriority(config, settings_config_prefix, context, has_config_key); auto pool = std::make_shared( configuration.replicas_configurations, context->getSettingsRef().postgresql_connection_pool_size, diff --git a/src/Dictionaries/RangeHashedDictionary.cpp b/src/Dictionaries/RangeHashedDictionary.cpp index 7dc955eb8f7..9dcc38dc4b2 100644 --- a/src/Dictionaries/RangeHashedDictionary.cpp +++ b/src/Dictionaries/RangeHashedDictionary.cpp @@ -345,9 +345,6 @@ void RangeHashedDictionary::calculateBytesAllocated() const auto & collection = std::get>(attribute.maps); bytes_allocated += sizeof(CollectionType) + collection.getBufferSizeInBytes(); bucket_count = collection.getBufferSizeInCells(); - - if constexpr (std::is_same_v) - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); }; callOnDictionaryAttributeType(attribute.type, type_call); @@ -358,12 +355,14 @@ void RangeHashedDictionary::calculateBytesAllocated() if (update_field_loaded_block) bytes_allocated += update_field_loaded_block->allocatedBytes(); + + bytes_allocated += string_arena.size(); } template typename RangeHashedDictionary::Attribute RangeHashedDictionary::createAttribute(const DictionaryAttribute & dictionary_attribute) { - Attribute attribute{dictionary_attribute.underlying_type, dictionary_attribute.is_nullable, {}, {}}; + Attribute attribute{dictionary_attribute.underlying_type, dictionary_attribute.is_nullable, {}}; auto type_call = [&](const auto &dictionary_attribute_type) { @@ -371,9 +370,6 @@ typename RangeHashedDictionary::Attribute RangeHashedDictio using AttributeType = typename Type::AttributeType; using ValueType = DictionaryValueType; - if constexpr (std::is_same_v) - attribute.string_arena = std::make_unique(); - attribute.maps = CollectionType(); }; @@ -544,7 +540,7 @@ void RangeHashedDictionary::blockToAttributes(const Block & } if constexpr (std::is_same_v) - key = copyKeyInArena(key); + key = copyStringInArena(string_arena, key); setAttributeValue(attribute, key, Range{lower_bound, upper_bound}, attribute_column[key_index]); keys_extractor.rollbackCurrentKey(); @@ -572,8 +568,7 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute if constexpr (std::is_same_v) { const auto & string = value.get(); - const auto * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - const StringRef string_ref{string_in_arena, string.size()}; + StringRef string_ref = copyStringInArena(string_arena, string); value_to_insert = Value{ range, { string_ref }}; } else @@ -671,16 +666,6 @@ void RangeHashedDictionary::getKeysAndDates( } } -template -StringRef RangeHashedDictionary::copyKeyInArena(StringRef key) -{ - size_t key_size = key.size; - char * place_for_key = complex_key_arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - StringRef updated_key{place_for_key, key_size}; - return updated_key; -} - template template PaddedPODArray RangeHashedDictionary::makeDateKeys( diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index fca72d5d7cc..a9b41a4c4d0 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -139,7 +139,6 @@ private: CollectionType, CollectionType> maps; - std::unique_ptr string_arena; }; void createAttributes(); @@ -162,9 +161,9 @@ private: void blockToAttributes(const Block & block); template - static void setAttributeValueImpl(Attribute & attribute, KeyType key, const Range & range, const Field & value); + void setAttributeValueImpl(Attribute & attribute, KeyType key, const Range & range, const Field & value); - static void setAttributeValue(Attribute & attribute, KeyType key, const Range & range, const Field & value); + void setAttributeValue(Attribute & attribute, KeyType key, const Range & range, const Field & value); template void getKeysAndDates( @@ -184,8 +183,6 @@ private: const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; - StringRef copyKeyInArena(StringRef key); - const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; @@ -200,6 +197,7 @@ private: size_t bucket_count = 0; mutable std::atomic query_count{0}; mutable std::atomic found_count{0}; + Arena string_arena; }; } diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index 7f0ecdb5cb8..e30b0a257d9 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -1148,10 +1148,7 @@ private: if constexpr (dictionary_key_type == DictionaryKeyType::Complex) { /// Copy complex key into arena and put in cache - size_t key_size = key.size; - char * place_for_key = complex_key_arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - KeyType updated_key{place_for_key, key_size}; + KeyType updated_key = copyStringInArena(complex_key_arena, key); ssd_cache_key.key = updated_key; } diff --git a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp index 36a462c533e..d1ce665d002 100644 --- a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp +++ b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp @@ -35,7 +35,13 @@ namespace ErrorCodes namespace { -using NamesToTypeNames = std::unordered_map; +struct AttributeConfiguration +{ + std::string type; + std::string expression; +}; + +using AttributeNameToConfiguration = std::unordered_map; /// Get value from field and convert it to string. /// Also remove quotes from strings. @@ -46,6 +52,21 @@ String getFieldAsString(const Field & field) return applyVisitor(FieldVisitorToString(), field); } +String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_attr) +{ + if (!dict_attr->expression) + return {}; + + /// EXPRESSION PROPERTY should be expression or string + String expression_str; + if (const auto * literal = dict_attr->expression->as(); literal && literal->value.getType() == Field::Types::String) + expression_str = getFieldAsString(literal->value); + else + expression_str = queryToString(dict_attr->expression); + + return expression_str; +} + using namespace Poco; using namespace Poco::XML; @@ -63,20 +84,19 @@ void buildLifetimeConfiguration( AutoPtr root, const ASTDictionaryLifetime * lifetime) { + if (!lifetime) + return; - if (lifetime) - { - AutoPtr lifetime_element(doc->createElement("lifetime")); - AutoPtr min_element(doc->createElement("min")); - AutoPtr max_element(doc->createElement("max")); - AutoPtr min_sec(doc->createTextNode(toString(lifetime->min_sec))); - min_element->appendChild(min_sec); - AutoPtr max_sec(doc->createTextNode(toString(lifetime->max_sec))); - max_element->appendChild(max_sec); - lifetime_element->appendChild(min_element); - lifetime_element->appendChild(max_element); - root->appendChild(lifetime_element); - } + AutoPtr lifetime_element(doc->createElement("lifetime")); + AutoPtr min_element(doc->createElement("min")); + AutoPtr max_element(doc->createElement("max")); + AutoPtr min_sec(doc->createTextNode(toString(lifetime->min_sec))); + min_element->appendChild(min_sec); + AutoPtr max_sec(doc->createTextNode(toString(lifetime->max_sec))); + max_element->appendChild(max_sec); + lifetime_element->appendChild(min_element); + lifetime_element->appendChild(max_element); + root->appendChild(lifetime_element); } /* Transforms next definition @@ -105,40 +125,43 @@ void buildLayoutConfiguration( AutoPtr layout_type_element(doc->createElement(layout->layout_type)); layout_element->appendChild(layout_type_element); - if (layout->parameters) + if (!layout->parameters) + return; + + for (const auto & param : layout->parameters->children) { - for (const auto & param : layout->parameters->children) + const ASTPair * pair = param->as(); + if (!pair) { - const ASTPair * pair = param->as(); - if (!pair) - { - throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, "Dictionary layout parameters must be key/value pairs, got '{}' instead", - param->formatForErrorMessage()); - } - - const ASTLiteral * value_literal = pair->second->as(); - if (!value_literal) - { - throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, - "Dictionary layout parameter value must be a literal, got '{}' instead", - pair->second->formatForErrorMessage()); - } - - const auto value_field = value_literal->value; - - if (value_field.getType() != Field::Types::UInt64 - && value_field.getType() != Field::Types::String) - { - throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, - "Dictionary layout parameter value must be an UInt64 or String, got '{}' instead", - value_field.getTypeName()); - } - - AutoPtr layout_type_parameter_element(doc->createElement(pair->first)); - AutoPtr value_to_append(doc->createTextNode(toString(value_field))); - layout_type_parameter_element->appendChild(value_to_append); - layout_type_element->appendChild(layout_type_parameter_element); + throw DB::Exception( + ErrorCodes::BAD_ARGUMENTS, + "Dictionary layout parameters must be key/value pairs, got '{}' instead", + param->formatForErrorMessage()); } + + const ASTLiteral * value_literal = pair->second->as(); + if (!value_literal) + { + throw DB::Exception( + ErrorCodes::BAD_ARGUMENTS, + "Dictionary layout parameter value must be a literal, got '{}' instead", + pair->second->formatForErrorMessage()); + } + + const auto value_field = value_literal->value; + + if (value_field.getType() != Field::Types::UInt64 && value_field.getType() != Field::Types::String) + { + throw DB::Exception( + ErrorCodes::BAD_ARGUMENTS, + "Dictionary layout parameter value must be an UInt64 or String, got '{}' instead", + value_field.getTypeName()); + } + + AutoPtr layout_type_parameter_element(doc->createElement(pair->first)); + AutoPtr value_to_append(doc->createTextNode(toString(value_field))); + layout_type_parameter_element->appendChild(value_to_append); + layout_type_element->appendChild(layout_type_parameter_element); } } @@ -149,10 +172,10 @@ void buildLayoutConfiguration( * StartDate * EndDate */ -void buildRangeConfiguration(AutoPtr doc, AutoPtr root, const ASTDictionaryRange * range, const NamesToTypeNames & all_attrs) +void buildRangeConfiguration(AutoPtr doc, AutoPtr root, const ASTDictionaryRange * range, const AttributeNameToConfiguration & all_attrs) { // appends value to root - auto append_element = [&doc, &root](const std::string & key, const std::string & name, const std::string & type) + auto append_element = [&doc, &root](const std::string & key, const std::string & name, const AttributeConfiguration & configuration) { AutoPtr element(doc->createElement(key)); AutoPtr name_node(doc->createElement("name")); @@ -161,22 +184,33 @@ void buildRangeConfiguration(AutoPtr doc, AutoPtr root, const element->appendChild(name_node); AutoPtr type_node(doc->createElement("type")); - AutoPtr type_text(doc->createTextNode(type)); + AutoPtr type_text(doc->createTextNode(configuration.type)); type_node->appendChild(type_text); element->appendChild(type_node); + if (!configuration.expression.empty()) + { + AutoPtr expression_node(doc->createElement("expression")); + AutoPtr expression_text(doc->createTextNode(configuration.expression)); + expression_node->appendChild(expression_text); + element->appendChild(expression_node); + } + root->appendChild(element); }; - if (!all_attrs.count(range->min_attr_name)) + auto range_min_attribute_it = all_attrs.find(range->min_attr_name); + if (range_min_attribute_it == all_attrs.end()) throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, - "MIN ({}) attribute is not defined in the dictionary attributes", range->min_attr_name); - if (!all_attrs.count(range->max_attr_name)) - throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, - "MAX ({}) attribute is not defined in the dictionary attributes", range->max_attr_name); + "MIN {} attribute is not defined in the dictionary attributes", range->min_attr_name); - append_element("range_min", range->min_attr_name, all_attrs.at(range->min_attr_name)); - append_element("range_max", range->max_attr_name, all_attrs.at(range->max_attr_name)); + auto range_max_attribute_it = all_attrs.find(range->min_attr_name); + if (range_max_attribute_it == all_attrs.end()) + throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, + "MAX {} attribute is not defined in the dictionary attributes", range->max_attr_name); + + append_element("range_min", range->min_attr_name, range_min_attribute_it->second); + append_element("range_max", range->max_attr_name, range_max_attribute_it->second); } @@ -199,25 +233,14 @@ void buildAttributeExpressionIfNeeded( AutoPtr root, const ASTDictionaryAttributeDeclaration * dict_attr) { - if (dict_attr->expression != nullptr) - { - AutoPtr expression_element(doc->createElement("expression")); + if (!dict_attr->expression) + return; - /// EXPRESSION PROPERTY should be expression or string - String expression_str; - if (const auto * literal = dict_attr->expression->as(); - literal && literal->value.getType() == Field::Types::String) - { - expression_str = getFieldAsString(literal->value); - } - else - expression_str = queryToString(dict_attr->expression); - - - AutoPtr expression(doc->createTextNode(expression_str)); - expression_element->appendChild(expression); - root->appendChild(expression_element); - } + AutoPtr expression_element(doc->createElement("expression")); + String expression_str = getAttributeExpression(dict_attr); + AutoPtr expression(doc->createTextNode(expression_str)); + expression_element->appendChild(expression); + root->appendChild(expression_element); } /** Transofrms single dictionary attribute to configuration @@ -373,25 +396,28 @@ void buildPrimaryKeyConfiguration( /** Transforms list of ASTDictionaryAttributeDeclarations to list of dictionary attributes */ -NamesToTypeNames buildDictionaryAttributesConfiguration( +AttributeNameToConfiguration buildDictionaryAttributesConfiguration( AutoPtr doc, AutoPtr root, const ASTExpressionList * dictionary_attributes, const Names & key_columns) { const auto & children = dictionary_attributes->children; - NamesToTypeNames attributes_names_and_types; + AttributeNameToConfiguration attributes_name_to_configuration; + for (const auto & child : children) { const ASTDictionaryAttributeDeclaration * dict_attr = child->as(); if (!dict_attr->type) throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Dictionary attribute must has type"); - attributes_names_and_types.emplace(dict_attr->name, queryToString(dict_attr->type)); + AttributeConfiguration attribute_configuration {queryToString(dict_attr->type), getAttributeExpression(dict_attr)}; + attributes_name_to_configuration.emplace(dict_attr->name, std::move(attribute_configuration)); + if (std::find(key_columns.begin(), key_columns.end(), dict_attr->name) == key_columns.end()) buildSingleAttribute(doc, root, dict_attr); } - return attributes_names_and_types; + return attributes_name_to_configuration; } /** Transform function with key-value arguments to configuration @@ -513,10 +539,10 @@ void checkAST(const ASTCreateQuery & query) throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Cannot create dictionary with empty source"); } -void checkPrimaryKey(const NamesToTypeNames & all_attrs, const Names & key_attrs) +void checkPrimaryKey(const AttributeNameToConfiguration & all_attrs, const Names & key_attrs) { for (const auto & key_attr : key_attrs) - if (all_attrs.count(key_attr) == 0) + if (all_attrs.find(key_attr) == all_attrs.end()) throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Unknown key attribute '{}'", key_attr); } diff --git a/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp index e2ee6ee0153..31e85442c6a 100644 --- a/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp +++ b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.cpp @@ -62,7 +62,8 @@ DiskAzureBlobStorage::DiskAzureBlobStorage( std::unique_ptr DiskAzureBlobStorage::readFile( const String & path, const ReadSettings & read_settings, - std::optional /*estimated_size*/) const + std::optional, + std::optional) const { auto settings = current_settings.get(); auto metadata = readMeta(path); diff --git a/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h index f90ede1add9..63c3c735812 100644 --- a/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h +++ b/src/Disks/AzureBlobStorage/DiskAzureBlobStorage.h @@ -50,7 +50,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional estimated_size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index b09487c17bc..f741b8242f5 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -86,15 +86,16 @@ std::unique_ptr DiskCacheWrapper::readFile( const String & path, const ReadSettings & settings, - std::optional size) const + std::optional read_hint, + std::optional file_size) const { if (!cache_file_predicate(path)) - return DiskDecorator::readFile(path, settings, size); + return DiskDecorator::readFile(path, settings, read_hint, file_size); LOG_TEST(log, "Read file {} from cache", backQuote(path)); if (cache_disk->exists(path)) - return cache_disk->readFile(path, settings, size); + return cache_disk->readFile(path, settings, read_hint, file_size); auto metadata = acquireDownloadMetadata(path); @@ -128,7 +129,7 @@ DiskCacheWrapper::readFile( auto tmp_path = path + ".tmp"; { - auto src_buffer = DiskDecorator::readFile(path, settings, size); + auto src_buffer = DiskDecorator::readFile(path, settings, read_hint, file_size); auto dst_buffer = cache_disk->writeFile(tmp_path, settings.local_fs_buffer_size, WriteMode::Rewrite); copyData(*src_buffer, *dst_buffer); } @@ -152,9 +153,9 @@ DiskCacheWrapper::readFile( } if (metadata->status == DOWNLOADED) - return cache_disk->readFile(path, settings, size); + return cache_disk->readFile(path, settings, read_hint, file_size); - return DiskDecorator::readFile(path, settings, size); + return DiskDecorator::readFile(path, settings, read_hint, file_size); } std::unique_ptr @@ -174,7 +175,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode [this, path, buf_size, mode]() { /// Copy file from cache to actual disk when cached buffer is finalized. - auto src_buffer = cache_disk->readFile(path, ReadSettings(), /* size= */ {}); + auto src_buffer = cache_disk->readFile(path, ReadSettings(), /* read_hint= */ {}, /* file_size= */ {}); auto dst_buffer = DiskDecorator::writeFile(path, buf_size, mode); copyData(*src_buffer, *dst_buffer); dst_buffer->finalize(); diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h index 8b15a8875be..6eb79114a54 100644 --- a/src/Disks/DiskCacheWrapper.h +++ b/src/Disks/DiskCacheWrapper.h @@ -37,7 +37,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode) override; diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp index 263c6c9c0ff..d4acb6fab0d 100644 --- a/src/Disks/DiskDecorator.cpp +++ b/src/Disks/DiskDecorator.cpp @@ -115,9 +115,9 @@ void DiskDecorator::listFiles(const String & path, std::vector & file_na std::unique_ptr DiskDecorator::readFile( - const String & path, const ReadSettings & settings, std::optional size) const + const String & path, const ReadSettings & settings, std::optional read_hint, std::optional file_size) const { - return delegate->readFile(path, settings, size); + return delegate->readFile(path, settings, read_hint, file_size); } std::unique_ptr diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index 5b88f4a36fa..ff4f16fdf3d 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -38,7 +38,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, @@ -70,6 +71,20 @@ public: void startup() override; void applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context, const String & config_prefix, const DisksMap & map) override; + std::unique_ptr readMetaFile( + const String & path, + const ReadSettings & settings, + std::optional size) const override { return delegate->readMetaFile(path, settings, size); } + + std::unique_ptr writeMetaFile( + const String & path, + size_t buf_size, + WriteMode mode) override { return delegate->writeMetaFile(path, buf_size, mode); } + + void removeMetaFileIfExists(const String & path) override { delegate->removeMetaFileIfExists(path); } + + UInt32 getRefCount(const String & path) const override { return delegate->getRefCount(path); } + protected: Executor & getExecutor() override; diff --git a/src/Disks/DiskEncrypted.cpp b/src/Disks/DiskEncrypted.cpp index de569d82c60..714264b7720 100644 --- a/src/Disks/DiskEncrypted.cpp +++ b/src/Disks/DiskEncrypted.cpp @@ -252,10 +252,11 @@ void DiskEncrypted::copy(const String & from_path, const std::shared_ptr std::unique_ptr DiskEncrypted::readFile( const String & path, const ReadSettings & settings, - std::optional size) const + std::optional read_hint, + std::optional file_size) const { auto wrapped_path = wrappedPath(path); - auto buffer = delegate->readFile(wrapped_path, settings, size); + auto buffer = delegate->readFile(wrapped_path, settings, read_hint, file_size); if (buffer->eof()) { /// File is empty, that's a normal case, see DiskEncrypted::truncateFile(). diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index 5b1bd7c5c6d..d99fe17457d 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -120,7 +120,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index 0a0764d41b1..3428a9aef54 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -86,6 +86,22 @@ static void loadDiskLocalConfig(const String & name, } } +std::optional fileSizeSafe(const fs::path & path) +{ + std::error_code ec; + + size_t size = fs::file_size(path, ec); + if (!ec) + return size; + + if (ec == std::errc::no_such_file_or_directory) + return std::nullopt; + if (ec == std::errc::operation_not_supported) + return std::nullopt; + + throw fs::filesystem_error("DiskLocal", path, ec); +} + class DiskLocalReservation : public IReservation { public: @@ -269,9 +285,11 @@ void DiskLocal::replaceFile(const String & from_path, const String & to_path) fs::rename(from_file, to_file); } -std::unique_ptr DiskLocal::readFile(const String & path, const ReadSettings & settings, std::optional size) const +std::unique_ptr DiskLocal::readFile(const String & path, const ReadSettings & settings, std::optional read_hint, std::optional file_size) const { - return createReadBufferFromFileBase(fs::path(disk_path) / path, settings, size); + if (!file_size.has_value()) + file_size = fileSizeSafe(fs::path(disk_path) / path); + return createReadBufferFromFileBase(fs::path(disk_path) / path, settings, read_hint, file_size); } std::unique_ptr diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h index 37855327578..f16497ae432 100644 --- a/src/Disks/DiskLocal.h +++ b/src/Disks/DiskLocal.h @@ -74,7 +74,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/DiskMemory.cpp b/src/Disks/DiskMemory.cpp index 834ed3e0c65..abaea0846a5 100644 --- a/src/Disks/DiskMemory.cpp +++ b/src/Disks/DiskMemory.cpp @@ -315,7 +315,7 @@ void DiskMemory::replaceFileImpl(const String & from_path, const String & to_pat files.insert(std::move(node)); } -std::unique_ptr DiskMemory::readFile(const String & path, const ReadSettings &, std::optional) const +std::unique_ptr DiskMemory::readFile(const String & path, const ReadSettings &, std::optional, std::optional) const { std::lock_guard lock(mutex); diff --git a/src/Disks/DiskMemory.h b/src/Disks/DiskMemory.h index d77161d898e..eef7b78502d 100644 --- a/src/Disks/DiskMemory.h +++ b/src/Disks/DiskMemory.h @@ -65,7 +65,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/DiskRestartProxy.cpp b/src/Disks/DiskRestartProxy.cpp index a8edd15ba79..9bd59513040 100644 --- a/src/Disks/DiskRestartProxy.cpp +++ b/src/Disks/DiskRestartProxy.cpp @@ -190,10 +190,10 @@ void DiskRestartProxy::listFiles(const String & path, std::vector & file } std::unique_ptr DiskRestartProxy::readFile( - const String & path, const ReadSettings & settings, std::optional size) const + const String & path, const ReadSettings & settings, std::optional read_hint, std::optional file_size) const { ReadLock lock (mutex); - auto impl = DiskDecorator::readFile(path, settings, size); + auto impl = DiskDecorator::readFile(path, settings, read_hint, file_size); return std::make_unique(*this, std::move(impl)); } diff --git a/src/Disks/DiskRestartProxy.h b/src/Disks/DiskRestartProxy.h index e8b5891947a..3644539e941 100644 --- a/src/Disks/DiskRestartProxy.h +++ b/src/Disks/DiskRestartProxy.h @@ -46,7 +46,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode) override; void removeFile(const String & path) override; void removeFileIfExists(const String & path) override; diff --git a/src/Disks/DiskWebServer.cpp b/src/Disks/DiskWebServer.cpp index 63e1cc0e6c5..7c94a5b98b1 100644 --- a/src/Disks/DiskWebServer.cpp +++ b/src/Disks/DiskWebServer.cpp @@ -154,7 +154,7 @@ bool DiskWebServer::exists(const String & path) const } -std::unique_ptr DiskWebServer::readFile(const String & path, const ReadSettings & read_settings, std::optional) const +std::unique_ptr DiskWebServer::readFile(const String & path, const ReadSettings & read_settings, std::optional, std::optional) const { LOG_TRACE(log, "Read from path: {}", path); auto iter = files.find(path); diff --git a/src/Disks/DiskWebServer.h b/src/Disks/DiskWebServer.h index 1a193d91adf..bda8c8adaad 100644 --- a/src/Disks/DiskWebServer.h +++ b/src/Disks/DiskWebServer.h @@ -63,7 +63,8 @@ public: std::unique_ptr readFile(const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; /// Disk info diff --git a/src/Disks/HDFS/DiskHDFS.cpp b/src/Disks/HDFS/DiskHDFS.cpp index 41c407c10ee..572c908768b 100644 --- a/src/Disks/HDFS/DiskHDFS.cpp +++ b/src/Disks/HDFS/DiskHDFS.cpp @@ -71,7 +71,7 @@ DiskHDFS::DiskHDFS( } -std::unique_ptr DiskHDFS::readFile(const String & path, const ReadSettings & read_settings, std::optional) const +std::unique_ptr DiskHDFS::readFile(const String & path, const ReadSettings & read_settings, std::optional, std::optional) const { auto metadata = readMeta(path); diff --git a/src/Disks/HDFS/DiskHDFS.h b/src/Disks/HDFS/DiskHDFS.h index 47150f1cfd8..de373d8d6ee 100644 --- a/src/Disks/HDFS/DiskHDFS.h +++ b/src/Disks/HDFS/DiskHDFS.h @@ -53,7 +53,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode) override; diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp index 42d5f5fce10..b1d7b33fec3 100644 --- a/src/Disks/IDisk.cpp +++ b/src/Disks/IDisk.cpp @@ -86,4 +86,28 @@ SyncGuardPtr IDisk::getDirectorySyncGuard(const String & /* path */) const return nullptr; } +std::unique_ptr IDisk::readMetaFile( + const String & path, + const ReadSettings & settings, + std::optional size) const +{ + LOG_TRACE(&Poco::Logger::get("IDisk"), "Read local metafile: {}", path); + return readFile(path, settings, size); +} + +std::unique_ptr IDisk::writeMetaFile( + const String & path, + size_t buf_size, + WriteMode mode) +{ + LOG_TRACE(&Poco::Logger::get("IDisk"), "Write local metafile: {}", path); + return writeFile(path, buf_size, mode); +} + +void IDisk::removeMetaFileIfExists(const String & path) +{ + LOG_TRACE(&Poco::Logger::get("IDisk"), "Remove local metafile: {}", path); + removeFileIfExists(path); +} + } diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 0a63421ae5c..665a35459c7 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -161,7 +161,8 @@ public: virtual std::unique_ptr readFile( const String & path, const ReadSettings & settings = ReadSettings{}, - std::optional size = {}) const = 0; + std::optional read_hint = {}, + std::optional file_size = {}) const = 0; /// Open the file for write and return WriteBufferFromFileBase object. virtual std::unique_ptr writeFile( @@ -247,6 +248,28 @@ public: /// Applies new settings for disk in runtime. virtual void applyNewSettings(const Poco::Util::AbstractConfiguration &, ContextPtr, const String &, const DisksMap &) {} + /// Open the local file for read and return ReadBufferFromFileBase object. + /// Overridden in IDiskRemote. + /// Used for work with custom metadata. + virtual std::unique_ptr readMetaFile( + const String & path, + const ReadSettings & settings, + std::optional size) const; + + /// Open the local file for write and return WriteBufferFromFileBase object. + /// Overridden in IDiskRemote. + /// Used for work with custom metadata. + virtual std::unique_ptr writeMetaFile( + const String & path, + size_t buf_size, + WriteMode mode); + + virtual void removeMetaFileIfExists(const String & path); + + /// Return reference count for remote FS. + /// Overridden in IDiskRemote. + virtual UInt32 getRefCount(const String &) const { return 0; } + protected: friend class DiskDecorator; diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp index 848726f957d..706f0f84f32 100644 --- a/src/Disks/IDiskRemote.cpp +++ b/src/Disks/IDiskRemote.cpp @@ -484,6 +484,7 @@ bool IDiskRemote::tryReserve(UInt64 bytes) String IDiskRemote::getUniqueId(const String & path) const { + LOG_TRACE(log, "Remote path: {}, Path: {}", remote_fs_root_path, path); Metadata metadata(remote_fs_root_path, metadata_disk, path); String id; if (!metadata.remote_fs_objects.empty()) @@ -500,4 +501,34 @@ AsynchronousReaderPtr IDiskRemote::getThreadPoolReader() return reader; } +std::unique_ptr IDiskRemote::readMetaFile( + const String & path, + const ReadSettings & settings, + std::optional size) const +{ + LOG_TRACE(log, "Read metafile: {}", path); + return metadata_disk->readFile(path, settings, size); +} + +std::unique_ptr IDiskRemote::writeMetaFile( + const String & path, + size_t buf_size, + WriteMode mode) +{ + LOG_TRACE(log, "Write metafile: {}", path); + return metadata_disk->writeFile(path, buf_size, mode); +} + +void IDiskRemote::removeMetaFileIfExists(const String & path) +{ + LOG_TRACE(log, "Remove metafile: {}", path); + return metadata_disk->removeFileIfExists(path); +} + +UInt32 IDiskRemote::getRefCount(const String & path) const +{ + auto meta = readMeta(path); + return meta.ref_count; +} + } diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h index c6a904020de..c4f475f5b3e 100644 --- a/src/Disks/IDiskRemote.h +++ b/src/Disks/IDiskRemote.h @@ -136,6 +136,21 @@ public: static AsynchronousReaderPtr getThreadPoolReader(); + virtual std::unique_ptr readMetaFile( + const String & path, + const ReadSettings & settings, + std::optional size) const override; + + virtual std::unique_ptr writeMetaFile( + const String & path, + size_t buf_size, + WriteMode mode) override; + + virtual void removeMetaFileIfExists( + const String & path) override; + + UInt32 getRefCount(const String & path) const override; + protected: Poco::Logger * log; const String name; diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h index 1b0cc17cb41..c9b81c98e61 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h @@ -32,7 +32,7 @@ public: explicit AsynchronousReadIndirectBufferFromRemoteFS( AsynchronousReaderPtr reader_, const ReadSettings & settings_, std::shared_ptr impl_, - size_t min_bytes_for_seek = 1024 * 1024); + size_t min_bytes_for_seek = DBMS_DEFAULT_BUFFER_SIZE); ~AsynchronousReadIndirectBufferFromRemoteFS() override; diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 201334cbd12..0bebf91df97 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -214,7 +214,7 @@ void DiskS3::moveFile(const String & from_path, const String & to_path, bool sen metadata_disk->moveFile(from_path, to_path); } -std::unique_ptr DiskS3::readFile(const String & path, const ReadSettings & read_settings, std::optional) const +std::unique_ptr DiskS3::readFile(const String & path, const ReadSettings & read_settings, std::optional, std::optional) const { auto settings = current_settings.get(); auto metadata = readMeta(path); diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index 18ed733ff01..c5d0722c6c2 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -76,7 +76,8 @@ public: std::unique_ptr readFile( const String & path, const ReadSettings & settings, - std::optional size) const override; + std::optional read_hint, + std::optional file_size) const override; std::unique_ptr writeFile( const String & path, diff --git a/src/Disks/tests/gtest_disk_encrypted.cpp b/src/Disks/tests/gtest_disk_encrypted.cpp index 85dd8eb78b8..d03128a6b33 100644 --- a/src/Disks/tests/gtest_disk_encrypted.cpp +++ b/src/Disks/tests/gtest_disk_encrypted.cpp @@ -57,7 +57,7 @@ protected: String getFileContents(const String & file_name) { - auto buf = encrypted_disk->readFile(file_name, /* settings= */ {}, /* size= */ {}); + auto buf = encrypted_disk->readFile(file_name, /* settings= */ {}, /* read_hint= */ {}, /* file_size= */ {}); String str; readStringUntilEOF(str, *buf); return str; diff --git a/src/Disks/tests/gtest_disk_hdfs.cpp b/src/Disks/tests/gtest_disk_hdfs.cpp index 2864797aae3..4b5ff182256 100644 --- a/src/Disks/tests/gtest_disk_hdfs.cpp +++ b/src/Disks/tests/gtest_disk_hdfs.cpp @@ -53,7 +53,7 @@ TEST(DiskTestHDFS, WriteReadHDFS) { DB::String result; - auto in = disk.readFile(file_name, {}, 1024); + auto in = disk.readFile(file_name, {}, 1024, 1024); readString(result, *in); EXPECT_EQ("Test write to file", result); } @@ -76,7 +76,7 @@ TEST(DiskTestHDFS, RewriteFileHDFS) { String result; - auto in = disk.readFile(file_name, {}, 1024); + auto in = disk.readFile(file_name, {}, 1024, 1024); readString(result, *in); EXPECT_EQ("Text10", result); readString(result, *in); @@ -104,7 +104,7 @@ TEST(DiskTestHDFS, AppendFileHDFS) { String result, expected; - auto in = disk.readFile(file_name, {}, 1024); + auto in = disk.readFile(file_name, {}, 1024, 1024); readString(result, *in); EXPECT_EQ("Text0123456789", result); @@ -131,7 +131,7 @@ TEST(DiskTestHDFS, SeekHDFS) /// Test SEEK_SET { String buf(4, '0'); - std::unique_ptr in = disk.readFile(file_name, {}, 1024); + std::unique_ptr in = disk.readFile(file_name, {}, 1024, 1024); in->seek(5, SEEK_SET); @@ -141,7 +141,7 @@ TEST(DiskTestHDFS, SeekHDFS) /// Test SEEK_CUR { - std::unique_ptr in = disk.readFile(file_name, {}, 1024); + std::unique_ptr in = disk.readFile(file_name, {}, 1024, 1024); String buf(4, '0'); in->readStrict(buf.data(), 4); diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp index ecfa5df8351..bed46a97c1b 100644 --- a/src/Formats/CapnProtoUtils.cpp +++ b/src/Formats/CapnProtoUtils.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -26,6 +28,7 @@ namespace ErrorCodes extern const int FILE_DOESNT_EXIST; extern const int UNKNOWN_EXCEPTION; extern const int INCORRECT_DATA; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) @@ -427,6 +430,113 @@ void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Blo } } +template +static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants) +{ + std::vector> values; + for (auto enumerant : enumerants) + values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal())); + return std::make_shared>(std::move(values)); +} + +static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema) +{ + auto enumerants = enum_schema.getEnumerants(); + if (enumerants.size() < 128) + return getEnumDataTypeFromEnumerants(enumerants); + if (enumerants.size() < 32768) + return getEnumDataTypeFromEnumerants(enumerants); + + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "ClickHouse supports only 8 and 16-but Enums"); +} + +static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) +{ + switch (capnp_type.which()) + { + case capnp::schema::Type::INT8: + return std::make_shared(); + case capnp::schema::Type::INT16: + return std::make_shared(); + case capnp::schema::Type::INT32: + return std::make_shared(); + case capnp::schema::Type::INT64: + return std::make_shared(); + case capnp::schema::Type::BOOL: [[fallthrough]]; + case capnp::schema::Type::UINT8: + return std::make_shared(); + case capnp::schema::Type::UINT16: + return std::make_shared(); + case capnp::schema::Type::UINT32: + return std::make_shared(); + case capnp::schema::Type::UINT64: + return std::make_shared(); + case capnp::schema::Type::FLOAT32: + return std::make_shared(); + case capnp::schema::Type::FLOAT64: + return std::make_shared(); + case capnp::schema::Type::DATA: [[fallthrough]]; + case capnp::schema::Type::TEXT: + return std::make_shared(); + case capnp::schema::Type::ENUM: + return getEnumDataTypeFromEnumSchema(capnp_type.asEnum()); + case capnp::schema::Type::LIST: + { + auto list_schema = capnp_type.asList(); + auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType()); + return std::make_shared(nested_type); + } + case capnp::schema::Type::STRUCT: + { + auto struct_schema = capnp_type.asStruct(); + + /// Check if it can be Nullable. + if (checkIfStructIsNamedUnion(struct_schema)) + { + auto fields = struct_schema.getUnionFields(); + if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unions are not supported"); + auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); + if (value_type.isStruct() || value_type.isList()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Tuples and Lists cannot be inside Nullable"); + + auto nested_type = getDataTypeFromCapnProtoType(value_type); + return std::make_shared(nested_type); + } + + if (checkIfStructContainsUnnamedUnion(struct_schema)) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unnamed union is not supported"); + + /// Treat Struct as Tuple. + DataTypes nested_types; + Names nested_names; + for (auto field : struct_schema.getNonUnionFields()) + { + nested_names.push_back(field.getProto().getName()); + nested_types.push_back(getDataTypeFromCapnProtoType(field.getType())); + } + return std::make_shared(std::move(nested_types), std::move(nested_names)); + } + default: + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); + } +} + +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema) +{ + if (checkIfStructContainsUnnamedUnion(schema)) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Unnamed union is not supported"); + + NamesAndTypesList names_and_types; + for (auto field : schema.getNonUnionFields()) + { + auto name = field.getProto().getName(); + auto type = getDataTypeFromCapnProtoType(field.getType()); + names_and_types.emplace_back(name, type); + } + return names_and_types; +} + } #endif diff --git a/src/Formats/CapnProtoUtils.h b/src/Formats/CapnProtoUtils.h index 93ca0a5e616..51c152de17f 100644 --- a/src/Formats/CapnProtoUtils.h +++ b/src/Formats/CapnProtoUtils.h @@ -38,6 +38,7 @@ capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Re void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode); +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema); } #endif diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index d956d9e6bfb..0a7747fc864 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -1,7 +1,16 @@ #include +#include +#include #include +#include +#include +#include #include #include +#include +#include +#include +#include namespace DB { @@ -9,6 +18,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } FormatSettings::EscapingRule stringToEscapingRule(const String & escaping_rule) @@ -193,30 +203,145 @@ void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSe } } -String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +template +String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { String result; switch (escaping_rule) { case FormatSettings::EscapingRule::Quoted: - readQuotedString(result, buf); + if constexpr (read_string) + readQuotedString(result, buf); + else + readQuotedFieldIntoString(result, buf); break; case FormatSettings::EscapingRule::JSON: - readJSONString(result, buf); + if constexpr (read_string) + readJSONString(result, buf); + else + readJSONFieldIntoString(result, buf); break; case FormatSettings::EscapingRule::Raw: readString(result, buf); break; case FormatSettings::EscapingRule::CSV: - readCSVString(result, buf, format_settings.csv); + if constexpr (read_string) + readCSVString(result, buf, format_settings.csv); + else + readCSVField(result, buf, format_settings.csv); break; case FormatSettings::EscapingRule::Escaped: readEscapedString(result, buf); break; default: - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read string with {} escaping rule", escapingRuleToString(escaping_rule)); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule)); } return result; } +String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + return readByEscapingRule(buf, escaping_rule, format_settings); +} + +String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) +{ + return readByEscapingRule(buf, escaping_rule, format_settings); +} + +static bool evaluateConstantExpressionFromString(const StringRef & field, DataTypePtr & type, ContextPtr context) +{ + if (!context) + throw Exception(ErrorCodes::LOGICAL_ERROR, "You must provide context to evaluate constant expression"); + + ParserExpression parser; + Expected expected; + Tokens tokens(field.data, field.data + field.size); + IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); + ASTPtr ast; + + /// FIXME: Our parser cannot parse maps in the form of '{key : value}' that is used in text formats. + bool parsed = parser.parse(token_iterator, ast, expected); + if (!parsed) + return false; + + try + { + std::pair result = evaluateConstantExpression(ast, context); + type = generalizeDataType(result.second); + return true; + } + catch (...) + { + return false; + } +} + +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::Quoted: + { + DataTypePtr type; + bool parsed = evaluateConstantExpressionFromString(field, type, context); + return parsed ? type : nullptr; + } + case FormatSettings::EscapingRule::JSON: + return getDataTypeFromJSONField(field); + case FormatSettings::EscapingRule::CSV: + { + if (field.empty() || field == format_settings.csv.null_representation) + return nullptr; + + if (field == format_settings.bool_false_representation || field == format_settings.bool_true_representation) + return std::make_shared(); + + DataTypePtr type; + bool parsed; + if (field[0] == '\'' || field[0] == '"') + { + /// Try to evaluate expression inside quotes. + parsed = evaluateConstantExpressionFromString(StringRef(field.data() + 1, field.size() - 2), type, context); + /// If it's a number in quotes we determine it as a string. + if (parsed && type && isNumber(removeNullable(type))) + return makeNullable(std::make_shared()); + } + else + parsed = evaluateConstantExpressionFromString(field, type, context); + + /// If we couldn't parse an expression, determine it as a string. + return parsed ? type : makeNullable(std::make_shared()); + } + case FormatSettings::EscapingRule::Raw: [[fallthrough]]; + case FormatSettings::EscapingRule::Escaped: + /// TODO: Try to use some heuristics here to determine the type of data. + return field.empty() ? nullptr : makeNullable(std::make_shared()); + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the type for value with {} escaping rule", escapingRuleToString(escaping_rule)); + } +} + +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context) +{ + DataTypes data_types; + data_types.reserve(fields.size()); + for (const auto & field : fields) + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, escaping_rule, context)); + return data_types; +} + +DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule) +{ + switch (escaping_rule) + { + case FormatSettings::EscapingRule::CSV: [[fallthrough]]; + case FormatSettings::EscapingRule::Escaped: [[fallthrough]]; + case FormatSettings::EscapingRule::Raw: + return makeNullable(std::make_shared()); + default: + return nullptr; + } +} + } diff --git a/src/Formats/EscapingRuleUtils.h b/src/Formats/EscapingRuleUtils.h index 02f027db74d..10147b29ad6 100644 --- a/src/Formats/EscapingRuleUtils.h +++ b/src/Formats/EscapingRuleUtils.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -33,5 +34,24 @@ void serializeFieldByEscapingRule( void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); +String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings); + +/// Try to determine the type of the field written by a specific escaping rule. +/// If cannot, return nullptr. +/// - For Quoted escaping rule we can interpret a single field as a constant +/// expression and get it's type by evaluation this expression. +/// - For JSON escaping rule we can use JSON parser to parse a single field +/// and then convert JSON type of this field to ClickHouse type. +/// - For CSV escaping rule we can do the next: +/// - If the field is an unquoted string, then we could try to evaluate it +/// as a constant expression, and if it fails, treat it as a String. +/// - If the field is a string in quotes, then we can try to evaluate +/// expression inside quotes as a constant expression, and if it fails or +/// the result is a number (we don't parse numbers in quotes) we treat it as a String. +/// - For TSV and TSVRaw we treat each field as a String (TODO: try to use some tweaks and heuristics here) +DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); +DataTypes determineDataTypesByEscapingRule(const std::vector & fields, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, ContextPtr context = nullptr); + +DataTypePtr getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule escaping_rule); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index d292bbf551c..ae554e62651 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -1,20 +1,20 @@ #include #include -#include -#include #include #include +#include +#include #include #include -#include #include -#include #include +#include +#include #include +#include -#include -#include +#include namespace DB { @@ -119,6 +119,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.seekable_read = settings.input_format_allow_seeks; + format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns; + format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context if (format_settings.schema.is_server) @@ -200,7 +202,6 @@ InputFormatPtr FormatFactory::getInput( return format; } - InputFormatPtr FormatFactory::getInputFormat( const String & name, ReadBuffer & buf, @@ -235,6 +236,18 @@ InputFormatPtr FormatFactory::getInputFormat( return format; } +static void addExistingProgressToOutputFormat(OutputFormatPtr format, ContextPtr context) +{ + auto * element_id = context->getProcessListElement(); + if (element_id) + { + /// While preparing the query there might have been progress (for example in subscalar subqueries) so add it here + auto current_progress = element_id->getProgressIn(); + Progress read_progress{current_progress.read_rows, current_progress.read_bytes, current_progress.total_rows_to_read}; + format->onProgress(read_progress); + } +} + OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( const String & name, WriteBuffer & buf, @@ -263,7 +276,9 @@ OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( if (context->hasQueryContext() && settings.log_queries) context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name); - return std::make_shared(builder); + auto format = std::make_shared(builder); + addExistingProgressToOutputFormat(format, context); + return format; } return getOutputFormat(name, buf, sample, context, callback, _format_settings); @@ -303,6 +318,8 @@ OutputFormatPtr FormatFactory::getOutputFormat( if (auto * mysql = typeid_cast(format.get())) mysql->setContext(context); + addExistingProgressToOutputFormat(format, context); + return format; } @@ -325,6 +342,32 @@ String FormatFactory::getContentType( return format->getContentType(); } +SchemaReaderPtr FormatFactory::getSchemaReader( + const String & name, + ReadBuffer & buf, + ContextPtr context, + const std::optional & _format_settings) const +{ + const auto & schema_reader_creator = dict.at(name).schema_reader_creator; + if (!schema_reader_creator) + throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + return schema_reader_creator(buf, format_settings, context); +} + +ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader( + const String & name, + ContextPtr context, + const std::optional & _format_settings) const +{ + const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator; + if (!external_schema_reader_creator) + throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR); + + auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context); + return external_schema_reader_creator(format_settings); +} void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator) { @@ -350,6 +393,30 @@ void FormatFactory::registerOutputFormat(const String & name, OutputCreator outp target = std::move(output_creator); } +void FormatFactory::registerFileExtension(const String & extension, const String & format_name) +{ + file_extension_formats[extension] = format_name; +} + +String FormatFactory::getFormatFromFileName(String file_name) +{ + CompressionMethod compression_method = chooseCompressionMethod(file_name, ""); + if (CompressionMethod::None != compression_method) + { + auto pos = file_name.find_last_of('.'); + if (pos != String::npos) + file_name = file_name.substr(0, pos); + } + + auto pos = file_name.find_last_of('.'); + if (pos == String::npos) + return ""; + + String file_extension = file_name.substr(pos + 1, String::npos); + boost::algorithm::to_lower(file_extension); + return file_extension_formats[file_extension]; +} + void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine) { auto & target = dict[name].file_segmentation_engine; @@ -358,6 +425,21 @@ void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegm target = std::move(file_segmentation_engine); } +void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator) +{ + auto & target = dict[name].schema_reader_creator; + if (target) + throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); + target = std::move(schema_reader_creator); +} + +void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator) +{ + auto & target = dict[name].external_schema_reader_creator; + if (target) + throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR); + target = std::move(external_schema_reader_creator); +} void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name) { @@ -395,6 +477,23 @@ bool FormatFactory::isOutputFormat(const String & name) const return it != dict.end() && it->second.output_creator; } +bool FormatFactory::checkIfFormatHasSchemaReader(const String & name) +{ + const auto & target = getCreators(name); + return bool(target.schema_reader_creator); +} + +bool FormatFactory::checkIfFormatHasExternalSchemaReader(const String & name) +{ + const auto & target = getCreators(name); + return bool(target.external_schema_reader_creator); +} + +bool FormatFactory::checkIfFormatHasAnySchemaReader(const String & name) +{ + return checkIfFormatHasSchemaReader(name) || checkIfFormatHasExternalSchemaReader(name); +} + FormatFactory & FormatFactory::instance() { static FormatFactory ret; diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index ea285c47996..229adbbb263 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -4,7 +4,10 @@ #include #include #include +#include +#include #include +#include #include @@ -31,6 +34,11 @@ class IOutputFormat; struct RowInputFormatParams; struct RowOutputFormatParams; +class ISchemaReader; +class IExternalSchemaReader; +using SchemaReaderPtr = std::shared_ptr; +using ExternalSchemaReaderPtr = std::shared_ptr; + using InputFormatPtr = std::shared_ptr; using OutputFormatPtr = std::shared_ptr; @@ -85,17 +93,23 @@ private: /// The checker should return true if parallel parsing should be disabled. using NonTrivialPrefixAndSuffixChecker = std::function; + using SchemaReaderCreator = std::function; + using ExternalSchemaReaderCreator = std::function; + struct Creators { InputCreator input_creator; OutputCreator output_creator; FileSegmentationEngine file_segmentation_engine; + SchemaReaderCreator schema_reader_creator; + ExternalSchemaReaderCreator external_schema_reader_creator; bool supports_parallel_formatting{false}; bool is_column_oriented{false}; NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker; }; using FormatsDictionary = std::unordered_map; + using FileExtensionFormats = std::unordered_map; public: static FormatFactory & instance(); @@ -138,6 +152,17 @@ public: ContextPtr context, const std::optional & format_settings = std::nullopt) const; + SchemaReaderPtr getSchemaReader( + const String & name, + ReadBuffer & buf, + ContextPtr context, + const std::optional & format_settings = std::nullopt) const; + + ExternalSchemaReaderPtr getExternalSchemaReader( + const String & name, + ContextPtr context, + const std::optional & format_settings = std::nullopt) const; + void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine); void registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker); @@ -146,11 +171,23 @@ public: void registerInputFormat(const String & name, InputCreator input_creator); void registerOutputFormat(const String & name, OutputCreator output_creator); + /// Register file extension for format + void registerFileExtension(const String & extension, const String & format_name); + String getFormatFromFileName(String file_name); + + /// Register schema readers for format its name. + void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator); + void registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator); + void markOutputFormatSupportsParallelFormatting(const String & name); void markFormatAsColumnOriented(const String & name); bool checkIfFormatIsColumnOriented(const String & name); + bool checkIfFormatHasSchemaReader(const String & name); + bool checkIfFormatHasExternalSchemaReader(const String & name); + bool checkIfFormatHasAnySchemaReader(const String & name); + const FormatsDictionary & getAllFormats() const { return dict; @@ -161,8 +198,10 @@ public: private: FormatsDictionary dict; + FileExtensionFormats file_extension_formats; const Creators & getCreators(const String & name) const; + }; } diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index d9af07fdc9c..6298e959c3e 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -33,6 +33,7 @@ struct FormatSettings bool defaults_for_omitted_fields = true; bool seekable_read = true; + UInt64 max_rows_to_read_for_schema_inference = 100; enum class DateTimeInputFormat { @@ -217,6 +218,11 @@ struct FormatSettings { EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES; } capn_proto; + + struct + { + UInt64 number_of_columns = 0; + } msgpack; }; } diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index b55e9f59cc7..c63b8453634 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -1,7 +1,17 @@ #include #include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include @@ -26,7 +36,7 @@ static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast(pos - in.position()) < min_chunk_size || number_of_rows < min_rows)) { const auto current_object_size = memory.size() + static_cast(pos - in.position()); - if (current_object_size > 10 * min_chunk_size) + if (min_chunk_size != 0 && current_object_size > 10 * min_chunk_size) throw ParsingException("Size of JSON object is extremely large. Expected not greater than " + std::to_string(min_chunk_size) + " bytes, but current is " + std::to_string(current_object_size) + " bytes per row. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely JSON is malformed", ErrorCodes::INCORRECT_DATA); @@ -92,6 +102,122 @@ static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer return {loadAtPosition(in, memory, pos), number_of_rows}; } +template +static String readJSONEachRowLineIntoStringImpl(ReadBuffer & in) +{ + Memory memory; + fileSegmentationEngineJSONEachRowImpl(in, memory, 0, 1); + return String(memory.data(), memory.size()); +} + +template +DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) +{ + if (field.isNull()) + return nullptr; + + if (field.isBool()) + return makeNullable(std::make_shared()); + + if (field.isInt64() || field.isUInt64() || field.isDouble()) + return makeNullable(std::make_shared()); + + if (field.isString()) + return makeNullable(std::make_shared()); + + if (field.isArray()) + { + auto array = field.getArray(); + + /// Return nullptr in case of empty array because we cannot determine nested type. + if (array.size() == 0) + return nullptr; + + DataTypes nested_data_types; + /// If this array contains fields with different types we will treat it as Tuple. + bool is_tuple = false; + for (const auto element : array) + { + auto type = getDataTypeFromJSONFieldImpl(element); + if (!type) + return nullptr; + + if (!nested_data_types.empty() && type->getName() != nested_data_types.back()->getName()) + is_tuple = true; + + nested_data_types.push_back(std::move(type)); + } + + if (is_tuple) + return std::make_shared(nested_data_types); + + return std::make_shared(nested_data_types.back()); + } + + if (field.isObject()) + { + auto object = field.getObject(); + DataTypePtr value_type; + for (const auto key_value_pair : object) + { + auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second); + if (!type) + return nullptr; + + if (value_type && value_type->getName() != type->getName()) + return nullptr; + + value_type = type; + } + return std::make_shared(std::make_shared(), value_type); + } + + throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; +} + +auto getJSONParserAndElement() +{ +#if USE_SIMDJSON + return std::pair(); +#elif USE_RAPIDJSON + return std::pair(); +#else + return std::pair(); +#endif +} + +DataTypePtr getDataTypeFromJSONField(const String & field) +{ + auto [parser, element] = getJSONParserAndElement(); + bool parsed = parser.parse(field, element); + if (!parsed) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object"); + + return getDataTypeFromJSONFieldImpl(element); +} + +template +static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, bool /*json_strings*/, Extractor & extractor) +{ + String line = readJSONEachRowLineIntoStringImpl(in); + auto [parser, element] = getJSONParserAndElement(); + bool parsed = parser.parse(line, element); + if (!parsed) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object"); + + auto fields = extractor.extract(element); + + DataTypes data_types; + data_types.reserve(fields.size()); + for (const auto & field : fields) + data_types.push_back(getDataTypeFromJSONFieldImpl(field)); + + /// TODO: For JSONStringsEachRow/JSONCompactStringsEach all types will be strings. + /// Should we try to parse data inside strings somehow in this case? + + return data_types; +} + std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) { return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_chunk_size, 1); @@ -102,6 +228,60 @@ std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_chunk_size, min_rows); } +struct JSONEachRowFieldsExtractor +{ + template + std::vector extract(const Element & element) + { + /// {..., "" : , ...} + auto object = element.getObject(); + std::vector fields; + fields.reserve(object.size()); + column_names.reserve(object.size()); + for (const auto & key_value_pair : object) + { + column_names.emplace_back(key_value_pair.first); + fields.push_back(key_value_pair.second); + } + + return fields; + } + + std::vector column_names; +}; + +std::unordered_map readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings) +{ + JSONEachRowFieldsExtractor extractor; + auto data_types = determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); + std::unordered_map result; + for (size_t i = 0; i != extractor.column_names.size(); ++i) + result[extractor.column_names[i]] = data_types[i]; + return result; +} + +struct JSONCompactEachRowFieldsExtractor +{ + template + std::vector extract(const Element & element) + { + /// [..., , ...] + auto array = element.getArray(); + std::vector fields; + fields.reserve(array.size()); + for (size_t i = 0; i != array.size(); ++i) + fields.push_back(array[i]); + return fields; + } +}; + +DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings) +{ + JSONCompactEachRowFieldsExtractor extractor; + return determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); +} + + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) { /// For JSONEachRow we can safely skip whitespace characters diff --git a/src/Formats/JSONEachRowUtils.h b/src/Formats/JSONEachRowUtils.h index 4a049aa1abd..6f71baa8b40 100644 --- a/src/Formats/JSONEachRowUtils.h +++ b/src/Formats/JSONEachRowUtils.h @@ -11,6 +11,21 @@ namespace DB std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows); + +/// Parse JSON from string and convert it's type to ClickHouse type. Make the result type always Nullable. +/// JSON array with different nested types is treated as Tuple. +/// If cannot convert (for example when field contains null), return nullptr. +DataTypePtr getDataTypeFromJSONField(const String & field); + +/// Read row in JSONEachRow format and try to determine type for each field. +/// Return map {column_name : type}. +/// If cannot determine the type of some field, return nullptr for it. +std::unordered_map readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings); + +/// Read row in JSONCompactEachRow format and try to determine type for each field. +/// If cannot determine the type of some field, return nullptr for it. +DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings); + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf); bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings); diff --git a/src/Formats/ParsedTemplateFormatString.cpp b/src/Formats/ParsedTemplateFormatString.cpp index 4966420f05b..8d1b987d01a 100644 --- a/src/Formats/ParsedTemplateFormatString.cpp +++ b/src/Formats/ParsedTemplateFormatString.cpp @@ -14,14 +14,14 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT; } -ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name) +ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes) { ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); String format_string; readStringUntilEOF(format_string, schema_file); try { - parse(format_string, idx_by_name); + parse(format_string, idx_by_name, allow_indexes); } catch (DB::Exception & e) { @@ -33,7 +33,7 @@ ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & } -void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name) +void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes) { enum ParserState { @@ -100,6 +100,8 @@ void ParsedTemplateFormatString::parse(const String & format_string, const Colum column_idx = strtoull(column_names.back().c_str(), &col_idx_end, 10); if (col_idx_end != column_names.back().c_str() + column_names.back().size() || errno) column_idx = idx_by_name(column_names.back()); + else if (!allow_indexes) + throw Exception(ErrorCodes::INVALID_TEMPLATE_FORMAT, "Indexes instead of names are not allowed"); } format_idx_to_column_idx.emplace_back(column_idx); break; diff --git a/src/Formats/ParsedTemplateFormatString.h b/src/Formats/ParsedTemplateFormatString.h index ba0ebdf5aa8..c5617d0f0ef 100644 --- a/src/Formats/ParsedTemplateFormatString.h +++ b/src/Formats/ParsedTemplateFormatString.h @@ -31,9 +31,9 @@ struct ParsedTemplateFormatString typedef std::function(const String &)> ColumnIdxGetter; ParsedTemplateFormatString() = default; - ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name); + ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true); - void parse(const String & format_string, const ColumnIdxGetter & idx_by_name); + void parse(const String & format_string, const ColumnIdxGetter & idx_by_name, bool allow_indexes = true); static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s); size_t columnsCount() const; diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index 5232b76b7fe..b59db12a16c 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -24,6 +24,7 @@ # include # include # include +# include # include # include # include @@ -56,6 +57,7 @@ namespace ErrorCodes extern const int PROTOBUF_FIELD_NOT_REPEATED; extern const int PROTOBUF_BAD_CAST; extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } namespace @@ -3017,10 +3019,8 @@ namespace { std::vector column_names_used; column_names_used.reserve(used_column_indices_in_nested.size()); - for (size_t i : used_column_indices_in_nested) column_names_used.emplace_back(nested_column_names[i]); - auto field_serializer = std::make_unique( std::move(column_names_used), field_descriptor, std::move(nested_message_serializer), get_root_desc_function); transformColumnIndices(used_column_indices_in_nested, nested_column_indices); @@ -3230,8 +3230,105 @@ namespace std::function get_root_desc_function; std::shared_ptr root_serializer_ptr; }; -} + template + DataTypePtr getEnumDataType(const google::protobuf::EnumDescriptor * enum_descriptor) + { + std::vector> values; + for (int i = 0; i != enum_descriptor->value_count(); ++i) + { + const auto * enum_value_descriptor = enum_descriptor->value(i); + values.emplace_back(enum_value_descriptor->name(), enum_value_descriptor->number()); + } + return std::make_shared>(std::move(values)); + } + + NameAndTypePair getNameAndDataTypeFromField(const google::protobuf::FieldDescriptor * field_descriptor, bool allow_repeat = true) + { + if (allow_repeat && field_descriptor->is_map()) + { + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false); + const auto * tuple_type = assert_cast(name_and_type.type.get()); + return {name_and_type.name, std::make_shared(tuple_type->getElements())}; + } + + if (allow_repeat && field_descriptor->is_repeated()) + { + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false); + return {name_and_type.name, std::make_shared(name_and_type.type)}; + } + + switch (field_descriptor->type()) + { + case FieldTypeId::TYPE_SFIXED32: [[fallthrough]]; + case FieldTypeId::TYPE_SINT32: [[fallthrough]]; + case FieldTypeId::TYPE_INT32: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_SFIXED64: [[fallthrough]]; + case FieldTypeId::TYPE_SINT64: [[fallthrough]]; + case FieldTypeId::TYPE_INT64: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_BOOL: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_FLOAT: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_DOUBLE: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_UINT32: [[fallthrough]]; + case FieldTypeId::TYPE_FIXED32: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_UINT64: [[fallthrough]]; + case FieldTypeId::TYPE_FIXED64: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_BYTES: [[fallthrough]]; + case FieldTypeId::TYPE_STRING: + return {field_descriptor->name(), std::make_shared()}; + case FieldTypeId::TYPE_ENUM: + { + const auto * enum_descriptor = field_descriptor->enum_type(); + if (enum_descriptor->value_count() == 0) + throw Exception("Empty enum field", ErrorCodes::BAD_ARGUMENTS); + int max_abs = std::abs(enum_descriptor->value(0)->number()); + for (int i = 1; i != enum_descriptor->value_count(); ++i) + { + if (std::abs(enum_descriptor->value(i)->number()) > max_abs) + max_abs = std::abs(enum_descriptor->value(i)->number()); + } + if (max_abs < 128) + return {field_descriptor->name(), getEnumDataType(enum_descriptor)}; + else if (max_abs < 32768) + return {field_descriptor->name(), getEnumDataType(enum_descriptor)}; + else + throw Exception("ClickHouse supports only 8-bit and 16-bit enums", ErrorCodes::BAD_ARGUMENTS); + } + case FieldTypeId::TYPE_GROUP: [[fallthrough]]; + case FieldTypeId::TYPE_MESSAGE: + { + const auto * message_descriptor = field_descriptor->message_type(); + if (message_descriptor->field_count() == 1) + { + const auto * nested_field_descriptor = message_descriptor->field(0); + auto nested_name_and_type = getNameAndDataTypeFromField(nested_field_descriptor); + return {field_descriptor->name() + "_" + nested_name_and_type.name, nested_name_and_type.type}; + } + else + { + DataTypes nested_types; + Strings nested_names; + for (int i = 0; i != message_descriptor->field_count(); ++i) + { + auto nested_name_and_type = getNameAndDataTypeFromField(message_descriptor->field(i)); + nested_types.push_back(nested_name_and_type.type); + nested_names.push_back(nested_name_and_type.name); + } + return {field_descriptor->name(), std::make_shared(std::move(nested_types), std::move(nested_names))}; + } + } + } + + __builtin_unreachable(); + } +} std::unique_ptr ProtobufSerializer::create( const Strings & column_names, @@ -3254,5 +3351,14 @@ std::unique_ptr ProtobufSerializer::create( std::vector missing_column_indices; return ProtobufSerializerBuilder(writer).buildMessageSerializer(column_names, data_types, missing_column_indices, message_descriptor, with_length_delimiter); } + +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor) +{ + NamesAndTypesList schema; + for (int i = 0; i != message_descriptor->field_count(); ++i) + schema.push_back(getNameAndDataTypeFromField(message_descriptor->field(i))); + return schema; +} + } #endif diff --git a/src/Formats/ProtobufSerializer.h b/src/Formats/ProtobufSerializer.h index 3eaca6a18d6..d9bed913517 100644 --- a/src/Formats/ProtobufSerializer.h +++ b/src/Formats/ProtobufSerializer.h @@ -4,6 +4,7 @@ #if USE_PROTOBUF # include +#include namespace google::protobuf { class Descriptor; } @@ -48,5 +49,7 @@ public: ProtobufWriter & writer); }; +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor); + } #endif diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp new file mode 100644 index 00000000000..37067eae64f --- /dev/null +++ b/src/Formats/ReadSchemaUtils.cpp @@ -0,0 +1,112 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int BAD_ARGUMENTS; +} + +ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context) +{ + NamesAndTypesList names_and_types; + if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name)) + { + auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings); + try + { + names_and_types = external_schema_reader->readSchema(); + } + catch (const DB::Exception & e) + { + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message()); + } + } + else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name)) + { + auto read_buf = read_buffer_creator(); + if (read_buf->eof()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file, file is empty", format_name); + + auto schema_reader = FormatFactory::instance().getSchemaReader(format_name, *read_buf, context, format_settings); + try + { + names_and_types = schema_reader->readSchema(); + } + catch (const DB::Exception & e) + { + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message()); + } + } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} file format doesn't support schema inference", format_name); + + return ColumnsDescription(names_and_types); +} + +DataTypePtr generalizeDataType(DataTypePtr type) +{ + WhichDataType which(type); + + if (which.isNothing()) + return nullptr; + + if (which.isNullable()) + { + const auto * nullable_type = assert_cast(type.get()); + return generalizeDataType(nullable_type->getNestedType()); + } + + if (isNumber(type)) + return makeNullable(std::make_shared()); + + if (which.isArray()) + { + const auto * array_type = assert_cast(type.get()); + auto nested_type = generalizeDataType(array_type->getNestedType()); + return nested_type ? std::make_shared(nested_type) : nullptr; + } + + if (which.isTuple()) + { + const auto * tuple_type = assert_cast(type.get()); + DataTypes nested_types; + for (const auto & element : tuple_type->getElements()) + { + auto nested_type = generalizeDataType(element); + if (!nested_type) + return nullptr; + nested_types.push_back(nested_type); + } + return std::make_shared(std::move(nested_types)); + } + + if (which.isMap()) + { + const auto * map_type = assert_cast(type.get()); + auto key_type = removeNullable(generalizeDataType(map_type->getKeyType())); + auto value_type = generalizeDataType(map_type->getValueType()); + return key_type && value_type ? std::make_shared(key_type, value_type) : nullptr; + } + + if (which.isLowCarnality()) + { + const auto * lc_type = assert_cast(type.get()); + auto nested_type = generalizeDataType(lc_type->getDictionaryType()); + return nested_type ? std::make_shared(nested_type) : nullptr; + } + + return makeNullable(type); +} + +} diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h new file mode 100644 index 00000000000..fb43acc3cd6 --- /dev/null +++ b/src/Formats/ReadSchemaUtils.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Try to determine the schema of the data in specifying format. +/// For formats that have an external schema reader, it will +/// use it and won't create a read buffer. +/// For formats that have a schema reader from the data, +/// read buffer will be created by the provided creator and +/// the schema will be extracted from the data. +/// If format doesn't have any schema reader or a schema reader +/// couldn't determine the schema, an exception will be thrown. +using ReadBufferCreator = std::function()>; +ColumnsDescription readSchemaFromFormat(const String & format_name, const std::optional & format_settings, ReadBufferCreator read_buffer_creator, ContextPtr context); + +/// Convert type to the most general type: +/// - IntN, UIntN, FloatN, Decimal -> Float64 +/// - Type -> Nullable(type) +/// - Array(Type) -> Array(Nullable(Type)) +/// - Tuple(Type1, ..., TypeN) -> Tuple(Nullable(Type1), ..., Nullable(TypeN)) +/// - Map(KeyType, ValueType) -> Map(KeyType, Nullable(ValueType)) +/// - LowCardinality(Type) -> LowCardinality(Nullable(Type)) +/// If type is Nothing or one of the nested types is Nothing, return nullptr. +DataTypePtr generalizeDataType(DataTypePtr type); + +} diff --git a/src/Formats/config_formats.h.in b/src/Formats/config_formats.h.in index f6497b4830b..427abc7d1ce 100644 --- a/src/Formats/config_formats.h.in +++ b/src/Formats/config_formats.h.in @@ -10,4 +10,3 @@ #cmakedefine01 USE_ARROW #cmakedefine01 USE_PROTOBUF #cmakedefine01 USE_MSGPACK - diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index 7425c6898de..289b5965455 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -81,6 +81,28 @@ void registerInputFormatCapnProto(FormatFactory & factory); void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory); void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory); +void registerArrowSchemaReader(FormatFactory & factory); +void registerParquetSchemaReader(FormatFactory & factory); +void registerORCSchemaReader(FormatFactory & factory); +void registerTSVSchemaReader(FormatFactory & factory); +void registerCSVSchemaReader(FormatFactory & factory); +void registerJSONCompactEachRowSchemaReader(FormatFactory & factory); +void registerJSONEachRowSchemaReader(FormatFactory & factory); +void registerNativeSchemaReader(FormatFactory & factory); +void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory); +void registerAvroSchemaReader(FormatFactory & factory); +void registerProtobufSchemaReader(FormatFactory & factory); +void registerLineAsStringSchemaReader(FormatFactory & factory); +void registerJSONAsStringSchemaReader(FormatFactory & factory); +void registerRawBLOBSchemaReader(FormatFactory & factory); +void registerMsgPackSchemaReader(FormatFactory & factory); +void registerCapnProtoSchemaReader(FormatFactory & factory); +void registerCustomSeparatedSchemaReader(FormatFactory & factory); +void registerRegexpSchemaReader(FormatFactory & factory); +void registerTSKVSchemaReader(FormatFactory & factory); +void registerValuesSchemaReader(FormatFactory & factory); +void registerTemplateSchemaReader(FormatFactory & factory); + void registerFormats() { auto & factory = FormatFactory::instance(); @@ -152,6 +174,38 @@ void registerFormats() registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory); registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory); + + registerArrowSchemaReader(factory); + registerParquetSchemaReader(factory); + registerORCSchemaReader(factory); + registerTSVSchemaReader(factory); + registerCSVSchemaReader(factory); + registerJSONCompactEachRowSchemaReader(factory); + registerJSONEachRowSchemaReader(factory); + registerNativeSchemaReader(factory); + registerRowBinaryWithNamesAndTypesSchemaReader(factory); + registerAvroSchemaReader(factory); + registerProtobufSchemaReader(factory); + registerLineAsStringSchemaReader(factory); + registerJSONAsStringSchemaReader(factory); + registerRawBLOBSchemaReader(factory); + registerMsgPackSchemaReader(factory); + registerCapnProtoSchemaReader(factory); + registerCustomSeparatedSchemaReader(factory); + registerRegexpSchemaReader(factory); + registerTSKVSchemaReader(factory); + registerValuesSchemaReader(factory); + registerTemplateSchemaReader(factory); + + factory.registerFileExtension("csv", "CSV"); + factory.registerFileExtension("tsv", "TSV"); + factory.registerFileExtension("parquet", "Parquet"); + factory.registerFileExtension("orc", "ORC"); + factory.registerFileExtension("native", "Native"); + factory.registerFileExtension("json", "JSON"); + factory.registerFileExtension("ndjson", "JSONEachRow"); + factory.registerFileExtension("xml", "XML"); + factory.registerFileExtension("avro", "Avro"); } } diff --git a/src/Functions/CustomWeekTransforms.h b/src/Functions/CustomWeekTransforms.h index 5ccb2e06c44..8656f9da927 100644 --- a/src/Functions/CustomWeekTransforms.h +++ b/src/Functions/CustomWeekTransforms.h @@ -76,7 +76,7 @@ struct ToStartOfWeekImpl } static inline UInt16 execute(UInt16 d, UInt8 week_mode, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfWeek(ExtendedDayNum(d), week_mode); + return time_zone.toFirstDayNumOfWeek(DayNum(d), week_mode); } using FactorTransform = ZeroTransform; diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index 08dac9c2ba0..a7f06689820 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -84,7 +84,8 @@ struct ToDate32Impl } static inline Int32 execute(UInt32 t, const DateLUTImpl & time_zone) { - return Int32(time_zone.toDayNum(t)); + /// Don't saturate. + return Int32(time_zone.toDayNum(t)); } static inline Int32 execute(Int32 d, const DateLUTImpl &) { @@ -117,7 +118,7 @@ struct ToStartOfDayImpl } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDate(ExtendedDayNum(d)); + return time_zone.toDate(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -143,7 +144,7 @@ struct ToMondayImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfWeek(ExtendedDayNum(d)); + return time_zone.toFirstDayNumOfWeek(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -167,7 +168,7 @@ struct ToStartOfMonthImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfMonth(ExtendedDayNum(d)); + return time_zone.toFirstDayNumOfMonth(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -191,7 +192,7 @@ struct ToStartOfQuarterImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfQuarter(ExtendedDayNum(d)); + return time_zone.toFirstDayNumOfQuarter(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -215,7 +216,7 @@ struct ToStartOfYearImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfYear(ExtendedDayNum(d)); + return time_zone.toFirstDayNumOfYear(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -224,7 +225,7 @@ struct ToStartOfYearImpl struct ToTimeImpl { - /// When transforming to time, the date will be equated to 1970-01-01. + /// When transforming to time, the date will be equated to 1970-01-02. static constexpr auto name = "toTime"; static UInt32 execute(const DecimalUtils::DecimalComponents & t, const DateLUTImpl & time_zone) @@ -456,7 +457,7 @@ struct ToYearImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toYear(ExtendedDayNum(d)); + return time_zone.toYear(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -480,7 +481,7 @@ struct ToQuarterImpl } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toQuarter(ExtendedDayNum(d)); + return time_zone.toQuarter(DayNum(d)); } using FactorTransform = ToStartOfYearImpl; @@ -504,7 +505,7 @@ struct ToMonthImpl } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toMonth(ExtendedDayNum(d)); + return time_zone.toMonth(DayNum(d)); } using FactorTransform = ToStartOfYearImpl; @@ -528,7 +529,7 @@ struct ToDayOfMonthImpl } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDayOfMonth(ExtendedDayNum(d)); + return time_zone.toDayOfMonth(DayNum(d)); } using FactorTransform = ToStartOfMonthImpl; @@ -552,7 +553,7 @@ struct ToDayOfWeekImpl } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDayOfWeek(ExtendedDayNum(d)); + return time_zone.toDayOfWeek(DayNum(d)); } using FactorTransform = ToMondayImpl; @@ -576,7 +577,7 @@ struct ToDayOfYearImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toDayOfYear(ExtendedDayNum(d)); + return time_zone.toDayOfYear(DayNum(d)); } using FactorTransform = ToStartOfYearImpl; @@ -699,7 +700,7 @@ struct ToISOYearImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toISOYear(ExtendedDayNum(d)); + return time_zone.toISOYear(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -723,7 +724,7 @@ struct ToStartOfISOYearImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toFirstDayNumOfISOYear(ExtendedDayNum(d)); + return time_zone.toFirstDayNumOfISOYear(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -747,7 +748,7 @@ struct ToISOWeekImpl } static inline UInt8 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toISOWeek(ExtendedDayNum(d)); + return time_zone.toISOWeek(DayNum(d)); } using FactorTransform = ToISOYearImpl; @@ -771,7 +772,7 @@ struct ToRelativeYearNumImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toYear(ExtendedDayNum(d)); + return time_zone.toYear(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -795,7 +796,7 @@ struct ToRelativeQuarterNumImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeQuarterNum(ExtendedDayNum(d)); + return time_zone.toRelativeQuarterNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -819,7 +820,7 @@ struct ToRelativeMonthNumImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeMonthNum(ExtendedDayNum(d)); + return time_zone.toRelativeMonthNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -843,7 +844,7 @@ struct ToRelativeWeekNumImpl } static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeWeekNum(ExtendedDayNum(d)); + return time_zone.toRelativeWeekNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -892,7 +893,7 @@ struct ToRelativeHourNumImpl } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeHourNum(ExtendedDayNum(d)); + return time_zone.toRelativeHourNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -916,7 +917,7 @@ struct ToRelativeMinuteNumImpl } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toRelativeMinuteNum(ExtendedDayNum(d)); + return time_zone.toRelativeMinuteNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -940,7 +941,7 @@ struct ToRelativeSecondNumImpl } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(ExtendedDayNum(d)); + return time_zone.fromDayNum(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -960,11 +961,11 @@ struct ToYYYYMMImpl } static inline UInt32 execute(Int32 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMM(static_cast(d)); + return time_zone.toNumYYYYMM(ExtendedDayNum(d)); } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMM(static_cast(d)); + return time_zone.toNumYYYYMM(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -984,11 +985,11 @@ struct ToYYYYMMDDImpl } static inline UInt32 execute(Int32 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMMDD(static_cast(d)); + return time_zone.toNumYYYYMMDD(ExtendedDayNum(d)); } static inline UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMMDD(static_cast(d)); + return time_zone.toNumYYYYMMDD(DayNum(d)); } using FactorTransform = ZeroTransform; @@ -1008,11 +1009,11 @@ struct ToYYYYMMDDhhmmssImpl } static inline UInt64 execute(Int32 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMMDDhhmmss(time_zone.toDate(static_cast(d))); + return time_zone.toNumYYYYMMDDhhmmss(time_zone.toDate(ExtendedDayNum(d))); } static inline UInt64 execute(UInt16 d, const DateLUTImpl & time_zone) { - return time_zone.toNumYYYYMMDDhhmmss(time_zone.toDate(static_cast(d))); + return time_zone.toNumYYYYMMDDhhmmss(time_zone.toDate(DayNum(d))); } using FactorTransform = ZeroTransform; diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h index 4224a74ae8e..8f6b1370935 100644 --- a/src/Functions/FunctionDateOrDateTimeAddInterval.h +++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h @@ -58,7 +58,7 @@ struct AddSecondsImpl } static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(ExtendedDayNum(d)) + delta; + return time_zone.fromDayNum(DayNum(d)) + delta; } }; @@ -83,7 +83,7 @@ struct AddMinutesImpl } static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(ExtendedDayNum(d)) + delta * 60; + return time_zone.fromDayNum(DayNum(d)) + delta * 60; } }; @@ -107,7 +107,7 @@ struct AddHoursImpl } static inline NO_SANITIZE_UNDEFINED UInt32 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.fromDayNum(ExtendedDayNum(d)) + delta * 3600; + return time_zone.fromDayNum(DayNum(d)) + delta * 3600; } }; @@ -180,7 +180,7 @@ struct AddMonthsImpl static inline UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.addMonths(ExtendedDayNum(d), delta); + return time_zone.addMonths(DayNum(d), delta); } static inline Int32 execute(Int32 d, Int64 delta, const DateLUTImpl & time_zone) @@ -206,7 +206,7 @@ struct AddQuartersImpl static inline UInt16 execute(UInt16 d, Int32 delta, const DateLUTImpl & time_zone) { - return time_zone.addQuarters(ExtendedDayNum(d), delta); + return time_zone.addQuarters(DayNum(d), delta); } static inline Int32 execute(Int32 d, Int32 delta, const DateLUTImpl & time_zone) @@ -232,7 +232,7 @@ struct AddYearsImpl static inline UInt16 execute(UInt16 d, Int64 delta, const DateLUTImpl & time_zone) { - return time_zone.addYears(ExtendedDayNum(d), delta); + return time_zone.addYears(DayNum(d), delta); } static inline Int32 execute(Int32 d, Int64 delta, const DateLUTImpl & time_zone) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 8018fa8e726..62e62b5f5dc 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -1835,6 +1835,8 @@ public: size_t getNumberOfArguments() const override { return 0; } bool useDefaultImplementationForConstants() const override { return true; } + bool canBeExecutedOnDefaultArguments() const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override diff --git a/src/Functions/FunctionsTimeWindow.h b/src/Functions/FunctionsTimeWindow.h index 6e5d79fd062..313de10702d 100644 --- a/src/Functions/FunctionsTimeWindow.h +++ b/src/Functions/FunctionsTimeWindow.h @@ -48,7 +48,7 @@ struct ToStartOfTransform; template <> \ struct ToStartOfTransform \ { \ - static ExtendedDayNum execute(UInt32 t, UInt64 delta, const DateLUTImpl & time_zone) \ + static auto execute(UInt32 t, UInt64 delta, const DateLUTImpl & time_zone) \ { \ return time_zone.toStartOf##INTERVAL_KIND##Interval(time_zone.toDayNum(t), delta); \ } \ @@ -89,7 +89,7 @@ struct ToStartOfTransform; template <> \ struct AddTime \ { \ - static inline ExtendedDayNum execute(UInt16 d, UInt64 delta, const DateLUTImpl & time_zone) \ + static inline auto execute(UInt16 d, UInt64 delta, const DateLUTImpl & time_zone) \ { \ return time_zone.add##INTERVAL_KIND##s(ExtendedDayNum(d), delta); \ } \ diff --git a/src/Functions/GatherUtils/Algorithms.h b/src/Functions/GatherUtils/Algorithms.h index fc54eaf88ab..046e2dcf70f 100644 --- a/src/Functions/GatherUtils/Algorithms.h +++ b/src/Functions/GatherUtils/Algorithms.h @@ -347,18 +347,31 @@ void NO_INLINE sliceDynamicOffsetUnbounded(Source && src, Sink && sink, const IC } } -template -void NO_INLINE sliceDynamicOffsetBounded(Source && src, Sink && sink, const IColumn & offset_column, const IColumn & length_column) -{ - const bool is_offset_null = offset_column.onlyNull(); - const auto * offset_nullable = typeid_cast(&offset_column); - const ColumnUInt8::Container * offset_null_map = offset_nullable ? &offset_nullable->getNullMapData() : nullptr; - const IColumn * offset_nested_column = offset_nullable ? &offset_nullable->getNestedColumn() : &offset_column; - const bool is_length_null = length_column.onlyNull(); - const auto * length_nullable = typeid_cast(&length_column); - const ColumnUInt8::Container * length_null_map = length_nullable ? &length_nullable->getNullMapData() : nullptr; - const IColumn * length_nested_column = length_nullable ? &length_nullable->getNestedColumn() : &length_column; +template +static void sliceDynamicOffsetBoundedImpl(Source && src, Sink && sink, const IColumn * offset_column, const IColumn * length_column) +{ + const bool is_offset_null = !offset_column || offset_column->onlyNull(); + const ColumnUInt8::Container * offset_null_map = nullptr; + const IColumn * offset_nested_column = nullptr; + + if (!is_offset_null) + { + const auto * offset_nullable = typeid_cast(offset_column); + offset_null_map = offset_nullable ? &offset_nullable->getNullMapData() : nullptr; + offset_nested_column = offset_nullable ? &offset_nullable->getNestedColumn() : offset_column; + } + + const bool is_length_null = !length_column || length_column->onlyNull(); + const ColumnUInt8::Container * length_null_map = nullptr; + const IColumn * length_nested_column = nullptr; + + if (!is_length_null) + { + const auto * length_nullable = typeid_cast(length_column); + length_null_map = length_nullable ? &length_nullable->getNullMapData() : nullptr; + length_nested_column = length_nullable ? &length_nullable->getNestedColumn() : length_column; + } while (!src.isEnd()) { @@ -376,9 +389,19 @@ void NO_INLINE sliceDynamicOffsetBounded(Source && src, Sink && sink, const ICol typename std::decay_t::Slice slice; if (offset > 0) - slice = src.getSliceFromLeft(offset - 1, size); + { + if constexpr (inverse) + slice = src.getSliceFromRight(UInt64(size) + UInt64(offset) - 1, size); + else + slice = src.getSliceFromLeft(UInt64(offset) - 1, size); + } else - slice = src.getSliceFromRight(-UInt64(offset), size); + { + if constexpr (inverse) + slice = src.getSliceFromLeft(-UInt64(offset), size); + else + slice = src.getSliceFromRight(-UInt64(offset), size); + } writeSlice(slice, sink); } @@ -389,6 +412,26 @@ void NO_INLINE sliceDynamicOffsetBounded(Source && src, Sink && sink, const ICol } +template +void NO_INLINE sliceDynamicOffsetBounded(Source && src, Sink && sink, const IColumn & offset_column, const IColumn & length_column) +{ + sliceDynamicOffsetBoundedImpl(std::forward(src), std::forward(sink), &offset_column, &length_column); +} + +/// Similar to above, but with no offset. +template +void NO_INLINE sliceFromLeftDynamicLength(Source && src, Sink && sink, const IColumn & length_column) +{ + sliceDynamicOffsetBoundedImpl(std::forward(src), std::forward(sink), nullptr, &length_column); +} + +template +void NO_INLINE sliceFromRightDynamicLength(Source && src, Sink && sink, const IColumn & length_column) +{ + sliceDynamicOffsetBoundedImpl(std::forward(src), std::forward(sink), nullptr, &length_column); +} + + template void NO_INLINE conditional(SourceA && src_a, SourceB && src_b, Sink && sink, const PaddedPODArray & condition) { @@ -593,6 +636,7 @@ bool insliceEqualElements(const NumericArraySlice & first [[maybe_unused]], else return accurate::equalsOp(first.data[first_ind], first.data[second_ind]); } + inline ALWAYS_INLINE bool insliceEqualElements(const GenericArraySlice & first, size_t first_ind, size_t second_ind) { return first.elements->compareAt(first_ind + first.begin, second_ind + first.begin, *first.elements, -1) == 0; diff --git a/src/Functions/GatherUtils/GatherUtils.h b/src/Functions/GatherUtils/GatherUtils.h index c2513214a79..8a623caa297 100644 --- a/src/Functions/GatherUtils/GatherUtils.h +++ b/src/Functions/GatherUtils/GatherUtils.h @@ -32,9 +32,9 @@ namespace DB::GatherUtils enum class ArraySearchType { - Any, // Corresponds to the hasAny array function - All, // Corresponds to the hasAll array function - Substr // Corresponds to the hasSubstr array function + Any, // Corresponds to the hasAny array function + All, // Corresponds to the hasAll array function + Substr // Corresponds to the hasSubstr array function }; std::unique_ptr createArraySource(const ColumnArray & col, bool is_const, size_t total_rows); @@ -52,6 +52,9 @@ ColumnArray::MutablePtr sliceFromRightConstantOffsetBounded(IArraySource & src, ColumnArray::MutablePtr sliceDynamicOffsetUnbounded(IArraySource & src, const IColumn & offset_column); ColumnArray::MutablePtr sliceDynamicOffsetBounded(IArraySource & src, const IColumn & offset_column, const IColumn & length_column); +ColumnArray::MutablePtr sliceFromLeftDynamicLength(IArraySource & src, const IColumn & length_column); +ColumnArray::MutablePtr sliceFromRightDynamicLength(IArraySource & src, const IColumn & length_column); + void sliceHasAny(IArraySource & first, IArraySource & second, ColumnUInt8 & result); void sliceHasAll(IArraySource & first, IArraySource & second, ColumnUInt8 & result); void sliceHasSubstr(IArraySource & first, IArraySource & second, ColumnUInt8 & result); diff --git a/src/Functions/GatherUtils/Sources.h b/src/Functions/GatherUtils/Sources.h index c8014d3e855..7d1241be7d1 100644 --- a/src/Functions/GatherUtils/Sources.h +++ b/src/Functions/GatherUtils/Sources.h @@ -358,6 +358,11 @@ struct UTF8StringSource : public StringSource return pos; } + size_t getElementSize() const + { + return UTF8::countCodePoints(&elements[prev_offset], StringSource::getElementSize()); + } + Slice getSliceFromLeft(size_t offset) const { const auto * begin = &elements[prev_offset]; diff --git a/src/Functions/GatherUtils/sliceFromLeftDynamicLength.cpp b/src/Functions/GatherUtils/sliceFromLeftDynamicLength.cpp new file mode 100644 index 00000000000..b704f7ada7d --- /dev/null +++ b/src/Functions/GatherUtils/sliceFromLeftDynamicLength.cpp @@ -0,0 +1,60 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + +#include "GatherUtils.h" +#include "Selectors.h" +#include "Algorithms.h" + +namespace DB::GatherUtils +{ + +namespace +{ + +struct Selector : public ArraySourceSelector +{ + template + static void selectSource(bool is_const, bool is_nullable, Source && source, + const IColumn & length_column, ColumnArray::MutablePtr & result) + { + using SourceType = typename std::decay::type; + using Sink = typename SourceType::SinkType; + + if (is_nullable) + { + using NullableSource = NullableArraySource; + using NullableSink = typename NullableSource::SinkType; + + auto & nullable_source = static_cast(source); + + result = ColumnArray::create(nullable_source.createValuesColumn()); + NullableSink sink(result->getData(), result->getOffsets(), source.getColumnSize()); + + if (is_const) + sliceFromLeftDynamicLength(static_cast &>(source), sink, length_column); + else + sliceFromLeftDynamicLength(static_cast(source), sink, length_column); + } + else + { + result = ColumnArray::create(source.createValuesColumn()); + Sink sink(result->getData(), result->getOffsets(), source.getColumnSize()); + + if (is_const) + sliceFromLeftDynamicLength(static_cast &>(source), sink, length_column); + else + sliceFromLeftDynamicLength(source, sink, length_column); + } + } +}; + +} + +ColumnArray::MutablePtr sliceFromLeftDynamicLength(IArraySource & src, const IColumn & length_column) +{ + ColumnArray::MutablePtr res; + Selector::select(src, length_column, res); + return res; +} +} + +#endif diff --git a/src/Functions/GatherUtils/sliceFromRightDynamicLength.cpp b/src/Functions/GatherUtils/sliceFromRightDynamicLength.cpp new file mode 100644 index 00000000000..1db86b4fda9 --- /dev/null +++ b/src/Functions/GatherUtils/sliceFromRightDynamicLength.cpp @@ -0,0 +1,60 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + +#include "GatherUtils.h" +#include "Selectors.h" +#include "Algorithms.h" + +namespace DB::GatherUtils +{ + +namespace +{ + +struct Selector : public ArraySourceSelector +{ + template + static void selectSource(bool is_const, bool is_nullable, Source && source, + const IColumn & length_column, ColumnArray::MutablePtr & result) + { + using SourceType = typename std::decay::type; + using Sink = typename SourceType::SinkType; + + if (is_nullable) + { + using NullableSource = NullableArraySource; + using NullableSink = typename NullableSource::SinkType; + + auto & nullable_source = static_cast(source); + + result = ColumnArray::create(nullable_source.createValuesColumn()); + NullableSink sink(result->getData(), result->getOffsets(), source.getColumnSize()); + + if (is_const) + sliceFromRightDynamicLength(static_cast &>(source), sink, length_column); + else + sliceFromRightDynamicLength(static_cast(source), sink, length_column); + } + else + { + result = ColumnArray::create(source.createValuesColumn()); + Sink sink(result->getData(), result->getOffsets(), source.getColumnSize()); + + if (is_const) + sliceFromRightDynamicLength(static_cast &>(source), sink, length_column); + else + sliceFromRightDynamicLength(source, sink, length_column); + } + } +}; + +} + +ColumnArray::MutablePtr sliceFromRightDynamicLength(IArraySource & src, const IColumn & length_column) +{ + ColumnArray::MutablePtr res; + Selector::select(src, length_column, res); + return res; +} +} + +#endif diff --git a/src/Functions/LeftRight.h b/src/Functions/LeftRight.h new file mode 100644 index 00000000000..054e76b7792 --- /dev/null +++ b/src/Functions/LeftRight.h @@ -0,0 +1,145 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +using namespace GatherUtils; + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +enum class SubstringDirection +{ + Left, + Right +}; + +template +class FunctionLeftRight : public IFunction +{ +public: + static constexpr auto name = direction == SubstringDirection::Left + ? (is_utf8 ? "leftUTF8" : "left") + : (is_utf8 ? "rightUTF8" : "right"); + + static FunctionPtr create(ContextPtr) + { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + bool isVariadic() const override { return false; } + size_t getNumberOfArguments() const override { return 2; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if ((is_utf8 && !isString(arguments[0])) || !isStringOrFixedString(arguments[0])) + throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (!isNativeNumber(arguments[1])) + throw Exception("Illegal type " + arguments[1]->getName() + + " of second argument of function " + + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); + } + + template + ColumnPtr executeForSource(const ColumnPtr & column_length, + const ColumnConst * column_length_const, + Int64 length_value, Source && source, + size_t input_rows_count) const + { + auto col_res = ColumnString::create(); + + if constexpr (direction == SubstringDirection::Left) + { + if (column_length_const) + sliceFromLeftConstantOffsetBounded(source, StringSink(*col_res, input_rows_count), 0, length_value); + else + sliceFromLeftDynamicLength(source, StringSink(*col_res, input_rows_count), *column_length); + } + else + { + if (column_length_const) + sliceFromRightConstantOffsetUnbounded(source, StringSink(*col_res, input_rows_count), length_value); + else + sliceFromRightDynamicLength(source, StringSink(*col_res, input_rows_count), *column_length); + } + + return col_res; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + ColumnPtr column_string = arguments[0].column; + ColumnPtr column_length = arguments[1].column; + + const ColumnConst * column_length_const = checkAndGetColumn(column_length.get()); + + Int64 length_value = 0; + + if (column_length_const) + length_value = column_length_const->getInt(0); + + if constexpr (is_utf8) + { + if (const ColumnString * col = checkAndGetColumn(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, UTF8StringSource(*col), input_rows_count); + else if (const ColumnConst * col_const = checkAndGetColumnConst(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, ConstSource(*col_const), input_rows_count); + else + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + if (const ColumnString * col = checkAndGetColumn(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, StringSource(*col), input_rows_count); + else if (const ColumnFixedString * col_fixed = checkAndGetColumn(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, FixedStringSource(*col_fixed), input_rows_count); + else if (const ColumnConst * col_const = checkAndGetColumnConst(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, ConstSource(*col_const), input_rows_count); + else if (const ColumnConst * col_const_fixed = checkAndGetColumnConst(column_string.get())) + return executeForSource(column_length, column_length_const, + length_value, ConstSource(*col_const_fixed), input_rows_count); + else + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + } +}; + +} diff --git a/src/Functions/URL/decodeURLComponent.cpp b/src/Functions/URL/decodeURLComponent.cpp index b6abaab515e..9ed290b1832 100644 --- a/src/Functions/URL/decodeURLComponent.cpp +++ b/src/Functions/URL/decodeURLComponent.cpp @@ -12,7 +12,7 @@ namespace ErrorCodes } /// We assume that size of the dst buf isn't less than src_size. -static size_t decodeURL(const char * src, size_t src_size, char * dst) +static size_t decodeURL(const char * src, size_t src_size, char * dst, bool plus_as_space) { const char * src_prev_pos = src; const char * src_curr_pos = src; @@ -21,12 +21,28 @@ static size_t decodeURL(const char * src, size_t src_size, char * dst) while (true) { - src_curr_pos = find_first_symbols<'%'>(src_curr_pos, src_end); + src_curr_pos = find_first_symbols<'%', '+'>(src_curr_pos, src_end); if (src_curr_pos == src_end) { break; } + else if (*src_curr_pos == '+') + { + if (!plus_as_space) + { + ++src_curr_pos; + continue; + } + size_t bytes_to_copy = src_curr_pos - src_prev_pos; + memcpySmallAllowReadWriteOverflow15(dst_pos, src_prev_pos, bytes_to_copy); + dst_pos += bytes_to_copy; + + ++src_curr_pos; + src_prev_pos = src_curr_pos; + *dst_pos = ' '; + ++dst_pos; + } else if (src_end - src_curr_pos < 3) { src_curr_pos = src_end; @@ -67,6 +83,7 @@ static size_t decodeURL(const char * src, size_t src_size, char * dst) /// Percent decode of URL data. +template struct DecodeURLComponentImpl { static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, @@ -83,7 +100,7 @@ struct DecodeURLComponentImpl { const char * src_data = reinterpret_cast(&data[prev_offset]); size_t src_size = offsets[i] - prev_offset; - size_t dst_size = decodeURL(src_data, src_size, reinterpret_cast(res_data.data() + res_offset)); + size_t dst_size = decodeURL(src_data, src_size, reinterpret_cast(res_data.data() + res_offset), plus_as_space); res_offset += dst_size; res_offsets[i] = res_offset; @@ -101,11 +118,14 @@ struct DecodeURLComponentImpl struct NameDecodeURLComponent { static constexpr auto name = "decodeURLComponent"; }; -using FunctionDecodeURLComponent = FunctionStringToString; +struct NameDecodeURLFormComponent { static constexpr auto name = "decodeURLFormComponent"; }; +using FunctionDecodeURLComponent = FunctionStringToString, NameDecodeURLComponent>; +using FunctionDecodeURLFormComponent = FunctionStringToString, NameDecodeURLFormComponent>; void registerFunctionDecodeURLComponent(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/array/arrayFirst.cpp b/src/Functions/array/arrayFirst.cpp index dbe545ea387..edbf7ef6269 100644 --- a/src/Functions/array/arrayFirst.cpp +++ b/src/Functions/array/arrayFirst.cpp @@ -11,7 +11,14 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; } -struct ArrayFirstImpl +enum class ArrayFirstLastStrategy +{ + First, + Last +}; + +template +struct ArrayFirstLastImpl { static bool needBoolean() { return false; } static bool needExpression() { return true; } @@ -40,15 +47,23 @@ struct ArrayFirstImpl auto out = data.cloneEmpty(); out->reserve(data.size()); - size_t pos{}; - for (auto offset : offsets) + size_t offsets_size = offsets.size(); + for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index) { - if (offset - pos > 0) - out->insert(data[pos]); - else - out->insertDefault(); + size_t start_offset = offsets[offset_index - 1]; + size_t end_offset = offsets[offset_index]; - pos = offset; + if (end_offset > start_offset) + { + if constexpr (strategy == ArrayFirstLastStrategy::First) + out->insert(data[start_offset]); + else + out->insert(data[end_offset - 1]); + } + else + { + out->insertDefault(); + } } return out; @@ -67,18 +82,36 @@ struct ArrayFirstImpl auto out = data.cloneEmpty(); out->reserve(data.size()); - size_t pos{}; - for (auto offset : offsets) + size_t offsets_size = offsets.size(); + for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index) { - auto exists = false; - for (; pos < offset; ++pos) + size_t start_offset = offsets[offset_index - 1]; + size_t end_offset = offsets[offset_index]; + + bool exists = false; + + if constexpr (strategy == ArrayFirstLastStrategy::First) { - if (filter[pos]) + for (; start_offset != end_offset; ++start_offset) { - out->insert(data[pos]); - exists = true; - pos = offset; - break; + if (filter[start_offset]) + { + out->insert(data[start_offset]); + exists = true; + break; + } + } + } + else + { + for (; end_offset != start_offset; --end_offset) + { + if (filter[end_offset - 1]) + { + out->insert(data[end_offset - 1]); + exists = true; + break; + } } } @@ -91,11 +124,17 @@ struct ArrayFirstImpl }; struct NameArrayFirst { static constexpr auto name = "arrayFirst"; }; +using ArrayFirstImpl = ArrayFirstLastImpl; using FunctionArrayFirst = FunctionArrayMapped; +struct NameArrayLast { static constexpr auto name = "arrayLast"; }; +using ArrayLastImpl = ArrayFirstLastImpl; +using FunctionArrayLast = FunctionArrayMapped; + void registerFunctionArrayFirst(FunctionFactory & factory) { factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/array/arrayFirstIndex.cpp b/src/Functions/array/arrayFirstIndex.cpp deleted file mode 100644 index d229687774e..00000000000 --- a/src/Functions/array/arrayFirstIndex.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include -#include "FunctionArrayMapped.h" -#include - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int ILLEGAL_COLUMN; -} - -struct ArrayFirstIndexImpl -{ - static bool needBoolean() { return false; } - static bool needExpression() { return true; } - static bool needOneArray() { return false; } - - static DataTypePtr getReturnType(const DataTypePtr & /*expression_return*/, const DataTypePtr & /*array_element*/) - { - return std::make_shared(); - } - - static ColumnPtr execute(const ColumnArray & array, ColumnPtr mapped) - { - const auto * column_filter = typeid_cast(&*mapped); - - if (!column_filter) - { - const auto * column_filter_const = checkAndGetColumnConst(&*mapped); - - if (!column_filter_const) - throw Exception("Unexpected type of filter column", ErrorCodes::ILLEGAL_COLUMN); - - if (column_filter_const->getValue()) - { - const auto & offsets = array.getOffsets(); - auto out_column = ColumnUInt32::create(offsets.size()); - auto & out_index = out_column->getData(); - - size_t pos{}; - for (size_t i = 0; i < offsets.size(); ++i) - { - out_index[i] = offsets[i] - pos > 0; - pos = offsets[i]; - } - - return out_column; - } - else - return DataTypeUInt32().createColumnConst(array.size(), 0u); - } - - const auto & filter = column_filter->getData(); - const auto & offsets = array.getOffsets(); - auto out_column = ColumnUInt32::create(offsets.size()); - auto & out_index = out_column->getData(); - - size_t pos{}; - for (size_t i = 0; i < offsets.size(); ++i) - { - UInt32 index{}; - for (size_t idx{1}; pos < offsets[i]; ++pos, ++idx) - { - if (filter[pos]) - { - index = idx; - pos = offsets[i]; - break; - } - } - - out_index[i] = index; - } - - return out_column; - } -}; - -struct NameArrayFirstIndex { static constexpr auto name = "arrayFirstIndex"; }; -using FunctionArrayFirstIndex = FunctionArrayMapped; - -void registerFunctionArrayFirstIndex(FunctionFactory & factory) -{ - factory.registerFunction(); -} - -} - diff --git a/src/Functions/array/arrayFirstLastIndex.cpp b/src/Functions/array/arrayFirstLastIndex.cpp new file mode 100644 index 00000000000..467678f3faa --- /dev/null +++ b/src/Functions/array/arrayFirstLastIndex.cpp @@ -0,0 +1,134 @@ +#include +#include +#include "FunctionArrayMapped.h" +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; +} + +enum class ArrayFirstLastIndexStrategy +{ + First, + Last +}; + +template +struct ArrayFirstLastIndexImpl +{ + static bool needBoolean() { return false; } + static bool needExpression() { return true; } + static bool needOneArray() { return false; } + + static DataTypePtr getReturnType(const DataTypePtr & /*expression_return*/, const DataTypePtr & /*array_element*/) + { + return std::make_shared(); + } + + static ColumnPtr execute(const ColumnArray & array, ColumnPtr mapped) + { + const auto * column_filter = typeid_cast(&*mapped); + + if (!column_filter) + { + const auto * column_filter_const = checkAndGetColumnConst(&*mapped); + + if (!column_filter_const) + throw Exception("Unexpected type of filter column", ErrorCodes::ILLEGAL_COLUMN); + + if (column_filter_const->getValue()) + { + const auto & offsets = array.getOffsets(); + auto out_column = ColumnUInt32::create(offsets.size()); + auto & out_index = out_column->getData(); + + size_t offsets_size = offsets.size(); + for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index) + { + size_t start_offset = offsets[offset_index - 1]; + size_t end_offset = offsets[offset_index]; + + if (end_offset > start_offset) + { + if constexpr (strategy == ArrayFirstLastIndexStrategy::First) + out_index[offset_index] = 1; + else + out_index[offset_index] = end_offset - start_offset; + } + else + { + out_index[offset_index] = 0; + } + } + + return out_column; + } + else + { + return DataTypeUInt32().createColumnConst(array.size(), 0u); + } + } + + const auto & filter = column_filter->getData(); + const auto & offsets = array.getOffsets(); + + size_t offsets_size = offsets.size(); + auto out_column = ColumnUInt32::create(offsets_size); + auto & out_index = out_column->getData(); + + for (size_t offset_index = 0; offset_index < offsets_size; ++offset_index) + { + size_t start_offset = offsets[offset_index - 1]; + size_t end_offset = offsets[offset_index]; + size_t result_index = 0; + + if constexpr (strategy == ArrayFirstLastIndexStrategy::First) + { + for (size_t index = 1; start_offset != end_offset; ++start_offset, ++index) + { + if (filter[start_offset]) + { + result_index = index; + break; + } + } + } + else + { + for (size_t index = end_offset - start_offset; end_offset != start_offset; --end_offset, --index) + { + if (filter[end_offset - 1]) + { + result_index = index; + break; + } + } + } + + out_index[offset_index] = result_index; + } + + return out_column; + } +}; + +struct NameArrayFirstIndex { static constexpr auto name = "arrayFirstIndex"; }; +using ArrayFirstIndexImpl = ArrayFirstLastIndexImpl; +using FunctionArrayFirstIndex = FunctionArrayMapped; + +struct NameArrayLastIndex { static constexpr auto name = "arrayLastIndex"; }; +using ArrayLastIndexImpl = ArrayFirstLastIndexImpl; +using FunctionArrayLastIndex = FunctionArrayMapped; + +void registerFunctionArrayFirstIndex(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerFunction(); +} + +} + diff --git a/src/Functions/h3CellAreaM2.cpp b/src/Functions/h3CellAreaM2.cpp new file mode 100644 index 00000000000..d110d0d92f9 --- /dev/null +++ b/src/Functions/h3CellAreaM2.cpp @@ -0,0 +1,90 @@ +#include "config_functions.h" + +#if USE_H3 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +class FunctionH3CellAreaM2 final : public IFunction +{ +public: + static constexpr auto name = "h3CellAreaM2"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto * arg = arguments[0].get(); + if (!WhichDataType(arg).isUInt64()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}. Must be UInt64", + arg->getName(), 1, getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64.", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); + + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + for (size_t row = 0; row < input_rows_count; ++row) + { + const UInt64 index = data[row]; + Float64 res = cellAreaM2(index); + dst_data[row] = res; + } + + return dst; + } +}; + +} + +void registerFunctionH3CellAreaM2(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + +#endif diff --git a/src/Functions/h3CellAreaRads2.cpp b/src/Functions/h3CellAreaRads2.cpp new file mode 100644 index 00000000000..1a257b0d9d3 --- /dev/null +++ b/src/Functions/h3CellAreaRads2.cpp @@ -0,0 +1,90 @@ +#include "config_functions.h" + +#if USE_H3 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +class FunctionH3CellAreaRads2 final : public IFunction +{ +public: + static constexpr auto name = "h3CellAreaRads2"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto * arg = arguments[0].get(); + if (!WhichDataType(arg).isUInt64()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}. Must be UInt64", + arg->getName(), 1, getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt64", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); + + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + for (size_t row = 0; row < input_rows_count; ++row) + { + const UInt64 index = data[row]; + Float64 res = cellAreaRads2(index); + dst_data[row] = res; + } + + return dst; + } +}; + +} + +void registerFunctionH3CellAreaRads2(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + +#endif diff --git a/src/Functions/h3DegsToRads.cpp b/src/Functions/h3DegsToRads.cpp new file mode 100644 index 00000000000..b3afc28f5a2 --- /dev/null +++ b/src/Functions/h3DegsToRads.cpp @@ -0,0 +1,90 @@ +#include "config_functions.h" + +#if USE_H3 + +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +class FunctionH3DegsToRads final : public IFunction +{ +public: + static constexpr auto name = "h3DegsToRads"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + + bool useDefaultImplementationForConstants() const override { return true; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto * arg = arguments[0].get(); + if (!WhichDataType(arg).isFloat64()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}. Must be Float64", + arg->getName(), 1, getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * column = checkAndGetColumn(arguments[0].column.get()); + + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be Float64", + arguments[0].type->getName(), + 1, + getName()); + + const auto & data = column->getData(); + + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + for (size_t row = 0; row < input_rows_count; ++row) + { + const Float64 degrees = data[row]; + auto res = degsToRads(degrees); + dst_data[row] = res; + } + + return dst; + } +}; + +} + +void registerFunctionH3DegsToRads(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + +#endif diff --git a/src/Functions/h3HexAreaKm2.cpp b/src/Functions/h3HexAreaKm2.cpp new file mode 100644 index 00000000000..933fcf21424 --- /dev/null +++ b/src/Functions/h3HexAreaKm2.cpp @@ -0,0 +1,99 @@ +#include "config_functions.h" + +#if USE_H3 + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ARGUMENT_OUT_OF_BOUND; +extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +class FunctionH3HexAreaKm2 final : public IFunction +{ +public: + static constexpr auto name = "h3HexAreaKm2"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto * arg = arguments[0].get(); + if (!WhichDataType(arg).isUInt8()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}. Must be UInt8", + arg->getName(), 1, getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be UInt8", + arguments[0].column->getName(), + 1, + getName()); + + const auto & data = column->getData(); + + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + for (size_t row = 0; row < input_rows_count; ++row) + { + const UInt64 resolution = data[row]; + if (resolution > MAX_H3_RES) + throw Exception( + ErrorCodes::ARGUMENT_OUT_OF_BOUND, + "The argument 'resolution' ({}) of function {} is out of bounds because the maximum resolution in H3 library is ", + resolution, + getName(), + MAX_H3_RES); + + Float64 res = getHexagonAreaAvgKm2(resolution); + dst_data[row] = res; + } + + return dst; + } +}; + +} + +void registerFunctionH3HexAreaKm2(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + +#endif diff --git a/src/Functions/h3RadsToDegs.cpp b/src/Functions/h3RadsToDegs.cpp new file mode 100644 index 00000000000..99b8969e13f --- /dev/null +++ b/src/Functions/h3RadsToDegs.cpp @@ -0,0 +1,88 @@ +#include "config_functions.h" + +#if USE_H3 + +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +class FunctionH3RadsToDegs final : public IFunction +{ +public: + static constexpr auto name = "h3RadsToDegs"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + + bool useDefaultImplementationForConstants() const override { return true; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + const auto * arg = arguments[0].get(); + if (!WhichDataType(arg).isFloat64()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument {} of function {}. Must be Float64", + arg->getName(), 1, getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto * column = checkAndGetColumn(arguments[0].column.get()); + if (!column) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal type {} of argument {} of function {}. Must be Float64", + arguments[0].type->getName(), + 1, + getName()); + + const auto & col_rads = column->getData(); + + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + for (size_t row = 0; row < input_rows_count; ++row) + { + const Float64 rads = col_rads[row]; + auto res = radsToDegs(rads); + dst_data[row] = res; + } + return dst; + } +}; + +} + +void registerFunctionH3RadsToDegs(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} + +#endif diff --git a/src/Functions/left.cpp b/src/Functions/left.cpp new file mode 100644 index 00000000000..aa7a2cdd5a8 --- /dev/null +++ b/src/Functions/left.cpp @@ -0,0 +1,13 @@ +#include +#include + +namespace DB +{ + +void registerFunctionLeft(FunctionFactory & factory) +{ + factory.registerFunction>(FunctionFactory::CaseInsensitive); + factory.registerFunction>(FunctionFactory::CaseSensitive); +} + +} diff --git a/src/Functions/monthName.cpp b/src/Functions/monthName.cpp new file mode 100644 index 00000000000..c397fdffaa5 --- /dev/null +++ b/src/Functions/monthName.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +class FunctionMonthName : public IFunction +{ +public: + static constexpr auto name = "monthName"; + + static constexpr auto month_str = "month"; + + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + + explicit FunctionMonthName(ContextPtr context_) + : function_resolver(FunctionFactory::instance().get("dateName", std::move(context_))) + {} + + String getName() const override { return name; } + + bool useDefaultImplementationForConstants() const override { return true; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + size_t getNumberOfArguments() const override { return 1; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.size() != 1) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 1", + getName(), + toString(arguments.size())); + + WhichDataType argument_type(arguments[0].type); + if (!argument_type.isDate() && !argument_type.isDateTime() && !argument_type.isDateTime64()) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of argument of function {}, should be Date, DateTime or DateTime64", + getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl( + const ColumnsWithTypeAndName & arguments, + const DataTypePtr & result_type, + size_t input_rows_count) const override + { + auto month_column = DataTypeString().createColumnConst(arguments[0].column->size(), month_str); + ColumnsWithTypeAndName temporary_columns + { + ColumnWithTypeAndName(month_column, std::make_shared(), ""), + arguments[0] + }; + + auto date_name_func = function_resolver->build(temporary_columns); + return date_name_func->execute(temporary_columns, result_type, input_rows_count); + } + +private: + FunctionOverloadResolverPtr function_resolver; +}; + +void registerFunctionMonthName(FunctionFactory & factory) +{ + factory.registerFunction(FunctionFactory::CaseInsensitive); +} + +} diff --git a/src/Functions/registerFunctionsDateTime.cpp b/src/Functions/registerFunctionsDateTime.cpp index abbc52c8360..5211a62ff1e 100644 --- a/src/Functions/registerFunctionsDateTime.cpp +++ b/src/Functions/registerFunctionsDateTime.cpp @@ -65,6 +65,7 @@ void registerFunctionSubtractQuarters(FunctionFactory &); void registerFunctionSubtractYears(FunctionFactory &); void registerFunctionDateDiff(FunctionFactory &); void registerFunctionDateName(FunctionFactory &); +void registerFunctionMonthName(FunctionFactory &); void registerFunctionToTimeZone(FunctionFactory &); void registerFunctionFormatDateTime(FunctionFactory &); void registerFunctionFromModifiedJulianDay(FunctionFactory &); @@ -136,6 +137,7 @@ void registerFunctionsDateTime(FunctionFactory & factory) registerFunctionSubtractYears(factory); registerFunctionDateDiff(factory); registerFunctionDateName(factory); + registerFunctionMonthName(factory); registerFunctionToTimeZone(factory); registerFunctionFormatDateTime(factory); registerFunctionFromModifiedJulianDay(factory); diff --git a/src/Functions/registerFunctionsGeo.cpp b/src/Functions/registerFunctionsGeo.cpp index b24dc4cb9c0..d0bb47ea3d7 100644 --- a/src/Functions/registerFunctionsGeo.cpp +++ b/src/Functions/registerFunctionsGeo.cpp @@ -43,6 +43,12 @@ void registerFunctionH3HexAreaM2(FunctionFactory &); void registerFunctionH3IsResClassIII(FunctionFactory &); void registerFunctionH3IsPentagon(FunctionFactory &); void registerFunctionH3GetFaces(FunctionFactory &); +void registerFunctionH3DegsToRads(FunctionFactory &); +void registerFunctionH3RadsToDegs(FunctionFactory &); +void registerFunctionH3HexAreaKm2(FunctionFactory &); +void registerFunctionH3CellAreaM2(FunctionFactory &); +void registerFunctionH3CellAreaRads2(FunctionFactory &); + #endif #if USE_S2_GEOMETRY @@ -99,6 +105,11 @@ void registerFunctionsGeo(FunctionFactory & factory) registerFunctionH3IsResClassIII(factory); registerFunctionH3IsPentagon(factory); registerFunctionH3GetFaces(factory); + registerFunctionH3DegsToRads(factory); + registerFunctionH3RadsToDegs(factory); + registerFunctionH3HexAreaKm2(factory); + registerFunctionH3CellAreaM2(factory); + registerFunctionH3CellAreaRads2(factory); #endif #if USE_S2_GEOMETRY diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index 79002f0a97d..7d1673aff7c 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -23,6 +23,8 @@ void registerFunctionsConcat(FunctionFactory &); void registerFunctionFormat(FunctionFactory &); void registerFunctionFormatRow(FunctionFactory &); void registerFunctionSubstring(FunctionFactory &); +void registerFunctionLeft(FunctionFactory &); +void registerFunctionRight(FunctionFactory &); void registerFunctionCRC(FunctionFactory &); void registerFunctionAppendTrailingCharIfAbsent(FunctionFactory &); void registerFunctionStartsWith(FunctionFactory &); @@ -74,6 +76,8 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionFormat(factory); registerFunctionFormatRow(factory); registerFunctionSubstring(factory); + registerFunctionLeft(factory); + registerFunctionRight(factory); registerFunctionAppendTrailingCharIfAbsent(factory); registerFunctionStartsWith(factory); registerFunctionEndsWith(factory); diff --git a/src/Functions/replicate.h b/src/Functions/replicate.h index 2455fda39c9..6012207980e 100644 --- a/src/Functions/replicate.h +++ b/src/Functions/replicate.h @@ -34,6 +34,8 @@ public: bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override; ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override; diff --git a/src/Functions/right.cpp b/src/Functions/right.cpp new file mode 100644 index 00000000000..ca0df8b2d98 --- /dev/null +++ b/src/Functions/right.cpp @@ -0,0 +1,13 @@ +#include +#include + +namespace DB +{ + +void registerFunctionRight(FunctionFactory & factory) +{ + factory.registerFunction>(FunctionFactory::CaseInsensitive); + factory.registerFunction>(FunctionFactory::CaseSensitive); +} + +} diff --git a/src/Functions/toStartOfInterval.cpp b/src/Functions/toStartOfInterval.cpp index f8ea44851b6..09b7931de8d 100644 --- a/src/Functions/toStartOfInterval.cpp +++ b/src/Functions/toStartOfInterval.cpp @@ -37,7 +37,7 @@ namespace static UInt16 execute(UInt16 d, UInt64 years, const DateLUTImpl & time_zone) { - return time_zone.toStartOfYearInterval(ExtendedDayNum(d), years); + return time_zone.toStartOfYearInterval(DayNum(d), years); } static UInt16 execute(Int32 d, UInt64 years, const DateLUTImpl & time_zone) @@ -63,7 +63,7 @@ namespace static UInt16 execute(UInt16 d, UInt64 quarters, const DateLUTImpl & time_zone) { - return time_zone.toStartOfQuarterInterval(ExtendedDayNum(d), quarters); + return time_zone.toStartOfQuarterInterval(DayNum(d), quarters); } static UInt16 execute(Int32 d, UInt64 quarters, const DateLUTImpl & time_zone) @@ -89,7 +89,7 @@ namespace static UInt16 execute(UInt16 d, UInt64 months, const DateLUTImpl & time_zone) { - return time_zone.toStartOfMonthInterval(ExtendedDayNum(d), months); + return time_zone.toStartOfMonthInterval(DayNum(d), months); } static UInt16 execute(Int32 d, UInt64 months, const DateLUTImpl & time_zone) @@ -115,7 +115,7 @@ namespace static UInt16 execute(UInt16 d, UInt64 weeks, const DateLUTImpl & time_zone) { - return time_zone.toStartOfWeekInterval(ExtendedDayNum(d), weeks); + return time_zone.toStartOfWeekInterval(DayNum(d), weeks); } static UInt16 execute(Int32 d, UInt64 weeks, const DateLUTImpl & time_zone) diff --git a/src/IO/AsynchronousReadBufferFromFile.cpp b/src/IO/AsynchronousReadBufferFromFile.cpp index 9327b80738d..969384cd91c 100644 --- a/src/IO/AsynchronousReadBufferFromFile.cpp +++ b/src/IO/AsynchronousReadBufferFromFile.cpp @@ -30,8 +30,10 @@ AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile( size_t buf_size, int flags, char * existing_memory, - size_t alignment) - : AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, -1, buf_size, existing_memory, alignment), file_name(file_name_) + size_t alignment, + std::optional file_size_) + : AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, -1, buf_size, existing_memory, alignment, file_size_) + , file_name(file_name_) { ProfileEvents::increment(ProfileEvents::FileOpen); @@ -62,10 +64,10 @@ AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile( const std::string & original_file_name, size_t buf_size, char * existing_memory, - size_t alignment) - : - AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, fd_, buf_size, existing_memory, alignment), - file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name) + size_t alignment, + std::optional file_size_) + : AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, fd_, buf_size, existing_memory, alignment, file_size_) + , file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name) { fd_ = -1; } diff --git a/src/IO/AsynchronousReadBufferFromFile.h b/src/IO/AsynchronousReadBufferFromFile.h index d9d5e43e0d4..96834350bab 100644 --- a/src/IO/AsynchronousReadBufferFromFile.h +++ b/src/IO/AsynchronousReadBufferFromFile.h @@ -14,17 +14,25 @@ protected: public: explicit AsynchronousReadBufferFromFile( - AsynchronousReaderPtr reader_, Int32 priority_, - const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1, - char * existing_memory = nullptr, size_t alignment = 0); + AsynchronousReaderPtr reader_, + Int32 priority_, + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt); /// Use pre-opened file descriptor. explicit AsynchronousReadBufferFromFile( - AsynchronousReaderPtr reader_, Int32 priority_, + AsynchronousReaderPtr reader_, + Int32 priority_, int & fd, /// Will be set to -1 if constructor didn't throw and ownership of file descriptor is passed to the object. const std::string & original_file_name = {}, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - char * existing_memory = nullptr, size_t alignment = 0); + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt); ~AsynchronousReadBufferFromFile() override; @@ -48,11 +56,16 @@ private: public: AsynchronousReadBufferFromFileWithDescriptorsCache( - AsynchronousReaderPtr reader_, Int32 priority_, - const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1, - char * existing_memory = nullptr, size_t alignment = 0) - : AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, -1, buf_size, existing_memory, alignment), - file_name(file_name_) + AsynchronousReaderPtr reader_, + Int32 priority_, + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : AsynchronousReadBufferFromFileDescriptor(std::move(reader_), priority_, -1, buf_size, existing_memory, alignment, file_size_) + , file_name(file_name_) { file = OpenedFileCache::instance().get(file_name, flags); fd = file->getFD(); diff --git a/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp b/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp index a27c9035c61..9c92201b3a1 100644 --- a/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp +++ b/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp @@ -44,6 +44,15 @@ std::future AsynchronousReadBufferFromFileDescripto request.offset = file_offset_of_buffer_end; request.priority = priority; + /// This is a workaround of a read pass EOF bug in linux kernel with pread() + if (file_size.has_value() && file_offset_of_buffer_end >= *file_size) + { + return std::async(std::launch::deferred, [] + { + return IAsynchronousReader::Result{ .size = 0, .offset = 0 }; + }); + } + return reader->submit(request); } diff --git a/src/IO/AsynchronousReadBufferFromFileDescriptor.h b/src/IO/AsynchronousReadBufferFromFileDescriptor.h index 50d8f5819fe..2a16148812e 100644 --- a/src/IO/AsynchronousReadBufferFromFileDescriptor.h +++ b/src/IO/AsynchronousReadBufferFromFileDescriptor.h @@ -35,10 +35,18 @@ protected: public: AsynchronousReadBufferFromFileDescriptor( - AsynchronousReaderPtr reader_, Int32 priority_, - int fd_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0) - : ReadBufferFromFileBase(buf_size, existing_memory, alignment), - reader(std::move(reader_)), priority(priority_), required_alignment(alignment), fd(fd_) + AsynchronousReaderPtr reader_, + Int32 priority_, + int fd_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : ReadBufferFromFileBase(buf_size, existing_memory, alignment, file_size_) + , reader(std::move(reader_)) + , priority(priority_) + , required_alignment(alignment) + , fd(fd_) { prefetch_buffer.alignment = alignment; } diff --git a/src/IO/BrotliReadBuffer.cpp b/src/IO/BrotliReadBuffer.cpp index b66bbf45054..77069746153 100644 --- a/src/IO/BrotliReadBuffer.cpp +++ b/src/IO/BrotliReadBuffer.cpp @@ -39,7 +39,7 @@ BrotliReadBuffer::BrotliReadBuffer(std::unique_ptr in_, size_t buf_s , in_data(nullptr) , out_capacity(0) , out_data(nullptr) - , eof(false) + , eof_flag(false) { } @@ -47,7 +47,7 @@ BrotliReadBuffer::~BrotliReadBuffer() = default; bool BrotliReadBuffer::nextImpl() { - if (eof) + if (eof_flag) return false; if (!in_available) @@ -74,7 +74,7 @@ bool BrotliReadBuffer::nextImpl() { if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } else diff --git a/src/IO/BrotliReadBuffer.h b/src/IO/BrotliReadBuffer.h index 0fa999d1de5..44a7dc7ddbd 100644 --- a/src/IO/BrotliReadBuffer.h +++ b/src/IO/BrotliReadBuffer.h @@ -32,7 +32,7 @@ private: size_t out_capacity; uint8_t * out_data; - bool eof; + bool eof_flag; }; } diff --git a/src/IO/Bzip2ReadBuffer.cpp b/src/IO/Bzip2ReadBuffer.cpp index df9a8d5b369..c2060612757 100644 --- a/src/IO/Bzip2ReadBuffer.cpp +++ b/src/IO/Bzip2ReadBuffer.cpp @@ -42,7 +42,7 @@ Bzip2ReadBuffer::Bzip2ReadBuffer(std::unique_ptr in_, size_t buf_siz : BufferWithOwnMemory(buf_size, existing_memory, alignment) , in(std::move(in_)) , bz(std::make_unique()) - , eof(false) + , eof_flag(false) { } @@ -50,7 +50,7 @@ Bzip2ReadBuffer::~Bzip2ReadBuffer() = default; bool Bzip2ReadBuffer::nextImpl() { - if (eof) + if (eof_flag) return false; if (!bz->stream.avail_in) @@ -72,7 +72,7 @@ bool Bzip2ReadBuffer::nextImpl() { if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } else @@ -91,7 +91,7 @@ bool Bzip2ReadBuffer::nextImpl() if (in->eof()) { - eof = true; + eof_flag = true; throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected end of bzip2 archive"); } diff --git a/src/IO/Bzip2ReadBuffer.h b/src/IO/Bzip2ReadBuffer.h index dc113800683..de1e61ee388 100644 --- a/src/IO/Bzip2ReadBuffer.h +++ b/src/IO/Bzip2ReadBuffer.h @@ -26,7 +26,7 @@ private: class Bzip2StateWrapper; std::unique_ptr bz; - bool eof; + bool eof_flag; }; } diff --git a/src/IO/LZMAInflatingReadBuffer.cpp b/src/IO/LZMAInflatingReadBuffer.cpp index f2df6bdca6a..80da7421fc3 100644 --- a/src/IO/LZMAInflatingReadBuffer.cpp +++ b/src/IO/LZMAInflatingReadBuffer.cpp @@ -7,7 +7,7 @@ namespace ErrorCodes extern const int LZMA_STREAM_DECODER_FAILED; } LZMAInflatingReadBuffer::LZMAInflatingReadBuffer(std::unique_ptr in_, size_t buf_size, char * existing_memory, size_t alignment) - : BufferWithOwnMemory(buf_size, existing_memory, alignment), in(std::move(in_)), eof(false) + : BufferWithOwnMemory(buf_size, existing_memory, alignment), in(std::move(in_)), eof_flag(false) { lstr = LZMA_STREAM_INIT; lstr.allocator = nullptr; @@ -36,7 +36,7 @@ LZMAInflatingReadBuffer::~LZMAInflatingReadBuffer() bool LZMAInflatingReadBuffer::nextImpl() { - if (eof) + if (eof_flag) return false; lzma_action action = LZMA_RUN; @@ -64,7 +64,7 @@ bool LZMAInflatingReadBuffer::nextImpl() { if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } else diff --git a/src/IO/LZMAInflatingReadBuffer.h b/src/IO/LZMAInflatingReadBuffer.h index 18922f64516..2d676eeeeb3 100644 --- a/src/IO/LZMAInflatingReadBuffer.h +++ b/src/IO/LZMAInflatingReadBuffer.h @@ -25,7 +25,7 @@ private: std::unique_ptr in; lzma_stream lstr; - bool eof; + bool eof_flag; }; } diff --git a/src/IO/Lz4InflatingReadBuffer.cpp b/src/IO/Lz4InflatingReadBuffer.cpp index 22bce94cad2..61e912d440c 100644 --- a/src/IO/Lz4InflatingReadBuffer.cpp +++ b/src/IO/Lz4InflatingReadBuffer.cpp @@ -32,7 +32,7 @@ Lz4InflatingReadBuffer::~Lz4InflatingReadBuffer() bool Lz4InflatingReadBuffer::nextImpl() { - if (eof) + if (eof_flag) return false; if (!in_available) @@ -66,7 +66,7 @@ bool Lz4InflatingReadBuffer::nextImpl() if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } diff --git a/src/IO/Lz4InflatingReadBuffer.h b/src/IO/Lz4InflatingReadBuffer.h index 0462d85adf7..d4d81f8765c 100644 --- a/src/IO/Lz4InflatingReadBuffer.h +++ b/src/IO/Lz4InflatingReadBuffer.h @@ -35,7 +35,7 @@ private: size_t in_available; size_t out_available; - bool eof = false; + bool eof_flag = false; }; } diff --git a/src/IO/ReadBufferFromFile.cpp b/src/IO/ReadBufferFromFile.cpp index d0f94441622..4f601301686 100644 --- a/src/IO/ReadBufferFromFile.cpp +++ b/src/IO/ReadBufferFromFile.cpp @@ -28,8 +28,9 @@ ReadBufferFromFile::ReadBufferFromFile( size_t buf_size, int flags, char * existing_memory, - size_t alignment) - : ReadBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment), file_name(file_name_) + size_t alignment, + std::optional file_size_) + : ReadBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment, file_size_), file_name(file_name_) { ProfileEvents::increment(ProfileEvents::FileOpen); @@ -58,10 +59,10 @@ ReadBufferFromFile::ReadBufferFromFile( const std::string & original_file_name, size_t buf_size, char * existing_memory, - size_t alignment) - : - ReadBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment), - file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name) + size_t alignment, + std::optional file_size_) + : ReadBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment, file_size_) + , file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name) { fd_ = -1; } diff --git a/src/IO/ReadBufferFromFile.h b/src/IO/ReadBufferFromFile.h index 1a45e4c1829..ff19fa40fdf 100644 --- a/src/IO/ReadBufferFromFile.h +++ b/src/IO/ReadBufferFromFile.h @@ -23,15 +23,22 @@ protected: CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForRead}; public: - explicit ReadBufferFromFile(const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1, - char * existing_memory = nullptr, size_t alignment = 0); + explicit ReadBufferFromFile( + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt); /// Use pre-opened file descriptor. explicit ReadBufferFromFile( int & fd, /// Will be set to -1 if constructor didn't throw and ownership of file descriptor is passed to the object. const std::string & original_file_name = {}, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - char * existing_memory = nullptr, size_t alignment = 0); + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt); ~ReadBufferFromFile() override; @@ -50,9 +57,14 @@ public: class ReadBufferFromFilePRead : public ReadBufferFromFile { public: - ReadBufferFromFilePRead(const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1, - char * existing_memory = nullptr, size_t alignment = 0) - : ReadBufferFromFile(file_name_, buf_size, flags, existing_memory, alignment) + ReadBufferFromFilePRead( + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : ReadBufferFromFile(file_name_, buf_size, flags, existing_memory, alignment, file_size_) { use_pread = true; } @@ -68,10 +80,15 @@ private: OpenedFileCache::OpenedFilePtr file; public: - ReadBufferFromFilePReadWithDescriptorsCache(const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1, - char * existing_memory = nullptr, size_t alignment = 0) - : ReadBufferFromFileDescriptorPRead(-1, buf_size, existing_memory, alignment), - file_name(file_name_) + ReadBufferFromFilePReadWithDescriptorsCache( + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : ReadBufferFromFileDescriptorPRead(-1, buf_size, existing_memory, alignment, file_size_) + , file_name(file_name_) { file = OpenedFileCache::instance().get(file_name, flags); fd = file->getFD(); diff --git a/src/IO/ReadBufferFromFileBase.cpp b/src/IO/ReadBufferFromFileBase.cpp index b598501a608..4db64755abf 100644 --- a/src/IO/ReadBufferFromFileBase.cpp +++ b/src/IO/ReadBufferFromFileBase.cpp @@ -7,8 +7,13 @@ ReadBufferFromFileBase::ReadBufferFromFileBase() : BufferWithOwnMemory file_size_) : BufferWithOwnMemory(buf_size, existing_memory, alignment) + , file_size(file_size_) { } diff --git a/src/IO/ReadBufferFromFileBase.h b/src/IO/ReadBufferFromFileBase.h index 731fd373e24..a051283b2bb 100644 --- a/src/IO/ReadBufferFromFileBase.h +++ b/src/IO/ReadBufferFromFileBase.h @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -22,7 +23,11 @@ class ReadBufferFromFileBase : public BufferWithOwnMemory { public: ReadBufferFromFileBase(); - ReadBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment); + ReadBufferFromFileBase( + size_t buf_size, + char * existing_memory, + size_t alignment, + std::optional file_size_ = std::nullopt); ~ReadBufferFromFileBase() override; virtual std::string getFileName() const = 0; @@ -44,6 +49,7 @@ public: } protected: + std::optional file_size; ProfileCallback profile_callback; clockid_t clock_type{}; }; diff --git a/src/IO/ReadBufferFromFileDescriptor.cpp b/src/IO/ReadBufferFromFileDescriptor.cpp index ed8eba62f04..ed6b1a60181 100644 --- a/src/IO/ReadBufferFromFileDescriptor.cpp +++ b/src/IO/ReadBufferFromFileDescriptor.cpp @@ -54,6 +54,10 @@ bool ReadBufferFromFileDescriptor::nextImpl() /// If internal_buffer size is empty, then read() cannot be distinguished from EOF assert(!internal_buffer.empty()); + /// This is a workaround of a read pass EOF bug in linux kernel with pread() + if (file_size.has_value() && file_offset_of_buffer_end >= *file_size) + return false; + size_t bytes_read = 0; while (!bytes_read) { diff --git a/src/IO/ReadBufferFromFileDescriptor.h b/src/IO/ReadBufferFromFileDescriptor.h index 8dbe8707bdb..188cdd709b5 100644 --- a/src/IO/ReadBufferFromFileDescriptor.h +++ b/src/IO/ReadBufferFromFileDescriptor.h @@ -27,8 +27,15 @@ protected: std::string getFileName() const override; public: - ReadBufferFromFileDescriptor(int fd_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0) - : ReadBufferFromFileBase(buf_size, existing_memory, alignment), required_alignment(alignment), fd(fd_) + ReadBufferFromFileDescriptor( + int fd_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : ReadBufferFromFileBase(buf_size, existing_memory, alignment, file_size_) + , required_alignment(alignment) + , fd(fd_) { } @@ -63,8 +70,13 @@ private: class ReadBufferFromFileDescriptorPRead : public ReadBufferFromFileDescriptor { public: - ReadBufferFromFileDescriptorPRead(int fd_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0) - : ReadBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment) + ReadBufferFromFileDescriptorPRead( + int fd_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + char * existing_memory = nullptr, + size_t alignment = 0, + std::optional file_size_ = std::nullopt) + : ReadBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment, file_size_) { use_pread = true; } diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index b0a6838b81e..48811a41edd 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -702,6 +702,25 @@ void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & set readCSVStringInto(s, buf, settings); } +void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings) +{ + s.clear(); + bool add_quote = false; + char quote = '\''; + + if (!buf.eof() && (*buf.position() == '\'' || *buf.position() == '"')) + { + quote = *buf.position(); + s.push_back(quote); + add_quote = true; + } + + readCSVStringInto(s, buf, settings); + + if (add_quote) + s.push_back(quote); +} + template void readCSVStringInto>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); @@ -1212,6 +1231,19 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim } } +// Use PeekableReadBuffer to copy field to string after parsing. +template +static void readParsedValueIntoString(String & s, ReadBuffer & buf, ParseFunc parse_func) +{ + PeekableReadBuffer peekable_buf(buf); + peekable_buf.setCheckpoint(); + parse_func(peekable_buf); + peekable_buf.makeContinuousMemoryFromCheckpointToPos(); + auto * end = peekable_buf.position(); + peekable_buf.rollbackToCheckpoint(); + s.append(peekable_buf.position(), end); + peekable_buf.position() = end; +} template static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf) @@ -1266,7 +1298,11 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) /// - Number: integer, float, decimal. if (*buf.position() == '\'') - readQuotedString(s, buf); + { + s.push_back('\''); + readQuotedStringInto(s, buf); + s.push_back('\''); + } else if (*buf.position() == '[') readQuotedFieldInBrackets<'[', ']'>(s, buf); else if (*buf.position() == '(') @@ -1290,18 +1326,19 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) else { /// It's an integer, float or decimal. They all can be parsed as float. - /// Use PeekableReadBuffer to copy field to string after parsing. - PeekableReadBuffer peekable_buf(buf); - peekable_buf.setCheckpoint(); - Float64 tmp; - readFloatText(tmp, peekable_buf); - peekable_buf.makeContinuousMemoryFromCheckpointToPos(); - auto * end = peekable_buf.position(); - peekable_buf.rollbackToCheckpoint(); - s.append(peekable_buf.position(), end); - peekable_buf.position() = end; + auto parse_func = [](ReadBuffer & in) + { + Float64 tmp; + readFloatText(tmp, in); + }; + readParsedValueIntoString(s, buf, parse_func); } } +void readJSONFieldIntoString(String & s, ReadBuffer & buf) +{ + auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); }; + readParsedValueIntoString(s, buf, parse_func); +} } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index b2ad4035cdc..6d1023947a5 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -563,6 +563,8 @@ void readStringUntilWhitespace(String & s, ReadBuffer & buf); */ void readCSVString(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +/// Differ from readCSVString in that it doesn't remove quotes around field if any. +void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); /// Read and append result to array of characters. template @@ -1381,4 +1383,7 @@ struct PcgDeserializer void readQuotedFieldIntoString(String & s, ReadBuffer & buf); +void readJSONFieldIntoString(String & s, ReadBuffer & buf); + } + diff --git a/src/IO/ZlibInflatingReadBuffer.cpp b/src/IO/ZlibInflatingReadBuffer.cpp index 472399dea3d..28426e920ef 100644 --- a/src/IO/ZlibInflatingReadBuffer.cpp +++ b/src/IO/ZlibInflatingReadBuffer.cpp @@ -16,7 +16,7 @@ ZlibInflatingReadBuffer::ZlibInflatingReadBuffer( size_t alignment) : BufferWithOwnMemory(buf_size, existing_memory, alignment) , in(std::move(in_)) - , eof(false) + , eof_flag(false) { zstr.zalloc = nullptr; zstr.zfree = nullptr; @@ -54,7 +54,7 @@ bool ZlibInflatingReadBuffer::nextImpl() do { /// if we already found eof, we shouldn't do anything - if (eof) + if (eof_flag) return false; /// if there is no available bytes in zstr, move ptr to next available data @@ -83,7 +83,7 @@ bool ZlibInflatingReadBuffer::nextImpl() /// * false if there is no data in working buffer if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } /// If it is not end of file, we need to reset zstr and return true, because we still have some data to read diff --git a/src/IO/ZlibInflatingReadBuffer.h b/src/IO/ZlibInflatingReadBuffer.h index b8c141e9b9b..905ab0cd3fc 100644 --- a/src/IO/ZlibInflatingReadBuffer.h +++ b/src/IO/ZlibInflatingReadBuffer.h @@ -33,7 +33,7 @@ private: std::unique_ptr in; z_stream zstr; - bool eof; + bool eof_flag; }; } diff --git a/src/IO/ZstdInflatingReadBuffer.cpp b/src/IO/ZstdInflatingReadBuffer.cpp index ce89f09f955..6f244dc5a75 100644 --- a/src/IO/ZstdInflatingReadBuffer.cpp +++ b/src/IO/ZstdInflatingReadBuffer.cpp @@ -31,7 +31,7 @@ bool ZstdInflatingReadBuffer::nextImpl() do { // If it is known that end of file was reached, return false - if (eof) + if (eof_flag) return false; /// If end was reached, get next part @@ -64,7 +64,7 @@ bool ZstdInflatingReadBuffer::nextImpl() /// If end of file is reached, fill eof variable and return true if there is some data in buffer, otherwise return false if (in->eof()) { - eof = true; + eof_flag = true; return !working_buffer.empty(); } /// It is possible, that input buffer is not at eof yet, but nothing was decompressed in current iteration. diff --git a/src/IO/ZstdInflatingReadBuffer.h b/src/IO/ZstdInflatingReadBuffer.h index e6e2dad0ad5..ec80b860e0e 100644 --- a/src/IO/ZstdInflatingReadBuffer.h +++ b/src/IO/ZstdInflatingReadBuffer.h @@ -31,7 +31,7 @@ private: ZSTD_DCtx * dctx; ZSTD_inBuffer input; ZSTD_outBuffer output; - bool eof = false; + bool eof_flag = false; }; } diff --git a/src/IO/createReadBufferFromFileBase.cpp b/src/IO/createReadBufferFromFileBase.cpp index bed97d54ab0..b83bfdbf3a8 100644 --- a/src/IO/createReadBufferFromFileBase.cpp +++ b/src/IO/createReadBufferFromFileBase.cpp @@ -29,14 +29,20 @@ namespace ErrorCodes std::unique_ptr createReadBufferFromFileBase( const std::string & filename, const ReadSettings & settings, - std::optional size, + std::optional read_hint, + std::optional file_size, int flags, char * existing_memory, size_t alignment) { - if (size.has_value() && !*size) + if (file_size.has_value() && !*file_size) return std::make_unique(); - size_t estimated_size = size.has_value() ? *size : 0; + + size_t estimated_size = 0; + if (read_hint.has_value()) + estimated_size = *read_hint; + else if (file_size.has_value()) + estimated_size = file_size.has_value() ? *file_size : 0; if (!existing_memory && settings.local_fs_method == LocalFSReadMethod::mmap @@ -63,23 +69,23 @@ std::unique_ptr createReadBufferFromFileBase( if (settings.local_fs_method == LocalFSReadMethod::read) { - res = std::make_unique(filename, buffer_size, actual_flags, existing_memory, alignment); + res = std::make_unique(filename, buffer_size, actual_flags, existing_memory, alignment, file_size); } else if (settings.local_fs_method == LocalFSReadMethod::pread || settings.local_fs_method == LocalFSReadMethod::mmap) { - res = std::make_unique(filename, buffer_size, actual_flags, existing_memory, alignment); + res = std::make_unique(filename, buffer_size, actual_flags, existing_memory, alignment, file_size); } else if (settings.local_fs_method == LocalFSReadMethod::pread_fake_async) { static AsynchronousReaderPtr reader = std::make_shared(); res = std::make_unique( - reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, alignment); + reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, alignment, file_size); } else if (settings.local_fs_method == LocalFSReadMethod::pread_threadpool) { static AsynchronousReaderPtr reader = std::make_shared(16, 1000000); res = std::make_unique( - reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, alignment); + reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, alignment, file_size); } else throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown read method"); diff --git a/src/IO/createReadBufferFromFileBase.h b/src/IO/createReadBufferFromFileBase.h index 86da469b55d..c2e2040587b 100644 --- a/src/IO/createReadBufferFromFileBase.h +++ b/src/IO/createReadBufferFromFileBase.h @@ -11,12 +11,14 @@ namespace DB /** Create an object to read data from a file. * - * @param size - the number of bytes to read + * @param read_hint - the number of bytes to read hint + * @param file_size - size of file */ std::unique_ptr createReadBufferFromFileBase( const std::string & filename, const ReadSettings & settings, - std::optional size = {}, + std::optional read_hint = {}, + std::optional file_size = {}, int flags_ = -1, char * existing_memory = nullptr, size_t alignment = 0); diff --git a/src/IO/examples/write_buffer.cpp b/src/IO/examples/write_buffer.cpp index 5587b8aa1a2..bca0be24b1a 100644 --- a/src/IO/examples/write_buffer.cpp +++ b/src/IO/examples/write_buffer.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp index 4f00ec5f96d..3b05d8c76b6 100644 --- a/src/IO/parseDateTimeBestEffort.cpp +++ b/src/IO/parseDateTimeBestEffort.cpp @@ -151,7 +151,18 @@ ReturnType parseDateTimeBestEffortImpl( { num_digits = readDigits(digits, sizeof(digits), in); - if (num_digits == 10 && !year && !has_time) + if (num_digits == 13 && !year && !has_time) + { + /// This is unix timestamp with millisecond. + readDecimalNumber<10>(res, digits); + if (fractional) + { + fractional->digits = 3; + readDecimalNumber<3>(fractional->value, digits + 10); + } + return ReturnType(true); + } + else if (num_digits == 10 && !year && !has_time) { /// This is unix timestamp. readDecimalNumber<10>(res, digits); diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index d5eb701e2aa..bc937755618 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -697,6 +697,10 @@ ASTs ActionsMatcher::doUntuple(const ASTFunction * function, ActionsMatcher::Dat for (const auto & name [[maybe_unused]] : tuple_type->getElementNames()) { auto tuple_ast = function->arguments->children[0]; + + /// This transformation can lead to exponential growth of AST size, let's check it. + tuple_ast->checkSize(data.getContext()->getSettingsRef().max_ast_elements); + if (tid != 0) tuple_ast = tuple_ast->clone(); diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index ae5ce117c61..5c9d94d7c45 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -361,7 +361,6 @@ void Aggregator::compileAggregateFunctionsIfNeeded() auto compiled_aggregate_functions = compileAggregateFunctions(getJITInstance(), functions_to_compile, functions_description); return std::make_shared(std::move(compiled_aggregate_functions)); }); - compiled_aggregate_functions_holder = std::static_pointer_cast(compiled_function_cache_entry); } else diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index b7b6b84439b..05972f2ee50 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -101,7 +102,7 @@ Cluster::Address::Address( user = config.getString(config_prefix + ".user", "default"); password = config.getString(config_prefix + ".password", ""); default_database = config.getString(config_prefix + ".default_database", ""); - secure = config.getBool(config_prefix + ".secure", false) ? Protocol::Secure::Enable : Protocol::Secure::Disable; + secure = ConfigHelper::getBool(config, config_prefix + ".secure", false, /* empty_as */true) ? Protocol::Secure::Enable : Protocol::Secure::Disable; priority = config.getInt(config_prefix + ".priority", 1); const char * port_type = secure == Protocol::Secure::Enable ? "tcp_port_secure" : "tcp_port"; is_local = isLocal(config.getInt(port_type, 0)); diff --git a/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp b/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp index 2de7b4b7846..b266746642f 100644 --- a/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp +++ b/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp @@ -1,5 +1,7 @@ #include "ExternalUserDefinedExecutableFunctionsLoader.h" +#include + #include #include @@ -54,29 +56,44 @@ ExternalLoader::LoadablePtr ExternalUserDefinedExecutableFunctionsLoader::create throw Exception(ErrorCodes::FUNCTION_ALREADY_EXISTS, "The aggregate function '{}' already exists", name); String type = config.getString(key_in_config + ".type"); - UserDefinedExecutableFunctionType function_type; + + bool is_executable_pool = false; if (type == "executable") - function_type = UserDefinedExecutableFunctionType::executable; + is_executable_pool = false; else if (type == "executable_pool") - function_type = UserDefinedExecutableFunctionType::executable_pool; + is_executable_pool = true; else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong user defined function type expected 'executable' or 'executable_pool' actual {}", - function_type); + type); + + bool execute_direct = config.getBool(key_in_config + ".execute_direct", true); + + String command_value = config.getString(key_in_config + ".command"); + std::vector command_arguments; + + if (execute_direct) + { + boost::split(command_arguments, command_value, [](char c) { return c == ' '; }); + + command_value = std::move(command_arguments[0]); + command_arguments.erase(command_arguments.begin()); + } - String command = config.getString(key_in_config + ".command"); String format = config.getString(key_in_config + ".format"); DataTypePtr result_type = DataTypeFactory::instance().get(config.getString(key_in_config + ".return_type")); bool send_chunk_header = config.getBool(key_in_config + ".send_chunk_header", false); + size_t command_termination_timeout_seconds = config.getUInt64(key_in_config + ".command_termination_timeout", 10); + size_t command_read_timeout_milliseconds = config.getUInt64(key_in_config + ".command_read_timeout", 10000); + size_t command_write_timeout_milliseconds = config.getUInt64(key_in_config + ".command_write_timeout", 10000); size_t pool_size = 0; - size_t command_termination_timeout = 0; size_t max_command_execution_time = 0; - if (function_type == UserDefinedExecutableFunctionType::executable_pool) + + if (is_executable_pool) { pool_size = config.getUInt64(key_in_config + ".pool_size", 16); - command_termination_timeout = config.getUInt64(key_in_config + ".command_termination_timeout", 10); max_command_execution_time = config.getUInt64(key_in_config + ".max_command_execution_time", 10); size_t max_execution_time_seconds = static_cast(getContext()->getSettings().max_execution_time.totalSeconds()); @@ -106,19 +123,28 @@ ExternalLoader::LoadablePtr ExternalUserDefinedExecutableFunctionsLoader::create UserDefinedExecutableFunctionConfiguration function_configuration { - .type = function_type, .name = std::move(name), //-V1030 - .script_path = std::move(command), //-V1030 - .format = std::move(format), //-V1030 + .command = std::move(command_value), //-V1030 + .command_arguments = std::move(command_arguments), //-V1030 .argument_types = std::move(argument_types), //-V1030 .result_type = std::move(result_type), //-V1030 - .pool_size = pool_size, - .command_termination_timeout = command_termination_timeout, - .max_command_execution_time = max_command_execution_time, - .send_chunk_header = send_chunk_header }; - return std::make_shared(function_configuration, lifetime); + ShellCommandSourceCoordinator::Configuration shell_command_coordinator_configration + { + .format = std::move(format), //-V1030 + .command_termination_timeout_seconds = command_termination_timeout_seconds, + .command_read_timeout_milliseconds = command_read_timeout_milliseconds, + .command_write_timeout_milliseconds = command_write_timeout_milliseconds, + .pool_size = pool_size, + .max_command_execution_time_seconds = max_command_execution_time, + .is_executable_pool = is_executable_pool, + .send_chunk_header = send_chunk_header, + .execute_direct = execute_direct + }; + + auto coordinator = std::make_shared(shell_command_coordinator_configration); + return std::make_shared(function_configuration, std::move(coordinator), lifetime); } } diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 5f7c54e427f..2475d437acb 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -48,10 +48,15 @@ BlockIO InterpreterAlterQuery::execute() FunctionNameNormalizer().visit(query_ptr.get()); const auto & alter = query_ptr->as(); if (alter.alter_object == ASTAlterQuery::AlterObjectType::DATABASE) + { return executeToDatabase(alter); + } else if (alter.alter_object == ASTAlterQuery::AlterObjectType::TABLE || alter.alter_object == ASTAlterQuery::AlterObjectType::LIVE_VIEW) + { return executeToTable(alter); + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown alter object type"); } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 8f003e75a07..7ddb0c8c26e 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -637,13 +637,14 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti /// Table function without columns list. auto table_function = TableFunctionFactory::instance().get(create.as_table_function, getContext()); properties.columns = table_function->getActualTableStructure(getContext()); - assert(!properties.columns.empty()); } else if (create.is_dictionary) { return {}; } - else + /// We can have queries like "CREATE TABLE ENGINE=" if + /// supports schema inference (will determine table structure in it's constructor). + else if (!StorageFactory::instance().checkIfStorageSupportsSchemaInterface(create.storage->engine->name)) throw Exception("Incorrect CREATE query: required list of column descriptions or AS section or SELECT.", ErrorCodes::INCORRECT_QUERY); /// Even if query has list of columns, canonicalize it (unfold Nested columns). @@ -1083,7 +1084,10 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, { const auto & factory = TableFunctionFactory::instance(); auto table_func = factory.get(create.as_table_function, getContext()); - res = table_func->execute(create.as_table_function, getContext(), create.getTable(), properties.columns); + /// In case of CREATE AS table_function() query we should use global context + /// in storage creation because there will be no query context on server startup + /// and because storage lifetime is bigger than query context lifetime. + res = table_func->execute(create.as_table_function, getContext(), create.getTable(), properties.columns, /*use_global_context=*/true); res->renameInMemory({create.getDatabase(), create.getTable(), create.uuid}); } else diff --git a/src/Interpreters/InterpreterExplainQuery.cpp b/src/Interpreters/InterpreterExplainQuery.cpp index e3a6812124f..fdb35637a9a 100644 --- a/src/Interpreters/InterpreterExplainQuery.cpp +++ b/src/Interpreters/InterpreterExplainQuery.cpp @@ -7,10 +7,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -250,117 +252,141 @@ QueryPipeline InterpreterExplainQuery::executeImpl() WriteBufferFromOwnString buf; bool single_line = false; + bool insert_buf = true; - if (ast.getKind() == ASTExplainQuery::ParsedAST) + switch (ast.getKind()) { - if (ast.getSettings()) - throw Exception("Settings are not supported for EXPLAIN AST query.", ErrorCodes::UNKNOWN_SETTING); - - dumpAST(*ast.getExplainedQuery(), buf); - } - else if (ast.getKind() == ASTExplainQuery::AnalyzedSyntax) - { - if (ast.getSettings()) - throw Exception("Settings are not supported for EXPLAIN SYNTAX query.", ErrorCodes::UNKNOWN_SETTING); - - ExplainAnalyzedSyntaxVisitor::Data data(getContext()); - ExplainAnalyzedSyntaxVisitor(data).visit(query); - - ast.getExplainedQuery()->format(IAST::FormatSettings(buf, false)); - } - else if (ast.getKind() == ASTExplainQuery::QueryPlan) - { - if (!dynamic_cast(ast.getExplainedQuery().get())) - throw Exception("Only SELECT is supported for EXPLAIN query", ErrorCodes::INCORRECT_QUERY); - - auto settings = checkAndGetSettings(ast.getSettings()); - QueryPlan plan; - - InterpreterSelectWithUnionQuery interpreter(ast.getExplainedQuery(), getContext(), SelectQueryOptions()); - interpreter.buildQueryPlan(plan); - - if (settings.optimize) - plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext())); - - if (settings.json) + case ASTExplainQuery::ParsedAST: { - /// Add extra layers to make plan look more like from postgres. - auto plan_map = std::make_unique(); - plan_map->add("Plan", plan.explainPlan(settings.query_plan_options)); - auto plan_array = std::make_unique(); - plan_array->add(std::move(plan_map)); + if (ast.getSettings()) + throw Exception("Settings are not supported for EXPLAIN AST query.", ErrorCodes::UNKNOWN_SETTING); - auto format_settings = getFormatSettings(getContext()); - format_settings.json.quote_64bit_integers = false; - - JSONBuilder::FormatSettings json_format_settings{.settings = format_settings}; - JSONBuilder::FormatContext format_context{.out = buf}; - - plan_array->format(json_format_settings, format_context); - - single_line = true; + dumpAST(*ast.getExplainedQuery(), buf); + break; } - else - plan.explainPlan(buf, settings.query_plan_options); - } - else if (ast.getKind() == ASTExplainQuery::QueryPipeline) - { - if (dynamic_cast(ast.getExplainedQuery().get())) + case ASTExplainQuery::AnalyzedSyntax: { - auto settings = checkAndGetSettings(ast.getSettings()); + if (ast.getSettings()) + throw Exception("Settings are not supported for EXPLAIN SYNTAX query.", ErrorCodes::UNKNOWN_SETTING); + + ExplainAnalyzedSyntaxVisitor::Data data(getContext()); + ExplainAnalyzedSyntaxVisitor(data).visit(query); + + ast.getExplainedQuery()->format(IAST::FormatSettings(buf, false)); + break; + } + case ASTExplainQuery::QueryPlan: + { + if (!dynamic_cast(ast.getExplainedQuery().get())) + throw Exception("Only SELECT is supported for EXPLAIN query", ErrorCodes::INCORRECT_QUERY); + + auto settings = checkAndGetSettings(ast.getSettings()); QueryPlan plan; InterpreterSelectWithUnionQuery interpreter(ast.getExplainedQuery(), getContext(), SelectQueryOptions()); interpreter.buildQueryPlan(plan); - auto pipeline = plan.buildQueryPipeline( + + if (settings.optimize) + plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext())); + + if (settings.json) + { + /// Add extra layers to make plan look more like from postgres. + auto plan_map = std::make_unique(); + plan_map->add("Plan", plan.explainPlan(settings.query_plan_options)); + auto plan_array = std::make_unique(); + plan_array->add(std::move(plan_map)); + + auto format_settings = getFormatSettings(getContext()); + format_settings.json.quote_64bit_integers = false; + + JSONBuilder::FormatSettings json_format_settings{.settings = format_settings}; + JSONBuilder::FormatContext format_context{.out = buf}; + + plan_array->format(json_format_settings, format_context); + + single_line = true; + } + else + plan.explainPlan(buf, settings.query_plan_options); + break; + } + case ASTExplainQuery::QueryPipeline: + { + if (dynamic_cast(ast.getExplainedQuery().get())) + { + auto settings = checkAndGetSettings(ast.getSettings()); + QueryPlan plan; + + InterpreterSelectWithUnionQuery interpreter(ast.getExplainedQuery(), getContext(), SelectQueryOptions()); + interpreter.buildQueryPlan(plan); + auto pipeline = plan.buildQueryPipeline( + QueryPlanOptimizationSettings::fromContext(getContext()), + BuildQueryPipelineSettings::fromContext(getContext())); + + if (settings.graph) + { + /// Pipe holds QueryPlan, should not go out-of-scope + auto pipe = QueryPipelineBuilder::getPipe(std::move(*pipeline)); + const auto & processors = pipe.getProcessors(); + + if (settings.compact) + printPipelineCompact(processors, buf, settings.query_pipeline_options.header); + else + printPipeline(processors, buf); + } + else + { + plan.explainPipeline(buf, settings.query_pipeline_options); + } + } + else if (dynamic_cast(ast.getExplainedQuery().get())) + { + InterpreterInsertQuery insert(ast.getExplainedQuery(), getContext()); + auto io = insert.execute(); + printPipeline(io.pipeline.getProcessors(), buf); + } + else + throw Exception("Only SELECT and INSERT is supported for EXPLAIN PIPELINE query", ErrorCodes::INCORRECT_QUERY); + break; + } + case ASTExplainQuery::QueryEstimates: + { + if (!dynamic_cast(ast.getExplainedQuery().get())) + throw Exception("Only SELECT is supported for EXPLAIN ESTIMATE query", ErrorCodes::INCORRECT_QUERY); + + auto settings = checkAndGetSettings(ast.getSettings()); + QueryPlan plan; + + InterpreterSelectWithUnionQuery interpreter(ast.getExplainedQuery(), getContext(), SelectQueryOptions()); + interpreter.buildQueryPlan(plan); + // collect the selected marks, rows, parts during build query pipeline. + plan.buildQueryPipeline( QueryPlanOptimizationSettings::fromContext(getContext()), BuildQueryPipelineSettings::fromContext(getContext())); - if (settings.graph) - { - /// Pipe holds QueryPlan, should not go out-of-scope - auto pipe = QueryPipelineBuilder::getPipe(std::move(*pipeline)); - const auto & processors = pipe.getProcessors(); - - if (settings.compact) - printPipelineCompact(processors, buf, settings.query_pipeline_options.header); - else - printPipeline(processors, buf); - } - else - { - plan.explainPipeline(buf, settings.query_pipeline_options); - } + if (settings.optimize) + plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext())); + plan.explainEstimate(res_columns); + insert_buf = false; + break; } - else if (dynamic_cast(ast.getExplainedQuery().get())) + case ASTExplainQuery::TableOverride: { - InterpreterInsertQuery insert(ast.getExplainedQuery(), getContext()); - auto io = insert.execute(); - printPipeline(io.pipeline.getProcessors(), buf); + if (auto * table_function = ast.getTableFunction()->as(); !table_function || table_function->name != "mysql") + { + throw Exception(ErrorCodes::INCORRECT_QUERY, "EXPLAIN TABLE OVERRIDE is not supported for the {}() table function", table_function->name); + } + auto storage = getContext()->getQueryContext()->executeTableFunction(ast.getTableFunction()); + auto metadata_snapshot = storage->getInMemoryMetadata(); + TableOverrideAnalyzer::Result override_info; + TableOverrideAnalyzer override_analyzer(ast.getTableOverride()); + override_analyzer.analyze(metadata_snapshot, override_info); + override_info.appendTo(buf); + break; } - else - throw Exception("Only SELECT and INSERT is supported for EXPLAIN PIPELINE query", ErrorCodes::INCORRECT_QUERY); } - else if (ast.getKind() == ASTExplainQuery::QueryEstimates) - { - if (!dynamic_cast(ast.getExplainedQuery().get())) - throw Exception("Only SELECT is supported for EXPLAIN ESTIMATE query", ErrorCodes::INCORRECT_QUERY); - - auto settings = checkAndGetSettings(ast.getSettings()); - QueryPlan plan; - - InterpreterSelectWithUnionQuery interpreter(ast.getExplainedQuery(), getContext(), SelectQueryOptions()); - interpreter.buildQueryPlan(plan); - // collect the selected marks, rows, parts during build query pipeline. - plan.buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(getContext()), - BuildQueryPipelineSettings::fromContext(getContext())); - - if (settings.optimize) - plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext())); - plan.explainEstimate(res_columns); - } - if (ast.getKind() != ASTExplainQuery::QueryEstimates) + if (insert_buf) { if (single_line) res_columns[0]->insertData(buf.str().data(), buf.str().size()); diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index a1f83c81a81..d340308122f 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -263,6 +263,10 @@ BlockIO InterpreterInsertQuery::execute() QueryPipelineBuilder pipeline; StoragePtr table = getTable(query); + StoragePtr inner_table; + if (const auto * mv = dynamic_cast(table.get())) + inner_table = mv->getTargetTable(); + if (query.partition_by && !table->supportsPartitionBy()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "PARTITION BY clause is not supported by storage"); @@ -450,11 +454,8 @@ BlockIO InterpreterInsertQuery::execute() } res.pipeline.addStorageHolder(table); - if (const auto * mv = dynamic_cast(table.get())) - { - if (auto inner_table = mv->tryGetTargetTable()) - res.pipeline.addStorageHolder(inner_table); - } + if (inner_table) + res.pipeline.addStorageHolder(inner_table); return res; } diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 5c0322ac1d9..8e0f73f0b31 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -401,7 +401,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( view = nullptr; } - if (try_move_to_prewhere && storage && storage->supportsPrewhere() && query.where() && !query.prewhere()) + if (try_move_to_prewhere && storage && storage->canMoveConditionsToPrewhere() && query.where() && !query.prewhere()) { /// PREWHERE optimization: transfer some condition from WHERE to PREWHERE if enabled and viable if (const auto & column_sizes = storage->getColumnSizes(); !column_sizes.empty()) @@ -1977,6 +1977,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc if (!options.ignore_quota && (options.to_stage == QueryProcessingStage::Complete)) quota = context->getQuota(); + query_info.settings_limit_offset_done = options.settings_limit_offset_done; storage->read(query_plan, required_columns, metadata_snapshot, query_info, context, processing_stage, max_block_size, max_streams); if (context->hasQueryContext() && !options.is_internal) diff --git a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp index 6779093a53d..e4b3e62c358 100644 --- a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp +++ b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp @@ -83,7 +83,7 @@ InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery( } } - if (num_children == 1 && settings_limit_offset_needed) + if (num_children == 1 && settings_limit_offset_needed && !options.settings_limit_offset_done) { const ASTPtr first_select_ast = ast->list_of_selects->children.at(0); ASTSelectQuery * select_query = dynamic_cast(first_select_ast.get()); @@ -127,7 +127,7 @@ InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery( select_query->setExpression(ASTSelectQuery::Expression::LIMIT_LENGTH, std::move(new_limit_length_ast)); } - settings_limit_offset_done = true; + options.settings_limit_offset_done = true; } } @@ -305,7 +305,7 @@ void InterpreterSelectWithUnionQuery::buildQueryPlan(QueryPlan & query_plan) } } - if (settings_limit_offset_needed && !settings_limit_offset_done) + if (settings_limit_offset_needed && !options.settings_limit_offset_done) { if (settings.limit > 0) { diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index b39ededaa91..123ff6ba2ca 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -250,6 +250,7 @@ BlockIO InterpreterSystemQuery::execute() } case Type::SUSPEND: { + getContext()->checkAccess(AccessType::SYSTEM_SHUTDOWN); auto command = fmt::format("kill -STOP {0} && sleep {1} && kill -CONT {0}", getpid(), query.seconds); LOG_DEBUG(log, "Will run {}", command); auto res = ShellCommand::execute(command); @@ -453,9 +454,11 @@ BlockIO InterpreterSystemQuery::execute() case Type::START_LISTEN_QUERIES: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "{} is not supported yet", query.type); case Type::STOP_THREAD_FUZZER: + getContext()->checkAccess(AccessType::SYSTEM_THREAD_FUZZER); ThreadFuzzer::stop(); break; case Type::START_THREAD_FUZZER: + getContext()->checkAccess(AccessType::SYSTEM_THREAD_FUZZER); ThreadFuzzer::start(); break; default: diff --git a/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp b/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp index 02af07bc00c..efa0e921527 100644 --- a/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp +++ b/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp @@ -13,7 +13,7 @@ #include #include - +#if USE_MYSQL using namespace DB; static inline ASTPtr tryRewrittenCreateQuery(const String & query, ContextPtr context) @@ -255,3 +255,4 @@ TEST(MySQLCreateRewritten, QueryWithEnum) std::string(MATERIALIZEDMYSQL_TABLE_COLUMNS) + ") ENGINE = ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); } +#endif diff --git a/src/Interpreters/OpenTelemetrySpanLog.cpp b/src/Interpreters/OpenTelemetrySpanLog.cpp index 89cce890555..7996d4a538e 100644 --- a/src/Interpreters/OpenTelemetrySpanLog.cpp +++ b/src/Interpreters/OpenTelemetrySpanLog.cpp @@ -200,7 +200,6 @@ bool OpenTelemetryTraceContext::parseTraceparentHeader(const std::string & trace ++data; UInt128 trace_id_128 = readHex(data); - trace_id = trace_id_128; data += 32; if (*data != '-') @@ -210,7 +209,7 @@ bool OpenTelemetryTraceContext::parseTraceparentHeader(const std::string & trace } ++data; - span_id = readHex(data); + UInt64 span_id_64 = readHex(data); data += 16; if (*data != '-') @@ -220,7 +219,9 @@ bool OpenTelemetryTraceContext::parseTraceparentHeader(const std::string & trace } ++data; - trace_flags = readHex(data); + this->trace_flags = readHex(data); + this->trace_id = trace_id_128; + this->span_id = span_id_64; return true; } diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index a4583685a90..37b2992d657 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -76,6 +77,7 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as { std::unique_lock lock(mutex); + IAST::QueryKind query_kind = ast->getQueryKind(); const auto queue_max_wait_ms = settings.queue_max_wait_ms.totalMilliseconds(); if (!is_unlimited_query && max_size && processes.size() >= max_size) @@ -86,6 +88,19 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as throw Exception("Too many simultaneous queries. Maximum: " + toString(max_size), ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES); } + if (!is_unlimited_query) + { + QueryAmount amount = getQueryKindAmount(query_kind); + if (max_insert_queries_amount && query_kind == IAST::QueryKind::Insert && amount >= max_insert_queries_amount) + throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, + "Too many simultaneous insert queries. Maximum: {}, current: {}", + max_insert_queries_amount, amount); + if (max_select_queries_amount && query_kind == IAST::QueryKind::Select && amount >= max_select_queries_amount) + throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, + "Too many simultaneous select queries. Maximum: {}, current: {}", + max_select_queries_amount, amount); + } + { /** * `max_size` check above is controlled by `max_concurrent_queries` server setting and is a "hard" limit for how many @@ -176,7 +191,9 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as } auto process_it = processes.emplace(processes.end(), - query_context, query_, client_info, priorities.insert(settings.priority)); + query_context, query_, client_info, priorities.insert(settings.priority), query_kind); + + increaseQueryKindAmount(query_kind); res = std::make_shared(*this, process_it); @@ -242,6 +259,7 @@ ProcessListEntry::~ProcessListEntry() String user = it->getClientInfo().current_user; String query_id = it->getClientInfo().current_query_id; + IAST::QueryKind query_kind = it->query_kind; const QueryStatus * process_list_element_ptr = &*it; @@ -273,6 +291,9 @@ ProcessListEntry::~ProcessListEntry() LOG_ERROR(&Poco::Logger::get("ProcessList"), "Logical error: cannot find query by query_id and pointer to ProcessListElement in ProcessListForUser"); std::terminate(); } + + parent.decreaseQueryKindAmount(query_kind); + parent.have_space.notify_all(); /// If there are no more queries for the user, then we will reset memory tracker and network throttler. @@ -286,11 +307,12 @@ ProcessListEntry::~ProcessListEntry() QueryStatus::QueryStatus( - ContextPtr context_, const String & query_, const ClientInfo & client_info_, QueryPriorities::Handle && priority_handle_) + ContextPtr context_, const String & query_, const ClientInfo & client_info_, QueryPriorities::Handle && priority_handle_, IAST::QueryKind query_kind_) : WithContext(context_) , query(query_) , client_info(client_info_) , priority_handle(std::move(priority_handle_)) + , query_kind(query_kind_) { auto settings = getContext()->getSettings(); limits.max_execution_time = settings.max_execution_time; @@ -411,9 +433,8 @@ QueryStatusInfo QueryStatus::getInfo(bool get_thread_list, bool get_profile_even res.read_bytes = progress_in.read_bytes; res.total_rows = progress_in.total_rows_to_read; - /// TODO: Use written_rows and written_bytes when real time progress is implemented - res.written_rows = progress_out.read_rows; - res.written_bytes = progress_out.read_bytes; + res.written_rows = progress_out.written_rows; + res.written_bytes = progress_out.written_bytes; if (thread_group) { @@ -485,4 +506,33 @@ ProcessList::UserInfo ProcessList::getUserInfo(bool get_profile_events) const return per_user_infos; } +void ProcessList::increaseQueryKindAmount(const IAST::QueryKind & query_kind) +{ + auto found = query_kind_amounts.find(query_kind); + if (found == query_kind_amounts.end()) + query_kind_amounts[query_kind] = 1; + else + found->second += 1; +} + +void ProcessList::decreaseQueryKindAmount(const IAST::QueryKind & query_kind) +{ + auto found = query_kind_amounts.find(query_kind); + /// TODO: we could just rebuild the map, as we have saved all query_kind. + if (found == query_kind_amounts.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong query kind amount: decrease before increase on '{}'", query_kind); + else if (found->second == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong query kind amount: decrease to negative on '{}'", query_kind, found->second); + else + found->second -= 1; +} + +ProcessList::QueryAmount ProcessList::getQueryKindAmount(const IAST::QueryKind & query_kind) const +{ + auto found = query_kind_amounts.find(query_kind); + if (found == query_kind_amounts.end()) + return 0; + return found->second; +} + } diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 9c826bde061..545e5b07345 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -94,7 +95,7 @@ protected: ExecutionSpeedLimits limits; OverflowMode overflow_mode; - QueryPriorities::Handle priority_handle; + QueryPriorities::Handle priority_handle = nullptr; std::atomic is_killed { false }; @@ -118,13 +119,17 @@ protected: ProcessListForUser * user_process_list = nullptr; + IAST::QueryKind query_kind; + public: QueryStatus( ContextPtr context_, const String & query_, const ClientInfo & client_info_, - QueryPriorities::Handle && priority_handle_); + QueryPriorities::Handle && priority_handle_, + IAST::QueryKind query_kind_ + ); ~QueryStatus(); @@ -256,6 +261,7 @@ class ProcessList public: using Element = QueryStatus; using Entry = ProcessListEntry; + using QueryAmount = UInt64; /// list, for iterators not to invalidate. NOTE: could replace with cyclic buffer, but not worth. using Container = std::list; @@ -265,6 +271,8 @@ public: /// User -> queries using UserToQueries = std::unordered_map; + using QueryKindAmounts = std::unordered_map; + protected: friend class ProcessListEntry; @@ -287,6 +295,19 @@ protected: /// Call under lock. Finds process with specified current_user and current_query_id. QueryStatus * tryGetProcessListElement(const String & current_query_id, const String & current_user); + /// limit for insert. 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. + size_t max_insert_queries_amount = 0; + + /// limit for select. 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. + size_t max_select_queries_amount = 0; + + /// amount of queries by query kind. + QueryKindAmounts query_kind_amounts; + + void increaseQueryKindAmount(const IAST::QueryKind & query_kind); + void decreaseQueryKindAmount(const IAST::QueryKind & query_kind); + QueryAmount getQueryKindAmount(const IAST::QueryKind & query_kind) const; + public: using EntryPtr = std::shared_ptr; @@ -312,6 +333,18 @@ public: max_size = max_size_; } + void setMaxInsertQueriesAmount(size_t max_insert_queries_amount_) + { + std::lock_guard lock(mutex); + max_insert_queries_amount = max_insert_queries_amount_; + } + + void setMaxSelectQueriesAmount(size_t max_select_queries_amount_) + { + std::lock_guard lock(mutex); + max_select_queries_amount = max_select_queries_amount_; + } + /// Try call cancel() for input and output streams of query with specified id and user CancellationCode sendCancelToQuery(const String & current_query_id, const String & current_user, bool kill = false); diff --git a/src/Interpreters/RowRefs.h b/src/Interpreters/RowRefs.h index 047146d569c..987fd197d9d 100644 --- a/src/Interpreters/RowRefs.h +++ b/src/Interpreters/RowRefs.h @@ -103,7 +103,7 @@ struct RowRefList : RowRef } } - bool ok() const { return first || (batch && position < batch->size); } + bool ok() const { return first || batch; } private: const RowRefList * root; diff --git a/src/Interpreters/SelectQueryOptions.h b/src/Interpreters/SelectQueryOptions.h index bc95a940c18..ee708b064bd 100644 --- a/src/Interpreters/SelectQueryOptions.h +++ b/src/Interpreters/SelectQueryOptions.h @@ -48,6 +48,7 @@ struct SelectQueryOptions bool is_internal = false; bool is_subquery = false; // non-subquery can also have subquery_depth > 0, e.g. insert select bool with_all_cols = false; /// asterisk include materialized and aliased columns + bool settings_limit_offset_done = false; /// These two fields are used to evaluate shardNum() and shardCount() function when /// prefer_localhost_replica == 1 and local instance is selected. They are needed because local @@ -58,8 +59,10 @@ struct SelectQueryOptions SelectQueryOptions( QueryProcessingStage::Enum stage = QueryProcessingStage::Complete, size_t depth = 0, - bool is_subquery_ = false) - : to_stage(stage), subquery_depth(depth), is_subquery(is_subquery_) + bool is_subquery_ = false, + bool settings_limit_offset_done_ = false) + : to_stage(stage), subquery_depth(depth), is_subquery(is_subquery_), + settings_limit_offset_done(settings_limit_offset_done_) {} SelectQueryOptions copy() const { return *this; } diff --git a/src/Interpreters/SystemLog.h b/src/Interpreters/SystemLog.h index d6342e3973e..46254d0c3a2 100644 --- a/src/Interpreters/SystemLog.h +++ b/src/Interpreters/SystemLog.h @@ -168,6 +168,8 @@ public: void shutdown() override { stopFlushThread(); + + auto table = DatabaseCatalog::instance().tryGetTable(table_id, getContext()); if (table) table->flushAndShutdown(); } @@ -186,7 +188,6 @@ private: /* Saving thread data */ const StorageID table_id; const String storage_def; - StoragePtr table; String create_query; String old_create_query; bool is_prepared = false; @@ -525,7 +526,7 @@ void SystemLog::prepareTable() { String description = table_id.getNameForLogs(); - table = DatabaseCatalog::instance().tryGetTable(table_id, getContext()); + auto table = DatabaseCatalog::instance().tryGetTable(table_id, getContext()); if (table) { diff --git a/src/Interpreters/TableOverrideUtils.cpp b/src/Interpreters/TableOverrideUtils.cpp new file mode 100644 index 00000000000..922dd6af25b --- /dev/null +++ b/src/Interpreters/TableOverrideUtils.cpp @@ -0,0 +1,174 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_TABLE_OVERRIDE; +} + +namespace +{ + +class MaybeNullableColumnsMatcher +{ +public: + using Visitor = ConstInDepthNodeVisitor; + using Data = RequiredSourceColumnsData; + + static bool needChildVisit(const ASTPtr & node, const ASTPtr & child) + { + if (const auto * f = node->as(); f && f->name == "assumeNotNull") + return false; + return RequiredSourceColumnsMatcher::needChildVisit(node, child); + } + + static void visit(const ASTPtr & ast, Data & data) + { + RequiredSourceColumnsMatcher::visit(ast, data); + } +}; + +using MaybeNullableColumnsVisitor = MaybeNullableColumnsMatcher::Visitor; + +} + +static void checkRequiredColumns(const IAST * ast, const NameToTypeMap & existing_types, NamesAndTypes & used_columns, const String & what, bool allow_nulls = false) +{ + if (!ast) + return; + RequiredSourceColumnsData columns_data; + RequiredSourceColumnsVisitor(columns_data).visit(ast->clone()); + auto required_columns = columns_data.requiredColumns(); + for (const auto & column : required_columns) + { + auto type = existing_types.find(column); + if (type == existing_types.end()) + throw Exception(ErrorCodes::INVALID_TABLE_OVERRIDE, "{} override refers to unknown column {}", what, backQuote(column)); + } + if (!allow_nulls) + { + RequiredSourceColumnsData nullable_data; + MaybeNullableColumnsVisitor(nullable_data).visit(ast->clone()); + for (const auto & column : nullable_data.requiredColumns()) + { + if (existing_types.find(column)->second->isNullable()) + throw Exception( + ErrorCodes::INVALID_TABLE_OVERRIDE, + "{} override refers to nullable column {} (use assumeNotNull() if the column does not in fact contain NULL values)", + what, + backQuote(column)); + } + } + for (const auto & col : required_columns) + { + used_columns.push_back({col, existing_types.find(col)->second}); + } +} + +void TableOverrideAnalyzer::analyze(const StorageInMemoryMetadata & metadata, Result & result) const +{ + for (const auto & column : metadata.columns) + result.existing_types[column.name] = column.type; + checkRequiredColumns(override->storage->order_by, result.existing_types, result.order_by_columns, "ORDER BY"); + checkRequiredColumns(override->storage->primary_key, result.existing_types, result.primary_key_columns, "PRIMARY KEY"); + checkRequiredColumns(override->storage->partition_by, result.existing_types, result.partition_by_columns, "PARTITION BY"); + checkRequiredColumns(override->storage->sample_by, result.existing_types, result.sample_by_columns, "SAMPLE BY"); + checkRequiredColumns(override->storage->ttl_table, result.existing_types, result.ttl_columns, "TTL"); + if (override->columns && override->columns->columns) + { + for (const auto & column_ast : override->columns->columns->children) + { + auto * override_column = column_ast->as(); + auto override_type = DataTypeFactory::instance().get(override_column->type); + auto found = metadata.columns.tryGetColumnOrSubcolumn(ColumnsDescription::GetFlags::All, override_column->name); + std::optional override_default_kind; + if (!override_column->default_specifier.empty()) + override_default_kind = columnDefaultKindFromString(override_column->default_specifier); + if (found) + { + std::optional existing_default_kind; + if (auto col_default = metadata.columns.getDefault(found->name)) + existing_default_kind = col_default->kind; + if (existing_default_kind != override_default_kind) + throw Exception(ErrorCodes::INVALID_TABLE_OVERRIDE, "column {}: modifying default specifier is not allowed", backQuote(override_column->name)); + result.modified_columns.push_back({found->name, override_type}); + /// TODO: validate that the original type can be converted to the overridden type + } + else + { + if (override_default_kind && *override_default_kind == ColumnDefaultKind::Alias) + result.added_columns.push_back({override_column->name, override_type}); + else + throw Exception(ErrorCodes::INVALID_TABLE_OVERRIDE, "column {}: can only add ALIAS columns", backQuote(override_column->name)); + } + /// TODO: validate default and materialized expressions (use checkRequiredColumns, allowing nulls) + } + } +} + +void TableOverrideAnalyzer::Result::appendTo(WriteBuffer & ostr) +{ + const auto & format_names = [&](const NamesAndTypes & names) -> String + { + WriteBufferFromOwnString buf; + bool first = true; + for (const auto & name : names) + { + if (!first) + buf << ", "; + first = false; + buf << backQuote(name.name) << " "; + auto old_type = existing_types.find(name.name); + if (old_type != existing_types.end() && old_type->second != name.type) + buf << old_type->second->getName() << " -> "; + buf << name.type->getName(); + } + return buf.str(); + }; + if (!modified_columns.empty()) + { + ostr << "Modified columns: " << format_names(modified_columns) << "\n"; + } + if (!added_columns.empty()) + { + ostr << "Added columns: " << format_names(added_columns) << "\n"; + } + if (!order_by_columns.empty()) + { + ostr << "ORDER BY uses columns: " << format_names(order_by_columns) << "\n"; + } + if (!primary_key_columns.empty()) + { + ostr << "PRIMARY KEY uses columns: " << format_names(primary_key_columns) << "\n"; + } + if (!partition_by_columns.empty()) + { + ostr << "PARTITION BY uses columns: " << format_names(partition_by_columns) << "\n"; + } + if (!sample_by_columns.empty()) + { + ostr << "SAMPLE BY uses columns: " << format_names(sample_by_columns) << "\n"; + } + if (!ttl_columns.empty()) + { + ostr << "TTL uses columns: " << format_names(ttl_columns) << "\n"; + } +} + +} diff --git a/src/Interpreters/TableOverrideUtils.h b/src/Interpreters/TableOverrideUtils.h new file mode 100644 index 00000000000..810ffecd573 --- /dev/null +++ b/src/Interpreters/TableOverrideUtils.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +struct StorageInMemoryMetadata; + +using NameToTypeMap = std::map; + +struct TableOverrideAnalyzer +{ + struct Result + { + NameToTypeMap existing_types; + NamesAndTypes order_by_columns; + NamesAndTypes primary_key_columns; + NamesAndTypes partition_by_columns; + NamesAndTypes sample_by_columns; + NamesAndTypes ttl_columns; + NamesAndTypes added_columns; + NamesAndTypes modified_columns; + + void appendTo(WriteBuffer &); + }; + + ASTTableOverride * override; + + explicit TableOverrideAnalyzer(ASTPtr ast) : override(assert_cast(ast.get())) { } + + void analyze(const StorageInMemoryMetadata & metadata, Result & result) const; +}; + +} diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp index fc6aa15a1e8..b3720b89eaa 100644 --- a/src/Interpreters/ThreadStatusExt.cpp +++ b/src/Interpreters/ThreadStatusExt.cpp @@ -24,12 +24,6 @@ # include #endif -namespace ProfileEvents -{ - extern const Event InsertedRows; - extern const Event InsertedBytes; -} - /// Implement some methods of ThreadStatus and CurrentThread here to avoid extra linking dependencies in clickhouse_common_io /// TODO It doesn't make sense. @@ -447,9 +441,8 @@ void ThreadStatus::logToQueryThreadLog(QueryThreadLog & thread_log, const String elem.read_rows = progress_in.read_rows.load(std::memory_order_relaxed); elem.read_bytes = progress_in.read_bytes.load(std::memory_order_relaxed); - /// TODO: Use written_rows and written_bytes when run time progress is implemented - elem.written_rows = progress_out.read_rows.load(std::memory_order_relaxed); - elem.written_bytes = progress_out.read_bytes.load(std::memory_order_relaxed); + elem.written_rows = progress_out.written_rows.load(std::memory_order_relaxed); + elem.written_bytes = progress_out.written_bytes.load(std::memory_order_relaxed); elem.memory_usage = memory_tracker.get(); elem.peak_memory_usage = memory_tracker.getPeak(); @@ -520,8 +513,8 @@ void ThreadStatus::logToQueryViewsLog(const ViewRuntimeData & vinfo) auto events = std::make_shared(performance_counters.getPartiallyAtomicSnapshot()); element.read_rows = progress_in.read_rows.load(std::memory_order_relaxed); element.read_bytes = progress_in.read_bytes.load(std::memory_order_relaxed); - element.written_rows = (*events)[ProfileEvents::InsertedRows]; - element.written_bytes = (*events)[ProfileEvents::InsertedBytes]; + element.written_rows = progress_out.written_rows.load(std::memory_order_relaxed); + element.written_bytes = progress_out.written_bytes.load(std::memory_order_relaxed); element.peak_memory_usage = memory_tracker.getPeak() > 0 ? memory_tracker.getPeak() : 0; if (query_context_ptr->getSettingsRef().log_profile_events != 0) { diff --git a/src/Interpreters/UserDefinedExecutableFunction.cpp b/src/Interpreters/UserDefinedExecutableFunction.cpp index d57978d0fd6..e5a852b0e75 100644 --- a/src/Interpreters/UserDefinedExecutableFunction.cpp +++ b/src/Interpreters/UserDefinedExecutableFunction.cpp @@ -13,14 +13,12 @@ namespace DB UserDefinedExecutableFunction::UserDefinedExecutableFunction( const UserDefinedExecutableFunctionConfiguration & configuration_, - const ExternalLoadableLifetime & lifetime_, - std::shared_ptr process_pool_) + std::shared_ptr coordinator_, + const ExternalLoadableLifetime & lifetime_) : configuration(configuration_) + , coordinator(std::move(coordinator_)) , lifetime(lifetime_) - , process_pool(process_pool_) { - if (!process_pool && configuration.type == UserDefinedExecutableFunctionType::executable_pool) - process_pool = std::make_shared(configuration.pool_size == 0 ? std::numeric_limits::max() : configuration.pool_size); } }; diff --git a/src/Interpreters/UserDefinedExecutableFunction.h b/src/Interpreters/UserDefinedExecutableFunction.h index 1cb1de47578..a4fad8ceb7b 100644 --- a/src/Interpreters/UserDefinedExecutableFunction.h +++ b/src/Interpreters/UserDefinedExecutableFunction.h @@ -10,26 +10,13 @@ namespace DB { -enum class UserDefinedExecutableFunctionType -{ - executable, - executable_pool -}; - struct UserDefinedExecutableFunctionConfiguration { - UserDefinedExecutableFunctionType type = UserDefinedExecutableFunctionType::executable; std::string name; - std::string script_path; - std::string format; + std::string command; + std::vector command_arguments; std::vector argument_types; DataTypePtr result_type; - /// Pool settings - size_t pool_size = 0; - size_t command_termination_timeout = 0; - size_t max_command_execution_time = 0; - /// Send number_of_rows\n before sending chunk to process - bool send_chunk_header = false; }; class UserDefinedExecutableFunction final : public IExternalLoadable @@ -38,8 +25,8 @@ public: UserDefinedExecutableFunction( const UserDefinedExecutableFunctionConfiguration & configuration_, - const ExternalLoadableLifetime & lifetime_, - std::shared_ptr process_pool_ = nullptr); + std::shared_ptr coordinator_, + const ExternalLoadableLifetime & lifetime_); const ExternalLoadableLifetime & getLifetime() const override { @@ -63,7 +50,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(configuration, lifetime, process_pool); + return std::make_shared(configuration, coordinator, lifetime); } const UserDefinedExecutableFunctionConfiguration & getConfiguration() const @@ -71,9 +58,9 @@ public: return configuration; } - std::shared_ptr getProcessPool() const + std::shared_ptr getCoordinator() const { - return process_pool; + return coordinator; } std::shared_ptr shared_from_this() @@ -87,13 +74,9 @@ public: } private: - UserDefinedExecutableFunction(const UserDefinedExecutableFunctionConfiguration & configuration_, - std::shared_ptr process_pool_, - const ExternalLoadableLifetime & lifetime_); - UserDefinedExecutableFunctionConfiguration configuration; + std::shared_ptr coordinator; ExternalLoadableLifetime lifetime; - std::shared_ptr process_pool; }; } diff --git a/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp b/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp index 4cb3e034b01..10cb806028e 100644 --- a/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp +++ b/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp @@ -1,8 +1,13 @@ #include "UserDefinedExecutableFunctionFactory.h" +#include + +#include + #include #include +#include #include #include @@ -19,7 +24,6 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; - extern const int TIMEOUT_EXCEEDED; } class UserDefinedFunction final : public IFunction @@ -52,10 +56,36 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { + auto coordinator = executable_function->getCoordinator(); + const auto & coordinator_configuration = coordinator->getConfiguration(); const auto & configuration = executable_function->getConfiguration(); + + String command = configuration.command; + + if (coordinator_configuration.execute_direct) + { + auto user_scripts_path = context->getUserScriptsPath(); + auto script_path = user_scripts_path + '/' + command; + + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} must be inside user scripts folder {}", + command, + user_scripts_path); + + if (!std::filesystem::exists(std::filesystem::path(script_path))) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, + "Executable file {} does not exist inside user scripts folder {}", + command, + user_scripts_path); + + command = std::move(script_path); + } + + size_t argument_size = arguments.size(); auto arguments_copy = arguments; - for (size_t i = 0; i < arguments.size(); ++i) + for (size_t i = 0; i < argument_size; ++i) { auto & column_with_type = arguments_copy[i]; column_with_type.column = column_with_type.column->convertToFullColumnIfConst(); @@ -68,56 +98,36 @@ public: column_with_type.column = castColumnAccurate(column_to_cast, argument_type); column_with_type.type = argument_type; - column_with_type = column_to_cast; + column_with_type = std::move(column_to_cast); } - std::unique_ptr process = getProcess(); - ColumnWithTypeAndName result(result_type, "result"); Block result_block({result}); Block arguments_block(arguments_copy); - auto * process_in = &process->in; - - auto process_pool = executable_function->getProcessPool(); - bool is_executable_pool_function = (process_pool != nullptr); + auto source = std::make_shared(std::move(arguments_block)); + auto shell_input_pipe = Pipe(std::move(source)); ShellCommandSourceConfiguration shell_command_source_configuration; - if (is_executable_pool_function) + if (coordinator_configuration.is_executable_pool) { shell_command_source_configuration.read_fixed_number_of_rows = true; shell_command_source_configuration.number_of_rows_to_read = input_rows_count; } - ShellCommandSource::SendDataTask task = {[process_in, arguments_block, &configuration, is_executable_pool_function, this]() - { - auto & out = *process_in; + Pipes shell_input_pipes; + shell_input_pipes.emplace_back(std::move(shell_input_pipe)); - if (configuration.send_chunk_header) - { - writeText(arguments_block.rows(), out); - writeChar('\n', out); - } - - auto output_format = context->getOutputFormat(configuration.format, out, arguments_block.cloneEmpty()); - formatBlock(output_format, arguments_block); - if (!is_executable_pool_function) - out.close(); - }}; - std::vector tasks = {std::move(task)}; - - Pipe pipe(std::make_unique( + Pipe pipe = coordinator->createPipe( + command, + configuration.command_arguments, + std::move(shell_input_pipes), + result_block, context, - configuration.format, - result_block.cloneEmpty(), - std::move(process), - std::move(tasks), - shell_command_source_configuration, - process_pool)); + shell_command_source_configuration); QueryPipeline pipeline(std::move(pipe)); - PullingPipelineExecutor executor(pipeline); auto result_column = result_type->createColumn(); @@ -133,8 +143,8 @@ public: size_t result_column_size = result_column->size(); if (result_column_size != input_rows_count) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, - "Function {} wrong result rows count expected {} actual {}", - getName(), + "Function {}: wrong result, expected {} row(s), actual {}", + quoteString(getName()), input_rows_count, result_column_size); @@ -143,36 +153,6 @@ public: private: - std::unique_ptr getProcess() const - { - auto process_pool = executable_function->getProcessPool(); - auto executable_function_configuration = executable_function->getConfiguration(); - - std::unique_ptr process; - bool is_executable_pool_function = (process_pool != nullptr); - if (is_executable_pool_function) - { - bool result = process_pool->tryBorrowObject(process, [&]() - { - ShellCommand::Config process_config(executable_function_configuration.script_path); - process_config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, executable_function_configuration.command_termination_timeout }; - auto shell_command = ShellCommand::execute(process_config); - return shell_command; - }, executable_function_configuration.max_command_execution_time * 1000); - - if (!result) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Could not get process from pool, max command execution timeout exceeded {} seconds", - executable_function_configuration.max_command_execution_time); - } - else - { - process = ShellCommand::execute(executable_function_configuration.script_path); - } - - return process; - } - ExternalUserDefinedExecutableFunctionsLoader::UserDefinedExecutableFunctionPtr executable_function; ContextPtr context; }; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index bd3c35c12f6..9770d1a988f 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -57,6 +57,8 @@ #include #include +#include + #include @@ -271,7 +273,7 @@ static void onExceptionBeforeStart(const String & query_for_logging, ContextPtr // Try log query_kind if ast is valid if (ast) { - elem.query_kind = ast->getQueryKindString(); + elem.query_kind = magic_enum::enum_name(ast->getQueryKind()); if (settings.log_formatted_queries) elem.formatted_query = queryToString(ast); } @@ -841,8 +843,8 @@ static std::tuple executeQueryImpl( else /// will be used only for ordinary INSERT queries { auto progress_out = process_list_elem->getProgressOut(); - elem.result_rows = progress_out.read_rows; - elem.result_bytes = progress_out.read_bytes; + elem.result_rows = progress_out.written_rows; + elem.result_bytes = progress_out.written_rows; } if (elem.read_rows != 0) diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp index 2ae494854ec..f53c39b192f 100644 --- a/src/Parsers/ASTAlterQuery.cpp +++ b/src/Parsers/ASTAlterQuery.cpp @@ -11,6 +11,11 @@ namespace ErrorCodes extern const int UNEXPECTED_AST_STRUCTURE; } +String ASTAlterCommand::getID(char delim) const +{ + return String("AlterCommand") + delim + typeToString(type); +} + ASTPtr ASTAlterCommand::clone() const { auto res = std::make_shared(*this); @@ -75,6 +80,53 @@ ASTPtr ASTAlterCommand::clone() const return res; } +const char * ASTAlterCommand::typeToString(ASTAlterCommand::Type type) +{ + switch (type) + { + case ADD_COLUMN: return "ADD_COLUMN"; + case DROP_COLUMN: return "DROP_COLUMN"; + case MODIFY_COLUMN: return "MODIFY_COLUMN"; + case COMMENT_COLUMN: return "COMMENT_COLUMN"; + case RENAME_COLUMN: return "RENAME_COLUMN"; + case MATERIALIZE_COLUMN: return "MATERIALIZE_COLUMN"; + case MODIFY_ORDER_BY: return "MODIFY_ORDER_BY"; + case MODIFY_SAMPLE_BY: return "MODIFY_SAMPLE_BY"; + case MODIFY_TTL: return "MODIFY_TTL"; + case MATERIALIZE_TTL: return "MATERIALIZE_TTL"; + case MODIFY_SETTING: return "MODIFY_SETTING"; + case RESET_SETTING: return "RESET_SETTING"; + case MODIFY_QUERY: return "MODIFY_QUERY"; + case REMOVE_TTL: return "REMOVE_TTL"; + case REMOVE_SAMPLE_BY: return "REMOVE_SAMPLE_BY"; + case ADD_INDEX: return "ADD_INDEX"; + case DROP_INDEX: return "DROP_INDEX"; + case MATERIALIZE_INDEX: return "MATERIALIZE_INDEX"; + case ADD_CONSTRAINT: return "ADD_CONSTRAINT"; + case DROP_CONSTRAINT: return "DROP_CONSTRAINT"; + case ADD_PROJECTION: return "ADD_PROJECTION"; + case DROP_PROJECTION: return "DROP_PROJECTION"; + case MATERIALIZE_PROJECTION: return "MATERIALIZE_PROJECTION"; + case DROP_PARTITION: return "DROP_PARTITION"; + case DROP_DETACHED_PARTITION: return "DROP_DETACHED_PARTITION"; + case ATTACH_PARTITION: return "ATTACH_PARTITION"; + case MOVE_PARTITION: return "MOVE_PARTITION"; + case REPLACE_PARTITION: return "REPLACE_PARTITION"; + case FETCH_PARTITION: return "FETCH_PARTITION"; + case FREEZE_PARTITION: return "FREEZE_PARTITION"; + case FREEZE_ALL: return "FREEZE_ALL"; + case UNFREEZE_PARTITION: return "UNFREEZE_PARTITION"; + case UNFREEZE_ALL: return "UNFREEZE_ALL"; + case DELETE: return "DELETE"; + case UPDATE: return "UPDATE"; + case NO_TYPE: return "NO_TYPE"; + case LIVE_VIEW_REFRESH: return "LIVE_VIEW_REFRESH"; + case MODIFY_DATABASE_SETTING: return "MODIFY_DATABASE_SETTING"; + case MODIFY_COMMENT: return "MODIFY_COMMENT"; + } + __builtin_unreachable(); +} + void ASTAlterCommand::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const { if (type == ASTAlterCommand::ADD_COLUMN) diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h index b0980c20f5e..976ccd1e2bf 100644 --- a/src/Parsers/ASTAlterQuery.h +++ b/src/Parsers/ASTAlterQuery.h @@ -204,10 +204,12 @@ public: /// Which property user want to remove String remove_property; - String getID(char delim) const override { return "AlterCommand" + (delim + std::to_string(static_cast(type))); } + String getID(char delim) const override; ASTPtr clone() const override; + static const char * typeToString(Type type); + protected: void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; }; @@ -246,7 +248,7 @@ public: return removeOnCluster(clone(), new_database); } - const char * getQueryKindString() const override { return "Alter"; } + virtual QueryKind getQueryKind() const override { return QueryKind::Alter; } protected: void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; diff --git a/src/Parsers/ASTCreateQuery.cpp b/src/Parsers/ASTCreateQuery.cpp index 3e77bee19a9..e61a0f55142 100644 --- a/src/Parsers/ASTCreateQuery.cpp +++ b/src/Parsers/ASTCreateQuery.cpp @@ -359,7 +359,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat if (as_table_function) { - if (columns_list) + if (columns_list && !columns_list->empty()) { frame.expression_list_always_start_on_new_line = true; settings.ostr << (settings.one_line ? " (" : "\n("); @@ -375,7 +375,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat frame.expression_list_always_start_on_new_line = true; - if (columns_list && !as_table_function) + if (columns_list && !columns_list->empty() && !as_table_function) { settings.ostr << (settings.one_line ? " (" : "\n("); FormatStateStacked frame_nested = frame; diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h index 93fced7dba5..fcc4107bb5f 100644 --- a/src/Parsers/ASTCreateQuery.h +++ b/src/Parsers/ASTCreateQuery.h @@ -50,6 +50,12 @@ public: ASTPtr clone() const override; void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override; + + bool empty() + { + return (!columns || columns->children.empty()) && (!indices || indices->children.empty()) && (!constraints || constraints->children.empty()) + && (!projections || projections->children.empty()); + } }; @@ -113,7 +119,7 @@ public: bool isView() const { return is_ordinary_view || is_materialized_view || is_live_view || is_window_view; } - const char * getQueryKindString() const override { return "Create"; } + virtual QueryKind getQueryKind() const override { return QueryKind::Create; } protected: void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; diff --git a/src/Parsers/ASTDropQuery.h b/src/Parsers/ASTDropQuery.h index 6e5fd5854d8..2e67eaf3692 100644 --- a/src/Parsers/ASTDropQuery.h +++ b/src/Parsers/ASTDropQuery.h @@ -45,7 +45,7 @@ public: return removeOnCluster(clone(), new_database); } - const char * getQueryKindString() const override { return "Drop"; } + virtual QueryKind getQueryKind() const override { return QueryKind::Drop; } protected: void formatQueryImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; diff --git a/src/Parsers/ASTExplainQuery.h b/src/Parsers/ASTExplainQuery.h index 5c50a8cd82e..abed9803a7b 100644 --- a/src/Parsers/ASTExplainQuery.h +++ b/src/Parsers/ASTExplainQuery.h @@ -18,6 +18,7 @@ public: QueryPlan, /// 'EXPLAIN SELECT ...' QueryPipeline, /// 'EXPLAIN PIPELINE ...' QueryEstimates, /// 'EXPLAIN ESTIMATE ...' + TableOverride, /// 'EXPLAIN TABLE OVERRIDE ...' }; explicit ASTExplainQuery(ExplainKind kind_) : kind(kind_) {} @@ -45,8 +46,22 @@ public: ast_settings = std::move(settings_); } + void setTableFunction(ASTPtr table_function_) + { + children.emplace_back(table_function_); + table_function = std::move(table_function_); + } + + void setTableOverride(ASTPtr table_override_) + { + children.emplace_back(table_override_); + table_override = std::move(table_override_); + } + const ASTPtr & getExplainedQuery() const { return query; } const ASTPtr & getSettings() const { return ast_settings; } + const ASTPtr & getTableFunction() const { return table_function; } + const ASTPtr & getTableOverride() const { return table_override; } protected: void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override @@ -59,8 +74,21 @@ protected: ast_settings->formatImpl(settings, state, frame); } - settings.ostr << settings.nl_or_ws; - query->formatImpl(settings, state, frame); + if (query) + { + settings.ostr << settings.nl_or_ws; + query->formatImpl(settings, state, frame); + } + if (table_function) + { + settings.ostr << settings.nl_or_ws; + table_function->formatImpl(settings, state, frame); + } + if (table_override) + { + settings.ostr << settings.nl_or_ws; + table_override->formatImpl(settings, state, frame); + } } private: @@ -69,6 +97,10 @@ private: ASTPtr query; ASTPtr ast_settings; + /// Used by EXPLAIN TABLE OVERRIDE + ASTPtr table_function; + ASTPtr table_override; + static String toString(ExplainKind kind) { switch (kind) @@ -78,6 +110,7 @@ private: case QueryPlan: return "EXPLAIN"; case QueryPipeline: return "EXPLAIN PIPELINE"; case QueryEstimates: return "EXPLAIN ESTIMATE"; + case TableOverride: return "EXPLAIN TABLE OVERRIDE"; } __builtin_unreachable(); diff --git a/src/Parsers/ASTInsertQuery.cpp b/src/Parsers/ASTInsertQuery.cpp index c733398a32b..7e1d48d7f55 100644 --- a/src/Parsers/ASTInsertQuery.cpp +++ b/src/Parsers/ASTInsertQuery.cpp @@ -79,6 +79,13 @@ void ASTInsertQuery::formatImpl(const FormatSettings & settings, FormatState & s settings.ostr << ")"; } + if (infile) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") << " FROM INFILE " << (settings.hilite ? hilite_none : "") << infile->as().value.safeGet(); + if (compression) + settings.ostr << (settings.hilite ? hilite_keyword : "") << " COMPRESSION " << (settings.hilite ? hilite_none : "") << compression->as().value.safeGet(); + } + if (select) { settings.ostr << " "; @@ -91,12 +98,6 @@ void ASTInsertQuery::formatImpl(const FormatSettings & settings, FormatState & s } else { - if (infile) - { - settings.ostr << (settings.hilite ? hilite_keyword : "") << " FROM INFILE " << (settings.hilite ? hilite_none : "") << infile->as().value.safeGet(); - if (compression) - settings.ostr << (settings.hilite ? hilite_keyword : "") << " COMPRESSION " << (settings.hilite ? hilite_none : "") << compression->as().value.safeGet(); - } if (!format.empty()) { settings.ostr << (settings.hilite ? hilite_keyword : "") << " FORMAT " << (settings.hilite ? hilite_none : "") << format; diff --git a/src/Parsers/ASTInsertQuery.h b/src/Parsers/ASTInsertQuery.h index d539ad5fdb3..db9262ea794 100644 --- a/src/Parsers/ASTInsertQuery.h +++ b/src/Parsers/ASTInsertQuery.h @@ -66,7 +66,7 @@ public: return res; } - const char * getQueryKindString() const override { return "Insert"; } + virtual QueryKind getQueryKind() const override { return QueryKind::Insert; } protected: void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; diff --git a/src/Parsers/ASTRenameQuery.h b/src/Parsers/ASTRenameQuery.h index dafc166f672..01ab0df9774 100644 --- a/src/Parsers/ASTRenameQuery.h +++ b/src/Parsers/ASTRenameQuery.h @@ -65,7 +65,7 @@ public: return query_ptr; } - const char * getQueryKindString() const override { return "Rename"; } + virtual QueryKind getQueryKind() const override { return QueryKind::Rename; } protected: void formatQueryImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override diff --git a/src/Parsers/ASTSelectIntersectExceptQuery.h b/src/Parsers/ASTSelectIntersectExceptQuery.h index 86475fcba5c..fa574b46c8d 100644 --- a/src/Parsers/ASTSelectIntersectExceptQuery.h +++ b/src/Parsers/ASTSelectIntersectExceptQuery.h @@ -22,7 +22,7 @@ public: void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; - const char * getQueryKindString() const override { return "SelectIntersectExcept"; } + virtual QueryKind getQueryKind() const override { return QueryKind::SelectIntersectExcept; } ASTs getListOfSelects() const; diff --git a/src/Parsers/ASTSelectQuery.h b/src/Parsers/ASTSelectQuery.h index 2b004e9e040..1c631783fdb 100644 --- a/src/Parsers/ASTSelectQuery.h +++ b/src/Parsers/ASTSelectQuery.h @@ -135,7 +135,7 @@ public: void setFinal(); - const char * getQueryKindString() const override { return "Select"; } + virtual QueryKind getQueryKind() const override { return QueryKind::Select; } protected: void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; diff --git a/src/Parsers/ASTSelectWithUnionQuery.h b/src/Parsers/ASTSelectWithUnionQuery.h index 3fc8f9171c0..bd45dd7fc05 100644 --- a/src/Parsers/ASTSelectWithUnionQuery.h +++ b/src/Parsers/ASTSelectWithUnionQuery.h @@ -17,7 +17,7 @@ public: void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; - const char * getQueryKindString() const override { return "Select"; } + virtual QueryKind getQueryKind() const override { return QueryKind::Select; } SelectUnionMode union_mode; diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h index 16f8a3c118a..22488e35e12 100644 --- a/src/Parsers/ASTSystemQuery.h +++ b/src/Parsers/ASTSystemQuery.h @@ -107,7 +107,7 @@ public: return removeOnCluster(clone(), new_database); } - const char * getQueryKindString() const override { return "System"; } + virtual QueryKind getQueryKind() const override { return QueryKind::System; } protected: diff --git a/src/Parsers/ASTTableOverrides.cpp b/src/Parsers/ASTTableOverrides.cpp index d2625bf19b4..8fc21db218f 100644 --- a/src/Parsers/ASTTableOverrides.cpp +++ b/src/Parsers/ASTTableOverrides.cpp @@ -31,13 +31,19 @@ void ASTTableOverride::formatImpl(const FormatSettings & settings_, FormatState String hl_keyword = settings.hilite ? hilite_keyword : ""; String hl_none = settings.hilite ? hilite_none : ""; - settings.ostr << hl_keyword << "TABLE OVERRIDE " << hl_none; - ASTIdentifier(table_name).formatImpl(settings, state, frame); + if (is_standalone) + { + settings.ostr << hl_keyword << "TABLE OVERRIDE " << hl_none; + ASTIdentifier(table_name).formatImpl(settings, state, frame); + } if (!columns && (!storage || storage->children.empty())) return; auto override_frame = frame; - ++override_frame.indent; - settings.ostr << nl_or_ws << '(' << nl_or_nothing; + if (is_standalone) + { + ++override_frame.indent; + settings.ostr << nl_or_ws << '(' << nl_or_nothing; + } String indent_str = settings.one_line ? "" : String(4 * override_frame.indent, ' '); size_t override_elems = 0; if (columns) @@ -68,7 +74,8 @@ void ASTTableOverride::formatImpl(const FormatSettings & settings_, FormatState format_storage_elem(storage->ttl_table, "TTL"); } - settings.ostr << nl_or_nothing << ')'; + if (is_standalone) + settings.ostr << nl_or_nothing << ')'; } ASTPtr ASTTableOverrideList::clone() const diff --git a/src/Parsers/ASTTableOverrides.h b/src/Parsers/ASTTableOverrides.h index 62e96b16b01..c0603f7a8e0 100644 --- a/src/Parsers/ASTTableOverrides.h +++ b/src/Parsers/ASTTableOverrides.h @@ -15,7 +15,7 @@ class ASTStorage; /// Storage and column overrides for a single table, for example: /// -/// TABLE OVERRIDE `foo` PARTITION BY toYYYYMM(`createtime`) +/// TABLE OVERRIDE `foo` (PARTITION BY toYYYYMM(`createtime`)) /// class ASTTableOverride : public IAST { @@ -23,6 +23,7 @@ public: String table_name; ASTColumns * columns = nullptr; ASTStorage * storage = nullptr; + bool is_standalone = true; String getID(char) const override { return "TableOverride " + table_name; } ASTPtr clone() const override; void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; diff --git a/src/Parsers/Access/ASTGrantQuery.h b/src/Parsers/Access/ASTGrantQuery.h index f8ea9b478fe..934d619fc36 100644 --- a/src/Parsers/Access/ASTGrantQuery.h +++ b/src/Parsers/Access/ASTGrantQuery.h @@ -34,6 +34,6 @@ public: void replaceEmptyDatabase(const String & current_database); void replaceCurrentUserTag(const String & current_user_name) const; ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } - const char * getQueryKindString() const override { return is_revoke ? "Revoke" : "Grant"; } + virtual QueryKind getQueryKind() const override { return is_revoke ? QueryKind::Revoke : QueryKind::Grant; } }; } diff --git a/src/Parsers/CommonParsers.cpp b/src/Parsers/CommonParsers.cpp index bffba4aa773..275679d61f0 100644 --- a/src/Parsers/CommonParsers.cpp +++ b/src/Parsers/CommonParsers.cpp @@ -3,6 +3,7 @@ namespace DB { + bool ParserKeyword::parseImpl(Pos & pos, [[maybe_unused]] ASTPtr & node, Expected & expected) { if (pos->type != TokenType::BareWord) @@ -36,4 +37,5 @@ bool ParserKeyword::parseImpl(Pos & pos, [[maybe_unused]] ASTPtr & node, Expecte return true; } + } diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 584c2a32afd..526b3aeb2bd 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -34,10 +35,10 @@ #include #include -#include "ASTColumnsMatcher.h" #include + namespace DB { @@ -273,15 +274,398 @@ bool ParserCompoundIdentifier::parseImpl(Pos & pos, ASTPtr & node, Expected & ex return true; } + +ASTPtr createFunctionCast(const ASTPtr & expr_ast, const ASTPtr & type_ast) +{ + /// Convert to canonical representation in functional form: CAST(expr, 'type') + auto type_literal = std::make_shared(queryToString(type_ast)); + + auto expr_list_args = std::make_shared(); + expr_list_args->children.push_back(expr_ast); + expr_list_args->children.push_back(std::move(type_literal)); + + auto func_node = std::make_shared(); + func_node->name = "CAST"; + func_node->arguments = std::move(expr_list_args); + func_node->children.push_back(func_node->arguments); + + return func_node; +} + + +namespace +{ + bool parseCastAs(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + /// expr AS type + + ASTPtr expr_node; + ASTPtr type_node; + + if (ParserExpression().parse(pos, expr_node, expected)) + { + if (ParserKeyword("AS").ignore(pos, expected)) + { + if (ParserDataType().parse(pos, type_node, expected)) + { + node = createFunctionCast(expr_node, type_node); + return true; + } + } + else if (ParserToken(TokenType::Comma).ignore(pos, expected)) + { + if (ParserExpression().parse(pos, type_node, expected)) + { + node = makeASTFunction("CAST", expr_node, type_node); + return true; + } + } + } + + return false; + } + + bool parseSubstring(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + /// Either SUBSTRING(expr FROM start) or SUBSTRING(expr FROM start FOR length) or SUBSTRING(expr, start, length) + /// The latter will be parsed normally as a function later. + + ASTPtr expr_node; + ASTPtr start_node; + ASTPtr length_node; + + if (!ParserExpression().parse(pos, expr_node, expected)) + return false; + + if (pos->type != TokenType::Comma) + { + if (!ParserKeyword("FROM").ignore(pos, expected)) + return false; + } + else + { + ++pos; + } + + if (!ParserExpression().parse(pos, start_node, expected)) + return false; + + if (pos->type != TokenType::ClosingRoundBracket) + { + if (pos->type != TokenType::Comma) + { + if (!ParserKeyword("FOR").ignore(pos, expected)) + return false; + } + else + { + ++pos; + } + + if (!ParserExpression().parse(pos, length_node, expected)) + return false; + } + + /// Convert to canonical representation in functional form: SUBSTRING(expr, start, length) + if (length_node) + node = makeASTFunction("substring", expr_node, start_node, length_node); + else + node = makeASTFunction("substring", expr_node, start_node); + + return true; + } + + bool parseTrim(bool trim_left, bool trim_right, IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + /// Handles all possible TRIM/LTRIM/RTRIM call variants + + std::string func_name; + bool char_override = false; + ASTPtr expr_node; + ASTPtr pattern_node; + ASTPtr to_remove; + + if (!trim_left && !trim_right) + { + if (ParserKeyword("BOTH").ignore(pos, expected)) + { + trim_left = true; + trim_right = true; + char_override = true; + } + else if (ParserKeyword("LEADING").ignore(pos, expected)) + { + trim_left = true; + char_override = true; + } + else if (ParserKeyword("TRAILING").ignore(pos, expected)) + { + trim_right = true; + char_override = true; + } + else + { + trim_left = true; + trim_right = true; + } + + if (char_override) + { + if (!ParserExpression().parse(pos, to_remove, expected)) + return false; + if (!ParserKeyword("FROM").ignore(pos, expected)) + return false; + + auto quote_meta_func_node = std::make_shared(); + auto quote_meta_list_args = std::make_shared(); + quote_meta_list_args->children = {to_remove}; + + quote_meta_func_node->name = "regexpQuoteMeta"; + quote_meta_func_node->arguments = std::move(quote_meta_list_args); + quote_meta_func_node->children.push_back(quote_meta_func_node->arguments); + + to_remove = std::move(quote_meta_func_node); + } + } + + if (!ParserExpression().parse(pos, expr_node, expected)) + return false; + + /// Convert to regexp replace function call + + if (char_override) + { + auto pattern_func_node = std::make_shared(); + auto pattern_list_args = std::make_shared(); + if (trim_left && trim_right) + { + pattern_list_args->children = { + std::make_shared("^["), + to_remove, + std::make_shared("]*|["), + to_remove, + std::make_shared("]*$") + }; + func_name = "replaceRegexpAll"; + } + else + { + if (trim_left) + { + pattern_list_args->children = { + std::make_shared("^["), + to_remove, + std::make_shared("]*") + }; + } + else + { + /// trim_right == false not possible + pattern_list_args->children = { + std::make_shared("["), + to_remove, + std::make_shared("]*$") + }; + } + func_name = "replaceRegexpOne"; + } + + pattern_func_node->name = "concat"; + pattern_func_node->arguments = std::move(pattern_list_args); + pattern_func_node->children.push_back(pattern_func_node->arguments); + + pattern_node = std::move(pattern_func_node); + } + else + { + if (trim_left && trim_right) + { + func_name = "trimBoth"; + } + else + { + if (trim_left) + { + func_name = "trimLeft"; + } + else + { + /// trim_right == false not possible + func_name = "trimRight"; + } + } + } + + if (char_override) + node = makeASTFunction(func_name, expr_node, pattern_node, std::make_shared("")); + else + node = makeASTFunction(func_name, expr_node); + return true; + } + + bool parseExtract(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + ASTPtr expr; + + IntervalKind interval_kind; + if (!parseIntervalKind(pos, expected, interval_kind)) + { + ASTPtr expr_list; + if (!ParserExpressionList(false, false).parse(pos, expr_list, expected)) + return false; + + auto res = std::make_shared(); + res->name = "extract"; + res->arguments = expr_list; + res->children.push_back(res->arguments); + node = std::move(res); + return true; + } + + ParserKeyword s_from("FROM"); + if (!s_from.ignore(pos, expected)) + return false; + + ParserExpression elem_parser; + if (!elem_parser.parse(pos, expr, expected)) + return false; + + node = makeASTFunction(interval_kind.toNameOfFunctionExtractTimePart(), expr); + return true; + } + + bool parsePosition(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + ASTPtr expr_list_node; + if (!ParserExpressionList(false, false).parse(pos, expr_list_node, expected)) + return false; + + ASTExpressionList * expr_list = typeid_cast(expr_list_node.get()); + if (expr_list && expr_list->children.size() == 1) + { + ASTFunction * func_in = typeid_cast(expr_list->children[0].get()); + if (func_in && func_in->name == "in") + { + ASTExpressionList * in_args = typeid_cast(func_in->arguments.get()); + if (in_args && in_args->children.size() == 2) + { + node = makeASTFunction("position", in_args->children[1], in_args->children[0]); + return true; + } + } + } + + auto res = std::make_shared(); + res->name = "position"; + res->arguments = expr_list_node; + res->children.push_back(res->arguments); + node = std::move(res); + return true; + } + + bool parseDateAdd(const char * function_name, IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + ASTPtr timestamp_node; + ASTPtr offset_node; + + IntervalKind interval_kind; + ASTPtr interval_func_node; + if (parseIntervalKind(pos, expected, interval_kind)) + { + /// function(unit, offset, timestamp) + if (pos->type != TokenType::Comma) + return false; + ++pos; + + if (!ParserExpression().parse(pos, offset_node, expected)) + return false; + + if (pos->type != TokenType::Comma) + return false; + ++pos; + + if (!ParserExpression().parse(pos, timestamp_node, expected)) + return false; + auto interval_expr_list_args = std::make_shared(); + interval_expr_list_args->children = {offset_node}; + + interval_func_node = std::make_shared(); + interval_func_node->as().name = interval_kind.toNameOfFunctionToIntervalDataType(); + interval_func_node->as().arguments = std::move(interval_expr_list_args); + interval_func_node->as().children.push_back(interval_func_node->as().arguments); + } + else + { + ASTPtr expr_list; + if (!ParserExpressionList(false, false).parse(pos, expr_list, expected)) + return false; + + auto res = std::make_shared(); + res->name = function_name; + res->arguments = expr_list; + res->children.push_back(res->arguments); + node = std::move(res); + return true; + } + + node = makeASTFunction(function_name, timestamp_node, interval_func_node); + return true; + } + + bool parseDateDiff(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + ASTPtr left_node; + ASTPtr right_node; + + IntervalKind interval_kind; + if (!parseIntervalKind(pos, expected, interval_kind)) + { + ASTPtr expr_list; + if (!ParserExpressionList(false, false).parse(pos, expr_list, expected)) + return false; + + auto res = std::make_shared(); + res->name = "dateDiff"; + res->arguments = expr_list; + res->children.push_back(res->arguments); + node = std::move(res); + return true; + } + + if (pos->type != TokenType::Comma) + return false; + ++pos; + + if (!ParserExpression().parse(pos, left_node, expected)) + return false; + + if (pos->type != TokenType::Comma) + return false; + ++pos; + + if (!ParserExpression().parse(pos, right_node, expected)) + return false; + + node = makeASTFunction("dateDiff", std::make_shared(interval_kind.toDateDiffUnit()), left_node, right_node); + return true; + } + + bool parseExists(IParser::Pos & pos, ASTPtr & node, Expected & expected) + { + if (!ParserSelectWithUnionQuery().parse(pos, node, expected)) + return false; + + auto subquery = std::make_shared(); + subquery->children.push_back(node); + node = makeASTFunction("exists", subquery); + return true; + } +} + + bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserIdentifier id_parser; - ParserKeyword distinct("DISTINCT"); - ParserKeyword all("ALL"); - ParserExpressionList contents(false, is_table_function); - ParserSelectWithUnionQuery select; - ParserKeyword filter("FILTER"); - ParserKeyword over("OVER"); bool has_all = false; bool has_distinct = false; @@ -304,9 +688,73 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return false; ++pos; + /// Avoid excessive backtracking. + //pos.putBarrier(); + + /// Special cases for expressions that look like functions but contain some syntax sugar: + + /// CAST, EXTRACT, POSITION, EXISTS + /// DATE_ADD, DATEADD, TIMESTAMPADD, DATE_SUB, DATESUB, TIMESTAMPSUB, + /// DATE_DIFF, DATEDIFF, TIMESTAMPDIFF, TIMESTAMP_DIFF, + /// SUBSTRING, TRIM, LTRIM, RTRIM, POSITION + + /// Can be parsed as a composition of functions, but the contents must be unwrapped: + /// POSITION(x IN y) -> POSITION(in(x, y)) -> POSITION(y, x) + + /// Can be parsed as a function, but not always: + /// CAST(x AS type) - alias has to be unwrapped + /// CAST(x AS type(params)) + + /// Can be parsed as a function, but some identifier arguments have special meanings. + /// DATE_ADD(MINUTE, x, y) -> addMinutes(x, y) + /// DATE_DIFF(MINUTE, x, y) + + /// Have keywords that have to processed explicitly: + /// EXTRACT(x FROM y) + /// TRIM(BOTH|LEADING|TRAILING x FROM y) + /// SUBSTRING(x FROM a) + /// SUBSTRING(x FROM a FOR b) + + String function_name = getIdentifierName(identifier); + String function_name_lowercase = Poco::toLower(function_name); + + std::optional parsed_special_function; + + if (function_name_lowercase == "cast") + parsed_special_function = parseCastAs(pos, node, expected); + else if (function_name_lowercase == "extract") + parsed_special_function = parseExtract(pos, node, expected); + else if (function_name_lowercase == "substring") + parsed_special_function = parseSubstring(pos, node, expected); + else if (function_name_lowercase == "position") + parsed_special_function = parsePosition(pos, node, expected); + else if (function_name_lowercase == "exists") + parsed_special_function = parseExists(pos, node, expected); + else if (function_name_lowercase == "trim") + parsed_special_function = parseTrim(false, false, pos, node, expected); + else if (function_name_lowercase == "ltrim") + parsed_special_function = parseTrim(true, false, pos, node, expected); + else if (function_name_lowercase == "rtrim") + parsed_special_function = parseTrim(false, true, pos, node, expected); + else if (function_name_lowercase == "dateadd" || function_name_lowercase == "date_add" + || function_name_lowercase == "timestampadd" || function_name_lowercase == "timestamp_add") + parsed_special_function = parseDateAdd("plus", pos, node, expected); + else if (function_name_lowercase == "datesub" || function_name_lowercase == "date_sub" + || function_name_lowercase == "timestampsub" || function_name_lowercase == "timestamp_sub") + parsed_special_function = parseDateAdd("minus", pos, node, expected); + else if (function_name_lowercase == "datediff" || function_name_lowercase == "date_diff" + || function_name_lowercase == "timestampdiff" || function_name_lowercase == "timestamp_diff") + parsed_special_function = parseDateDiff(pos, node, expected); + + if (parsed_special_function.has_value()) + return parsed_special_function.value() && ParserToken(TokenType::ClosingRoundBracket).ignore(pos); + auto pos_after_bracket = pos; auto old_expected = expected; + ParserKeyword all("ALL"); + ParserKeyword distinct("DISTINCT"); + if (all.ignore(pos, expected)) has_all = true; @@ -331,6 +779,8 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) } } + ParserExpressionList contents(false, is_table_function); + const char * contents_begin = pos->begin; if (!contents.parse(pos, expr_list_args, expected)) return false; @@ -345,7 +795,7 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) * If you do not report that the first option is an error, then the argument will be interpreted as 2014 - 01 - 01 - some number, * and the query silently returns an unexpected result. */ - if (getIdentifierName(identifier) == "toDate" + if (function_name == "toDate" && contents_end - contents_begin == strlen("2014-01-01") && contents_begin[0] >= '2' && contents_begin[0] <= '3' && contents_begin[1] >= '0' && contents_begin[1] <= '9' @@ -362,26 +812,6 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) throw Exception("Argument of function toDate is unquoted: toDate(" + contents_str + "), must be: toDate('" + contents_str + "')" , ErrorCodes::SYNTAX_ERROR); } - else if (Poco::toLower(getIdentifierName(identifier)) == "position") - { - /// POSITION(needle IN haystack) is equivalent to function position(haystack, needle) - if (const auto * list = expr_list_args->as()) - { - if (list->children.size() == 1) - { - if (const auto * in_func = list->children[0]->as()) - { - if (in_func->name == "in") - { - // switch the two arguments - const auto & arg_list = in_func->arguments->as(); - if (arg_list.children.size() == 2) - expr_list_args->children = {arg_list.children[1], arg_list.children[0]}; - } - } - } - } - } /// The parametric aggregate function has two lists (parameters and arguments) in parentheses. Example: quantile(0.9)(x). if (allow_function_parameters && pos->type == TokenType::OpeningRoundBracket) @@ -445,6 +875,9 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) function_node->children.push_back(function_node->parameters); } + ParserKeyword filter("FILTER"); + ParserKeyword over("OVER"); + if (filter.ignore(pos, expected)) { // We are slightly breaking the parser interface by parsing the window @@ -455,9 +888,7 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ParserFilterClause filter_parser; if (!filter_parser.parse(pos, function_node_as_iast, expected)) - { return false; - } } if (over.ignore(pos, expected)) @@ -468,9 +899,7 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ParserWindowReference window_reference; if (!window_reference.parse(pos, function_node_as_iast, expected)) - { return false; - } } node = function_node; @@ -877,22 +1306,6 @@ bool ParserCodec::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return true; } -ASTPtr createFunctionCast(const ASTPtr & expr_ast, const ASTPtr & type_ast) -{ - /// Convert to canonical representation in functional form: CAST(expr, 'type') - auto type_literal = std::make_shared(queryToString(type_ast)); - - auto expr_list_args = std::make_shared(); - expr_list_args->children.push_back(expr_ast); - expr_list_args->children.push_back(std::move(type_literal)); - - auto func_node = std::make_shared(); - func_node->name = "CAST"; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - return func_node; -} template static bool isOneOf(TokenType token) @@ -1005,509 +1418,6 @@ bool ParserCastOperator::parseImpl(Pos & pos, ASTPtr & node, Expected & expected return false; } -bool ParserCastAsExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - /// Either CAST(expr AS type) or CAST(expr, 'type') - /// The latter will be parsed normally as a function later. - - ASTPtr expr_node; - ASTPtr type_node; - - if (ParserKeyword("CAST").ignore(pos, expected) - && ParserToken(TokenType::OpeningRoundBracket).ignore(pos, expected) - && ParserExpression().parse(pos, expr_node, expected) - && ParserKeyword("AS").ignore(pos, expected) - && ParserDataType().parse(pos, type_node, expected) - && ParserToken(TokenType::ClosingRoundBracket).ignore(pos, expected)) - { - node = createFunctionCast(expr_node, type_node); - return true; - } - - return false; -} - -bool ParserSubstringExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - /// Either SUBSTRING(expr FROM start) or SUBSTRING(expr FROM start FOR length) or SUBSTRING(expr, start, length) - /// The latter will be parsed normally as a function later. - - ASTPtr expr_node; - ASTPtr start_node; - ASTPtr length_node; - - if (!ParserKeyword("SUBSTRING").ignore(pos, expected)) - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - if (!ParserExpression().parse(pos, expr_node, expected)) - return false; - - if (pos->type != TokenType::Comma) - { - if (!ParserKeyword("FROM").ignore(pos, expected)) - return false; - } - else - { - ++pos; - } - - if (!ParserExpression().parse(pos, start_node, expected)) - return false; - - if (pos->type == TokenType::ClosingRoundBracket) - { - ++pos; - } - else - { - if (pos->type != TokenType::Comma) - { - if (!ParserKeyword("FOR").ignore(pos, expected)) - return false; - } - else - { - ++pos; - } - - if (!ParserExpression().parse(pos, length_node, expected)) - return false; - - ParserToken(TokenType::ClosingRoundBracket).ignore(pos, expected); - } - - /// Convert to canonical representation in functional form: SUBSTRING(expr, start, length) - - auto expr_list_args = std::make_shared(); - expr_list_args->children = {expr_node, start_node}; - - if (length_node) - expr_list_args->children.push_back(length_node); - - auto func_node = std::make_shared(); - func_node->name = "substring"; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - return true; -} - -bool ParserTrimExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - /// Handles all possible TRIM/LTRIM/RTRIM call variants - - std::string func_name; - bool trim_left = false; - bool trim_right = false; - bool char_override = false; - ASTPtr expr_node; - ASTPtr pattern_node; - ASTPtr to_remove; - - if (ParserKeyword("LTRIM").ignore(pos, expected)) - { - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - trim_left = true; - } - else if (ParserKeyword("RTRIM").ignore(pos, expected)) - { - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - trim_right = true; - } - else if (ParserKeyword("TRIM").ignore(pos, expected)) - { - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - if (ParserKeyword("BOTH").ignore(pos, expected)) - { - trim_left = true; - trim_right = true; - char_override = true; - } - else if (ParserKeyword("LEADING").ignore(pos, expected)) - { - trim_left = true; - char_override = true; - } - else if (ParserKeyword("TRAILING").ignore(pos, expected)) - { - trim_right = true; - char_override = true; - } - else - { - trim_left = true; - trim_right = true; - } - - if (char_override) - { - if (!ParserExpression().parse(pos, to_remove, expected)) - return false; - if (!ParserKeyword("FROM").ignore(pos, expected)) - return false; - - auto quote_meta_func_node = std::make_shared(); - auto quote_meta_list_args = std::make_shared(); - quote_meta_list_args->children = {to_remove}; - - quote_meta_func_node->name = "regexpQuoteMeta"; - quote_meta_func_node->arguments = std::move(quote_meta_list_args); - quote_meta_func_node->children.push_back(quote_meta_func_node->arguments); - - to_remove = std::move(quote_meta_func_node); - } - } - - if (!(trim_left || trim_right)) - return false; - - if (!ParserExpression().parse(pos, expr_node, expected)) - return false; - - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - /// Convert to regexp replace function call - - if (char_override) - { - auto pattern_func_node = std::make_shared(); - auto pattern_list_args = std::make_shared(); - if (trim_left && trim_right) - { - pattern_list_args->children = { - std::make_shared("^["), - to_remove, - std::make_shared("]*|["), - to_remove, - std::make_shared("]*$") - }; - func_name = "replaceRegexpAll"; - } - else - { - if (trim_left) - { - pattern_list_args->children = { - std::make_shared("^["), - to_remove, - std::make_shared("]*") - }; - } - else - { - /// trim_right == false not possible - pattern_list_args->children = { - std::make_shared("["), - to_remove, - std::make_shared("]*$") - }; - } - func_name = "replaceRegexpOne"; - } - - pattern_func_node->name = "concat"; - pattern_func_node->arguments = std::move(pattern_list_args); - pattern_func_node->children.push_back(pattern_func_node->arguments); - - pattern_node = std::move(pattern_func_node); - } - else - { - if (trim_left && trim_right) - { - func_name = "trimBoth"; - } - else - { - if (trim_left) - { - func_name = "trimLeft"; - } - else - { - /// trim_right == false not possible - func_name = "trimRight"; - } - } - } - - auto expr_list_args = std::make_shared(); - if (char_override) - expr_list_args->children = {expr_node, pattern_node, std::make_shared("")}; - else - expr_list_args->children = {expr_node}; - - auto func_node = std::make_shared(); - func_node->name = func_name; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - return true; -} - -bool ParserLeftExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - /// Rewrites left(expr, length) to SUBSTRING(expr, 1, length) - - ASTPtr expr_node; - ASTPtr start_node; - ASTPtr length_node; - - if (!ParserKeyword("LEFT").ignore(pos, expected)) - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - if (!ParserExpression().parse(pos, expr_node, expected)) - return false; - - ParserToken(TokenType::Comma).ignore(pos, expected); - - if (!ParserExpression().parse(pos, length_node, expected)) - return false; - - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - auto expr_list_args = std::make_shared(); - start_node = std::make_shared(1); - expr_list_args->children = {expr_node, start_node, length_node}; - - auto func_node = std::make_shared(); - func_node->name = "substring"; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - return true; -} - -bool ParserRightExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - /// Rewrites RIGHT(expr, length) to substring(expr, -length) - - ASTPtr expr_node; - ASTPtr length_node; - - if (!ParserKeyword("RIGHT").ignore(pos, expected)) - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - if (!ParserExpression().parse(pos, expr_node, expected)) - return false; - - ParserToken(TokenType::Comma).ignore(pos, expected); - - if (!ParserExpression().parse(pos, length_node, expected)) - return false; - - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - auto start_expr_list_args = std::make_shared(); - start_expr_list_args->children = {length_node}; - - auto start_node = std::make_shared(); - start_node->name = "negate"; - start_node->arguments = std::move(start_expr_list_args); - start_node->children.push_back(start_node->arguments); - - auto expr_list_args = std::make_shared(); - expr_list_args->children = {expr_node, start_node}; - - auto func_node = std::make_shared(); - func_node->name = "substring"; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - return true; -} - -bool ParserExtractExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - if (!ParserKeyword("EXTRACT").ignore(pos, expected)) - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - ASTPtr expr; - - IntervalKind interval_kind; - if (!parseIntervalKind(pos, expected, interval_kind)) - return false; - - ParserKeyword s_from("FROM"); - if (!s_from.ignore(pos, expected)) - return false; - - ParserExpression elem_parser; - if (!elem_parser.parse(pos, expr, expected)) - return false; - - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - auto function = std::make_shared(); - auto exp_list = std::make_shared(); - function->name = interval_kind.toNameOfFunctionExtractTimePart(); - function->arguments = exp_list; - function->children.push_back(exp_list); - exp_list->children.push_back(expr); - node = function; - - return true; -} - -bool ParserDateAddExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - const char * function_name = nullptr; - ASTPtr timestamp_node; - ASTPtr offset_node; - - if (ParserKeyword("DATEADD").ignore(pos, expected) || ParserKeyword("DATE_ADD").ignore(pos, expected) - || ParserKeyword("TIMESTAMPADD").ignore(pos, expected) || ParserKeyword("TIMESTAMP_ADD").ignore(pos, expected)) - function_name = "plus"; - else if (ParserKeyword("DATESUB").ignore(pos, expected) || ParserKeyword("DATE_SUB").ignore(pos, expected) - || ParserKeyword("TIMESTAMPSUB").ignore(pos, expected) || ParserKeyword("TIMESTAMP_SUB").ignore(pos, expected)) - function_name = "minus"; - else - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - IntervalKind interval_kind; - ASTPtr interval_func_node; - if (parseIntervalKind(pos, expected, interval_kind)) - { - /// function(unit, offset, timestamp) - if (pos->type != TokenType::Comma) - return false; - ++pos; - - if (!ParserExpression().parse(pos, offset_node, expected)) - return false; - - if (pos->type != TokenType::Comma) - return false; - ++pos; - - if (!ParserExpression().parse(pos, timestamp_node, expected)) - return false; - auto interval_expr_list_args = std::make_shared(); - interval_expr_list_args->children = {offset_node}; - - interval_func_node = std::make_shared(); - interval_func_node->as().name = interval_kind.toNameOfFunctionToIntervalDataType(); - interval_func_node->as().arguments = std::move(interval_expr_list_args); - interval_func_node->as().children.push_back(interval_func_node->as().arguments); - } - else - { - /// function(timestamp, INTERVAL offset unit) - if (!ParserExpression().parse(pos, timestamp_node, expected)) - return false; - - if (pos->type != TokenType::Comma) - return false; - ++pos; - - if (!ParserIntervalOperatorExpression{}.parse(pos, interval_func_node, expected)) - return false; - } - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - auto expr_list_args = std::make_shared(); - expr_list_args->children = {timestamp_node, interval_func_node}; - - auto func_node = std::make_shared(); - func_node->name = function_name; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - - return true; -} - -bool ParserDateDiffExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - ASTPtr left_node; - ASTPtr right_node; - - if (!(ParserKeyword("DATEDIFF").ignore(pos, expected) || ParserKeyword("DATE_DIFF").ignore(pos, expected) - || ParserKeyword("TIMESTAMPDIFF").ignore(pos, expected) || ParserKeyword("TIMESTAMP_DIFF").ignore(pos, expected))) - return false; - - if (pos->type != TokenType::OpeningRoundBracket) - return false; - ++pos; - - IntervalKind interval_kind; - if (!parseIntervalKind(pos, expected, interval_kind)) - return false; - - if (pos->type != TokenType::Comma) - return false; - ++pos; - - if (!ParserExpression().parse(pos, left_node, expected)) - return false; - - if (pos->type != TokenType::Comma) - return false; - ++pos; - - if (!ParserExpression().parse(pos, right_node, expected)) - return false; - - if (pos->type != TokenType::ClosingRoundBracket) - return false; - ++pos; - - auto expr_list_args = std::make_shared(); - expr_list_args->children = {std::make_shared(interval_kind.toDateDiffUnit()), left_node, right_node}; - - auto func_node = std::make_shared(); - func_node->name = "dateDiff"; - func_node->arguments = std::move(expr_list_args); - func_node->children.push_back(func_node->arguments); - - node = std::move(func_node); - - return true; -} - bool ParserNull::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { @@ -2246,16 +2156,6 @@ bool ParserMySQLGlobalVariable::parseImpl(Pos & pos, ASTPtr & node, Expected & e return true; } -bool ParserExistsExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - if (ParserKeyword("EXISTS").ignore(pos, expected) && ParserSubquery().parse(pos, node, expected)) - { - node = makeASTFunction("exists", node); - return true; - } - return false; -} - bool ParserExpressionElement::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { @@ -2266,20 +2166,11 @@ bool ParserExpressionElement::parseImpl(Pos & pos, ASTPtr & node, Expected & exp || ParserArrayOfLiterals().parse(pos, node, expected) || ParserArray().parse(pos, node, expected) || ParserLiteral().parse(pos, node, expected) - || ParserCastAsExpression().parse(pos, node, expected) - || ParserExtractExpression().parse(pos, node, expected) - || ParserDateAddExpression().parse(pos, node, expected) - || ParserDateDiffExpression().parse(pos, node, expected) - || ParserSubstringExpression().parse(pos, node, expected) - || ParserTrimExpression().parse(pos, node, expected) - || ParserLeftExpression().parse(pos, node, expected) - || ParserRightExpression().parse(pos, node, expected) || ParserCase().parse(pos, node, expected) || ParserColumnsMatcher().parse(pos, node, expected) /// before ParserFunction because it can be also parsed as a function. || ParserFunction().parse(pos, node, expected) || ParserQualifiedAsterisk().parse(pos, node, expected) || ParserAsterisk().parse(pos, node, expected) - || ParserExistsExpression().parse(pos, node, expected) || ParserCompoundIdentifier(false, true).parse(pos, node, expected) || ParserSubstitution().parse(pos, node, expected) || ParserMySQLGlobalVariable().parse(pos, node, expected); diff --git a/src/Parsers/ExpressionElementParsers.h b/src/Parsers/ExpressionElementParsers.h index 5dff2e026be..c86721dca18 100644 --- a/src/Parsers/ExpressionElementParsers.h +++ b/src/Parsers/ExpressionElementParsers.h @@ -228,63 +228,6 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; -ASTPtr createFunctionCast(const ASTPtr & expr_ast, const ASTPtr & type_ast); -class ParserCastAsExpression : public IParserBase -{ -protected: - const char * getName() const override { return "CAST AS expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserSubstringExpression : public IParserBase -{ -protected: - const char * getName() const override { return "SUBSTRING expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserTrimExpression : public IParserBase -{ -protected: - const char * getName() const override { return "TRIM expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserLeftExpression : public IParserBase -{ -protected: - const char * getName() const override { return "LEFT expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserRightExpression : public IParserBase -{ -protected: - const char * getName() const override { return "RIGHT expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserExtractExpression : public IParserBase -{ -protected: - const char * getName() const override { return "EXTRACT expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserDateAddExpression : public IParserBase -{ -protected: - const char * getName() const override { return "DATE_ADD expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - -class ParserDateDiffExpression : public IParserBase -{ -protected: - const char * getName() const override { return "DATE_DIFF expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - /** NULL literal. */ class ParserNull : public IParserBase @@ -333,17 +276,6 @@ protected: }; -/** - * Parse query with EXISTS expression. - */ -class ParserExistsExpression : public IParserBase -{ -protected: - const char * getName() const override { return "exists expression"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; -}; - - /** An array or tuple of literals. * Arrays can also be parsed as an application of [] operator and tuples as an application of 'tuple' function. * But parsing the whole array/tuple as a whole constant seriously speeds up the analysis of expressions in the case of very large collection. @@ -535,4 +467,6 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; +ASTPtr createFunctionCast(const ASTPtr & expr_ast, const ASTPtr & type_ast); + } diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index 680d3f6031b..96c1bad75c2 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -689,7 +689,7 @@ bool ParserUnaryExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expec bool ParserCastExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ASTPtr expr_ast; - if (!elem_parser.parse(pos, expr_ast, expected)) + if (!ParserExpressionElement().parse(pos, expr_ast, expected)) return false; ASTPtr type_ast; diff --git a/src/Parsers/ExpressionListParsers.h b/src/Parsers/ExpressionListParsers.h index e9389f15bbb..a035d4a2ef0 100644 --- a/src/Parsers/ExpressionListParsers.h +++ b/src/Parsers/ExpressionListParsers.h @@ -203,9 +203,6 @@ protected: /// Example: "[1, 1 + 1, 1 + 2]::Array(UInt8)" class ParserCastExpression : public IParserBase { -private: - ParserExpressionElement elem_parser; - protected: const char * getName() const override { return "CAST expression"; } diff --git a/src/Parsers/IAST.h b/src/Parsers/IAST.h index 2f30a1f7bee..fdf821c4a0b 100644 --- a/src/Parsers/IAST.h +++ b/src/Parsers/IAST.h @@ -245,10 +245,23 @@ public: void cloneChildren(); - // Return query_kind string representation of this AST query. - virtual const char * getQueryKindString() const { return ""; } + enum class QueryKind : uint8_t + { + None = 0, + Alter, + Create, + Drop, + Grant, + Insert, + Rename, + Revoke, + SelectIntersectExcept, + Select, + System, + }; + /// Return QueryKind of this AST query. + virtual QueryKind getQueryKind() const { return QueryKind::None; } -public: /// For syntax highlighting. static const char * hilite_keyword; static const char * hilite_identifier; diff --git a/src/Parsers/IParser.h b/src/Parsers/IParser.h index 64f117c707f..4e6dbca15a6 100644 --- a/src/Parsers/IParser.h +++ b/src/Parsers/IParser.h @@ -60,7 +60,9 @@ public: uint32_t depth = 0; uint32_t max_depth = 0; - Pos(Tokens & tokens_, uint32_t max_depth_) : TokenIterator(tokens_), max_depth(max_depth_) {} + Pos(Tokens & tokens_, uint32_t max_depth_) : TokenIterator(tokens_), max_depth(max_depth_) + { + } ALWAYS_INLINE void increaseDepth() { diff --git a/src/Parsers/IParserBase.h b/src/Parsers/IParserBase.h index ce08bdef790..6fd195da40d 100644 --- a/src/Parsers/IParserBase.h +++ b/src/Parsers/IParserBase.h @@ -17,7 +17,7 @@ public: Pos begin = pos; bool res = func(); if (!res) - pos = begin; + pos = begin; return res; } @@ -31,7 +31,7 @@ public: bool res = func(); pos.decreaseDepth(); if (!res) - pos = begin; + pos = begin; return res; } diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index dbbea986404..6d295a0d516 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -557,34 +557,43 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe } } } + /** Create queries without list of columns: + * - CREATE|ATTACH TABLE ... AS ... + * - CREATE|ATTACH TABLE ... ENGINE = engine + */ else { storage_p.parse(pos, storage, expected); - if (!s_as.ignore(pos, expected)) - return false; - - if (!select_p.parse(pos, select, expected)) /// AS SELECT ... + /// CREATE|ATTACH TABLE ... AS ... + if (s_as.ignore(pos, expected)) { - /// ENGINE can not be specified for table functions. - if (storage || !table_function_p.parse(pos, as_table_function, expected)) + if (!select_p.parse(pos, select, expected)) /// AS SELECT ... { - /// AS [db.]table - if (!name_p.parse(pos, as_table, expected)) - return false; - - if (s_dot.ignore(pos, expected)) + /// ENGINE can not be specified for table functions. + if (storage || !table_function_p.parse(pos, as_table_function, expected)) { - as_database = as_table; + /// AS [db.]table if (!name_p.parse(pos, as_table, expected)) return false; - } - /// Optional - ENGINE can be specified. - if (!storage) - storage_p.parse(pos, storage, expected); + if (s_dot.ignore(pos, expected)) + { + as_database = as_table; + if (!name_p.parse(pos, as_table, expected)) + return false; + } + + /// Optional - ENGINE can be specified. + if (!storage) + storage_p.parse(pos, storage, expected); + } } } + else if (!storage) + { + return false; + } } auto comment = parseComment(pos, expected); @@ -960,14 +969,15 @@ bool ParserTableOverrideDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expecte ASTPtr sample_by; ASTPtr ttl_table; - if (!s_table_override.ignore(pos, expected)) - return false; - - if (!table_name_p.parse(pos, table_name, expected)) - return false; - - if (!lparen_p.ignore(pos, expected)) - return false; + if (is_standalone) + { + if (!s_table_override.ignore(pos, expected)) + return false; + if (!table_name_p.parse(pos, table_name, expected)) + return false; + if (!lparen_p.ignore(pos, expected)) + return false; + } while (true) { @@ -1025,7 +1035,7 @@ bool ParserTableOverrideDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expecte break; } - if (!rparen_p.ignore(pos, expected)) + if (is_standalone && !rparen_p.ignore(pos, expected)) return false; auto storage = std::make_shared(); @@ -1036,7 +1046,9 @@ bool ParserTableOverrideDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expecte storage->set(storage->ttl_table, ttl_table); auto res = std::make_shared(); - res->table_name = table_name->as()->name(); + if (table_name) + res->table_name = table_name->as()->name(); + res->is_standalone = is_standalone; res->set(res->storage, storage); if (columns) res->set(res->columns, columns); diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index bc1ebd65639..615121eae58 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -361,6 +361,8 @@ protected: * Or: * CREATE|ATTACH TABLE [IF NOT EXISTS] [db.]name [UUID 'uuid'] [ON CLUSTER cluster] AS ENGINE = engine SELECT ... * + * Or (for engines that supports schema inference): + * CREATE|ATTACH TABLE [IF NOT EXISTS] [db.]name [UUID 'uuid'] [ON CLUSTER cluster] ENGINE = engine */ class ParserCreateTableQuery : public IParserBase { @@ -387,6 +389,10 @@ protected: class ParserTableOverrideDeclaration : public IParserBase { +public: + const bool is_standalone; + ParserTableOverrideDeclaration(bool is_standalone_ = true) : is_standalone(is_standalone_) { } + protected: const char * getName() const override { return "table override declaration"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; diff --git a/src/Parsers/ParserExplainQuery.cpp b/src/Parsers/ParserExplainQuery.cpp index ffaab0f2b6d..e072f6a14d7 100644 --- a/src/Parsers/ParserExplainQuery.cpp +++ b/src/Parsers/ParserExplainQuery.cpp @@ -21,6 +21,7 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserKeyword s_pipeline("PIPELINE"); ParserKeyword s_plan("PLAN"); ParserKeyword s_estimates("ESTIMATE"); + ParserKeyword s_table_override("TABLE OVERRIDE"); if (s_explain.ignore(pos, expected)) { @@ -36,6 +37,8 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected kind = ASTExplainQuery::ExplainKind::QueryPlan; //-V1048 else if (s_estimates.ignore(pos, expected)) kind = ASTExplainQuery::ExplainKind::QueryEstimates; //-V1048 + else if (s_table_override.ignore(pos, expected)) + kind = ASTExplainQuery::ExplainKind::TableOverride; } else return false; @@ -65,6 +68,17 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected else return false; } + else if (kind == ASTExplainQuery::ExplainKind::TableOverride) + { + ASTPtr table_function; + if (!ParserFunction(true, true).parse(pos, table_function, expected)) + return false; + ASTPtr table_override; + if (!ParserTableOverrideDeclaration(false).parse(pos, table_override, expected)) + return false; + explain_query->setTableFunction(table_function); + explain_query->setTableOverride(table_override); + } else if (select_p.parse(pos, query, expected) || create_p.parse(pos, query, expected) || insert_p.parse(pos, query, expected)) diff --git a/src/Parsers/ParserInsertQuery.cpp b/src/Parsers/ParserInsertQuery.cpp index f2f8226c818..568f486a5cf 100644 --- a/src/Parsers/ParserInsertQuery.cpp +++ b/src/Parsers/ParserInsertQuery.cpp @@ -116,7 +116,7 @@ bool ParserInsertQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) /// Check if file is a source of data. if (s_from_infile.ignore(pos, expected)) { - /// Read its name to process it later + /// Read file name to process it later if (!infile_name_p.parse(pos, infile, expected)) return false; @@ -133,7 +133,8 @@ bool ParserInsertQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) Pos before_values = pos; String format_str; - /// VALUES or FROM INFILE or FORMAT or SELECT + /// VALUES or FORMAT or SELECT or WITH or WATCH. + /// After FROM INFILE we expect FORMAT, SELECT, WITH or nothing. if (!infile && s_values.ignore(pos, expected)) { /// If VALUES is defined in query, everything except setting will be parsed as data @@ -162,21 +163,17 @@ bool ParserInsertQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) tryGetIdentifierNameInto(format, format_str); } - else if (s_watch.ignore(pos, expected)) + else if (!infile && s_watch.ignore(pos, expected)) { /// If WATCH is defined, return to position before WATCH and parse /// rest of query as WATCH query. pos = before_values; ParserWatchQuery watch_p; watch_p.parse(pos, watch, expected); - - /// FORMAT section is expected if we have input() in SELECT part - if (s_format.ignore(pos, expected) && !name_p.parse(pos, format, expected)) - return false; } - else + else if (!infile) { - /// If all previous conditions were false, query is incorrect + /// If all previous conditions were false and it's not FROM INFILE, query is incorrect return false; } diff --git a/src/Processors/Executors/PipelineExecutor.h b/src/Processors/Executors/PipelineExecutor.h index 12f2bd8b75b..0b1fe5dedf6 100644 --- a/src/Processors/Executors/PipelineExecutor.h +++ b/src/Processors/Executors/PipelineExecutor.h @@ -26,7 +26,7 @@ public: /// During pipeline execution new processors can appear. They will be added to existing set. /// /// Explicit graph representation is built in constructor. Throws if graph is not correct. - explicit PipelineExecutor(Processors & processors, QueryStatus * elem = nullptr); + explicit PipelineExecutor(Processors & processors, QueryStatus * elem); ~PipelineExecutor(); /// Execute pipeline in multiple threads. Must be called once. diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp new file mode 100644 index 00000000000..096e39a2893 --- /dev/null +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -0,0 +1,160 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; +} + +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) + : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +{ +} + +NamesAndTypesList IRowSchemaReader::readSchema() +{ + DataTypes data_types = readRowAndGetDataTypes(); + for (size_t row = 1; row < max_rows_to_read; ++row) + { + DataTypes new_data_types = readRowAndGetDataTypes(); + if (new_data_types.empty()) + /// We reached eof. + break; + + if (new_data_types.size() != data_types.size()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Rows have different amount of values"); + + for (size_t i = 0; i != data_types.size(); ++i) + { + /// We couldn't determine the type of this column in a new row, just skip it. + if (!new_data_types[i]) + continue; + + /// If we couldn't determine the type of column yet, just set the new type. + if (!data_types[i]) + data_types[i] = new_data_types[i]; + /// If the new type and the previous type for this column are different, + /// we will use default type if we have it or throw an exception. + else if (data_types[i]->getName() != new_data_types[i]->getName()) + { + if (default_type) + data_types[i] = default_type; + else + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", new_data_types[i]->getName(), i + 1, row, data_types[i]->getName()); + } + } + } + + /// Check that we read at list one column. + if (data_types.empty()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data"); + + /// If column names weren't set, use default names 'c1', 'c2', ... + if (column_names.empty()) + { + column_names.reserve(data_types.size()); + for (size_t i = 0; i != data_types.size(); ++i) + column_names.push_back("c" + std::to_string(i + 1)); + } + /// If column names were set, check that the number of names match the number of types. + else if (column_names.size() != data_types.size()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The number of column names {} differs with the number of types {}", column_names.size(), data_types.size()); + + NamesAndTypesList result; + for (size_t i = 0; i != data_types.size(); ++i) + { + /// Check that we could determine the type of this column. + if (!data_types[i]) + { + if (!default_type) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " + "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", + max_rows_to_read); + + data_types[i] = default_type; + } + result.emplace_back(column_names[i], data_types[i]); + } + + return result; +} + +IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) + : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +{ +} + +NamesAndTypesList IRowWithNamesSchemaReader::readSchema() +{ + auto names_and_types = readRowAndGetNamesAndDataTypes(); + for (size_t row = 1; row < max_rows_to_read; ++row) + { + auto new_names_and_types = readRowAndGetNamesAndDataTypes(); + if (new_names_and_types.empty()) + /// We reached eof. + break; + + for (const auto & [name, new_type] : new_names_and_types) + { + auto it = names_and_types.find(name); + /// If we didn't see this column before, just add it. + if (it == names_and_types.end()) + { + names_and_types[name] = new_type; + continue; + } + + auto & type = it->second; + /// If we couldn't determine the type of column yet, just set the new type. + if (!type) + type = new_type; + /// If the new type and the previous type for this column are different, + /// we will use default type if we have it or throw an exception. + else if (new_type && type->getName() != new_type->getName()) + { + if (default_type) + type = default_type; + else + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", type->getName(), name, row, new_type->getName()); + } + } + } + + /// Check that we read at list one column. + if (names_and_types.empty()) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data"); + + NamesAndTypesList result; + for (auto & [name, type] : names_and_types) + { + /// Check that we could determine the type of this column. + if (!type) + { + if (!default_type) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " + "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", + max_rows_to_read); + + type = default_type; + } + result.emplace_back(name, type); + } + + return result; +} + +} diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h new file mode 100644 index 00000000000..67a8eb88d61 --- /dev/null +++ b/src/Processors/Formats/ISchemaReader.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +/// Base class for schema inference for the data in some specific format. +/// It reads some data from read buffer and try to determine the schema +/// from read data. +class ISchemaReader +{ +public: + ISchemaReader(ReadBuffer & in_) : in(in_) {} + + virtual NamesAndTypesList readSchema() = 0; + + virtual ~ISchemaReader() = default; + +protected: + ReadBuffer & in; +}; + +/// Base class for schema inference for formats that read data row by row. +/// It reads data row by row (up to max_rows_to_read), determines types of columns +/// for each row and compare them with types from the previous rows. If some column +/// contains values with different types in different rows, the default type will be +/// used for this column or the exception will be thrown (if default type is not set). +class IRowSchemaReader : public ISchemaReader +{ +public: + IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + NamesAndTypesList readSchema() override; + +protected: + /// Read one row and determine types of columns in it. + /// Return types in the same order in which the values were in the row. + /// If it's impossible to determine the type for some column, return nullptr for it. + /// Return empty list if can't read more data. + virtual DataTypes readRowAndGetDataTypes() = 0; + + void setColumnNames(const std::vector & names) { column_names = names; } + +private: + size_t max_rows_to_read; + DataTypePtr default_type; + std::vector column_names; +}; + +/// Base class for schema inference for formats that read data row by row and each +/// row contains column names and values (ex: JSONEachRow, TSKV). +/// Differ from IRowSchemaReader in that after reading a row we get +/// a map {column_name : type} and some columns may be missed in a single row +/// (in this case we will use types from the previous rows for missed columns). +class IRowWithNamesSchemaReader : public ISchemaReader +{ +public: + IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + NamesAndTypesList readSchema() override; + +protected: + /// Read one row and determine types of columns in it. + /// Return map {column_name : type}. + /// If it's impossible to determine the type for some column, return nullptr for it. + /// Return empty map is can't read more data. + virtual std::unordered_map readRowAndGetNamesAndDataTypes() = 0; + +private: + size_t max_rows_to_read; + DataTypePtr default_type; +}; + +/// Base class for schema inference for formats that don't need any data to +/// determine the schema: formats with constant schema (ex: JSONAsString, LineAsString) +/// and formats that use external format schema (ex: Protobuf, CapnProto). +class IExternalSchemaReader +{ +public: + virtual NamesAndTypesList readSchema() = 0; + + virtual ~IExternalSchemaReader() = default; +}; + +} diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp index 1f6b530d72f..4af2c651c39 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp @@ -85,31 +85,38 @@ void ArrowBlockInputFormat::resetParser() record_batch_current = 0; } +static std::shared_ptr createStreamReader(ReadBuffer & in) +{ + auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique(in)); + if (!stream_reader_status.ok()) + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Error while opening a table: {}", stream_reader_status.status().ToString()); + return *stream_reader_status; +} + +static std::shared_ptr createFileReader(ReadBuffer & in, const FormatSettings & format_settings, std::atomic & is_stopped) +{ + auto arrow_file = asArrowFile(in, format_settings, is_stopped); + if (is_stopped) + return nullptr; + + auto file_reader_status = arrow::ipc::RecordBatchFileReader::Open(std::move(arrow_file)); + if (!file_reader_status.ok()) + throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, + "Error while opening a table: {}", file_reader_status.status().ToString()); + return *file_reader_status; +} + + void ArrowBlockInputFormat::prepareReader() { - std::shared_ptr schema; - if (stream) - { - auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique(*in)); - if (!stream_reader_status.ok()) - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Error while opening a table: {}", stream_reader_status.status().ToString()); - stream_reader = *stream_reader_status; - schema = stream_reader->schema(); - } + stream_reader = createStreamReader(*in); else { - auto arrow_file = asArrowFile(*in, format_settings, is_stopped); - if (is_stopped) + file_reader = createFileReader(*in, format_settings, is_stopped); + if (!file_reader) return; - - auto file_reader_status = arrow::ipc::RecordBatchFileReader::Open(std::move(arrow_file)); - if (!file_reader_status.ok()) - throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, - "Error while opening a table: {}", file_reader_status.status().ToString()); - file_reader = *file_reader_status; - schema = file_reader->schema(); } arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Arrow", format_settings.arrow.import_nested); @@ -122,6 +129,27 @@ void ArrowBlockInputFormat::prepareReader() record_batch_current = 0; } +ArrowSchemaReader::ArrowSchemaReader(ReadBuffer & in_, bool stream_, const FormatSettings & format_settings_) + : ISchemaReader(in_), stream(stream_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ArrowSchemaReader::readSchema() +{ + std::shared_ptr schema; + + if (stream) + schema = createStreamReader(in)->schema(); + else + { + std::atomic is_stopped = 0; + schema = createFileReader(in, format_settings, is_stopped)->schema(); + } + + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, stream ? "ArrowStream" : "Arrow"); + return header.getNamesAndTypesList(); +} + void registerInputFormatArrow(FormatFactory & factory) { factory.registerInputFormat( @@ -145,6 +173,20 @@ void registerInputFormatArrow(FormatFactory & factory) }); } +void registerArrowSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "Arrow", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, false, settings); + }); + factory.registerSchemaReader( + "ArrowStream", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, true, settings); + });} } #else @@ -154,6 +196,8 @@ class FormatFactory; void registerInputFormatArrow(FormatFactory &) { } + +void registerArrowSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.h b/src/Processors/Formats/Impl/ArrowBlockInputFormat.h index bb8a000477c..62cbf949fc2 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.h @@ -4,6 +4,7 @@ #if USE_ARROW #include +#include #include namespace arrow { class RecordBatchReader; } @@ -51,6 +52,18 @@ private: std::atomic is_stopped{0}; }; +class ArrowSchemaReader : public ISchemaReader +{ +public: + ArrowSchemaReader(ReadBuffer & in_, bool stream_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + bool stream; + const FormatSettings format_settings; +}; + } #endif diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 272907022a1..1edf5432c98 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -80,6 +80,9 @@ static ColumnWithTypeAndName readColumnWithNumericData(std::shared_ptr(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) { std::shared_ptr chunk = arrow_column->chunk(chunk_i); + if (chunk->length() == 0) + continue; + /// buffers[0] is a null bitmap and buffers[1] are actual values std::shared_ptr buffer = chunk->data()->buffers[1]; @@ -146,6 +149,9 @@ static ColumnWithTypeAndName readColumnWithBooleanData(std::shared_ptr(arrow_column->num_chunks()); chunk_i < num_chunks; ++chunk_i) { arrow::BooleanArray & chunk = dynamic_cast(*(arrow_column->chunk(chunk_i))); + if (chunk.length() == 0) + continue; + /// buffers[0] is a null bitmap and buffers[1] are actual values std::shared_ptr buffer = chunk.data()->buffers[1]; @@ -239,10 +245,8 @@ static ColumnWithTypeAndName readColumnWithTimestampData(std::shared_ptr -static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr & arrow_column, const String & column_name) +static ColumnWithTypeAndName readColumnWithDecimalDataImpl(std::shared_ptr & arrow_column, const String & column_name, DataTypePtr internal_type) { - const auto * arrow_decimal_type = static_cast(arrow_column->type().get()); - auto internal_type = std::make_shared>(arrow_decimal_type->precision(), arrow_decimal_type->scale()); auto internal_column = internal_type->createColumn(); auto & column = assert_cast &>(*internal_column); auto & column_data = column.getData(); @@ -259,6 +263,21 @@ static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr +static ColumnWithTypeAndName readColumnWithDecimalData(std::shared_ptr & arrow_column, const String & column_name) +{ + const auto * arrow_decimal_type = static_cast(arrow_column->type().get()); + size_t precision = arrow_decimal_type->precision(); + auto internal_type = createDecimal(precision, arrow_decimal_type->scale()); + if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + else if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + else if (precision <= DecimalUtils::max_precision) + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); + return readColumnWithDecimalDataImpl(arrow_column, column_name, internal_type); +} + /// Creates a null bytemap from arrow's null bitmap static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr & arrow_column) { @@ -328,12 +347,13 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( const std::string & column_name, const std::string & format_name, bool is_nullable, - std::unordered_map> & dictionary_values) + std::unordered_map> & dictionary_values, + bool read_ints_as_dates) { if (!is_nullable && arrow_column->null_count() && arrow_column->type()->id() != arrow::Type::LIST && arrow_column->type()->id() != arrow::Type::MAP && arrow_column->type()->id() != arrow::Type::STRUCT) { - auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_column, column_name, format_name, true, dictionary_values, read_ints_as_dates); auto nullmap_column = readByteMapFromArrowColumn(arrow_column); auto nullable_type = std::make_shared(std::move(nested_column.type)); auto nullable_column = ColumnNullable::create(std::move(nested_column.column), std::move(nullmap_column)); @@ -358,25 +378,27 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::UINT16: { auto column = readColumnWithNumericData(arrow_column, column_name); - column.type = std::make_shared(); + if (read_ints_as_dates) + column.type = std::make_shared(); return column; } case arrow::Type::UINT32: { auto column = readColumnWithNumericData(arrow_column, column_name); - column.type = std::make_shared(); + if (read_ints_as_dates) + column.type = std::make_shared(); return column; } case arrow::Type::TIMESTAMP: return readColumnWithTimestampData(arrow_column, column_name); case arrow::Type::DECIMAL128: - return readColumnWithDecimalData(arrow_column, column_name); + return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::DECIMAL256: - return readColumnWithDecimalData(arrow_column, column_name); + return readColumnWithDecimalData(arrow_column, column_name); case arrow::Type::MAP: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); const auto * tuple_column = assert_cast(nested_column.column.get()); @@ -388,7 +410,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( case arrow::Type::LIST: { auto arrow_nested_column = getNestedArrowColumn(arrow_column); - auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); auto array_column = ColumnArray::create(std::move(nested_column.column), std::move(offsets_column)); auto array_type = std::make_shared(nested_column.type); @@ -413,7 +435,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( for (int i = 0; i != arrow_struct_type->num_fields(); ++i) { auto nested_arrow_column = std::make_shared(nested_arrow_columns[i]); - auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values); + auto element = readColumnFromArrowColumn(nested_arrow_column, arrow_struct_type->field(i)->name(), format_name, false, dictionary_values, read_ints_as_dates); tuple_elements.emplace_back(std::move(element.column)); tuple_types.emplace_back(std::move(element.type)); tuple_names.emplace_back(std::move(element.name)); @@ -436,7 +458,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( dict_array.emplace_back(dict_chunk.dictionary()); } auto arrow_dict_column = std::make_shared(dict_array); - auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values); + auto dict_column = readColumnFromArrowColumn(arrow_dict_column, column_name, format_name, false, dictionary_values, read_ints_as_dates); /// We should convert read column to ColumnUnique. auto tmp_lc_column = DataTypeLowCardinality(dict_column.type).createColumn(); @@ -483,7 +505,7 @@ static void checkStatus(const arrow::Status & status, const String & column_name throw Exception{ErrorCodes::UNKNOWN_EXCEPTION, "Error with a {} column '{}': {}.", format_name, column_name, status.ToString()}; } -static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name) +Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name) { ColumnsWithTypeAndName sample_columns; for (const auto & field : schema.fields()) @@ -493,24 +515,21 @@ static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::stri std::unique_ptr array_builder; arrow::Status status = MakeBuilder(pool, field->type(), &array_builder); checkStatus(status, field->name(), format_name); + std::shared_ptr arrow_array; status = array_builder->Finish(&arrow_array); checkStatus(status, field->name(), format_name); + arrow::ArrayVector array_vector = {arrow_array}; auto arrow_column = std::make_shared(array_vector); std::unordered_map> dict_values; - ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values); + ColumnWithTypeAndName sample_column = readColumnFromArrowColumn(arrow_column, field->name(), format_name, false, dict_values, false); + sample_columns.emplace_back(std::move(sample_column)); } return Block(std::move(sample_columns)); } -ArrowColumnToCHColumn::ArrowColumnToCHColumn( - const arrow::Schema & schema, const std::string & format_name_, bool import_nested_) - : header(arrowSchemaToCHHeader(schema, format_name_)), format_name(format_name_), import_nested(import_nested_) -{ -} - ArrowColumnToCHColumn::ArrowColumnToCHColumn( const Block & header_, const std::string & format_name_, bool import_nested_) : header(header_), format_name(format_name_), import_nested(import_nested_) @@ -553,7 +572,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (!nested_tables.contains(nested_table_name)) { std::shared_ptr arrow_column = name_to_column_ptr[nested_table_name]; - ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values)}; + ColumnsWithTypeAndName cols = {readColumnFromArrowColumn(arrow_column, nested_table_name, format_name, false, dictionary_values, true)}; Block block(cols); nested_tables[nested_table_name] = std::make_shared(Nested::flatten(block)); } @@ -573,7 +592,7 @@ void ArrowColumnToCHColumn::arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & if (read_from_nested) column = nested_tables[nested_table_name]->getByName(header_column.name); else - column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values); + column = readColumnFromArrowColumn(arrow_column, header_column.name, format_name, false, dictionary_values, true); try { diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h index 46976093f0b..58f8f1536b5 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.h @@ -23,16 +23,14 @@ public: ArrowColumnToCHColumn(const Block & header_, const std::string & format_name_, bool import_nested_); - /// Constructor that create header by arrow schema. It will be useful for inserting - /// data from file without knowing table structure. - ArrowColumnToCHColumn(const arrow::Schema & schema, const std::string & format_name, bool import_nested_); - void arrowTableToCHChunk(Chunk & res, std::shared_ptr & table); void arrowColumnsToCHChunk(Chunk & res, NameToColumnPtr & name_to_column_ptr); + static Block arrowSchemaToCHHeader(const arrow::Schema & schema, const std::string & format_name); + private: - const Block header; + const Block & header; const std::string format_name; bool import_nested; diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 11e56ecbe0c..a372df41344 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -815,6 +815,92 @@ const AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(Sc return it->second; } +AvroSchemaReader::AvroSchemaReader(ReadBuffer & in_, bool confluent_, const FormatSettings & format_settings_) + : ISchemaReader(in_), confluent(confluent_), format_settings(format_settings_) +{ +} + +NamesAndTypesList AvroSchemaReader::readSchema() +{ + avro::NodePtr root_node; + if (confluent) + { + UInt32 schema_id = readConfluentSchemaId(in); + root_node = getConfluentSchemaRegistry(format_settings)->getSchema(schema_id).root(); + } + else + { + auto file_reader_ptr = std::make_unique(std::make_unique(in)); + root_node = file_reader_ptr->dataSchema().root(); + } + + if (root_node->type() != avro::Type::AVRO_RECORD) + throw Exception("Root schema must be a record", ErrorCodes::TYPE_MISMATCH); + + NamesAndTypesList names_and_types; + for (size_t i = 0; i != root_node->leaves(); ++i) + names_and_types.emplace_back(root_node->nameAt(i), avroNodeToDataType(root_node->leafAt(i))); + + return names_and_types; +} + +DataTypePtr AvroSchemaReader::avroNodeToDataType(avro::NodePtr node) +{ + switch (node->type()) + { + case avro::Type::AVRO_INT: + return {std::make_shared()}; + case avro::Type::AVRO_LONG: + return std::make_shared(); + case avro::Type::AVRO_BOOL: + return std::make_shared(); + case avro::Type::AVRO_FLOAT: + return std::make_shared(); + case avro::Type::AVRO_DOUBLE: + return std::make_shared(); + case avro::Type::AVRO_STRING: + return std::make_shared(); + case avro::Type::AVRO_BYTES: + return std::make_shared(); + case avro::Type::AVRO_ENUM: + { + if (node->names() < 128) + { + EnumValues::Values values; + for (size_t i = 0; i != node->names(); ++i) + values.emplace_back(node->nameAt(i), i); + return std::make_shared(std::move(values)); + } + else if (node->names() < 32768) + { + EnumValues::Values values; + for (size_t i = 0; i != node->names(); ++i) + values.emplace_back(node->nameAt(i), i); + return std::make_shared(std::move(values)); + } + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ClickHouse supports only 8 and 16-bit Enum."); + } + case avro::Type::AVRO_FIXED: + return std::make_shared(node->fixedSize()); + case avro::Type::AVRO_ARRAY: + return std::make_shared(avroNodeToDataType(node->leafAt(0))); + case avro::Type::AVRO_NULL: + return std::make_shared(); + case avro::Type::AVRO_UNION: + if (node->leaves() == 2 && (node->leafAt(0)->type() == avro::Type::AVRO_NULL || node->leafAt(1)->type() == avro::Type::AVRO_NULL)) + { + size_t nested_leaf_index = node->leafAt(0)->type() == avro::Type::AVRO_NULL ? 1 : 0; + return makeNullable(avroNodeToDataType(node->leafAt(nested_leaf_index))); + } + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Avro type UNION is not supported for inserting."); + case avro::Type::AVRO_SYMBOLIC: + return avroNodeToDataType(avro::resolveSymbol(node)); + default: + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Avro column {} is not supported for inserting."); + } +} + void registerInputFormatAvro(FormatFactory & factory) { factory.registerInputFormat("Avro", []( @@ -836,6 +922,21 @@ void registerInputFormatAvro(FormatFactory & factory) }); } +void registerAvroSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Avro", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, false, settings); + }); + + factory.registerSchemaReader("AvroConfluent", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, true, settings); + }); + +} + + } #else @@ -846,6 +947,8 @@ class FormatFactory; void registerInputFormatAvro(FormatFactory &) { } + +void registerAvroSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.h b/src/Processors/Formats/Impl/AvroRowInputFormat.h index 73237369e56..46e571d87ec 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.h +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -160,6 +161,20 @@ private: FormatSettings format_settings; }; +class AvroSchemaReader : public ISchemaReader +{ +public: + AvroSchemaReader(ReadBuffer & in_, bool confluent_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + DataTypePtr avroNodeToDataType(avro::NodePtr node); + + bool confluent; + const FormatSettings format_settings; +}; + } #endif diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp index 0506c539c0f..b356967a544 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.cpp @@ -5,7 +5,6 @@ #include #include - namespace DB { @@ -15,11 +14,23 @@ namespace ErrorCodes } BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, Block header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(std::move(header), in_, std::move(params_), with_names_, with_types_, format_settings_) + : RowInputFormatWithNamesAndTypes( + std::move(header), + in_, + std::move(params_), + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, format_settings_)) { } -std::vector BinaryRowInputFormat::readHeaderRow() + +BinaryFormatReader::BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_) +{ +} + +std::vector BinaryFormatReader::readHeaderRow() { std::vector fields; String field; @@ -31,13 +42,13 @@ std::vector BinaryRowInputFormat::readHeaderRow() return fields; } -std::vector BinaryRowInputFormat::readNames() +std::vector BinaryFormatReader::readNames() { readVarUInt(read_columns, *in); return readHeaderRow(); } -std::vector BinaryRowInputFormat::readTypes() +std::vector BinaryFormatReader::readTypes() { auto types = readHeaderRow(); for (const auto & type_name : types) @@ -45,31 +56,37 @@ std::vector BinaryRowInputFormat::readTypes() return types; } -bool BinaryRowInputFormat::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/) +bool BinaryFormatReader::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/) { serialization->deserializeBinary(column, *in); return true; } -void BinaryRowInputFormat::skipHeaderRow() +void BinaryFormatReader::skipHeaderRow() { String tmp; for (size_t i = 0; i < read_columns; ++i) readStringBinary(tmp, *in); } -void BinaryRowInputFormat::skipNames() +void BinaryFormatReader::skipNames() { readVarUInt(read_columns, *in); skipHeaderRow(); } -void BinaryRowInputFormat::skipTypes() +void BinaryFormatReader::skipTypes() { + if (read_columns == 0) + { + /// It's possible only when with_names = false and with_types = true + readVarUInt(read_columns, *in); + } + skipHeaderRow(); } -void BinaryRowInputFormat::skipField(size_t file_column) +void BinaryFormatReader::skipField(size_t file_column) { if (file_column >= read_data_types.size()) throw Exception(ErrorCodes::CANNOT_SKIP_UNKNOWN_FIELD, "Cannot skip unknown field in RowBinaryWithNames format, because it's type is unknown"); @@ -77,6 +94,11 @@ void BinaryRowInputFormat::skipField(size_t file_column) read_data_types[file_column]->getDefaultSerialization()->deserializeBinary(field, *in); } +BinaryWithNamesAndTypesSchemaReader::BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, 0, true, true, &reader), reader(in_, format_settings_) +{ +} + void registerInputFormatRowBinary(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) @@ -94,4 +116,13 @@ void registerInputFormatRowBinary(FormatFactory & factory) registerWithNamesAndTypes("RowBinary", register_func); } +void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("RowBinaryWithNamesAndTypes", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} + + } diff --git a/src/Processors/Formats/Impl/BinaryRowInputFormat.h b/src/Processors/Formats/Impl/BinaryRowInputFormat.h index 61d6df77522..d98e75bf621 100644 --- a/src/Processors/Formats/Impl/BinaryRowInputFormat.h +++ b/src/Processors/Formats/Impl/BinaryRowInputFormat.h @@ -1,15 +1,19 @@ #pragma once #include -#include #include +#include namespace DB { -class ReadBuffer; +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} +class ReadBuffer; /** A stream for inputting data in a binary line-by-line format. */ @@ -24,9 +28,15 @@ public: /// in this format we cannot provide any DiagnosticInfo, because here we have /// just binary data. std::string getDiagnosticInfo() override { return {}; } +}; + +class BinaryFormatReader : public FormatWithNamesAndTypesReader +{ +public: + BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_); -private: bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; + void skipField(size_t file_column) override; void skipNames() override; @@ -37,9 +47,24 @@ private: std::vector readTypes() override; std::vector readHeaderRow(); +private: /// Data types read from input data. DataTypes read_data_types; - UInt64 read_columns = 0; + UInt64 read_columns; +}; + +class BinaryWithNamesAndTypesSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + DataTypes readRowAndGetDataTypes() override + { + throw Exception{ErrorCodes::NOT_IMPLEMENTED, "Method readRowAndGetDataTypes is not implemented"}; + } + + BinaryFormatReader reader; }; } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 9de2b908b1e..735a549d0a6 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -5,13 +5,16 @@ #include #include #include +#include +#include #include #include -#include +#include +#include + namespace DB { - namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -26,7 +29,14 @@ CSVRowInputFormat::CSVRowInputFormat( bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_) + : RowInputFormatWithNamesAndTypes( + header_, + in_, + params_, + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, format_settings_)) { const String bad_delimiters = " \t\"'.UL"; if (bad_delimiters.find(format_settings.csv.delimiter) != String::npos) @@ -36,6 +46,11 @@ CSVRowInputFormat::CSVRowInputFormat( ErrorCodes::BAD_ARGUMENTS); } +void CSVRowInputFormat::syncAfterError() +{ + skipToNextLineOrEOF(*in); +} + static void skipEndOfLine(ReadBuffer & in) { /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) @@ -52,8 +67,10 @@ static void skipEndOfLine(ReadBuffer & in) if (!in.eof() && *in.position() == '\n') ++in.position(); else - throw Exception("Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." - " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", ErrorCodes::INCORRECT_DATA); + throw Exception( + "Cannot parse CSV format: found \\r (CR) not followed by \\n (LF)." + " Line must end by \\n (LF) or \\r\\n (CR LF) or \\n\\r.", + ErrorCodes::INCORRECT_DATA); } else if (!in.eof()) throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA); @@ -62,32 +79,38 @@ static void skipEndOfLine(ReadBuffer & in) /// Skip `whitespace` symbols allowed in CSV. static inline void skipWhitespacesAndTabs(ReadBuffer & in) { - while (!in.eof() - && (*in.position() == ' ' - || *in.position() == '\t')) + while (!in.eof() && (*in.position() == ' ' || *in.position() == '\t')) ++in.position(); } -void CSVRowInputFormat::skipFieldDelimiter() +CSVFormatReader::CSVFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_) +{ +} + +void CSVFormatReader::skipFieldDelimiter() { skipWhitespacesAndTabs(*in); assertChar(format_settings.csv.delimiter, *in); } -String CSVRowInputFormat::readFieldIntoString() +template +String CSVFormatReader::readCSVFieldIntoString() { skipWhitespacesAndTabs(*in); String field; - readCSVString(field, *in, format_settings.csv); + if constexpr (read_string) + readCSVString(field, *in, format_settings.csv); + else + readCSVField(field, *in, format_settings.csv); return field; } -void CSVRowInputFormat::skipField() +void CSVFormatReader::skipField() { - readFieldIntoString(); + readCSVFieldIntoString(); } -void CSVRowInputFormat::skipRowEndDelimiter() +void CSVFormatReader::skipRowEndDelimiter() { skipWhitespacesAndTabs(*in); @@ -105,33 +128,32 @@ void CSVRowInputFormat::skipRowEndDelimiter() skipEndOfLine(*in); } -void CSVRowInputFormat::skipHeaderRow() +void CSVFormatReader::skipHeaderRow() { do { skipField(); skipWhitespacesAndTabs(*in); - } - while (checkChar(format_settings.csv.delimiter, *in)); + } while (checkChar(format_settings.csv.delimiter, *in)); skipRowEndDelimiter(); } -std::vector CSVRowInputFormat::readHeaderRow() +template +std::vector CSVFormatReader::readRowImpl() { std::vector fields; do { - fields.push_back(readFieldIntoString()); + fields.push_back(readCSVFieldIntoString()); skipWhitespacesAndTabs(*in); - } - while (checkChar(format_settings.csv.delimiter, *in)); + } while (checkChar(format_settings.csv.delimiter, *in)); skipRowEndDelimiter(); return fields; } -bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool CSVFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { const char delimiter = format_settings.csv.delimiter; @@ -144,7 +166,8 @@ bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { if (*in->position() == '\n' || *in->position() == '\r') { - out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected." + out << "ERROR: Line feed found where delimiter (" << delimiter + << ") is expected." " It's like your file has less columns than expected.\n" "And if your file has the right number of columns, maybe it has unescaped quotes in values.\n"; } @@ -160,7 +183,7 @@ bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) return true; } -bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool CSVFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { skipWhitespacesAndTabs(*in); @@ -191,23 +214,21 @@ bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) return true; } -void CSVRowInputFormat::syncAfterError() -{ - skipToNextLineOrEOF(*in); -} - -bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) +bool CSVFormatReader::readField( + IColumn & column, + const DataTypePtr & type, + const SerializationPtr & serialization, + bool is_last_file_column, + const String & /*column_name*/) { skipWhitespacesAndTabs(*in); const bool at_delimiter = !in->eof() && *in->position() == format_settings.csv.delimiter; - const bool at_last_column_line_end = is_last_file_column - && (in->eof() || *in->position() == '\n' || *in->position() == '\r'); + const bool at_last_column_line_end = is_last_file_column && (in->eof() || *in->position() == '\n' || *in->position() == '\r'); /// Note: Tuples are serialized in CSV as separate columns, but with empty_as_default or null_as_default /// only one empty or NULL column will be expected - if (format_settings.csv.empty_as_default - && (at_delimiter || at_last_column_line_end)) + if (format_settings.csv.empty_as_default && (at_delimiter || at_last_column_line_end)) { /// Treat empty unquoted column value as default value, if /// specified in the settings. Tuple columns might seem @@ -231,6 +252,31 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co } } + +CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_) + : FormatWithNamesAndTypesSchemaReader( + in_, + format_setting_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::CSV)) + , reader(in_, format_setting_) + , context(context_) +{ +} + + +DataTypes CSVSchemaReader::readRowAndGetDataTypes() +{ + if (in.eof()) + return {}; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), FormatSettings::EscapingRule::CSV, context); +} + + void registerInputFormatCSV(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) @@ -326,4 +372,17 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory) registerWithNamesAndTypes("CSV", register_func); } +void registerCSVSchemaReader(FormatFactory & factory) +{ + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, with_names, with_types, settings, context); + }); + }; + + registerWithNamesAndTypes("CSV", register_func); +} + } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index d7c557b58d8..d723647595e 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -5,6 +5,7 @@ #include #include +#include #include @@ -28,6 +29,12 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; +}; + +class CSVFormatReader : public FormatWithNamesAndTypesReader +{ +public: + CSVFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_); bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; @@ -42,17 +49,34 @@ private: void skipField(size_t /*file_column*/) override { skipField(); } void skipField(); - void skipHeaderRow() ; + void skipHeaderRow(); void skipNames() override { skipHeaderRow(); } void skipTypes() override { skipHeaderRow(); } void skipFieldDelimiter() override; void skipRowEndDelimiter() override; - std::vector readHeaderRow(); std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } + std::vector readHeaderRow() { return readRowImpl(); } + std::vector readRow() { return readRowImpl(); } - String readFieldIntoString(); + template + std::vector readRowImpl(); + + template + String readCSVFieldIntoString(); +}; + +class CSVSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_setting_, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + CSVFormatReader reader; + ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index 4d000bb1f35..311f4742335 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -273,6 +273,7 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension #endif auto root_reader = msg.getRoot(root); + for (size_t i = 0; i != columns.size(); ++i) { auto value = getReaderByColumnName(root_reader, column_names[i]); @@ -282,6 +283,24 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension return true; } +CapnProtoSchemaReader::CapnProtoSchemaReader(const FormatSettings & format_settings_) : format_settings(format_settings_) +{ +} + +NamesAndTypesList CapnProtoSchemaReader::readSchema() +{ + auto schema_info = FormatSchemaInfo( + format_settings.schema.format_schema, + "CapnProto", + true, + format_settings.schema.is_server, + format_settings.schema.format_schema_path); + + auto schema_parser = CapnProtoSchemaParser(); + auto schema = schema_parser.getMessageSchema(schema_info); + return capnProtoSchemaToCHSchema(schema); +} + void registerInputFormatCapnProto(FormatFactory & factory) { factory.registerInputFormat( @@ -293,6 +312,14 @@ void registerInputFormatCapnProto(FormatFactory & factory) }); } +void registerCapnProtoSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("CapnProto", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); +} + } #else @@ -301,6 +328,7 @@ namespace DB { class FormatFactory; void registerInputFormatCapnProto(FormatFactory &) {} + void registerCapnProtoSchemaReader(FormatFactory &) {} } #endif // USE_CAPNP diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h index 4c0f34d70a3..053de14d1a4 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -38,6 +39,17 @@ private: Names column_names; }; +class CapnProtoSchemaReader : public IExternalSchemaReader +{ +public: + explicit CapnProtoSchemaReader(const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; +}; + } #endif // USE_CAPNP diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 56aa4345777..d2e0d6e21a9 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -31,7 +31,7 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( bool ignore_spaces_, const FormatSettings & format_settings_) : CustomSeparatedRowInputFormat( - header_, std::make_unique(in_buf_), params_, with_names_, with_types_, ignore_spaces_, format_settings_) + header_, std::make_unique(in_buf_), params_, with_names_, with_types_, ignore_spaces_, updateFormatSettings(format_settings_)) { } @@ -43,10 +43,15 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( bool with_types_, bool ignore_spaces_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, *buf_, params_, with_names_, with_types_, updateFormatSettings(format_settings_)) + : RowInputFormatWithNamesAndTypes( + header_, + *buf_, + params_, + with_names_, + with_types_, + format_settings_, + std::make_unique(*buf_, ignore_spaces_, format_settings_)) , buf(std::move(buf_)) - , ignore_spaces(ignore_spaces_) - , escaping_rule(format_settings_.custom.escaping_rule) { /// In case of CustomSeparatedWithNames(AndTypes) formats and enabled setting input_format_with_names_use_header we don't know /// the exact number of columns in data (because it can contain unknown columns). So, if field_delimiter and row_after_delimiter are @@ -61,43 +66,76 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat( } } -void CustomSeparatedRowInputFormat::skipPrefixBeforeHeader() + +bool CustomSeparatedRowInputFormat::allowSyncAfterError() const +{ + return !format_settings.custom.row_after_delimiter.empty() || !format_settings.custom.row_between_delimiter.empty(); +} + +void CustomSeparatedRowInputFormat::syncAfterError() +{ + skipToNextRowOrEof(*buf, format_settings.custom.row_after_delimiter, format_settings.custom.row_between_delimiter, ignore_spaces); + end_of_stream = buf->eof(); + /// It can happen that buf->position() is not at the beginning of row + /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter. + /// It will cause another parsing error. +} + +void CustomSeparatedRowInputFormat::setReadBuffer(ReadBuffer & in_) +{ + buf = std::make_unique(in_); + RowInputFormatWithNamesAndTypes::setReadBuffer(*buf); +} + +CustomSeparatedFormatReader::CustomSeparatedFormatReader( + PeekableReadBuffer & buf_, bool ignore_spaces_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesReader(buf_, format_settings_), buf(&buf_), ignore_spaces(ignore_spaces_) +{ +} + +void CustomSeparatedRowInputFormat::resetParser() +{ + RowInputFormatWithNamesAndTypes::resetParser(); + buf->reset(); +} + +void CustomSeparatedFormatReader::skipPrefixBeforeHeader() { skipSpaces(); assertString(format_settings.custom.result_before_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipRowStartDelimiter() +void CustomSeparatedFormatReader::skipRowStartDelimiter() { skipSpaces(); assertString(format_settings.custom.row_before_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipFieldDelimiter() +void CustomSeparatedFormatReader::skipFieldDelimiter() { skipSpaces(); assertString(format_settings.custom.field_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipRowEndDelimiter() +void CustomSeparatedFormatReader::skipRowEndDelimiter() { skipSpaces(); assertString(format_settings.custom.row_after_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipRowBetweenDelimiter() +void CustomSeparatedFormatReader::skipRowBetweenDelimiter() { skipSpaces(); assertString(format_settings.custom.row_between_delimiter, *buf); } -void CustomSeparatedRowInputFormat::skipField() +void CustomSeparatedFormatReader::skipField() { skipSpaces(); - skipFieldByEscapingRule(*buf, escaping_rule, format_settings); + skipFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); } -bool CustomSeparatedRowInputFormat::checkEndOfRow() +bool CustomSeparatedFormatReader::checkEndOfRow() { PeekableReadBufferCheckpoint checkpoint{*buf, true}; @@ -118,43 +156,66 @@ bool CustomSeparatedRowInputFormat::checkEndOfRow() return checkForSuffixImpl(true); } -std::vector CustomSeparatedRowInputFormat::readHeaderRow() +template +String CustomSeparatedFormatReader::readFieldIntoString(bool is_first) +{ + if (!is_first) + skipFieldDelimiter(); + skipSpaces(); + if constexpr (is_header) + return readStringByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); + else + return readFieldByEscapingRule(*buf, format_settings.custom.escaping_rule, format_settings); +} + +template +std::vector CustomSeparatedFormatReader::readRowImpl() { std::vector values; skipRowStartDelimiter(); - do + + if (columns == 0) { - if (!values.empty()) - skipFieldDelimiter(); - skipSpaces(); - values.push_back(readStringByEscapingRule(*buf, escaping_rule, format_settings)); + do + { + values.push_back(readFieldIntoString(values.empty())); + } while (!checkEndOfRow()); + columns = values.size(); + } + else + { + for (size_t i = 0; i != columns; ++i) + values.push_back(readFieldIntoString(i == 0)); } - while (!checkEndOfRow()); skipRowEndDelimiter(); return values; } -void CustomSeparatedRowInputFormat::skipHeaderRow() +void CustomSeparatedFormatReader::skipHeaderRow() { - size_t columns = getPort().getHeader().columns(); skipRowStartDelimiter(); - for (size_t i = 0; i != columns; ++i) + bool first = true; + do { - skipField(); - if (i + 1 != columns) + if (!first) skipFieldDelimiter(); + first = false; + + skipField(); } + while (!checkEndOfRow()); + skipRowEndDelimiter(); } -bool CustomSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &) +bool CustomSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool, const String &) { skipSpaces(); - return deserializeFieldByEscapingRule(type, serialization, column, *buf, escaping_rule, format_settings); + return deserializeFieldByEscapingRule(type, serialization, column, *buf, format_settings.custom.escaping_rule, format_settings); } -bool CustomSeparatedRowInputFormat::checkForSuffixImpl(bool check_eof) +bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof) { skipSpaces(); if (format_settings.custom.result_after_delimiter.empty()) @@ -177,7 +238,7 @@ bool CustomSeparatedRowInputFormat::checkForSuffixImpl(bool check_eof) return false; } -bool CustomSeparatedRowInputFormat::tryParseSuffixWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::tryParseSuffixWithDiagnosticInfo(WriteBuffer & out) { PeekableReadBufferCheckpoint checkpoint{*buf}; if (checkForSuffixImpl(false)) @@ -192,7 +253,7 @@ bool CustomSeparatedRowInputFormat::tryParseSuffixWithDiagnosticInfo(WriteBuffer return true; } -bool CustomSeparatedRowInputFormat::checkForSuffix() +bool CustomSeparatedFormatReader::checkForSuffix() { PeekableReadBufferCheckpoint checkpoint{*buf}; if (checkForSuffixImpl(true)) @@ -201,51 +262,60 @@ bool CustomSeparatedRowInputFormat::checkForSuffix() return false; } - -bool CustomSeparatedRowInputFormat::allowSyncAfterError() const -{ - return !format_settings.custom.row_after_delimiter.empty() || !format_settings.custom.row_between_delimiter.empty(); -} - -void CustomSeparatedRowInputFormat::syncAfterError() -{ - skipToNextRowOrEof(*buf, format_settings.custom.row_after_delimiter, format_settings.custom.row_between_delimiter, ignore_spaces); - end_of_stream = buf->eof(); - /// It can happen that buf->position() is not at the beginning of row - /// if some delimiters is similar to row_format.delimiters.back() and row_between_delimiter. - /// It will cause another parsing error. -} - -bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces); } -bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.field_delimiter, "delimiter between fields", ignore_spaces); } -bool CustomSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_after_delimiter, "delimiter after last field", ignore_spaces); } -bool CustomSeparatedRowInputFormat::parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool CustomSeparatedFormatReader::parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer & out) { return parseDelimiterWithDiagnosticInfo(out, *buf, format_settings.custom.row_between_delimiter, "delimiter between rows", ignore_spaces); } -void CustomSeparatedRowInputFormat::resetParser() +void CustomSeparatedFormatReader::setReadBuffer(ReadBuffer & in_) { - RowInputFormatWithNamesAndTypes::resetParser(); - buf->reset(); + buf = assert_cast(&in_); + FormatWithNamesAndTypesReader::setReadBuffer(in_); } -void CustomSeparatedRowInputFormat::setReadBuffer(ReadBuffer & in_) +CustomSeparatedSchemaReader::CustomSeparatedSchemaReader( + ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_) + : FormatWithNamesAndTypesSchemaReader( + buf, + format_setting_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(format_setting_.custom.escaping_rule)) + , buf(in_) + , reader(buf, ignore_spaces_, updateFormatSettings(format_setting_)) + , context(context_) { - buf = std::make_unique(in_); - IInputFormat::setReadBuffer(*buf); +} + +DataTypes CustomSeparatedSchemaReader::readRowAndGetDataTypes() +{ + if (reader.checkForSuffix()) + return {}; + + if (!first_row || with_names || with_types) + reader.skipRowBetweenDelimiter(); + + if (first_row) + first_row = false; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule(), context); } void registerInputFormatCustomSeparated(FormatFactory & factory) @@ -267,4 +337,20 @@ void registerInputFormatCustomSeparated(FormatFactory & factory) } } +void registerCustomSeparatedSchemaReader(FormatFactory & factory) +{ + for (bool ignore_spaces : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types, ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, with_names, with_types, ignore_spaces, settings, context); + }); + }; + + registerWithNamesAndTypes(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", register_func); + } +} + } diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h index 6b572ca1417..d38d5bf0da4 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h @@ -19,7 +19,6 @@ public: void resetParser() override; String getName() const override { return "CustomSeparatedRowInputFormat"; } - void setReadBuffer(ReadBuffer & in_) override; private: @@ -28,6 +27,19 @@ private: std::unique_ptr in_buf_, const Params & params_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_settings_); + + bool allowSyncAfterError() const override; + void syncAfterError() override; + + std::unique_ptr buf; + bool ignore_spaces; +}; + +class CustomSeparatedFormatReader : public FormatWithNamesAndTypesReader +{ +public: + CustomSeparatedFormatReader(PeekableReadBuffer & buf_, bool ignore_spaces_, const FormatSettings & format_settings_); + using EscapingRule = FormatSettings::EscapingRule; bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; @@ -46,9 +58,6 @@ private: bool checkForSuffix() override; - bool allowSyncAfterError() const override; - void syncAfterError() override; - bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; @@ -57,15 +66,41 @@ private: std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } - std::vector readHeaderRow(); + std::vector readHeaderRow() {return readRowImpl(); } + + std::vector readRow() { return readRowImpl(); } bool checkEndOfRow(); bool checkForSuffixImpl(bool check_eof); inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } - std::unique_ptr buf; + EscapingRule getEscapingRule() { return format_settings.custom.escaping_rule; } + + void setReadBuffer(ReadBuffer & in_) override; +private: + template + std::vector readRowImpl(); + + template + String readFieldIntoString(bool is_first); + + PeekableReadBuffer * buf; bool ignore_spaces; - EscapingRule escaping_rule; + size_t columns = 0; +}; + +class CustomSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + CustomSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool ignore_spaces_, const FormatSettings & format_setting_, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + CustomSeparatedFormatReader reader; + ContextPtr context; + bool first_row = true; }; } diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index 476985c2509..56ba975dea1 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -202,4 +202,12 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factor factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsString", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } +void registerJSONAsStringSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("JSONAsString", [](const FormatSettings &) + { + return std::make_shared(); + }); +} + } diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h index d86142af795..ea6e9a1ed2f 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h @@ -1,8 +1,10 @@ #pragma once #include +#include #include #include +#include namespace DB { @@ -39,4 +41,13 @@ private: bool allow_new_rows = true; }; +class JSONAsStringExternalSchemaReader : public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"json", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 88fb411ffbd..263702ad20f 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -8,16 +9,13 @@ #include #include #include +#include +#include +#include namespace DB { -namespace ErrorCodes -{ - extern const int INCORRECT_DATA; -} - - JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat( const Block & header_, ReadBuffer & in_, @@ -26,24 +24,40 @@ JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat( bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, std::move(params_), with_names_, with_types_, format_settings_) - , yield_strings(yield_strings_) + : RowInputFormatWithNamesAndTypes( + header_, + in_, + std::move(params_), + with_names_, + with_types_, + format_settings_, + std::make_unique(in_, yield_strings_, format_settings_)) { } -void JSONCompactEachRowRowInputFormat::skipRowStartDelimiter() +void JSONCompactEachRowRowInputFormat::syncAfterError() +{ + skipToUnescapedNextLineOrEOF(*in); +} + +JSONCompactEachRowFormatReader::JSONCompactEachRowFormatReader(ReadBuffer & in_, bool yield_strings_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesReader(in_, format_settings_), yield_strings(yield_strings_) +{ +} + +void JSONCompactEachRowFormatReader::skipRowStartDelimiter() { skipWhitespaceIfAny(*in); assertChar('[', *in); } -void JSONCompactEachRowRowInputFormat::skipFieldDelimiter() +void JSONCompactEachRowFormatReader::skipFieldDelimiter() { skipWhitespaceIfAny(*in); assertChar(',', *in); } -void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter() +void JSONCompactEachRowFormatReader::skipRowEndDelimiter() { skipWhitespaceIfAny(*in); assertChar(']', *in); @@ -55,29 +69,18 @@ void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter() skipWhitespaceIfAny(*in); } -String JSONCompactEachRowRowInputFormat::readFieldIntoString() +void JSONCompactEachRowFormatReader::skipField() { skipWhitespaceIfAny(*in); - String field; - readJSONString(field, *in); - return field; + skipJSONField(*in, "skipped_field"); } -void JSONCompactEachRowRowInputFormat::skipField(size_t file_column) -{ - skipWhitespaceIfAny(*in); - skipJSONField(*in, column_mapping->names_of_columns[file_column]); -} - -void JSONCompactEachRowRowInputFormat::skipHeaderRow() +void JSONCompactEachRowFormatReader::skipHeaderRow() { skipRowStartDelimiter(); - size_t i = 0; do { - if (i >= column_mapping->names_of_columns.size()) - throw Exception(ErrorCodes::INCORRECT_DATA, "The number of columns in a row differs from the number of column names"); - skipField(i++); + skipField(); skipWhitespaceIfAny(*in); } while (checkChar(',', *in)); @@ -85,13 +88,16 @@ void JSONCompactEachRowRowInputFormat::skipHeaderRow() skipRowEndDelimiter(); } -std::vector JSONCompactEachRowRowInputFormat::readHeaderRow() +std::vector JSONCompactEachRowFormatReader::readHeaderRow() { skipRowStartDelimiter(); std::vector fields; + String field; do { - fields.push_back(readFieldIntoString()); + skipWhitespaceIfAny(*in); + readJSONString(field, *in); + fields.push_back(field); skipWhitespaceIfAny(*in); } while (checkChar(',', *in)); @@ -100,18 +106,13 @@ std::vector JSONCompactEachRowRowInputFormat::readHeaderRow() return fields; } -bool JSONCompactEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) +bool JSONCompactEachRowFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) { skipWhitespaceIfAny(*in); return readFieldImpl(*in, column, type, serialization, column_name, format_settings, yield_strings); } -void JSONCompactEachRowRowInputFormat::syncAfterError() -{ - skipToUnescapedNextLineOrEOF(*in); -} - -bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) { skipWhitespaceIfAny(*in); if (!checkChar('[', *in)) @@ -123,7 +124,7 @@ bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuff return true; } -bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { try { @@ -150,7 +151,7 @@ bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(Wri return true; } -bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { skipWhitespaceIfAny(*in); @@ -180,6 +181,20 @@ bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer return true; } +JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader(in_, format_settings_.max_rows_to_read_for_schema_inference, with_names_, with_types_, &reader), reader(in_, yield_strings_, format_settings_) +{ +} + +DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes() +{ + skipWhitespaceIfAny(in); + if (in.eof()) + return {}; + + return readRowAndGetDataTypesForJSONCompactEachRow(in, reader.yieldStrings()); +} + void registerInputFormatJSONCompactEachRow(FormatFactory & factory) { for (bool yield_strings : {true, false}) @@ -200,6 +215,21 @@ void registerInputFormatJSONCompactEachRow(FormatFactory & factory) } } +void registerJSONCompactEachRowSchemaReader(FormatFactory & factory) +{ + for (bool json_strings : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [=](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, with_names, with_types, json_strings, settings); + }); + }; + registerWithNamesAndTypes(json_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func); + } +} + void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory) { auto register_func = [&](const String & format_name, bool with_names, bool with_types) diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h index e01a4f49b30..0551aa8b64e 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -10,6 +11,7 @@ namespace DB class ReadBuffer; + /** A stream for reading data in a bunch of formats: * - JSONCompactEachRow * - JSONCompactEachRowWithNamesAndTypes @@ -34,6 +36,13 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; +}; + +class JSONCompactEachRowFormatReader : public FormatWithNamesAndTypesReader +{ +public: + JSONCompactEachRowFormatReader(ReadBuffer & in_, bool yield_strings_, const FormatSettings & format_settings_); + bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; @@ -45,7 +54,8 @@ private: bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; - void skipField(size_t file_column) override; + void skipField(size_t /*column_index*/) override { skipField(); } + void skipField(); void skipHeaderRow(); void skipNames() override { skipHeaderRow(); } void skipTypes() override { skipHeaderRow(); } @@ -56,9 +66,21 @@ private: std::vector readHeaderRow(); std::vector readNames() override { return readHeaderRow(); } std::vector readTypes() override { return readHeaderRow(); } - String readFieldIntoString(); + bool yieldStrings() const { return yield_strings; } +private: bool yield_strings; }; +class JSONCompactEachRowRowSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_); + +private: + DataTypes readRowAndGetDataTypes() override; + + JSONCompactEachRowFormatReader reader; +}; + } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 28481313974..75beca955b9 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -286,11 +287,7 @@ void JSONEachRowRowInputFormat::readPrefix() skipBOMIfExists(*in); skipWhitespaceIfAny(*in); - if (!in->eof() && *in->position() == '[') - { - ++in->position(); - data_in_square_brackets = true; - } + data_in_square_brackets = checkChar('[', *in); } void JSONEachRowRowInputFormat::readSuffix() @@ -309,6 +306,28 @@ void JSONEachRowRowInputFormat::readSuffix() assertEOF(*in); } +JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings_, const FormatSettings & format_settings) + : IRowWithNamesSchemaReader(in_, format_settings.max_rows_to_read_for_schema_inference), json_strings(json_strings_) +{ +} + + +std::unordered_map JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes() +{ + if (first_row) + { + skipBOMIfExists(in); + skipWhitespaceIfAny(in); + checkChar('[', in); + first_row = false; + } + + skipWhitespaceIfAny(in); + if (in.eof()) + return {}; + + return readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings); +} void registerInputFormatJSONEachRow(FormatFactory & factory) { @@ -343,4 +362,17 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory factory.registerNonTrivialPrefixAndSuffixChecker("JSONStringsEachRow", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } +void registerJSONEachRowSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("JSONEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_unique(buf, false, settings); + }); + + factory.registerSchemaReader("JSONStringsEachRow", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_unique(buf, true, settings); + }); +} + } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index 9810f2dc765..323909a7730 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -84,4 +85,16 @@ private: bool yield_strings; }; +class JSONEachRowSchemaReader : public IRowWithNamesSchemaReader +{ +public: + JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings); + +private: + std::unordered_map readRowAndGetNamesAndDataTypes() override; + + bool json_strings; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp index 1a05f61d36b..5983f3170e5 100644 --- a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp @@ -72,4 +72,13 @@ void registerInputFormatLineAsString(FormatFactory & factory) return std::make_shared(sample, buf, params); }); } + +void registerLineAsStringSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("LineAsString", []( + const FormatSettings &) + { + return std::make_shared(); + }); +} } diff --git a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h index 1a6c6247558..c4c17c47dbe 100644 --- a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.h @@ -1,7 +1,9 @@ #pragma once #include +#include #include +#include namespace DB { @@ -26,4 +28,13 @@ private: void readLineObject(IColumn & column); }; +class LinaAsStringSchemaReader : public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"line", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index 60db32d879a..c56af536e15 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,8 @@ namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int INCORRECT_DATA; + extern const int BAD_ARGUMENTS; + extern const int UNEXPECTED_END_OF_FILE; } MsgPackRowInputFormat::MsgPackRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) @@ -369,7 +372,108 @@ bool MsgPackRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & void MsgPackRowInputFormat::setReadBuffer(ReadBuffer & in_) { buf = std::make_unique(in_); - IInputFormat::setReadBuffer(*buf); + IInputFormat::setReadBuffer(in_); +} + +MsgPackSchemaReader::MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), number_of_columns(format_settings_.msgpack.number_of_columns) +{ + if (!number_of_columns) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "You must specify setting input_format_msgpack_number_of_columns to extract table schema from MsgPack data"); +} + + +msgpack::object_handle MsgPackSchemaReader::readObject() +{ + if (buf.eof()) + throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected eof while parsing msgpack object"); + + PeekableReadBufferCheckpoint checkpoint{buf}; + size_t offset = 0; + bool need_more_data = true; + msgpack::object_handle object_handle; + while (need_more_data) + { + offset = 0; + try + { + object_handle = msgpack::unpack(buf.position(), buf.buffer().end() - buf.position(), offset); + need_more_data = false; + } + catch (msgpack::insufficient_bytes &) + { + buf.position() = buf.buffer().end(); + if (buf.eof()) + throw Exception("Unexpected end of file while parsing msgpack object", ErrorCodes::UNEXPECTED_END_OF_FILE); + buf.position() = buf.buffer().end(); + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + } + } + buf.position() += offset; + return object_handle; +} + +DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) +{ + switch (object.type) + { + case msgpack::type::object_type::POSITIVE_INTEGER: [[fallthrough]]; + case msgpack::type::object_type::NEGATIVE_INTEGER: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::FLOAT32: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::FLOAT64: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::BOOLEAN: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::BIN: [[fallthrough]]; + case msgpack::type::object_type::STR: + return makeNullable(std::make_shared()); + case msgpack::type::object_type::ARRAY: + { + msgpack::object_array object_array = object.via.array; + if (object_array.size) + { + auto nested_type = getDataType(object_array.ptr[0]); + if (nested_type) + return std::make_shared(getDataType(object_array.ptr[0])); + } + return nullptr; + } + case msgpack::type::object_type::MAP: + { + msgpack::object_map object_map = object.via.map; + if (object_map.size) + { + auto key_type = removeNullable(getDataType(object_map.ptr[0].key)); + auto value_type = getDataType(object_map.ptr[0].val); + if (key_type && value_type) + return std::make_shared(key_type, value_type); + } + return nullptr; + } + case msgpack::type::object_type::NIL: + return nullptr; + default: + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack type is not supported"); + } +} + +DataTypes MsgPackSchemaReader::readRowAndGetDataTypes() +{ + if (buf.eof()) + return {}; + + DataTypes data_types; + data_types.reserve(number_of_columns); + for (size_t i = 0; i != number_of_columns; ++i) + { + auto object_handle = readObject(); + data_types.push_back(getDataType(object_handle.get())); + } + + return data_types; } void registerInputFormatMsgPack(FormatFactory & factory) @@ -384,6 +488,14 @@ void registerInputFormatMsgPack(FormatFactory & factory) }); } +void registerMsgPackSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("MsgPack", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} + } #else @@ -394,6 +506,10 @@ class FormatFactory; void registerInputFormatMsgPack(FormatFactory &) { } + +void registerMsgPackSchemaReader(FormatFactory &) +{ +} } #endif diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h index bb3887695eb..dd5655c80fc 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h @@ -6,6 +6,7 @@ #if USE_MSGPACK #include +#include #include #include #include @@ -76,6 +77,20 @@ private: const DataTypes data_types; }; +class MsgPackSchemaReader : public IRowSchemaReader +{ +public: + MsgPackSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + msgpack::object_handle readObject(); + DataTypePtr getDataType(const msgpack::object & object); + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + UInt64 number_of_columns; +}; + } #endif diff --git a/src/Processors/Formats/Impl/NativeFormat.cpp b/src/Processors/Formats/Impl/NativeFormat.cpp index 07cf4670981..19e2ede6b65 100644 --- a/src/Processors/Formats/Impl/NativeFormat.cpp +++ b/src/Processors/Formats/Impl/NativeFormat.cpp @@ -1,8 +1,10 @@ #include #include + #include #include #include +#include #include @@ -82,6 +84,20 @@ private: NativeWriter writer; }; +class NativeSchemaReader : public ISchemaReader +{ +public: + explicit NativeSchemaReader(ReadBuffer & in_) : ISchemaReader(in_) {} + + NamesAndTypesList readSchema() override + { + auto reader = NativeReader(in, 0); + auto block = reader.read(); + return block.getNamesAndTypesList(); + } +}; + + void registerInputFormatNative(FormatFactory & factory) { factory.registerInputFormat("Native", []( @@ -106,4 +122,14 @@ void registerOutputFormatNative(FormatFactory & factory) }); } + +void registerNativeSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Native", [](ReadBuffer & buf, const FormatSettings &, ContextPtr) + { + return std::make_shared(buf); + }); +} + + } diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index 87eec459aa3..9a787e5a614 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -87,9 +87,14 @@ static size_t countIndicesForType(std::shared_ptr type) return 1; } -void ORCBlockInputFormat::prepareReader() +static void getFileReaderAndSchema( + ReadBuffer & in, + std::unique_ptr & file_reader, + std::shared_ptr & schema, + const FormatSettings & format_settings, + std::atomic & is_stopped) { - auto arrow_file = asArrowFile(*in, format_settings, is_stopped); + auto arrow_file = asArrowFile(in, format_settings, is_stopped); if (is_stopped) return; @@ -101,7 +106,15 @@ void ORCBlockInputFormat::prepareReader() auto read_schema_result = file_reader->ReadSchema(); if (!read_schema_result.ok()) throw Exception(read_schema_result.status().ToString(), ErrorCodes::BAD_ARGUMENTS); - std::shared_ptr schema = std::move(read_schema_result).ValueOrDie(); + schema = std::move(read_schema_result).ValueOrDie(); +} + +void ORCBlockInputFormat::prepareReader() +{ + std::shared_ptr schema; + getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped); + if (is_stopped) + return; arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "ORC", format_settings.orc.import_nested); @@ -128,7 +141,21 @@ void ORCBlockInputFormat::prepareReader() } } -void registerInputFormatORC(FormatFactory &factory) +ORCSchemaReader::ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ORCSchemaReader::readSchema() +{ + std::unique_ptr file_reader; + std::shared_ptr schema; + std::atomic is_stopped = 0; + getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "ORC"); + return header.getNamesAndTypesList(); +} + +void registerInputFormatORC(FormatFactory & factory) { factory.registerInputFormat( "ORC", @@ -142,6 +169,17 @@ void registerInputFormatORC(FormatFactory &factory) factory.markFormatAsColumnOriented("ORC"); } +void registerORCSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "ORC", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + } + ); +} + } #else @@ -151,6 +189,10 @@ namespace DB void registerInputFormatORC(FormatFactory &) { } + + void registerORCSchemaReader(FormatFactory &) + { + } } #endif diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index c7dc1c4a710..9b55747f552 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -3,6 +3,7 @@ #if USE_ORC #include +#include #include #include @@ -54,5 +55,16 @@ private: std::atomic is_stopped{0}; }; +class ORCSchemaReader : public ISchemaReader +{ +public: + ORCSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; +}; + } #endif diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 4c8f6ab2c54..651b9545c81 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -87,6 +87,7 @@ ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & t { return orc::createPrimitiveType(orc::TypeKind::DOUBLE); } + case TypeIndex::Date32: [[fallthrough]]; case TypeIndex::Date: { return orc::createPrimitiveType(orc::TypeKind::DATE); @@ -292,6 +293,7 @@ void ORCBlockOutputFormat::writeColumn( writeNumbers(orc_column, column, null_bytemap, [](const UInt16 & value){ return value; }); break; } + case TypeIndex::Date32: [[fallthrough]]; case TypeIndex::Int32: { writeNumbers(orc_column, column, null_bytemap, [](const Int32 & value){ return value; }); diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 0b6cd006300..1d303014d31 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -94,19 +94,30 @@ static size_t countIndicesForType(std::shared_ptr type) return 1; } +static void getFileReaderAndSchema( + ReadBuffer & in, + std::unique_ptr & file_reader, + std::shared_ptr & schema, + const FormatSettings & format_settings, + std::atomic & is_stopped) +{ + auto arrow_file = asArrowFile(in, format_settings, is_stopped); + if (is_stopped) + return; + THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader)); + THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); +} + void ParquetBlockInputFormat::prepareReader() { - auto arrow_file = asArrowFile(*in, format_settings, is_stopped); + std::shared_ptr schema; + getFileReaderAndSchema(*in, file_reader, schema, format_settings, is_stopped); if (is_stopped) return; - THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(std::move(arrow_file), arrow::default_memory_pool(), &file_reader)); row_group_total = file_reader->num_row_groups(); row_group_current = 0; - std::shared_ptr schema; - THROW_ARROW_NOT_OK(file_reader->GetSchema(&schema)); - arrow_column_to_ch_column = std::make_unique(getPort().getHeader(), "Parquet", format_settings.parquet.import_nested); std::unordered_set nested_table_names; @@ -130,7 +141,21 @@ void ParquetBlockInputFormat::prepareReader() } } -void registerInputFormatParquet(FormatFactory &factory) +ParquetSchemaReader::ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) : ISchemaReader(in_), format_settings(format_settings_) +{ +} + +NamesAndTypesList ParquetSchemaReader::readSchema() +{ + std::unique_ptr file_reader; + std::shared_ptr schema; + std::atomic is_stopped = 0; + getFileReaderAndSchema(in, file_reader, schema, format_settings, is_stopped); + auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(*schema, "Parquet"); + return header.getNamesAndTypesList(); +} + +void registerInputFormatParquet(FormatFactory & factory) { factory.registerInputFormat( "Parquet", @@ -144,6 +169,17 @@ void registerInputFormatParquet(FormatFactory &factory) factory.markFormatAsColumnOriented("Parquet"); } +void registerParquetSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "Parquet", + [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + } + ); +} + } #else @@ -154,6 +190,8 @@ class FormatFactory; void registerInputFormatParquet(FormatFactory &) { } + +void registerParquetSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index 3e04c523442..dbc99c08a35 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -3,6 +3,7 @@ #if USE_PARQUET #include +#include #include namespace parquet::arrow { class FileReader; } @@ -44,6 +45,17 @@ private: std::atomic is_stopped{0}; }; +class ParquetSchemaReader : public ISchemaReader +{ +public: + ParquetSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + + NamesAndTypesList readSchema() override; + +private: + const FormatSettings format_settings; +}; + } #endif diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index df7b7102739..66da27e8829 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -73,6 +73,34 @@ void registerInputFormatProtobuf(FormatFactory & factory) } } +ProtobufSchemaReader::ProtobufSchemaReader(const FormatSettings & format_settings) + : schema_info( + format_settings.schema.format_schema, + "Protobuf", + true, + format_settings.schema.is_server, + format_settings.schema.format_schema_path) +{ +} + +NamesAndTypesList ProtobufSchemaReader::readSchema() +{ + const auto * message_descriptor = ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info); + return protobufSchemaToCHSchema(message_descriptor); +} + +void registerProtobufSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("Protobuf", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); + factory.registerExternalSchemaReader("ProtobufSingle", [](const FormatSettings & settings) + { + return std::make_shared(settings); + }); +} + } #else @@ -81,6 +109,8 @@ namespace DB { class FormatFactory; void registerInputFormatProtobuf(FormatFactory &) {} + +void registerProtobufSchemaReader(FormatFactory &) {} } #endif diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h index 6f465e3f0b8..d7d16d36ddf 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h @@ -3,7 +3,9 @@ #include "config_formats.h" #if USE_PROTOBUF +# include # include +# include namespace DB { @@ -42,5 +44,16 @@ private: std::unique_ptr serializer; }; +class ProtobufSchemaReader : public IExternalSchemaReader +{ +public: + explicit ProtobufSchemaReader(const FormatSettings & format_settings); + + NamesAndTypesList readSchema() override; + +private: + FormatSchemaInfo schema_info; +}; + } #endif diff --git a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp index 34424fffd34..91b1cc60fae 100644 --- a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.cpp @@ -51,5 +51,14 @@ void registerInputFormatRawBLOB(FormatFactory & factory) }); } +void registerRawBLOBSchemaReader(FormatFactory & factory) +{ + factory.registerExternalSchemaReader("RawBLOB", []( + const FormatSettings &) + { + return std::make_shared(); + }); +} + } diff --git a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h index 343af9f4068..367ca04f9d8 100644 --- a/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h +++ b/src/Processors/Formats/Impl/RawBLOBRowInputFormat.h @@ -1,6 +1,8 @@ #pragma once #include +#include +#include namespace DB @@ -22,5 +24,14 @@ private: bool readRow(MutableColumns & columns, RowReadExtension &) override; }; +class RawBLOBSchemaReader: public IExternalSchemaReader +{ +public: + NamesAndTypesList readSchema() override + { + return {{"raw_blob", std::make_shared()}}; + } +}; + } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index 279ae89aba5..90db6f6f0ec 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -14,18 +14,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -RegexpRowInputFormat::RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_) - : RegexpRowInputFormat(std::make_unique(in_), header_, params_, format_settings_) -{ -} - -RegexpRowInputFormat::RegexpRowInputFormat( - std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_) - : IRowInputFormat(header_, *buf_, std::move(params_)) - , buf(std::move(buf_)) - , format_settings(format_settings_) - , escaping_rule(format_settings_.regexp.escaping_rule) - , regexp(format_settings_.regexp.regexp) +RegexpFieldExtractor::RegexpFieldExtractor(const FormatSettings & format_settings) : regexp(format_settings.regexp.regexp), skip_unmatched(format_settings.regexp.skip_unmatched) { size_t fields_count = regexp.NumberOfCapturingGroups(); matched_fields.resize(fields_count); @@ -40,6 +29,50 @@ RegexpRowInputFormat::RegexpRowInputFormat( } } +bool RegexpFieldExtractor::parseRow(PeekableReadBuffer & buf) +{ + PeekableReadBufferCheckpoint checkpoint{buf}; + + size_t line_size = 0; + + do + { + char * pos = find_first_symbols<'\n', '\r'>(buf.position(), buf.buffer().end()); + line_size += pos - buf.position(); + buf.position() = pos; + } while (buf.position() == buf.buffer().end() && !buf.eof()); + + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + + bool match = RE2::FullMatchN(re2::StringPiece(buf.position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size()); + + if (!match && !skip_unmatched) + throw Exception("Line \"" + std::string(buf.position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA); + + buf.position() += line_size; + checkChar('\r', buf); + if (!buf.eof() && !checkChar('\n', buf)) + throw Exception("No \\n after \\r at the end of line.", ErrorCodes::INCORRECT_DATA); + + return match; +} + +RegexpRowInputFormat::RegexpRowInputFormat( + ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_) + : RegexpRowInputFormat(std::make_unique(in_), header_, params_, format_settings_) +{ +} + +RegexpRowInputFormat::RegexpRowInputFormat( + std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_) + : IRowInputFormat(header_, *buf_, std::move(params_)) + , buf(std::move(buf_)) + , format_settings(format_settings_) + , escaping_rule(format_settings_.regexp.escaping_rule) + , field_extractor(RegexpFieldExtractor(format_settings_)) +{ +} void RegexpRowInputFormat::resetParser() { @@ -50,7 +83,8 @@ void RegexpRowInputFormat::resetParser() bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) { const auto & type = getPort().getHeader().getByPosition(index).type; - ReadBuffer field_buf(const_cast(matched_fields[index].data()), matched_fields[index].size(), 0); + auto matched_field = field_extractor.getField(index); + ReadBuffer field_buf(const_cast(matched_field.data()), matched_field.size(), 0); try { return deserializeFieldByEscapingRule(type, serializations[index], *columns[index], field_buf, escaping_rule, format_settings); @@ -64,7 +98,7 @@ bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) void RegexpRowInputFormat::readFieldsFromMatch(MutableColumns & columns, RowReadExtension & ext) { - if (matched_fields.size() != columns.size()) + if (field_extractor.getMatchedFieldsSize() != columns.size()) throw Exception("The number of matched fields in line doesn't match the number of columns.", ErrorCodes::INCORRECT_DATA); ext.read_columns.assign(columns.size(), false); @@ -79,39 +113,8 @@ bool RegexpRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & if (buf->eof()) return false; - PeekableReadBufferCheckpoint checkpoint{*buf}; - - size_t line_size = 0; - - do - { - char * pos = find_first_symbols<'\n', '\r'>(buf->position(), buf->buffer().end()); - line_size += pos - buf->position(); - buf->position() = pos; - } while (buf->position() == buf->buffer().end() && !buf->eof()); - - buf->makeContinuousMemoryFromCheckpointToPos(); - buf->rollbackToCheckpoint(); - - bool match = RE2::FullMatchN(re2::StringPiece(buf->position(), line_size), regexp, re2_arguments_ptrs.data(), re2_arguments_ptrs.size()); - bool read_line = true; - - if (!match) - { - if (!format_settings.regexp.skip_unmatched) - throw Exception("Line \"" + std::string(buf->position(), line_size) + "\" doesn't match the regexp.", ErrorCodes::INCORRECT_DATA); - read_line = false; - } - - if (read_line) + if (field_extractor.parseRow(*buf)) readFieldsFromMatch(columns, ext); - - buf->position() += line_size; - - checkChar('\r', *buf); - if (!buf->eof() && !checkChar('\n', *buf)) - throw Exception("No \\n after \\r at the end of line.", ErrorCodes::INCORRECT_DATA); - return true; } @@ -121,6 +124,36 @@ void RegexpRowInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } +RegexpSchemaReader::RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) + : IRowSchemaReader( + buf, + format_settings_.max_rows_to_read_for_schema_inference, + getDefaultDataTypeForEscapingRule(format_settings_.regexp.escaping_rule)) + , format_settings(format_settings_) + , field_extractor(format_settings) + , buf(in_) + , context(context_) +{ +} + +DataTypes RegexpSchemaReader::readRowAndGetDataTypes() +{ + if (buf.eof()) + return {}; + + field_extractor.parseRow(buf); + + DataTypes data_types; + data_types.reserve(field_extractor.getMatchedFieldsSize()); + for (size_t i = 0; i != field_extractor.getMatchedFieldsSize(); ++i) + { + String field(field_extractor.getField(i)); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, format_settings.regexp.escaping_rule, context)); + } + + return data_types; +} + void registerInputFormatRegexp(FormatFactory & factory) { factory.registerInputFormat("Regexp", []( @@ -172,4 +205,12 @@ void registerFileSegmentationEngineRegexp(FormatFactory & factory) factory.registerFileSegmentationEngine("Regexp", &fileSegmentationEngineRegexpImpl); } +void registerRegexpSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Regexp", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, settings, context); + }); +} + } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index dbce31a9b49..dffd2f82e02 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,29 @@ namespace DB class ReadBuffer; +/// Class for extracting row fields from data by regexp. +class RegexpFieldExtractor +{ +public: + RegexpFieldExtractor(const FormatSettings & format_settings); + + /// Return true if row was successfully parsed and row fields were extracted. + bool parseRow(PeekableReadBuffer & buf); + + re2::StringPiece getField(size_t index) { return matched_fields[index]; } + size_t getMatchedFieldsSize() const { return matched_fields.size(); } + size_t getNumberOfGroups() const { return regexp.NumberOfCapturingGroups(); } + +private: + const RE2 regexp; + // The vector of fields extracted from line using regexp. + std::vector matched_fields; + // These two vectors are needed to use RE2::FullMatchN (function for extracting fields). + std::vector re2_arguments; + std::vector re2_arguments_ptrs; + bool skip_unmatched; +}; + /// Regexp input format. /// This format applies regular expression from format_regexp setting for every line of file /// (the lines must be separated by newline character ('\n') or DOS-style newline ("\r\n")). @@ -25,7 +49,6 @@ class ReadBuffer; class RegexpRowInputFormat : public IRowInputFormat { - using EscapingRule = FormatSettings::EscapingRule; public: RegexpRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_); @@ -36,6 +59,8 @@ public: private: RegexpRowInputFormat(std::unique_ptr buf_, const Block & header_, Params params_, const FormatSettings & format_settings_); + using EscapingRule = FormatSettings::EscapingRule; + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; bool readField(size_t index, MutableColumns & columns); @@ -44,13 +69,22 @@ private: std::unique_ptr buf; const FormatSettings format_settings; const EscapingRule escaping_rule; + RegexpFieldExtractor field_extractor; +}; - const RE2 regexp; - // The vector of fields extracted from line using regexp. - std::vector matched_fields; - // These two vectors are needed to use RE2::FullMatchN (function for extracting fields). - std::vector re2_arguments; - std::vector re2_arguments_ptrs; +class RegexpSchemaReader : public IRowSchemaReader +{ +public: + RegexpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + using EscapingRule = FormatSettings::EscapingRule; + const FormatSettings format_settings; + RegexpFieldExtractor field_extractor; + PeekableReadBuffer buf; + ContextPtr context; }; } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index eef97e15dd5..8a56c2ed5c7 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -1,7 +1,10 @@ #include #include #include +#include #include +#include +#include namespace DB @@ -211,6 +214,59 @@ void TSKVRowInputFormat::resetParser() name_buf.clear(); } +TSKVSchemaReader::TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) + : IRowWithNamesSchemaReader( + in_, + format_settings_.max_rows_to_read_for_schema_inference, + getDefaultDataTypeForEscapingRule(FormatSettings::EscapingRule::Escaped)) + , format_settings(format_settings_) +{ +} + +std::unordered_map TSKVSchemaReader::readRowAndGetNamesAndDataTypes() +{ + if (first_row) + { + skipBOMIfExists(in); + first_row = false; + } + + if (in.eof()) + return {}; + + if (*in.position() == '\n') + { + ++in.position(); + return {}; + } + + std::unordered_map names_and_types; + StringRef name_ref; + String name_tmp; + String value; + do + { + bool has_value = readName(in, name_ref, name_tmp); + if (has_value) + { + readEscapedString(value, in); + names_and_types[String(name_ref)] = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped); + } + else + { + /// The only thing that can go without value is `tskv` fragment that is ignored. + if (!(name_ref.size == 4 && 0 == memcmp(name_ref.data, "tskv", 4))) + throw Exception("Found field without value while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA); + } + + } + while (checkChar('\t', in)); + + assertChar('\n', in); + + return names_and_types; +} + void registerInputFormatTSKV(FormatFactory & factory) { factory.registerInputFormat("TSKV", []( @@ -222,5 +278,12 @@ void registerInputFormatTSKV(FormatFactory & factory) return std::make_shared(buf, sample, std::move(params), settings); }); } +void registerTSKVSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("TSKV", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, settings); + }); +} } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.h b/src/Processors/Formats/Impl/TSKVRowInputFormat.h index 7d732bae691..6aef50a0f84 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.h +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -52,4 +53,16 @@ private: /// for row like ..., non-nullable column name=\N, ... }; +class TSKVSchemaReader : public IRowWithNamesSchemaReader +{ +public: + TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + +private: + std::unordered_map readRowAndGetNamesAndDataTypes() override; + + const FormatSettings format_settings; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 1e6d238b202..bb844ec68ea 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -1,13 +1,15 @@ #include #include -#include -#include -#include -#include -#include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include namespace DB { @@ -38,40 +40,50 @@ TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( bool with_types_, bool is_raw_, const FormatSettings & format_settings_) - : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_), is_raw(is_raw_) + : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_, std::make_unique(in_, format_settings_, is_raw_)) { } -void TabSeparatedRowInputFormat::skipFieldDelimiter() +TabSeparatedFormatReader::TabSeparatedFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool is_raw_) + : FormatWithNamesAndTypesReader(in_, format_settings_), is_raw(is_raw_) +{ +} + +void TabSeparatedFormatReader::skipFieldDelimiter() { assertChar('\t', *in); } -void TabSeparatedRowInputFormat::skipRowEndDelimiter() +void TabSeparatedFormatReader::skipRowEndDelimiter() { if (in->eof()) return; - if (unlikely(row_num <= 1)) + if (unlikely(first_row)) + { checkForCarriageReturn(*in); + first_row = false; + } assertChar('\n', *in); } -String TabSeparatedRowInputFormat::readFieldIntoString() +String TabSeparatedFormatReader::readFieldIntoString() { String field; - readEscapedString(field, *in); + if (is_raw) + readString(field, *in); + else + readEscapedString(field, *in); return field; } -void TabSeparatedRowInputFormat::skipField() +void TabSeparatedFormatReader::skipField() { - NullOutput null_sink; - readEscapedStringInto(null_sink, *in); + readFieldIntoString(); } -void TabSeparatedRowInputFormat::skipHeaderRow() +void TabSeparatedFormatReader::skipHeaderRow() { do { @@ -82,7 +94,7 @@ void TabSeparatedRowInputFormat::skipHeaderRow() skipRowEndDelimiter(); } -std::vector TabSeparatedRowInputFormat::readHeaderRow() +std::vector TabSeparatedFormatReader::readRow() { std::vector fields; do @@ -95,7 +107,7 @@ std::vector TabSeparatedRowInputFormat::readHeaderRow() return fields; } -bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, +bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) { const bool at_delimiter = !is_last_file_column && !in->eof() && *in->position() == '\t'; @@ -118,6 +130,7 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & return true; } + if (as_nullable) return SerializationNullable::deserializeTextEscapedImpl(column, *in, format_settings, serialization); @@ -125,7 +138,7 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & return true; } -bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +bool TabSeparatedFormatReader::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { try { @@ -156,7 +169,7 @@ bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuff return true; } -bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +bool TabSeparatedFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { if (in->eof()) return true; @@ -190,7 +203,7 @@ bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out return true; } -void TabSeparatedRowInputFormat::checkNullValueForNonNullable(DataTypePtr type) +void TabSeparatedFormatReader::checkNullValueForNonNullable(DataTypePtr type) { bool can_be_parsed_as_null = type->isNullable() || type->isLowCardinalityNullable() || format_settings.null_as_default; @@ -218,6 +231,28 @@ void TabSeparatedRowInputFormat::syncAfterError() skipToUnescapedNextLineOrEOF(*in); } +TabSeparatedSchemaReader::TabSeparatedSchemaReader( + ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_) + : FormatWithNamesAndTypesSchemaReader( + in_, + format_settings_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + getDefaultDataTypeForEscapingRule(is_raw_ ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped)) + , reader(in_, format_settings_, is_raw_) +{ +} + +DataTypes TabSeparatedSchemaReader::readRowAndGetDataTypes() +{ + if (in.eof()) + return {}; + + auto fields = reader.readRow(); + return determineDataTypesByEscapingRule(fields, reader.getFormatSettings(), reader.getEscapingRule()); +} + void registerInputFormatTabSeparated(FormatFactory & factory) { for (bool is_raw : {false, true}) @@ -239,6 +274,23 @@ void registerInputFormatTabSeparated(FormatFactory & factory) } } +void registerTSVSchemaReader(FormatFactory & factory) +{ + for (bool is_raw : {false, true}) + { + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerSchemaReader(format_name, [with_names, with_types, is_raw](ReadBuffer & buf, const FormatSettings & settings, ContextPtr) + { + return std::make_shared(buf, with_names, with_types, is_raw, settings); + }); + }; + + registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); + registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); + } +} + static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, bool is_raw, size_t min_rows) { bool need_more_data = true; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 6e2e283e792..1f2bfc255b8 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -24,6 +25,13 @@ public: private: bool allowSyncAfterError() const override { return true; } void syncAfterError() override; + bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } +}; + +class TabSeparatedFormatReader : public FormatWithNamesAndTypesReader +{ +public: + TabSeparatedFormatReader(ReadBuffer & in_, const FormatSettings & format_settings, bool is_raw_); bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; @@ -36,18 +44,34 @@ private: void skipFieldDelimiter() override; void skipRowEndDelimiter() override; - std::vector readHeaderRow(); - std::vector readNames() override { return readHeaderRow(); } - std::vector readTypes() override { return readHeaderRow(); } + std::vector readRow(); + std::vector readNames() override { return readRow(); } + std::vector readTypes() override { return readRow(); } String readFieldIntoString(); void checkNullValueForNonNullable(DataTypePtr type) override; bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; - bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } + FormatSettings::EscapingRule getEscapingRule() + { + return is_raw ? FormatSettings::EscapingRule::Raw : FormatSettings::EscapingRule::Escaped; + } +private: bool is_raw; + bool first_row = true; +}; + +class TabSeparatedSchemaReader : public FormatWithNamesAndTypesSchemaReader +{ +public: + TabSeparatedSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings); + +private: + DataTypes readRowAndGetDataTypes() override; + + TabSeparatedFormatReader reader; }; } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp index 5d87f5a0b14..03a3ea99b28 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp @@ -22,7 +22,10 @@ void TabSeparatedRowOutputFormat::writeLine(const std::vector & values) { for (size_t i = 0; i < values.size(); ++i) { - writeEscapedString(values[i], out); + if (is_raw) + writeString(values[i], out); + else + writeEscapedString(values[i], out); if (i + 1 == values.size()) writeRowEndDelimiter(); else @@ -95,6 +98,8 @@ void registerOutputFormatTabSeparated(FormatFactory & factory) registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); + if (is_raw) + registerWithNamesAndTypes("LineAsString", register_func); } } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index fccf6eb10df..06d6ba06bcc 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include namespace DB @@ -12,13 +11,19 @@ namespace DB namespace ErrorCodes { -extern const int ATTEMPT_TO_READ_AFTER_EOF; -extern const int CANNOT_READ_ALL_DATA; -extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; -extern const int CANNOT_PARSE_QUOTED_STRING; -extern const int SYNTAX_ERROR; + extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int CANNOT_READ_ALL_DATA; + extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; + extern const int CANNOT_PARSE_QUOTED_STRING; + extern const int SYNTAX_ERROR; } +[[noreturn]] static void throwUnexpectedEof(size_t row_num) +{ + throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " + "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", + ErrorCodes::CANNOT_READ_ALL_DATA); +} TemplateRowInputFormat::TemplateRowInputFormat( const Block & header_, @@ -41,37 +46,13 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::uniqu : RowInputFormatWithDiagnosticInfo(header_, *buf_, params_), buf(std::move(buf_)), data_types(header_.getDataTypes()), settings(std::move(settings_)), ignore_spaces(ignore_spaces_), format(std::move(format_)), row_format(std::move(row_format_)), - default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(std::move(row_between_delimiter_)) + default_csv_delimiter(settings.csv.delimiter), row_between_delimiter(row_between_delimiter_), + format_reader(std::make_unique(*buf, ignore_spaces_, format, row_format, row_between_delimiter, settings)) { - /// Validate format string for result set - bool has_data = false; - for (size_t i = 0; i < format.columnsCount(); ++i) - { - if (format.format_idx_to_column_idx[i]) - { - if (*format.format_idx_to_column_idx[i] != 0) - format.throwInvalidFormat("Invalid input part", i); - if (has_data) - format.throwInvalidFormat("${data} can occur only once", i); - if (format.escaping_rules[i] != EscapingRule::None) - format.throwInvalidFormat("${data} must have empty or None deserialization type", i); - has_data = true; - format_data_idx = i; - } - else - { - if (format.escaping_rules[i] == EscapingRule::XML) - format.throwInvalidFormat("XML deserialization is not supported", i); - } - } - /// Validate format string for rows std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - if (row_format.escaping_rules[i] == EscapingRule::XML) - row_format.throwInvalidFormat("XML deserialization is not supported", i); - if (row_format.format_idx_to_column_idx[i]) { if (header_.columns() <= *row_format.format_idx_to_column_idx[i]) @@ -94,69 +75,7 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, std::uniqu void TemplateRowInputFormat::readPrefix() { - size_t last_successfully_parsed_idx = 0; - try - { - tryReadPrefixOrSuffix(last_successfully_parsed_idx, format_data_idx); - } - catch (Exception & e) - { - format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx); - } -} - -/// Asserts delimiters and skips fields in prefix or suffix. -/// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row -/// (most likely false will be returned on first call of checkString(...)) -template -ReturnType TemplateRowInputFormat::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end) -{ - static constexpr bool throw_exception = std::is_same_v; - - skipSpaces(); - if constexpr (throw_exception) - assertString(format.delimiters[input_part_beg], *buf); - else - { - if (likely(!checkString(format.delimiters[input_part_beg], *buf))) - return ReturnType(false); - } - - while (input_part_beg < input_part_end) - { - skipSpaces(); - if constexpr (throw_exception) - skipField(format.escaping_rules[input_part_beg]); - else - { - try - { - skipField(format.escaping_rules[input_part_beg]); - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && - e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && - e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) - throw; - /// If it's parsing error, then suffix is not found - return ReturnType(false); - } - } - ++input_part_beg; - - skipSpaces(); - if constexpr (throw_exception) - assertString(format.delimiters[input_part_beg], *buf); - else - { - if (likely(!checkString(format.delimiters[input_part_beg], *buf))) - return ReturnType(false); - } - } - - if constexpr (!throw_exception) - return ReturnType(true); + format_reader->readPrefix(); } bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra) @@ -165,9 +84,7 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension if (unlikely(end_of_stream)) return false; - skipSpaces(); - - if (unlikely(checkForSuffix())) + if (unlikely(format_reader->checkForSuffix())) { end_of_stream = true; return false; @@ -176,27 +93,24 @@ bool TemplateRowInputFormat::readRow(MutableColumns & columns, RowReadExtension updateDiagnosticInfo(); if (likely(row_num != 1)) - assertString(row_between_delimiter, *buf); + format_reader->skipRowBetweenDelimiter(); extra.read_columns.assign(columns.size(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - skipSpaces(); - assertString(row_format.delimiters[i], *buf); - skipSpaces(); + format_reader->skipDelimiter(i); + if (row_format.format_idx_to_column_idx[i]) { size_t col_idx = *row_format.format_idx_to_column_idx[i]; extra.read_columns[col_idx] = deserializeField(data_types[col_idx], serializations[col_idx], *columns[col_idx], i); } else - skipField(row_format.escaping_rules[i]); - + format_reader->skipField(row_format.escaping_rules[i]); } - skipSpaces(); - assertString(row_format.delimiters.back(), *buf); + format_reader->skipRowEndDelimiter(); for (const auto & idx : always_default_columns) data_types[idx]->insertDefaultInto(*columns[idx]); @@ -219,65 +133,21 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, catch (Exception & e) { if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throwUnexpectedEof(); + throwUnexpectedEof(row_num); throw; } } -void TemplateRowInputFormat::skipField(TemplateRowInputFormat::EscapingRule escaping_rule) -{ - try - { - skipFieldByEscapingRule(*buf, escaping_rule, settings); - } - catch (Exception & e) - { - if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throwUnexpectedEof(); - throw; - } -} - -/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF. -/// Otherwise returns false -bool TemplateRowInputFormat::checkForSuffix() -{ - PeekableReadBufferCheckpoint checkpoint{*buf}; - bool suffix_found = false; - size_t last_successfully_parsed_idx = format_data_idx + 1; - try - { - suffix_found = tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && - e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && - e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) - throw; - } - - if (unlikely(suffix_found)) - { - skipSpaces(); - if (buf->eof()) - return true; - } - - buf->rollbackToCheckpoint(); - return false; -} - bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) { out << "Suffix does not match: "; - size_t last_successfully_parsed_idx = format_data_idx + 1; + size_t last_successfully_parsed_idx = format_reader->getFormatDataIdx() + 1; const ReadBuffer::Position row_begin_pos = buf->position(); bool caught = false; try { PeekableReadBufferCheckpoint checkpoint{*buf, true}; - tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); + format_reader->tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); } catch (Exception & e) { @@ -309,7 +179,7 @@ bool TemplateRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & col if (!parseDelimiterWithDiagnosticInfo(out, *buf, row_format.delimiters[i], "delimiter before field " + std::to_string(i), ignore_spaces)) return false; - skipSpaces(); + format_reader->skipSpaces(); if (row_format.format_idx_to_column_idx[i]) { const auto & header = getPort().getHeader(); @@ -364,7 +234,7 @@ void TemplateRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColu if (index) deserializeField(type, serializations[*index], column, file_column); else - skipField(row_format.escaping_rules[file_column]); + format_reader->skipField(row_format.escaping_rules[file_column]); } bool TemplateRowInputFormat::isGarbageAfterField(size_t, ReadBuffer::Position) @@ -387,13 +257,6 @@ void TemplateRowInputFormat::syncAfterError() /// It will cause another parsing error. } -void TemplateRowInputFormat::throwUnexpectedEof() -{ - throw ParsingException("Unexpected EOF while parsing row " + std::to_string(row_num) + ". " - "Maybe last row has wrong format or input doesn't contain specified suffix before EOF.", - ErrorCodes::CANNOT_READ_ALL_DATA); -} - void TemplateRowInputFormat::resetParser() { RowInputFormatWithDiagnosticInfo::resetParser(); @@ -407,6 +270,268 @@ void TemplateRowInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } +TemplateFormatReader::TemplateFormatReader( + PeekableReadBuffer & buf_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter_, + const FormatSettings & format_settings_) + : buf(&buf_) + , ignore_spaces(ignore_spaces_) + , format(format_) + , row_format(row_format_) + , row_between_delimiter(row_between_delimiter_) + , format_settings(format_settings_) +{ + /// Validate format string for result set + bool has_data = false; + for (size_t i = 0; i < format.columnsCount(); ++i) + { + if (format.format_idx_to_column_idx[i]) + { + if (*format.format_idx_to_column_idx[i] != 0) + format.throwInvalidFormat("Invalid input part", i); + if (has_data) + format.throwInvalidFormat("${data} can occur only once", i); + if (format.escaping_rules[i] != EscapingRule::None) + format.throwInvalidFormat("${data} must have empty or None deserialization type", i); + has_data = true; + format_data_idx = i; + } + else + { + if (format.escaping_rules[i] == EscapingRule::XML) + format.throwInvalidFormat("XML deserialization is not supported", i); + } + } + + /// Validate format string for rows + for (size_t i = 0; i < row_format.columnsCount(); ++i) + { + if (row_format.escaping_rules[i] == EscapingRule::XML) + row_format.throwInvalidFormat("XML deserialization is not supported", i); + } +} + +void TemplateFormatReader::readPrefix() +{ + size_t last_successfully_parsed_idx = 0; + try + { + tryReadPrefixOrSuffix(last_successfully_parsed_idx, format_data_idx); + } + catch (Exception & e) + { + format.throwInvalidFormat(e.message() + " While parsing prefix", last_successfully_parsed_idx); + } +} + +void TemplateFormatReader::skipField(EscapingRule escaping_rule) +{ + try + { + skipFieldByEscapingRule(*buf, escaping_rule, format_settings); + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + throwUnexpectedEof(row_num); + throw; + } +} + +/// Asserts delimiters and skips fields in prefix or suffix. +/// tryReadPrefixOrSuffix(...) is used in checkForSuffix() to avoid throwing an exception after read of each row +/// (most likely false will be returned on first call of checkString(...)) +template +ReturnType TemplateFormatReader::tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end) +{ + static constexpr bool throw_exception = std::is_same_v; + + skipSpaces(); + if constexpr (throw_exception) + assertString(format.delimiters[input_part_beg], *buf); + else + { + if (likely(!checkString(format.delimiters[input_part_beg], *buf))) + return ReturnType(false); + } + + while (input_part_beg < input_part_end) + { + skipSpaces(); + if constexpr (throw_exception) + skipField(format.escaping_rules[input_part_beg]); + else + { + try + { + skipField(format.escaping_rules[input_part_beg]); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && + e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) + throw; + /// If it's parsing error, then suffix is not found + return ReturnType(false); + } + } + ++input_part_beg; + + skipSpaces(); + if constexpr (throw_exception) + assertString(format.delimiters[input_part_beg], *buf); + else + { + if (likely(!checkString(format.delimiters[input_part_beg], *buf))) + return ReturnType(false); + } + } + + if constexpr (!throw_exception) + return ReturnType(true); +} + +/// Returns true if all rows have been read i.e. there are only suffix and spaces (if ignore_spaces == true) before EOF. +/// Otherwise returns false +bool TemplateFormatReader::checkForSuffix() +{ + PeekableReadBufferCheckpoint checkpoint{*buf}; + bool suffix_found = false; + size_t last_successfully_parsed_idx = format_data_idx + 1; + try + { + suffix_found = tryReadPrefixOrSuffix(last_successfully_parsed_idx, format.columnsCount()); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF && + e.code() != ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE && + e.code() != ErrorCodes::CANNOT_PARSE_QUOTED_STRING) + throw; + } + + if (unlikely(suffix_found)) + { + skipSpaces(); + if (buf->eof()) + return true; + } + + buf->rollbackToCheckpoint(); + return false; +} + +void TemplateFormatReader::skipDelimiter(size_t index) +{ + skipSpaces(); + assertString(row_format.delimiters[index], *buf); + skipSpaces(); +} + +void TemplateFormatReader::skipRowEndDelimiter() +{ + ++row_num; + skipSpaces(); + assertString(row_format.delimiters.back(), *buf); + skipSpaces(); +} + +void TemplateFormatReader::skipRowBetweenDelimiter() +{ + skipSpaces(); + assertString(row_between_delimiter, *buf); + skipSpaces(); +} + +TemplateSchemaReader::TemplateSchemaReader( + ReadBuffer & in_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_, + ContextPtr context_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference) + , buf(in_) + , format(format_) + , row_format(row_format_) + , format_settings(format_settings_) + , context(context_) + , format_reader(buf, ignore_spaces_, format, row_format, row_between_delimiter, format_settings) +{ + setColumnNames(row_format.column_names); +} + +DataTypes TemplateSchemaReader::readRowAndGetDataTypes() +{ + if (first_row) + format_reader.readPrefix(); + + if (format_reader.checkForSuffix()) + return {}; + + if (first_row) + first_row = false; + else + format_reader.skipRowBetweenDelimiter(); + + DataTypes data_types; + data_types.reserve(row_format.columnsCount()); + String field; + for (size_t i = 0; i != row_format.columnsCount(); ++i) + { + format_reader.skipDelimiter(i); + if (row_format.escaping_rules[i] == FormatSettings::EscapingRule::CSV) + format_settings.csv.delimiter = row_format.delimiters[i + 1].empty() ? format_settings.csv.delimiter : row_format.delimiters[i + 1].front(); + + field = readFieldByEscapingRule(buf, row_format.escaping_rules[i], format_settings); + data_types.push_back(determineDataTypeByEscapingRule(field, format_settings, row_format.escaping_rules[i], context)); + } + + format_reader.skipRowEndDelimiter(); + return data_types; +} + +static ParsedTemplateFormatString fillResultSetFormat(const FormatSettings & settings) +{ + ParsedTemplateFormatString resultset_format; + if (settings.template_settings.resultset_format.empty()) + { + /// Default format string: "${data}" + resultset_format.delimiters.resize(2); + resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); + resultset_format.format_idx_to_column_idx.emplace_back(0); + resultset_format.column_names.emplace_back("data"); + } + else + { + /// Read format string from file + resultset_format = ParsedTemplateFormatString( + FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, + settings.schema.is_server, settings.schema.format_schema_path), + [&](const String & partName) -> std::optional + { + if (partName == "data") + return 0; + throw Exception("Unknown input part " + partName, + ErrorCodes::SYNTAX_ERROR); + }); + } + return resultset_format; +} + +static ParsedTemplateFormatString fillRowFormat(const FormatSettings & settings, ParsedTemplateFormatString::ColumnIdxGetter idx_getter, bool allow_indexes) +{ + return ParsedTemplateFormatString( + FormatSchemaInfo( + settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), + idx_getter, allow_indexes); +} + void registerInputFormatTemplate(FormatFactory & factory) { for (bool ignore_spaces : {false, true}) @@ -417,39 +542,34 @@ void registerInputFormatTemplate(FormatFactory & factory) IRowInputFormat::Params params, const FormatSettings & settings) { - ParsedTemplateFormatString resultset_format; - if (settings.template_settings.resultset_format.empty()) + auto idx_getter = [&](const String & colName) -> std::optional { - /// Default format string: "${data}" - resultset_format.delimiters.resize(2); - resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); - resultset_format.format_idx_to_column_idx.emplace_back(0); - resultset_format.column_names.emplace_back("data"); - } - else - { - /// Read format string from file - resultset_format = ParsedTemplateFormatString( - FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, - settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & partName) -> std::optional - { - if (partName == "data") - return 0; - throw Exception("Unknown input part " + partName, - ErrorCodes::SYNTAX_ERROR); - }); - } + return sample.getPositionByName(colName); + }; - ParsedTemplateFormatString row_format = ParsedTemplateFormatString( - FormatSchemaInfo(settings.template_settings.row_format, "Template", false, - settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & colName) -> std::optional - { - return sample.getPositionByName(colName); - }); + return std::make_shared( + sample, + buf, + params, + settings, + ignore_spaces, + fillResultSetFormat(settings), + fillRowFormat(settings, idx_getter, true), + settings.template_settings.row_between_delimiter); + }); + } +} - return std::make_shared(sample, buf, params, settings, ignore_spaces, resultset_format, row_format, settings.template_settings.row_between_delimiter); +void registerTemplateSchemaReader(FormatFactory & factory) +{ + for (bool ignore_spaces : {false, true}) + { + factory.registerSchemaReader(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [ignore_spaces](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + size_t index = 0; + auto idx_getter = [&](const String &) -> std::optional { return index++; }; + auto row_format = fillRowFormat(settings, idx_getter, false); + return std::make_shared(buf, ignore_spaces, fillResultSetFormat(settings), row_format, settings.template_settings.row_between_delimiter, settings, context); }); } } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 61cd97413bf..755ad6cb39b 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -2,15 +2,19 @@ #include #include +#include #include #include #include #include +#include namespace DB { +class TemplateFormatReader; + class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo { using EscapingRule = FormatSettings::EscapingRule; @@ -40,14 +44,6 @@ private: bool deserializeField(const DataTypePtr & type, const SerializationPtr & serialization, IColumn & column, size_t file_column); - void skipField(EscapingRule escaping_rule); - inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } - - template - ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end); - bool checkForSuffix(); - [[noreturn]] void throwUnexpectedEof(); - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; @@ -63,12 +59,76 @@ private: const ParsedTemplateFormatString format; const ParsedTemplateFormatString row_format; - size_t format_data_idx; bool end_of_stream = false; std::vector always_default_columns; const char default_csv_delimiter; const std::string row_between_delimiter; + + std::unique_ptr format_reader; +}; + +class TemplateFormatReader +{ + using EscapingRule = FormatSettings::EscapingRule; + +public: + TemplateFormatReader( + PeekableReadBuffer & buf_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_); + + void readPrefix(); + void skipField(EscapingRule escaping_rule); + inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } + + template + ReturnType tryReadPrefixOrSuffix(size_t & input_part_beg, size_t input_part_end); + bool checkForSuffix(); + + void setReadBuffer(PeekableReadBuffer & buf_) { buf = &buf_; } + + void skipDelimiter(size_t index); + void skipRowEndDelimiter(); + void skipRowBetweenDelimiter(); + + size_t getFormatDataIdx() const { return format_data_idx; } + +private: + PeekableReadBuffer * buf; + bool ignore_spaces; + const ParsedTemplateFormatString & format; + const ParsedTemplateFormatString & row_format; + const std::string row_between_delimiter; + const FormatSettings & format_settings; + size_t format_data_idx; + size_t row_num; +}; + +class TemplateSchemaReader : public IRowSchemaReader +{ +public: + TemplateSchemaReader(ReadBuffer & in_, + bool ignore_spaces_, + const ParsedTemplateFormatString & format_, + const ParsedTemplateFormatString & row_format_, + std::string row_between_delimiter, + const FormatSettings & format_settings_, + ContextPtr context_); + + DataTypes readRowAndGetDataTypes() override; + +private: + PeekableReadBuffer buf; + const ParsedTemplateFormatString format; + const ParsedTemplateFormatString row_format; + FormatSettings format_settings; + ContextPtr context; + TemplateFormatReader format_reader; + bool first_row = true; }; bool parseDelimiterWithDiagnosticInfo(WriteBuffer & out, ReadBuffer & buf, const String & delimiter, const String & description, bool skip_spaces); diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index adf6d2e8a25..b58be3f5526 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,7 @@ #include #include +#include namespace DB { @@ -286,6 +288,50 @@ namespace } } +/// Can be used in fileSegmentationEngine for parallel parsing of Values +static bool skipToNextRow(PeekableReadBuffer * buf, size_t min_chunk_bytes, int balance) +{ + skipWhitespaceIfAny(*buf); + if (buf->eof() || *buf->position() == ';') + return false; + bool quoted = false; + + size_t chunk_begin_buf_count = buf->count(); + while (!buf->eof() && (balance || buf->count() - chunk_begin_buf_count < min_chunk_bytes)) + { + buf->position() = find_first_symbols<'\\', '\'', ')', '('>(buf->position(), buf->buffer().end()); + if (buf->position() == buf->buffer().end()) + continue; + if (*buf->position() == '\\') + { + ++buf->position(); + if (!buf->eof()) + ++buf->position(); + } + else if (*buf->position() == '\'') + { + quoted ^= true; + ++buf->position(); + } + else if (*buf->position() == ')') + { + ++buf->position(); + if (!quoted) + --balance; + } + else if (*buf->position() == '(') + { + ++buf->position(); + if (!quoted) + ++balance; + } + } + + if (!buf->eof() && *buf->position() == ',') + ++buf->position(); + return true; +} + bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx) { const Block & header = getPort().getHeader(); @@ -293,7 +339,7 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx auto settings = context->getSettingsRef(); /// We need continuous memory containing the expression to use Lexer - skipToNextRow(0, 1); + skipToNextRow(buf.get(), 0, 1); buf->makeContinuousMemoryFromCheckpointToPos(); buf->rollbackToCheckpoint(); @@ -437,50 +483,6 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx return true; } -/// Can be used in fileSegmentationEngine for parallel parsing of Values -bool ValuesBlockInputFormat::skipToNextRow(size_t min_chunk_bytes, int balance) -{ - skipWhitespaceIfAny(*buf); - if (buf->eof() || *buf->position() == ';') - return false; - bool quoted = false; - - size_t chunk_begin_buf_count = buf->count(); - while (!buf->eof() && (balance || buf->count() - chunk_begin_buf_count < min_chunk_bytes)) - { - buf->position() = find_first_symbols<'\\', '\'', ')', '('>(buf->position(), buf->buffer().end()); - if (buf->position() == buf->buffer().end()) - continue; - if (*buf->position() == '\\') - { - ++buf->position(); - if (!buf->eof()) - ++buf->position(); - } - else if (*buf->position() == '\'') - { - quoted ^= true; - ++buf->position(); - } - else if (*buf->position() == ')') - { - ++buf->position(); - if (!quoted) - --balance; - } - else if (*buf->position() == '(') - { - ++buf->position(); - if (!quoted) - ++balance; - } - } - - if (!buf->eof() && *buf->position() == ',') - ++buf->position(); - return true; -} - void ValuesBlockInputFormat::assertDelimiterAfterValue(size_t column_idx) { if (unlikely(!checkDelimiterAfterValue(column_idx))) @@ -559,6 +561,63 @@ void ValuesBlockInputFormat::setReadBuffer(ReadBuffer & in_) IInputFormat::setReadBuffer(*buf); } +ValuesSchemaReader::ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, ContextPtr context_) + : IRowSchemaReader(buf, format_settings_.max_rows_to_read_for_schema_inference), buf(in_), context(context_) +{ +} + +DataTypes ValuesSchemaReader::readRowAndGetDataTypes() +{ + if (first_row) + { + skipBOMIfExists(buf); + first_row = false; + } + + skipWhitespaceIfAny(buf); + if (buf.eof()) + return {}; + + assertChar('(', buf); + PeekableReadBufferCheckpoint checkpoint(buf); + skipToNextRow(&buf, 0, 1); + buf.makeContinuousMemoryFromCheckpointToPos(); + buf.rollbackToCheckpoint(); + + Tokens tokens(buf.position(), buf.buffer().end()); + IParser::Pos token_iterator(tokens, context->getSettingsRef().max_parser_depth); + + DataTypes data_types; + bool finish = false; + while (!finish) + { + Expected expected; + ASTPtr ast; + + bool parsed = parser.parse(token_iterator, ast, expected); + /// Consider delimiter after value (',' or ')') as part of expression + parsed &= token_iterator->type == TokenType::Comma || token_iterator->type == TokenType::ClosingRoundBracket; + + if (!parsed) + throw Exception(ErrorCodes::SYNTAX_ERROR, "Cannot parse expression here: {}, token: {}", + String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())), String(token_iterator.get().begin, token_iterator.get().end)); + + std::pair result = evaluateConstantExpression(ast, context); + data_types.push_back(generalizeDataType(result.second)); + + if (token_iterator->type == TokenType::ClosingRoundBracket) + finish = true; + ++token_iterator; + buf.position() = const_cast(token_iterator->begin); + } + + skipWhitespaceIfAny(buf); + if (!buf.eof() && *buf.position() == ',') + ++buf.position(); + + return data_types; +} + void registerInputFormatValues(FormatFactory & factory) { factory.registerInputFormat("Values", []( @@ -571,4 +630,12 @@ void registerInputFormatValues(FormatFactory & factory) }); } +void registerValuesSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader("Values", [](ReadBuffer & buf, const FormatSettings & settings, ContextPtr context) + { + return std::make_shared(buf, settings, context); + }); +} + } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index 5bbd4bea5ba..e1521955472 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace DB @@ -68,8 +69,6 @@ private: void readPrefix(); void readSuffix(); - bool skipToNextRow(size_t min_chunk_bytes = 0, int balance = 0); - std::unique_ptr buf; const RowInputFormatParams params; @@ -95,4 +94,18 @@ private: BlockMissingValues block_missing_values; }; +class ValuesSchemaReader : public IRowSchemaReader +{ +public: + ValuesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings, ContextPtr context_); + +private: + DataTypes readRowAndGetDataTypes() override; + + PeekableReadBuffer buf; + ContextPtr context; + ParserExpression parser; + bool first_row = true; +}; + } diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 87fa5ec1c4a..7720b01dc74 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -1,5 +1,7 @@ #include +#include #include +#include #include #include @@ -9,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_DATA; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes( @@ -17,8 +20,13 @@ RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes( const Params & params_, bool with_names_, bool with_types_, - const FormatSettings & format_settings_) - : RowInputFormatWithDiagnosticInfo(header_, in_, params_), format_settings(format_settings_), with_names(with_names_), with_types(with_types_) + const FormatSettings & format_settings_, + std::unique_ptr format_reader_) + : RowInputFormatWithDiagnosticInfo(header_, in_, params_) + , format_settings(format_settings_) + , with_names(with_names_) + , with_types(with_types_) + , format_reader(std::move(format_reader_)) { const auto & sample = getPort().getHeader(); size_t num_columns = sample.columns(); @@ -88,7 +96,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() } /// Skip prefix before names and types. - skipPrefixBeforeHeader(); + format_reader->skipPrefixBeforeHeader(); /// This is a bit of abstraction leakage, but we need it in parallel parsing: /// we check if this InputFormat is working with the "real" beginning of the data. @@ -97,7 +105,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() if (format_settings.with_names_use_header) { std::vector read_columns(data_types.size(), false); - auto column_names = readNames(); + auto column_names = format_reader->readNames(); for (const auto & name : column_names) addInputColumn(name, read_columns); @@ -110,7 +118,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() else { setupAllColumnsByTableSchema(); - skipNames(); + format_reader->skipNames(); } } else if (!column_mapping->is_set) @@ -119,10 +127,10 @@ void RowInputFormatWithNamesAndTypes::readPrefix() if (with_types) { /// Skip delimiter between names and types. - skipRowBetweenDelimiter(); + format_reader->skipRowBetweenDelimiter(); if (format_settings.with_types_use_header) { - auto types = readTypes(); + auto types = format_reader->readTypes(); if (types.size() != column_mapping->column_indexes_for_input_fields.size()) throw Exception( ErrorCodes::INCORRECT_DATA, @@ -143,7 +151,7 @@ void RowInputFormatWithNamesAndTypes::readPrefix() } } else - skipTypes(); + format_reader->skipTypes(); } } @@ -161,7 +169,7 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE if (unlikely(end_of_stream)) return false; - if (unlikely(checkForSuffix())) + if (unlikely(format_reader->checkForSuffix())) { end_of_stream = true; return false; @@ -170,9 +178,9 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE updateDiagnosticInfo(); if (likely(row_num != 1 || (getCurrentUnitNumber() == 0 && (with_names || with_types)))) - skipRowBetweenDelimiter(); + format_reader->skipRowBetweenDelimiter(); - skipRowStartDelimiter(); + format_reader->skipRowStartDelimiter(); ext.read_columns.resize(data_types.size()); for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) @@ -180,20 +188,20 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); if (column_index) - ext.read_columns[*column_index] = readField( + ext.read_columns[*column_index] = format_reader->readField( *columns[*column_index], data_types[*column_index], serializations[*column_index], is_last_file_column, column_mapping->names_of_columns[file_column]); else - skipField(file_column); + format_reader->skipField(file_column); if (!is_last_file_column) - skipFieldDelimiter(); + format_reader->skipFieldDelimiter(); } - skipRowEndDelimiter(); + format_reader->skipRowEndDelimiter(); insertDefaultsForNotSeenColumns(columns, ext); @@ -218,13 +226,13 @@ void RowInputFormatWithNamesAndTypes::tryDeserializeField(const DataTypePtr & ty const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; if (index) { - checkNullValueForNonNullable(type); + format_reader->checkNullValueForNonNullable(type); const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]); + format_reader->readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]); } else { - skipField(file_column); + format_reader->skipField(file_column); } } @@ -236,13 +244,13 @@ bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColu return false; } - if (!tryParseSuffixWithDiagnosticInfo(out)) + if (!format_reader->tryParseSuffixWithDiagnosticInfo(out)) return false; - if (likely(row_num != 1) && !parseRowBetweenDelimiterWithDiagnosticInfo(out)) + if (likely(row_num != 1) && !format_reader->parseRowBetweenDelimiterWithDiagnosticInfo(out)) return false; - if (!parseRowStartWithDiagnosticInfo(out)) + if (!format_reader->parseRowStartWithDiagnosticInfo(out)) return false; for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) @@ -266,22 +274,68 @@ bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColu /// Delimiters if (file_column + 1 != column_mapping->column_indexes_for_input_fields.size()) { - if (!parseFieldDelimiterWithDiagnosticInfo(out)) + if (!format_reader->parseFieldDelimiterWithDiagnosticInfo(out)) return false; } } - return parseRowEndWithDiagnosticInfo(out); + return format_reader->parseRowEndWithDiagnosticInfo(out); } - -void registerFileSegmentationEngineForFormatWithNamesAndTypes( - FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine) +bool RowInputFormatWithNamesAndTypes::isGarbageAfterField(size_t index, ReadBuffer::Position pos) { - factory.registerFileSegmentationEngine(base_format_name, segmentation_engine); - factory.registerFileSegmentationEngine(base_format_name + "WithNames", segmentation_engine); - factory.registerFileSegmentationEngine(base_format_name + "WithNamesAndTypes", segmentation_engine); + return format_reader->isGarbageAfterField(index, pos); } +void RowInputFormatWithNamesAndTypes::setReadBuffer(ReadBuffer & in_) +{ + format_reader->setReadBuffer(in_); + IInputFormat::setReadBuffer(in_); +} + +FormatWithNamesAndTypesSchemaReader::FormatWithNamesAndTypesSchemaReader( + ReadBuffer & in_, + size_t max_rows_to_read_, + bool with_names_, + bool with_types_, + FormatWithNamesAndTypesReader * format_reader_, + DataTypePtr default_type_) + : IRowSchemaReader(in_, max_rows_to_read_, default_type_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) +{ +} + +NamesAndTypesList FormatWithNamesAndTypesSchemaReader::readSchema() +{ + if (with_names || with_types) + skipBOMIfExists(in); + + format_reader->skipPrefixBeforeHeader(); + + Names names; + if (with_names) + names = format_reader->readNames(); + + if (with_types) + { + format_reader->skipRowBetweenDelimiter(); + std::vector data_type_names = format_reader->readTypes(); + if (data_type_names.size() != names.size()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The number of column names {} differs with the number of types {}", names.size(), data_type_names.size()); + + NamesAndTypesList result; + for (size_t i = 0; i != data_type_names.size(); ++i) + result.emplace_back(names[i], DataTypeFactory::instance().get(data_type_names[i])); + return result; + } + + if (!names.empty()) + setColumnNames(names); + + /// We should determine types by reading rows with data. Use the implementation from IRowSchemaReader. + return IRowSchemaReader::readSchema(); +} } + diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index cd7cd34d7e6..25ffc8d6de2 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -1,12 +1,15 @@ #pragma once #include +#include #include #include namespace DB { +class FormatWithNamesAndTypesReader; + /// Base class for input formats with -WithNames and -WithNamesAndTypes suffixes. /// It accepts 2 parameters in constructor - with_names and with_types and implements /// input format depending on them: @@ -20,7 +23,7 @@ namespace DB /// then reads/skips types. So you can this invariant. class RowInputFormatWithNamesAndTypes : public RowInputFormatWithDiagnosticInfo { -public: +protected: /** with_names - in the first line the header with column names * with_types - in the second line the header with column names */ @@ -28,44 +31,14 @@ public: const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, bool with_types_, const FormatSettings & format_settings_); + bool with_names_, + bool with_types_, + const FormatSettings & format_settings_, + std::unique_ptr format_reader_); void resetParser() override; - -protected: - /// Read single field from input. Return false if there was no real value and we inserted default value. - virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0; - - /// Skip single field, it's used to skip unknown columns. - virtual void skipField(size_t file_column) = 0; - /// Skip the whole row with names. - virtual void skipNames() = 0; - /// Skip the whole row with types. - virtual void skipTypes() = 0; - - /// Skip delimiters, if any. - virtual void skipPrefixBeforeHeader() {} - virtual void skipRowStartDelimiter() {} - virtual void skipFieldDelimiter() {} - virtual void skipRowEndDelimiter() {} - virtual void skipRowBetweenDelimiter() {} - - /// Check suffix. - virtual bool checkForSuffix() { return in->eof(); } - - /// Methods for parsing with diagnostic info. - virtual void checkNullValueForNonNullable(DataTypePtr) {} - virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer &) { return true; } - virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer &) { return true; } - virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer &) { return true;} - virtual bool parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer &) { return true;} - virtual bool tryParseSuffixWithDiagnosticInfo(WriteBuffer &) { return true; } - bool isGarbageAfterField(size_t, ReadBuffer::Position) override {return false; } - - /// Read row with names and return the list of them. - virtual std::vector readNames() = 0; - /// Read row with types and return the list of them. - virtual std::vector readTypes() = 0; + bool isGarbageAfterField(size_t index, ReadBuffer::Position pos) override; + void setReadBuffer(ReadBuffer & in_) override; const FormatSettings format_settings; DataTypes data_types; @@ -84,10 +57,90 @@ private: bool with_names; bool with_types; + std::unique_ptr format_reader; std::unordered_map column_indexes_by_names; }; -void registerFileSegmentationEngineForFormatWithNamesAndTypes( - FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine); +/// Base class for parsing data in input formats with -WithNames and -WithNamesAndTypes suffixes. +/// Used for reading/skipping names/types/delimiters in specific format. +class FormatWithNamesAndTypesReader +{ +public: + explicit FormatWithNamesAndTypesReader(ReadBuffer & in_, const FormatSettings & format_settings_) : in(&in_), format_settings(format_settings_) {} + + /// Read single field from input. Return false if there was no real value and we inserted default value. + virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0; + + /// Methods for parsing with diagnostic info. + virtual void checkNullValueForNonNullable(DataTypePtr) {} + virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer &) { return true;} + virtual bool parseRowBetweenDelimiterWithDiagnosticInfo(WriteBuffer &) { return true;} + virtual bool tryParseSuffixWithDiagnosticInfo(WriteBuffer &) { return true; } + virtual bool isGarbageAfterField(size_t, ReadBuffer::Position) { return false; } + + /// Read row with names and return the list of them. + virtual std::vector readNames() = 0; + /// Read row with types and return the list of them. + virtual std::vector readTypes() = 0; + + /// Skip single field, it's used to skip unknown columns. + virtual void skipField(size_t file_column) = 0; + /// Skip the whole row with names. + virtual void skipNames() = 0; + /// Skip the whole row with types. + virtual void skipTypes() = 0; + + /// Skip delimiters, if any. + virtual void skipPrefixBeforeHeader() {} + virtual void skipRowStartDelimiter() {} + virtual void skipFieldDelimiter() {} + virtual void skipRowEndDelimiter() {} + virtual void skipRowBetweenDelimiter() {} + + /// Check suffix. + virtual bool checkForSuffix() { return in->eof(); } + + const FormatSettings & getFormatSettings() const { return format_settings; } + + virtual void setReadBuffer(ReadBuffer & in_) { in = &in_; } + + virtual ~FormatWithNamesAndTypesReader() = default; + +protected: + ReadBuffer * in; + const FormatSettings format_settings; +}; + +/// Base class for schema inference for formats with -WithNames and -WithNamesAndTypes suffixes. +/// For formats with -WithNamesAndTypes suffix the schema will be determined by first two rows. +/// For formats with -WithNames suffix the names of columns will be determined by the first row +/// and types of columns by the rows with data. +/// For formats without suffixes default column names will be used +/// and types will be determined by the rows with data. +class FormatWithNamesAndTypesSchemaReader : public IRowSchemaReader +{ +public: + FormatWithNamesAndTypesSchemaReader( + ReadBuffer & in, + size_t max_rows_to_read_, + bool with_names_, + bool with_types_, + FormatWithNamesAndTypesReader * format_reader_, + DataTypePtr default_type_ = nullptr); + + NamesAndTypesList readSchema() override; + +protected: + virtual DataTypes readRowAndGetDataTypes() override = 0; + + bool with_names; + bool with_types; + +private: + FormatWithNamesAndTypesReader * format_reader; +}; } + diff --git a/src/Processors/Merges/Algorithms/Graphite.cpp b/src/Processors/Merges/Algorithms/Graphite.cpp new file mode 100644 index 00000000000..38d3fa30b42 --- /dev/null +++ b/src/Processors/Merges/Algorithms/Graphite.cpp @@ -0,0 +1,493 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +using namespace std::literals; + +namespace DB::ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int BAD_ARGUMENTS; + extern const int UNKNOWN_ELEMENT_IN_CONFIG; + extern const int NO_ELEMENTS_IN_CONFIG; + } + +namespace DB::Graphite +{ +static std::unordered_map ruleTypeMap = +{ + { RuleTypeAll, "all" }, + { RuleTypePlain, "plain" }, + { RuleTypeTagged, "tagged"}, + { RuleTypeTagList, "tag_list"} +}; + +const String & ruleTypeStr(RuleType rule_type) +{ + try + { + return ruleTypeMap.at(rule_type); + } + catch (...) + { + throw Exception("invalid rule type: " + std::to_string(rule_type), DB::ErrorCodes::BAD_ARGUMENTS); + } +} + +RuleType ruleType(const String & s) +{ + if (s == "all") + return RuleTypeAll; + else if (s == "plain") + return RuleTypePlain; + else if (s == "tagged") + return RuleTypeTagged; + else if (s == "tag_list") + return RuleTypeTagList; + else + throw Exception("invalid rule type: " + s, DB::ErrorCodes::BAD_ARGUMENTS); +} + +static const Graphite::Pattern undef_pattern = +{ /// empty pattern for selectPatternForPath + .rule_type = RuleTypeAll, + .regexp = nullptr, + .regexp_str = "", + .function = nullptr, + .retentions = Graphite::Retentions(), + .type = undef_pattern.TypeUndef, +}; + +inline static const Patterns & selectPatternsForMetricType(const Graphite::Params & params, const StringRef path) +{ + if (params.patterns_typed) + { + std::string_view path_view = path.toView(); + if (path_view.find("?"sv) == path_view.npos) + return params.patterns_plain; + else + return params.patterns_tagged; + } + else + { + return params.patterns; + } +} + +Graphite::RollupRule selectPatternForPath( + const Graphite::Params & params, + const StringRef path) +{ + const Graphite::Pattern * first_match = &undef_pattern; + + const Patterns & patterns_check = selectPatternsForMetricType(params, path); + + for (const auto & pattern : patterns_check) + { + if (!pattern.regexp) + { + /// Default pattern + if (first_match->type == first_match->TypeUndef && pattern.type == pattern.TypeAll) + { + /// There is only default pattern for both retention and aggregation + return std::pair(&pattern, &pattern); + } + if (pattern.type != first_match->type) + { + if (first_match->type == first_match->TypeRetention) + { + return std::pair(first_match, &pattern); + } + if (first_match->type == first_match->TypeAggregation) + { + return std::pair(&pattern, first_match); + } + } + } + else + { + if (pattern.regexp->match(path.data, path.size)) + { + /// General pattern with matched path + if (pattern.type == pattern.TypeAll) + { + /// Only for not default patterns with both function and retention parameters + return std::pair(&pattern, &pattern); + } + if (first_match->type == first_match->TypeUndef) + { + first_match = &pattern; + continue; + } + if (pattern.type != first_match->type) + { + if (first_match->type == first_match->TypeRetention) + { + return std::pair(first_match, &pattern); + } + if (first_match->type == first_match->TypeAggregation) + { + return std::pair(&pattern, first_match); + } + } + } + } + } + + return {nullptr, nullptr}; +} + +/** Is used to order Graphite::Retentions by age and precision descending. + * Throws exception if not both age and precision are less or greater then another. + */ +static bool compareRetentions(const Retention & a, const Retention & b) +{ + if (a.age > b.age && a.precision > b.precision) + { + return true; + } + else if (a.age < b.age && a.precision < b.precision) + { + return false; + } + String error_msg = "age and precision should only grow up: " + + std::to_string(a.age) + ":" + std::to_string(a.precision) + " vs " + + std::to_string(b.age) + ":" + std::to_string(b.precision); + throw Exception( + error_msg, + DB::ErrorCodes::BAD_ARGUMENTS); +} + +bool operator==(const Retention & a, const Retention & b) +{ + return a.age == b.age && a.precision == b.precision; +} + +std::ostream & operator<<(std::ostream & stream, const Retentions & a) +{ + stream << "{ "; + for (size_t i = 0; i < a.size(); i++) + { + if (i > 0) + stream << ","; + stream << " { age = " << a[i].age << ", precision = " << a[i].precision << " }"; + } + stream << " }"; + + return stream; +} + +bool operator==(const Pattern & a, const Pattern & b) +{ + // equal + // Retentions retentions; /// Must be ordered by 'age' descending. + if (a.type != b.type || a.regexp_str != b.regexp_str || a.rule_type != b.rule_type) + return false; + + if (a.function == nullptr) + { + if (b.function != nullptr) + return false; + } + else if (b.function == nullptr) + { + return false; + } + else if (a.function->getName() != b.function->getName()) + { + return false; + } + + return a.retentions == b.retentions; +} + +std::ostream & operator<<(std::ostream & stream, const Pattern & a) +{ + stream << "{ rule_type = " << ruleTypeStr(a.rule_type); + if (!a.regexp_str.empty()) + stream << ", regexp = '" << a.regexp_str << "'"; + if (a.function != nullptr) + stream << ", function = " << a.function->getName(); + if (!a.retentions.empty()) + { + stream << ",\n retentions = {\n"; + for (size_t i = 0; i < a.retentions.size(); i++) + { + stream << " { " << a.retentions[i].age << ", " << a.retentions[i].precision << " }"; + if (i < a.retentions.size() - 1) + stream << ","; + stream << "\n"; + } + stream << " }\n"; + } + else + stream << " "; + + stream << "}"; + return stream; +} + +std::string buildTaggedRegex(std::string regexp_str) +{ + /* + * tags list in format (for name or any value can use regexp, alphabet sorting not needed) + * spaces are not stiped and used as tag and value part + * name must be first (if used) + * + * tag1=value1; tag2=VALUE2_REGEX;tag3=value3 + * or + * name;tag1=value1;tag2=VALUE2_REGEX;tag3=value3 + * or for one tag + * tag1=value1 + * + * Resulting regex against metric like + * name?tag1=value1&tag2=value2 + * + * So, + * + * name + * produce + * name\? + * + * tag2=val2 + * produce + * [\?&]tag2=val2(&.*)?$ + * + * nam.* ; tag1=val1 ; tag2=val2 + * produce + * nam.*\?(.*&)?tag1=val1&(.*&)?tag2=val2(&.*)?$ + */ + + std::vector tags; + + splitInto<';'>(tags, regexp_str); + /* remove empthy elements */ + using namespace std::string_literals; + tags.erase(std::remove(tags.begin(), tags.end(), ""s), tags.end()); + if (tags[0].find('=') == tags[0].npos) + { + if (tags.size() == 1) /* only name */ + return "^" + tags[0] + "\\?"; + /* start with name value */ + regexp_str = "^" + tags[0] + "\\?(.*&)?"; + tags.erase(std::begin(tags)); + } + else + regexp_str = "[\\?&]"; + + std::sort(std::begin(tags), std::end(tags)); /* sorted tag keys */ + regexp_str += fmt::format( + "{}{}", + fmt::join(tags, "&(.*&)?"), + "(&.*)?$" /* close regex */ + ); + + return regexp_str; +} + +/** Read the settings for Graphite rollup from config. + * Example + * + * + * Path + * + * click_cost + * any + * + * 0 + * 3600 + * + * + * 86400 + * 60 + * + * + * + * max + * + * 0 + * 60 + * + * + * 3600 + * 300 + * + * + * 86400 + * 3600 + * + * + * + */ +static const Pattern & +appendGraphitePattern( + const Poco::Util::AbstractConfiguration & config, + const String & config_element, Patterns & patterns, + bool default_rule, + ContextPtr context) +{ + Pattern pattern; + + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(config_element, keys); + + for (const auto & key : keys) + { + if (key == "regexp") + { + pattern.regexp_str = config.getString(config_element + ".regexp"); + } + else if (key == "function") + { + String aggregate_function_name_with_params = config.getString(config_element + ".function"); + String aggregate_function_name; + Array params_row; + getAggregateFunctionNameAndParametersArray( + aggregate_function_name_with_params, aggregate_function_name, params_row, "GraphiteMergeTree storage initialization", context); + + /// TODO Not only Float64 + AggregateFunctionProperties properties; + pattern.function = AggregateFunctionFactory::instance().get( + aggregate_function_name, {std::make_shared()}, params_row, properties); + } + else if (key == "rule_type") + { + String rule_type = config.getString(config_element + ".rule_type"); + pattern.rule_type = ruleType(rule_type); + } + else if (startsWith(key, "retention")) + { + pattern.retentions.emplace_back(Graphite::Retention{ + .age = config.getUInt(config_element + "." + key + ".age"), + .precision = config.getUInt(config_element + "." + key + ".precision")}); + } + else + throw Exception("Unknown element in config: " + key, DB::ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); + } + + if (!pattern.regexp_str.empty()) + { + if (pattern.rule_type == RuleTypeTagList) + { + // construct tagged regexp + pattern.regexp_str = buildTaggedRegex(pattern.regexp_str); + pattern.rule_type = RuleTypeTagged; + } + pattern.regexp = std::make_shared(pattern.regexp_str); + } + + if (!pattern.function && pattern.retentions.empty()) + throw Exception( + "At least one of an aggregate function or retention rules is mandatory for rollup patterns in GraphiteMergeTree", + DB::ErrorCodes::NO_ELEMENTS_IN_CONFIG); + + if (default_rule && pattern.rule_type != RuleTypeAll) + { + throw Exception( + "Default must have rule_type all for rollup patterns in GraphiteMergeTree", + DB::ErrorCodes::BAD_ARGUMENTS); + } + + if (!pattern.function) + { + pattern.type = pattern.TypeRetention; + } + else if (pattern.retentions.empty()) + { + pattern.type = pattern.TypeAggregation; + } + else + { + pattern.type = pattern.TypeAll; + } + + if (pattern.type & pattern.TypeAggregation) /// TypeAggregation or TypeAll + if (pattern.function->allocatesMemoryInArena()) + throw Exception( + "Aggregate function " + pattern.function->getName() + " isn't supported in GraphiteMergeTree", DB::ErrorCodes::NOT_IMPLEMENTED); + + /// retention should be in descending order of age. + if (pattern.type & pattern.TypeRetention) /// TypeRetention or TypeAll + std::sort(pattern.retentions.begin(), pattern.retentions.end(), compareRetentions); + + patterns.emplace_back(pattern); + return patterns.back(); +} + +void setGraphitePatternsFromConfig(ContextPtr context, const String & config_element, Graphite::Params & params) +{ + const auto & config = context->getConfigRef(); + + if (!config.has(config_element)) + throw Exception("No '" + config_element + "' element in configuration file", ErrorCodes::NO_ELEMENTS_IN_CONFIG); + + params.config_name = config_element; + params.path_column_name = config.getString(config_element + ".path_column_name", "Path"); + params.time_column_name = config.getString(config_element + ".time_column_name", "Time"); + params.value_column_name = config.getString(config_element + ".value_column_name", "Value"); + params.version_column_name = config.getString(config_element + ".version_column_name", "Timestamp"); + + params.patterns_typed = false; + + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(config_element, keys); + + for (const auto & key : keys) + { + if (startsWith(key, "pattern")) + { + if (appendGraphitePattern(config, config_element + "." + key, params.patterns, false, context).rule_type != RuleTypeAll) + params.patterns_typed = true; + } + else if (key == "default") + { + /// See below. + } + else if (key == "path_column_name" || key == "time_column_name" || key == "value_column_name" || key == "version_column_name") + { + /// See above. + } + else + throw Exception("Unknown element in config: " + key, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); + } + + if (config.has(config_element + ".default")) + appendGraphitePattern(config, config_element + "." + ".default", params.patterns, true, context); + + for (const auto & pattern : params.patterns) + { + if (pattern.rule_type == RuleTypeAll) + { + if (params.patterns_typed) + { + params.patterns_plain.push_back(pattern); + params.patterns_tagged.push_back(pattern); + } + } + else if (pattern.rule_type == RuleTypePlain) + { + params.patterns_plain.push_back(pattern); + } + else if (pattern.rule_type == RuleTypeTagged) + { + params.patterns_tagged.push_back(pattern); + } + else + { + throw Exception("Unhandled rule_type in config: " + ruleTypeStr(pattern.rule_type), ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); + } + } +} + +} diff --git a/src/Processors/Merges/Algorithms/Graphite.h b/src/Processors/Merges/Algorithms/Graphite.h index ecb1aeb9804..dc39cb46386 100644 --- a/src/Processors/Merges/Algorithms/Graphite.h +++ b/src/Processors/Merges/Algorithms/Graphite.h @@ -1,13 +1,8 @@ #pragma once + +#include #include - -namespace DB -{ - -class IAggregateFunction; -using AggregateFunctionPtr = std::shared_ptr; - -} +#include /** Intended for implementation of "rollup" - aggregation (rounding) of older data * for a table with Graphite data (Graphite is the system for time series monitoring). @@ -97,16 +92,32 @@ using AggregateFunctionPtr = std::shared_ptr; namespace DB::Graphite { +// sync with rule_types_str +enum RuleType +{ + RuleTypeAll = 0, // default, with regex, compatible with old scheme + RuleTypePlain = 1, // plain metrics, with regex, compatible with old scheme + RuleTypeTagged = 2, // tagged metrics, with regex, compatible with old scheme + RuleTypeTagList = 3 // tagged metrics, with regex (converted to RuleTypeTagged from string like 'retention=10min ; env=(staging|prod)') +}; + +const String & ruleTypeStr(RuleType rule_type); + struct Retention { UInt32 age; UInt32 precision; }; +bool operator==(const Retention & a, const Retention & b); + using Retentions = std::vector; +std::ostream &operator<<(std::ostream & stream, const Retentions & a); + struct Pattern { + RuleType rule_type = RuleTypeAll; std::shared_ptr regexp; std::string regexp_str; AggregateFunctionPtr function; @@ -114,6 +125,9 @@ struct Pattern enum { TypeUndef, TypeRetention, TypeAggregation, TypeAll } type = TypeAll; /// The type of defined pattern, filled automatically }; +bool operator==(const Pattern & a, const Pattern & b); +std::ostream &operator<<(std::ostream & stream, const Pattern & a); + using Patterns = std::vector; using RetentionPattern = Pattern; using AggregationPattern = Pattern; @@ -125,9 +139,16 @@ struct Params String time_column_name; String value_column_name; String version_column_name; + bool patterns_typed; Graphite::Patterns patterns; + Graphite::Patterns patterns_plain; + Graphite::Patterns patterns_tagged; }; using RollupRule = std::pair; +Graphite::RollupRule selectPatternForPath(const Graphite::Params & params, const StringRef path); + +void setGraphitePatternsFromConfig(ContextPtr context, const String & config_element, Graphite::Params & params); + } diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp index d5a35fef7bd..6464f10ca58 100644 --- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -52,62 +53,6 @@ GraphiteRollupSortedAlgorithm::GraphiteRollupSortedAlgorithm( columns_definition = defineColumns(header, params); } -Graphite::RollupRule GraphiteRollupSortedAlgorithm::selectPatternForPath(StringRef path) const -{ - const Graphite::Pattern * first_match = &undef_pattern; - - for (const auto & pattern : params.patterns) - { - if (!pattern.regexp) - { - /// Default pattern - if (first_match->type == first_match->TypeUndef && pattern.type == pattern.TypeAll) - { - /// There is only default pattern for both retention and aggregation - return std::pair(&pattern, &pattern); - } - if (pattern.type != first_match->type) - { - if (first_match->type == first_match->TypeRetention) - { - return std::pair(first_match, &pattern); - } - if (first_match->type == first_match->TypeAggregation) - { - return std::pair(&pattern, first_match); - } - } - } - else if (pattern.regexp->match(path.data, path.size)) - { - /// General pattern with matched path - if (pattern.type == pattern.TypeAll) - { - /// Only for not default patterns with both function and retention parameters - return std::pair(&pattern, &pattern); - } - if (first_match->type == first_match->TypeUndef) - { - first_match = &pattern; - continue; - } - if (pattern.type != first_match->type) - { - if (first_match->type == first_match->TypeRetention) - { - return std::pair(first_match, &pattern); - } - if (first_match->type == first_match->TypeAggregation) - { - return std::pair(&pattern, first_match); - } - } - } - } - - return {nullptr, nullptr}; -} - UInt32 GraphiteRollupSortedAlgorithm::selectPrecision(const Graphite::Retentions & retentions, time_t time) const { static_assert(is_signed_v, "time_t must be signed type"); @@ -188,7 +133,7 @@ IMergingAlgorithm::Status GraphiteRollupSortedAlgorithm::merge() Graphite::RollupRule next_rule = merged_data.currentRule(); if (new_path) - next_rule = selectPatternForPath(next_path); + next_rule = selectPatternForPath(this->params, next_path); const Graphite::RetentionPattern * retention_pattern = std::get<0>(next_rule); time_t next_time_rounded; diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h index 0155b73b238..4968cbfc470 100644 --- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h @@ -102,16 +102,6 @@ private: time_t current_time = 0; time_t current_time_rounded = 0; - const Graphite::Pattern undef_pattern = - { /// temporary empty pattern for selectPatternForPath - .regexp = nullptr, - .regexp_str = "", - .function = nullptr, - .retentions = DB::Graphite::Retentions(), - .type = undef_pattern.TypeUndef, - }; - - Graphite::RollupRule selectPatternForPath(StringRef path) const; UInt32 selectPrecision(const Graphite::Retentions & retentions, time_t time) const; /// Insert the values into the resulting columns, which will not be changed in the future. diff --git a/src/Processors/Merges/Algorithms/tests/gtest_graphite.cpp b/src/Processors/Merges/Algorithms/tests/gtest_graphite.cpp new file mode 100644 index 00000000000..1d739bf566a --- /dev/null +++ b/src/Processors/Merges/Algorithms/tests/gtest_graphite.cpp @@ -0,0 +1,597 @@ +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +using namespace DB; + +static int regAggregateFunctions = 0; + +void tryRegisterAggregateFunctions() +{ + if (!regAggregateFunctions) + { + registerAggregateFunctions(); + regAggregateFunctions = 1; + } +} + +static ConfigProcessor::LoadedConfig loadConfiguration(const std::string & config_path) +{ + ConfigProcessor config_processor(config_path, true, true); + ConfigProcessor::LoadedConfig config = config_processor.loadConfig(false); + return config; +} + +static ConfigProcessor::LoadedConfig loadConfigurationFromString(std::string & s) +{ + char tmp_file[19]; + strcpy(tmp_file, "/tmp/rollup-XXXXXX"); + int fd = mkstemp(tmp_file); + if (fd == -1) + { + throw std::runtime_error(strerror(errno)); + } + try { + if (write(fd, s.c_str(), s.size()) < s.size()) + { + throw std::runtime_error("unable write to temp file"); + } + if (write(fd, "\n", 1) != 1) + { + throw std::runtime_error("unable write to temp file"); + } + close(fd); + auto config_path = std::string(tmp_file) + ".xml"; + if (std::rename(tmp_file, config_path.c_str())) + { + int err = errno; + remove(tmp_file); + throw std::runtime_error(strerror(err)); + } + ConfigProcessor::LoadedConfig config = loadConfiguration(config_path); + remove(tmp_file); + return config; + } + catch (...) + { + remove(tmp_file); + throw; + } +} + +static Graphite::Params setGraphitePatterns(ContextMutablePtr context, ConfigProcessor::LoadedConfig & config) +{ + context->setConfig(config.configuration); + + Graphite::Params params; + setGraphitePatternsFromConfig(context, "graphite_rollup", params); + + return params; +} + +struct PatternForCheck +{ + Graphite::RuleType rule_type; + std::string regexp_str; + String function; + Graphite::Retentions retentions; +}; + + +bool checkRule(const Graphite::Pattern & pattern, const struct PatternForCheck & pattern_check, + const std::string & typ, const std::string & path, std::string & message) +{ + bool rule_type_eq = (pattern.rule_type == pattern_check.rule_type); + bool regexp_eq = (pattern.regexp_str == pattern_check.regexp_str); + bool function_eq = (pattern.function == nullptr && pattern_check.function.empty()) + || (pattern.function != nullptr && pattern.function->getName() == pattern_check.function); + bool retentions_eq = (pattern.retentions == pattern_check.retentions); + + if (rule_type_eq && regexp_eq && function_eq && retentions_eq) + return true; + + message = typ + " rollup rule mismatch for '" + path + "'," + + (rule_type_eq ? "" : "rule_type ") + + (regexp_eq ? "" : "regexp ") + + (function_eq ? "" : "function ") + + (retentions_eq ? "" : "retentions "); + return false; +} + +std::ostream & operator<<(std::ostream & stream, const PatternForCheck & a) +{ + stream << "{ rule_type = " << ruleTypeStr(a.rule_type); + if (!a.regexp_str.empty()) + stream << ", regexp = '" << a.regexp_str << "'"; + if (!a.function.empty()) + stream << ", function = " << a.function; + if (!a.retentions.empty()) + { + stream << ",\n retentions = {\n"; + for (size_t i = 0; i < a.retentions.size(); i++) + { + stream << " { " << a.retentions[i].age << ", " << a.retentions[i].precision << " }"; + if (i < a.retentions.size() - 1) + stream << ","; + stream << "\n"; + } + stream << " }\n"; + } + else + stream << " "; + + stream << "}"; + return stream; +} + +struct PatternsForPath +{ + std::string path; + PatternForCheck retention_want; + PatternForCheck aggregation_want; +}; + +TEST(GraphiteTest, testSelectPattern) +{ + tryRegisterAggregateFunctions(); + + using namespace std::literals; + + std::string + xml(R"END( + + + \.sum$ + sum + + + ^((.*)|.)sum\? + sum + + + \.max$ + max + + + ^((.*)|.)max\? + max + + + \.min$ + min + + + ^((.*)|.)min\? + min + + + \.(count|sum|sum_sq)$ + sum + + + ^((.*)|.)(count|sum|sum_sq)\? + sum + + + ^retention\. + + 0 + 60 + + + 86400 + 3600 + + + + avg + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + + +)END"); + + // Retentions must be ordered by 'age' descending. + std::vector tests + { + { + "test.sum", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, R"END(\.sum$)END", "sum", { } } + }, + { + "val.sum?env=test&tag=Fake3", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, R"END(^((.*)|.)sum\?)END", "sum", { } } + }, + { + "test.max", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, R"END(\.max$)END", "max", { } }, + }, + { + "val.max?env=test&tag=Fake4", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, R"END(^((.*)|.)max\?)END", "max", { } }, + }, + { + "test.min", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, R"END(\.min$)END", "min", { } }, + }, + { + "val.min?env=test&tag=Fake5", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, R"END(^((.*)|.)min\?)END", "min", { } }, + }, + { + "retention.count", + { Graphite::RuleTypeAll, R"END(^retention\.)END", "", { { 86400, 3600 }, { 0, 60 } } }, // ^retention + { Graphite::RuleTypeAll, R"END(\.(count|sum|sum_sq)$)END", "sum", { } }, + }, + { + "val.retention.count?env=test&tag=Fake5", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, R"END(^((.*)|.)(count|sum|sum_sq)\?)END", "sum", { } }, + }, + { + "val.count?env=test&tag=Fake5", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, R"END(^((.*)|.)(count|sum|sum_sq)\?)END", "sum", { } }, + }, + { + "test.p95", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + }, + { + "val.p95?env=test&tag=FakeNo", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + }, + { + "default", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + }, + { + "val.default?env=test&tag=FakeNo", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + } + }; + + auto config = loadConfigurationFromString(xml); + ContextMutablePtr context = getContext().context; + Graphite::Params params = setGraphitePatterns(context, config); + + for (const auto & t : tests) + { + auto rule = DB::Graphite::selectPatternForPath(params, t.path); + std:: string message; + if (!checkRule(*rule.first, t.retention_want, "retention", t.path, message)) + ADD_FAILURE() << message << ", got\n" << *rule.first << "\n, want\n" << t.retention_want << "\n"; + if (!checkRule(*rule.second, t.aggregation_want, "aggregation", t.path, message)) + ADD_FAILURE() << message << ", got\n" << *rule.second << "\n, want\n" << t.aggregation_want << "\n"; + } +} + + +namespace DB::Graphite +{ + std::string buildTaggedRegex(std::string regexp_str); +} + +struct RegexCheck +{ + std::string regex; + std::string regex_want; + std::string match; + std::string nomatch; +}; + +TEST(GraphiteTest, testBuildTaggedRegex) +{ + std::vector tests + { + { + "cpu\\.loadavg;project=DB.*;env=st.*", + R"END(^cpu\.loadavg\?(.*&)?env=st.*&(.*&)?project=DB.*(&.*)?$)END", + R"END(cpu.loadavg?env=staging&project=DBAAS)END", + R"END(cpu.loadavg?env=staging&project=D)END" + }, + { + R"END(project=DB.*;env=staging;)END", + R"END([\?&]env=staging&(.*&)?project=DB.*(&.*)?$)END", + R"END(cpu.loadavg?env=staging&project=DBPG)END", + R"END(cpu.loadavg?env=stagingN&project=DBAAS)END" + }, + { + "env=staging;", + R"END([\?&]env=staging(&.*)?$)END", + R"END(cpu.loadavg?env=staging&project=DPG)END", + R"END(cpu.loadavg?env=stagingN)END" + }, + { + " env = staging ;", // spaces are allowed, + R"END([\?&] env = staging (&.*)?$)END", + R"END(cpu.loadavg? env = staging &project=DPG)END", + R"END(cpu.loadavg?env=stagingN)END" + }, + { + "name;", + R"END(^name\?)END", + R"END(name?env=staging&project=DPG)END", + R"END(nameN?env=stagingN)END", + }, + { + "name", + R"END(^name\?)END", + R"END(name?env=staging&project=DPG)END", + R"END(nameN?env=stagingN)END", + } + }; + for (const auto & t : tests) + { + auto s = DB::Graphite::buildTaggedRegex(t.regex); + EXPECT_EQ(t.regex_want, s) << "result for '" << t.regex_want << "' mismatch"; + auto regexp = OptimizedRegularExpression(s); + EXPECT_TRUE(regexp.match(t.match.data(), t.match.size())) << t.match << " match for '" << s << "' failed"; + EXPECT_FALSE(regexp.match(t.nomatch.data(), t.nomatch.size())) << t.nomatch << " ! match for '" << s << "' failed"; + } +} + +TEST(GraphiteTest, testSelectPatternTyped) +{ + tryRegisterAggregateFunctions(); + + using namespace std::literals; + + std::string + xml(R"END( + + + plain + \.sum$ + sum + + + tagged + ^((.*)|.)sum\? + sum + + + plain + \.max$ + max + + + tagged + ^((.*)|.)max\? + max + + + plain + \.min$ + min + + + tagged + ^((.*)|.)min\? + min + + + plain + \.(count|sum|sum_sq)$ + sum + + + tagged + ^((.*)|.)(count|sum|sum_sq)\? + sum + + + plain + ^retention\. + + 0 + 60 + + + 86400 + 3600 + + + + tagged + + + 0 + 60 + + + 86400 + 3600 + + + + tag_list + retention=10min;env=staging + + 0 + 600 + + + 86400 + 3600 + + + + tag_list + retention=10min;env=[A-Za-z-]+rod[A-Za-z-]+ + + 0 + 600 + + + 86400 + 3600 + + + + tag_list + cpu\.loadavg + + 0 + 600 + + + 86400 + 3600 + + + + avg + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + + +)END"); + + // Retentions must be ordered by 'age' descending. + std::vector tests + { + { + "test.sum", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypePlain, R"END(\.sum$)END", "sum", { } } + }, + { + "val.sum?env=test&tag=Fake3", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeTagged, R"END(^((.*)|.)sum\?)END", "sum", { } } + }, + { + "test.max", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypePlain, R"END(\.max$)END", "max", { } }, + }, + { + "val.max?env=test&tag=Fake4", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeTagged, R"END(^((.*)|.)max\?)END", "max", { } }, + }, + { + "test.min", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypePlain, R"END(\.min$)END", "min", { } }, + }, + { + "val.min?env=test&tag=Fake5", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeTagged, R"END(^((.*)|.)min\?)END", "min", { } }, + }, + { + "retention.count", + { Graphite::RuleTypePlain, R"END(^retention\.)END", "", { { 86400, 3600 }, { 0, 60 } } }, // ^retention + { Graphite::RuleTypePlain, R"END(\.(count|sum|sum_sq)$)END", "sum", { } }, + }, + { + "val.count?env=test&retention=hour&tag=Fake5", + { Graphite::RuleTypeTagged, R"END([\?&]retention=hour(&.*)?$)END", "", { { 86400, 3600 }, { 0, 60 } } }, // tagged retention=hour + { Graphite::RuleTypeTagged, R"END(^((.*)|.)(count|sum|sum_sq)\?)END", "sum", { } }, + }, + { + "val.count?env=test&retention=hour", + { Graphite::RuleTypeTagged, R"END([\?&]retention=hour(&.*)?$)END", "", { { 86400, 3600 }, { 0, 60 } } }, // tagged retention=hour + { Graphite::RuleTypeTagged, R"END(^((.*)|.)(count|sum|sum_sq)\?)END", "sum", { } }, + }, + { + "val.count?env=staging&retention=10min", + { Graphite::RuleTypeTagged, R"END([\?&]env=staging&(.*&)?retention=10min(&.*)?$)END", "", { { 86400, 3600 }, { 0, 600 } } }, // retention=10min ; env=staging + { Graphite::RuleTypeTagged, R"END(^((.*)|.)(count|sum|sum_sq)\?)END", "sum", { } }, + }, + { + "val.count?env=production&retention=10min", + { Graphite::RuleTypeTagged, R"END([\?&]env=[A-Za-z-]+rod[A-Za-z-]+&(.*&)?retention=10min(&.*)?$)END", "", { { 86400, 3600 }, { 0, 600 } } }, // retention=10min ; env=[A-Za-z-]+rod[A-Za-z-]+ + { Graphite::RuleTypeTagged, R"END(^((.*)|.)(count|sum|sum_sq)\?)END", "sum", { } }, + }, + { + "val.count?env=test&tag=Fake5", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeTagged, R"END(^((.*)|.)(count|sum|sum_sq)\?)END", "sum", { } }, + }, + { + "cpu.loadavg?env=test&tag=FakeNo", + { Graphite::RuleTypeTagged, R"END(^cpu\.loadavg\?)END", "", { { 86400, 3600 }, { 0, 600 } } }, // name=cpu\.loadavg + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, + }, + { + "test.p95", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + }, + { + "val.p95?env=test&tag=FakeNo", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + }, + { + "default", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + }, + { + "val.default?env=test&tag=FakeNo", + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + { Graphite::RuleTypeAll, "", "avg", { { 86400, 3600 }, { 3600, 300 }, { 0, 60 } } }, //default + } + }; + + auto config = loadConfigurationFromString(xml); + ContextMutablePtr context = getContext().context; + Graphite::Params params = setGraphitePatterns(context, config); + + for (const auto & t : tests) + { + auto rule = DB::Graphite::selectPatternForPath(params, t.path); + std:: string message; + if (!checkRule(*rule.first, t.retention_want, "retention", t.path, message)) + ADD_FAILURE() << message << ", got\n" << *rule.first << "\n, want\n" << t.retention_want << "\n"; + if (!checkRule(*rule.second, t.aggregation_want, "aggregation", t.path, message)) + ADD_FAILURE() << message << ", got\n" << *rule.second << "\n, want\n" << t.aggregation_want << "\n"; + } +} diff --git a/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp b/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp index 2480673d65e..fb3ed7f80fc 100644 --- a/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp +++ b/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp @@ -6,16 +6,13 @@ namespace DB { -BuildQueryPipelineSettings BuildQueryPipelineSettings::fromSettings(const Settings & from) +BuildQueryPipelineSettings BuildQueryPipelineSettings::fromContext(ContextPtr from) { BuildQueryPipelineSettings settings; - settings.actions_settings = ExpressionActionsSettings::fromSettings(from, CompileExpressions::yes); + settings.actions_settings = ExpressionActionsSettings::fromSettings(from->getSettingsRef(), CompileExpressions::yes); + settings.process_list_element = from->getProcessListElement(); + settings.progress_callback = from->getProgressCallback(); return settings; } -BuildQueryPipelineSettings BuildQueryPipelineSettings::fromContext(ContextPtr from) -{ - return fromSettings(from->getSettingsRef()); -} - } diff --git a/src/Processors/QueryPlan/BuildQueryPipelineSettings.h b/src/Processors/QueryPlan/BuildQueryPipelineSettings.h index c3282d43778..fadbd061fbd 100644 --- a/src/Processors/QueryPlan/BuildQueryPipelineSettings.h +++ b/src/Processors/QueryPlan/BuildQueryPipelineSettings.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -8,14 +9,15 @@ namespace DB { struct Settings; +class QueryStatus; struct BuildQueryPipelineSettings { ExpressionActionsSettings actions_settings; + QueryStatus * process_list_element = nullptr; + ProgressCallback progress_callback = nullptr; const ExpressionActionsSettings & getActionsSettings() const { return actions_settings; } - - static BuildQueryPipelineSettings fromSettings(const Settings & from); static BuildQueryPipelineSettings fromContext(ContextPtr from); }; diff --git a/src/Processors/QueryPlan/QueryPlan.cpp b/src/Processors/QueryPlan/QueryPlan.cpp index f319e562bfb..a271ef78dfa 100644 --- a/src/Processors/QueryPlan/QueryPlan.cpp +++ b/src/Processors/QueryPlan/QueryPlan.cpp @@ -180,6 +180,9 @@ QueryPipelineBuilderPtr QueryPlan::buildQueryPipeline( for (auto & context : interpreter_context) last_pipeline->addInterpreterContext(std::move(context)); + last_pipeline->setProgressCallback(build_pipeline_settings.progress_callback); + last_pipeline->setProcessListElement(build_pipeline_settings.process_list_element); + return last_pipeline; } diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 826ef084d87..0f56e4ab33f 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -289,8 +289,6 @@ void ReadFromRemote::initializePipeline(QueryPipelineBuilder & pipeline, const B { for (const auto & shard : shards) { - auto coordinator = std::make_shared(); - if (shard.lazy) addLazyPipe(pipes, shard, /*coordinator=*/nullptr, /*pool*/{}, /*replica_info*/std::nullopt); else diff --git a/src/Processors/Sources/ShellCommandSource.cpp b/src/Processors/Sources/ShellCommandSource.cpp new file mode 100644 index 00000000000..dc272ace01e --- /dev/null +++ b/src/Processors/Sources/ShellCommandSource.cpp @@ -0,0 +1,586 @@ +#include + +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNSUPPORTED_METHOD; + extern const int TIMEOUT_EXCEEDED; + extern const int CANNOT_FCNTL; + extern const int CANNOT_READ_FROM_FILE_DESCRIPTOR; + extern const int CANNOT_POLL; + extern const int CANNOT_WRITE_TO_FILE_DESCRIPTOR; +} + +static bool tryMakeFdNonBlocking(int fd) +{ + int flags = fcntl(fd, F_GETFL, 0); + if (-1 == flags) + return false; + if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) + return false; + + return true; +} + +static void makeFdNonBlocking(int fd) +{ + bool result = tryMakeFdNonBlocking(fd); + if (!result) + throwFromErrno("Cannot set non-blocking mode of pipe", ErrorCodes::CANNOT_FCNTL); +} + +static bool tryMakeFdBlocking(int fd) +{ + int flags = fcntl(fd, F_GETFL, 0); + if (-1 == flags) + return false; + + if (-1 == fcntl(fd, F_SETFL, flags & (~O_NONBLOCK))) + return false; + + return true; +} + +static void makeFdBlocking(int fd) +{ + bool result = tryMakeFdBlocking(fd); + if (!result) + throwFromErrno("Cannot set blocking mode of pipe", ErrorCodes::CANNOT_FCNTL); +} + +static bool pollFd(int fd, size_t timeout_milliseconds, int events) +{ + pollfd pfd; + pfd.fd = fd; + pfd.events = events; + pfd.revents = 0; + + Stopwatch watch; + + int res; + + while (true) + { + res = poll(&pfd, 1, timeout_milliseconds); + + if (res < 0) + { + if (errno == EINTR) + { + watch.stop(); + timeout_milliseconds -= watch.elapsedMilliseconds(); + watch.start(); + + continue; + } + else + { + throwFromErrno("Cannot poll", ErrorCodes::CANNOT_POLL); + } + } + else + { + break; + } + } + + return res > 0; +} + +class TimeoutReadBufferFromFileDescriptor : public BufferWithOwnMemory +{ +public: + explicit TimeoutReadBufferFromFileDescriptor(int fd_, size_t timeout_milliseconds_) + : fd(fd_) + , timeout_milliseconds(timeout_milliseconds_) + { + makeFdNonBlocking(fd); + } + + bool nextImpl() override + { + size_t bytes_read = 0; + + while (!bytes_read) + { + if (!pollFd(fd, timeout_milliseconds, POLLIN)) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Pipe read timeout exceeded {} milliseconds", timeout_milliseconds); + + ssize_t res = ::read(fd, internal_buffer.begin(), internal_buffer.size()); + + if (-1 == res && errno != EINTR) + throwFromErrno("Cannot read from pipe ", ErrorCodes::CANNOT_READ_FROM_FILE_DESCRIPTOR); + + if (res == 0) + break; + + if (res > 0) + bytes_read += res; + } + + if (bytes_read > 0) + { + working_buffer = internal_buffer; + working_buffer.resize(bytes_read); + } + else + { + return false; + } + + return true; + } + + void reset() const + { + makeFdBlocking(fd); + } + + ~TimeoutReadBufferFromFileDescriptor() override + { + tryMakeFdBlocking(fd); + } + +private: + int fd; + size_t timeout_milliseconds; +}; + +class TimeoutWriteBufferFromFileDescriptor : public BufferWithOwnMemory +{ +public: + explicit TimeoutWriteBufferFromFileDescriptor(int fd_, size_t timeout_milliseconds_) + : fd(fd_) + , timeout_milliseconds(timeout_milliseconds_) + { + makeFdNonBlocking(fd); + } + + void nextImpl() override + { + if (!offset()) + return; + + size_t bytes_written = 0; + + while (bytes_written != offset()) + { + if (!pollFd(fd, timeout_milliseconds, POLLOUT)) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Pipe write timeout exceeded {} milliseconds", timeout_milliseconds); + + ssize_t res = ::write(fd, working_buffer.begin() + bytes_written, offset() - bytes_written); + + if ((-1 == res || 0 == res) && errno != EINTR) + throwFromErrno("Cannot write into pipe ", ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR); + + if (res > 0) + bytes_written += res; + } + } + + void reset() const + { + makeFdBlocking(fd); + } + + ~TimeoutWriteBufferFromFileDescriptor() override + { + tryMakeFdBlocking(fd); + } + +private: + int fd; + size_t timeout_milliseconds; +}; + +class ShellCommandHolder +{ +public: + using ShellCommandBuilderFunc = std::function()>; + + explicit ShellCommandHolder(ShellCommandBuilderFunc && func_) + : func(std::move(func_)) + {} + + std::unique_ptr buildCommand() + { + if (returned_command) + return std::move(returned_command); + + return func(); + } + + void returnCommand(std::unique_ptr command) + { + returned_command = std::move(command); + } + +private: + std::unique_ptr returned_command; + ShellCommandBuilderFunc func; +}; + +namespace +{ + /** A stream, that get child process and sends data using tasks in background threads. + * For each send data task background thread is created. Send data task must send data to process input pipes. + * ShellCommandPoolSource receives data from process stdout. + * + * If process_pool is passed in constructor then after source is destroyed process is returned to pool. + */ + class ShellCommandSource final : public SourceWithProgress + { + public: + + using SendDataTask = std::function; + + ShellCommandSource( + ContextPtr context_, + const std::string & format_, + size_t command_read_timeout_milliseconds, + const Block & sample_block_, + std::unique_ptr && command_, + std::vector && send_data_tasks = {}, + const ShellCommandSourceConfiguration & configuration_ = {}, + std::unique_ptr && command_holder_ = nullptr, + std::shared_ptr process_pool_ = nullptr) + : SourceWithProgress(sample_block_) + , context(context_) + , format(format_) + , sample_block(sample_block_) + , command(std::move(command_)) + , configuration(configuration_) + , timeout_command_out(command->out.getFD(), command_read_timeout_milliseconds) + , command_holder(std::move(command_holder_)) + , process_pool(process_pool_) + { + for (auto && send_data_task : send_data_tasks) + { + send_data_threads.emplace_back([task = std::move(send_data_task), this]() + { + try + { + task(); + } + catch (...) + { + std::lock_guard lock(send_data_lock); + exception_during_send_data = std::current_exception(); + } + }); + } + + size_t max_block_size = configuration.max_block_size; + + if (configuration.read_fixed_number_of_rows) + { + /** Currently parallel parsing input format cannot read exactly max_block_size rows from input, + * so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof. + */ + auto context_for_reading = Context::createCopy(context); + context_for_reading->setSetting("input_format_parallel_parsing", false); + context = context_for_reading; + + if (configuration.read_number_of_rows_from_process_output) + { + /// Initialize executor in generate + return; + } + + max_block_size = configuration.number_of_rows_to_read; + } + + pipeline = QueryPipeline(Pipe(context->getInputFormat(format, timeout_command_out, sample_block, max_block_size))); + executor = std::make_unique(pipeline); + } + + ~ShellCommandSource() override + { + for (auto & thread : send_data_threads) + if (thread.joinable()) + thread.join(); + + if (command_is_invalid) + command = nullptr; + + if (command_holder && process_pool) + { + bool valid_command = configuration.read_fixed_number_of_rows && current_read_rows >= configuration.number_of_rows_to_read; + + if (command && valid_command) + command_holder->returnCommand(std::move(command)); + + process_pool->returnObject(std::move(command_holder)); + } + } + + protected: + + Chunk generate() override + { + rethrowExceptionDuringSendDataIfNeeded(); + + Chunk chunk; + + try + { + if (configuration.read_fixed_number_of_rows) + { + if (!executor && configuration.read_number_of_rows_from_process_output) + { + readText(configuration.number_of_rows_to_read, timeout_command_out); + char dummy; + readChar(dummy, timeout_command_out); + + size_t max_block_size = configuration.number_of_rows_to_read; + pipeline = QueryPipeline(Pipe(context->getInputFormat(format, timeout_command_out, sample_block, max_block_size))); + executor = std::make_unique(pipeline); + } + + if (current_read_rows >= configuration.number_of_rows_to_read) + return {}; + } + + if (!executor->pull(chunk)) + return {}; + + current_read_rows += chunk.getNumRows(); + } + catch (...) + { + command_is_invalid = true; + throw; + } + + return chunk; + } + + Status prepare() override + { + auto status = SourceWithProgress::prepare(); + + if (status == Status::Finished) + { + for (auto & thread : send_data_threads) + if (thread.joinable()) + thread.join(); + + rethrowExceptionDuringSendDataIfNeeded(); + } + + return status; + } + + String getName() const override { return "ShellCommandSource"; } + + private: + + void rethrowExceptionDuringSendDataIfNeeded() + { + std::lock_guard lock(send_data_lock); + if (exception_during_send_data) + { + command_is_invalid = true; + std::rethrow_exception(exception_during_send_data); + } + } + + ContextPtr context; + std::string format; + Block sample_block; + + std::unique_ptr command; + ShellCommandSourceConfiguration configuration; + + TimeoutReadBufferFromFileDescriptor timeout_command_out; + + size_t current_read_rows = 0; + + ShellCommandHolderPtr command_holder; + std::shared_ptr process_pool; + + QueryPipeline pipeline; + std::unique_ptr executor; + + std::vector send_data_threads; + + std::mutex send_data_lock; + std::exception_ptr exception_during_send_data; + + std::atomic command_is_invalid {false}; + }; + + class SendingChunkHeaderTransform final : public ISimpleTransform + { + public: + SendingChunkHeaderTransform(const Block & header, std::shared_ptr buffer_) + : ISimpleTransform(header, header, false) + , buffer(buffer_) + { + } + + String getName() const override { return "SendingChunkHeaderTransform"; } + + protected: + + void transform(Chunk & chunk) override + { + writeText(chunk.getNumRows(), *buffer); + writeChar('\n', *buffer); + } + + private: + std::shared_ptr buffer; + }; + +} + +ShellCommandSourceCoordinator::ShellCommandSourceCoordinator(const Configuration & configuration_) + : configuration(configuration_) +{ + if (configuration.is_executable_pool) + process_pool = std::make_shared(configuration.pool_size ? configuration.pool_size : std::numeric_limits::max()); +} + +Pipe ShellCommandSourceCoordinator::createPipe( + const std::string & command, + const std::vector & arguments, + std::vector && input_pipes, + Block sample_block, + ContextPtr context, + const ShellCommandSourceConfiguration & source_configuration) +{ + ShellCommand::Config command_config(command); + command_config.arguments = arguments; + for (size_t i = 1; i < input_pipes.size(); ++i) + command_config.write_fds.emplace_back(i + 2); + + std::unique_ptr process; + std::unique_ptr process_holder; + + auto destructor_strategy = ShellCommand::DestructorStrategy{true /*terminate_in_destructor*/, configuration.command_termination_timeout_seconds}; + command_config.terminate_in_destructor_strategy = destructor_strategy; + + bool is_executable_pool = (process_pool != nullptr); + if (is_executable_pool) + { + bool execute_direct = configuration.execute_direct; + + bool result = process_pool->tryBorrowObject( + process_holder, + [command_config, execute_direct]() + { + ShellCommandHolder::ShellCommandBuilderFunc func = [command_config, execute_direct]() mutable + { + if (execute_direct) + return ShellCommand::executeDirect(command_config); + else + return ShellCommand::execute(command_config); + }; + + return std::make_unique(std::move(func)); + }, + configuration.max_command_execution_time_seconds * 10000); + + if (!result) + throw Exception( + ErrorCodes::TIMEOUT_EXCEEDED, + "Could not get process from pool, max command execution timeout exceeded {} seconds", + configuration.max_command_execution_time_seconds); + + process = process_holder->buildCommand(); + } + else + { + if (configuration.execute_direct) + process = ShellCommand::executeDirect(command_config); + else + process = ShellCommand::execute(command_config); + } + + std::vector tasks; + tasks.reserve(input_pipes.size()); + + for (size_t i = 0; i < input_pipes.size(); ++i) + { + WriteBufferFromFile * write_buffer = nullptr; + + if (i == 0) + { + write_buffer = &process->in; + } + else + { + auto descriptor = i + 2; + auto it = process->write_fds.find(descriptor); + if (it == process->write_fds.end()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Process does not contain descriptor to write {}", descriptor); + + write_buffer = &it->second; + } + + int write_buffer_fd = write_buffer->getFD(); + auto timeout_write_buffer = std::make_shared(write_buffer_fd, configuration.command_write_timeout_milliseconds); + + input_pipes[i].resize(1); + + if (configuration.send_chunk_header) + { + auto transform = std::make_shared(input_pipes[i].getHeader(), timeout_write_buffer); + input_pipes[i].addTransform(std::move(transform)); + } + + auto pipeline = std::make_shared(std::move(input_pipes[i])); + auto out = context->getOutputFormat(configuration.format, *timeout_write_buffer, materializeBlock(pipeline->getHeader())); + out->setAutoFlush(); + pipeline->complete(std::move(out)); + + ShellCommandSource::SendDataTask task = [pipeline, timeout_write_buffer, write_buffer, is_executable_pool]() + { + CompletedPipelineExecutor executor(*pipeline); + executor.execute(); + + if (!is_executable_pool) + { + timeout_write_buffer->next(); + timeout_write_buffer->reset(); + + write_buffer->close(); + } + }; + + tasks.emplace_back(std::move(task)); + } + + auto source = std::make_unique( + context, + configuration.format, + configuration.command_read_timeout_milliseconds, + std::move(sample_block), + std::move(process), + std::move(tasks), + source_configuration, + std::move(process_holder), + process_pool); + auto pipe = Pipe(std::move(source)); + + return pipe; +} + +} diff --git a/src/Processors/Sources/ShellCommandSource.h b/src/Processors/Sources/ShellCommandSource.h index 4974c33f290..649c713afcb 100644 --- a/src/Processors/Sources/ShellCommandSource.h +++ b/src/Processors/Sources/ShellCommandSource.h @@ -19,14 +19,10 @@ namespace DB { -/** A stream, that get child process and sends data using tasks in background threads. - * For each send data task background thread is created. Send data task must send data to process input pipes. - * ShellCommandPoolSource receives data from process stdout. - * - * If process_pool is passed in constructor then after source is destroyed process is returned to pool. - */ +class ShellCommandHolder; +using ShellCommandHolderPtr = std::unique_ptr; -using ProcessPool = BorrowedObjectPool>; +using ProcessPool = BorrowedObjectPool; struct ShellCommandSourceConfiguration { @@ -37,148 +33,92 @@ struct ShellCommandSourceConfiguration /// Valid only if read_fixed_number_of_rows = true size_t number_of_rows_to_read = 0; /// Max block size - size_t max_block_size = DBMS_DEFAULT_BUFFER_SIZE; + size_t max_block_size = DEFAULT_BLOCK_SIZE; }; -class ShellCommandSource final : public SourceWithProgress +class ShellCommandSourceCoordinator { public: - using SendDataTask = std::function; + struct Configuration + { - ShellCommandSource( + /// Script output format + std::string format; + + /// Command termination timeout in seconds + size_t command_termination_timeout_seconds = 10; + + /// Timeout for reading data from command stdout + size_t command_read_timeout_milliseconds = 10000; + + /// Timeout for writing data to command stdin + size_t command_write_timeout_milliseconds = 10000; + + /// Pool size valid only if executable_pool = true + size_t pool_size = 16; + + /// Max command execution time in milliseconds. Valid only if executable_pool = true + size_t max_command_execution_time_seconds = 10; + + /// Should pool of processes be created. + bool is_executable_pool = false; + + /// Send number_of_rows\n before sending chunk to process. + bool send_chunk_header = false; + + /// Execute script direct or with /bin/bash. + bool execute_direct = true; + + }; + + explicit ShellCommandSourceCoordinator(const Configuration & configuration_); + + const Configuration & getConfiguration() const + { + return configuration; + } + + Pipe createPipe( + const std::string & command, + const std::vector & arguments, + std::vector && input_pipes, + Block sample_block, ContextPtr context, - const std::string & format, - const Block & sample_block, - std::unique_ptr && command_, - std::vector && send_data_tasks = {}, - const ShellCommandSourceConfiguration & configuration_ = {}, - std::shared_ptr process_pool_ = nullptr) - : SourceWithProgress(sample_block) - , command(std::move(command_)) - , configuration(configuration_) - , process_pool(process_pool_) + const ShellCommandSourceConfiguration & source_configuration = {}); + + Pipe createPipe( + const std::string & command, + std::vector && input_pipes, + Block sample_block, + ContextPtr context, + const ShellCommandSourceConfiguration & source_configuration = {}) { - for (auto && send_data_task : send_data_tasks) - { - send_data_threads.emplace_back([task = std::move(send_data_task), this]() - { - try - { - task(); - } - catch (...) - { - std::lock_guard lock(send_data_lock); - exception_during_send_data = std::current_exception(); - } - }); - } - - size_t max_block_size = configuration.max_block_size; - - if (configuration.read_fixed_number_of_rows) - { - /** Currently parallel parsing input format cannot read exactly max_block_size rows from input, - * so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof. - */ - auto context_for_reading = Context::createCopy(context); - context_for_reading->setSetting("input_format_parallel_parsing", false); - context = context_for_reading; - - if (configuration.read_number_of_rows_from_process_output) - { - readText(configuration.number_of_rows_to_read, command->out); - char dummy; - readChar(dummy, command->out); - } - - max_block_size = configuration.number_of_rows_to_read; - } - - pipeline = QueryPipeline(Pipe(context->getInputFormat(format, command->out, sample_block, max_block_size))); - executor = std::make_unique(pipeline); + return createPipe(command, {}, std::move(input_pipes), std::move(sample_block), std::move(context), source_configuration); } - ~ShellCommandSource() override + Pipe createPipe( + const std::string & command, + const std::vector & arguments, + Block sample_block, + ContextPtr context) { - for (auto & thread : send_data_threads) - if (thread.joinable()) - thread.join(); - - if (command && process_pool) - process_pool->returnObject(std::move(command)); + return createPipe(command, arguments, {}, std::move(sample_block), std::move(context), {}); } -protected: - - Chunk generate() override + Pipe createPipe( + const std::string & command, + Block sample_block, + ContextPtr context) { - rethrowExceptionDuringSendDataIfNeeded(); - - if (configuration.read_fixed_number_of_rows && configuration.number_of_rows_to_read == current_read_rows) - return {}; - - Chunk chunk; - - try - { - if (!executor->pull(chunk)) - return {}; - - current_read_rows += chunk.getNumRows(); - } - catch (...) - { - command = nullptr; - throw; - } - - return chunk; + return createPipe(command, {}, {}, std::move(sample_block), std::move(context), {}); } - Status prepare() override - { - auto status = SourceWithProgress::prepare(); - - if (status == Status::Finished) - { - for (auto & thread : send_data_threads) - if (thread.joinable()) - thread.join(); - - rethrowExceptionDuringSendDataIfNeeded(); - } - - return status; - } - - String getName() const override { return "ShellCommandSource"; } - private: - void rethrowExceptionDuringSendDataIfNeeded() - { - std::lock_guard lock(send_data_lock); - if (exception_during_send_data) - { - command = nullptr; - std::rethrow_exception(exception_during_send_data); - } - } + Configuration configuration; - std::unique_ptr command; - ShellCommandSourceConfiguration configuration; - - size_t current_read_rows = 0; - - std::shared_ptr process_pool; - - QueryPipeline pipeline; - std::unique_ptr executor; - - std::vector send_data_threads; - std::mutex send_data_lock; - std::exception_ptr exception_during_send_data; + std::shared_ptr process_pool = nullptr; }; + } diff --git a/src/Processors/Sources/SourceWithProgress.cpp b/src/Processors/Sources/SourceWithProgress.cpp index 9b7a5c6a762..60c39c919f6 100644 --- a/src/Processors/Sources/SourceWithProgress.cpp +++ b/src/Processors/Sources/SourceWithProgress.cpp @@ -26,6 +26,8 @@ SourceWithProgress::SourceWithProgress(Block header, bool enable_auto_progress) void SourceWithProgress::setProcessListElement(QueryStatus * elem) { process_list_elem = elem; + if (!elem) + return; /// Update total_rows_approx as soon as possible. /// diff --git a/src/Processors/Transforms/CountingTransform.cpp b/src/Processors/Transforms/CountingTransform.cpp index 79b6360f22e..eb191b36586 100644 --- a/src/Processors/Transforms/CountingTransform.cpp +++ b/src/Processors/Transforms/CountingTransform.cpp @@ -18,20 +18,21 @@ namespace DB void CountingTransform::onConsume(Chunk chunk) { - Progress local_progress(chunk.getNumRows(), chunk.bytes(), 0); + Progress local_progress{WriteProgress(chunk.getNumRows(), chunk.bytes())}; progress.incrementPiecewiseAtomically(local_progress); //std::cerr << "============ counting adding progress for " << static_cast(thread_status) << ' ' << chunk.getNumRows() << " rows\n"; if (thread_status) { - thread_status->performance_counters.increment(ProfileEvents::InsertedRows, local_progress.read_rows); - thread_status->performance_counters.increment(ProfileEvents::InsertedBytes, local_progress.read_bytes); + thread_status->performance_counters.increment(ProfileEvents::InsertedRows, local_progress.written_rows); + thread_status->performance_counters.increment(ProfileEvents::InsertedBytes, local_progress.written_bytes); + thread_status->progress_out.incrementPiecewiseAtomically(local_progress); } else { - ProfileEvents::increment(ProfileEvents::InsertedRows, local_progress.read_rows); - ProfileEvents::increment(ProfileEvents::InsertedBytes, local_progress.read_bytes); + ProfileEvents::increment(ProfileEvents::InsertedRows, local_progress.written_rows); + ProfileEvents::increment(ProfileEvents::InsertedBytes, local_progress.written_bytes); } if (process_elem) diff --git a/src/Processors/Transforms/TTLTransform.cpp b/src/Processors/Transforms/TTLTransform.cpp index 7d0da3dca91..e79dcb34c41 100644 --- a/src/Processors/Transforms/TTLTransform.cpp +++ b/src/Processors/Transforms/TTLTransform.cpp @@ -139,8 +139,10 @@ void TTLTransform::finalize() if (delete_algorithm) { - size_t rows_removed = all_data_dropped ? data_part->rows_count : delete_algorithm->getNumberOfRemovedRows(); - LOG_DEBUG(log, "Removed {} rows with expired TTL from part {}", rows_removed, data_part->name); + if (all_data_dropped) + LOG_DEBUG(log, "Removed all rows from part {} due to expired TTL", data_part->name); + else + LOG_DEBUG(log, "Removed {} rows with expired TTL from part {}", delete_algorithm->getNumberOfRemovedRows(), data_part->name); } } diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index 82c2a337a45..17075e2b318 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +25,12 @@ #include #include +namespace ProfileEvents +{ + extern const Event SelectedBytes; + extern const Event SelectedRows; +} + namespace DB { @@ -451,13 +459,6 @@ static QueryPipeline process(Block block, ViewRuntimeData & view, const ViewsDat pipeline.getHeader(), std::make_shared(std::move(converting)))); - pipeline.setProgressCallback([context](const Progress & progress) - { - CurrentThread::updateProgressIn(progress); - if (auto callback = context->getProgressCallback()) - callback(progress); - }); - return QueryPipelineBuilder::getPipeline(std::move(pipeline)); } @@ -595,7 +596,11 @@ void PushingToLiveViewSink::consume(Chunk chunk) { Progress local_progress(chunk.getNumRows(), chunk.bytes(), 0); StorageLiveView::writeIntoLiveView(live_view, getHeader().cloneWithColumns(chunk.detachColumns()), context); - CurrentThread::updateProgressIn(local_progress); + auto * process = context->getProcessListElement(); + if (process) + process->updateProgressIn(local_progress); + ProfileEvents::increment(ProfileEvents::SelectedRows, local_progress.read_rows); + ProfileEvents::increment(ProfileEvents::SelectedBytes, local_progress.read_bytes); } @@ -614,7 +619,11 @@ void PushingToWindowViewSink::consume(Chunk chunk) Progress local_progress(chunk.getNumRows(), chunk.bytes(), 0); StorageWindowView::writeIntoWindowView( window_view, getHeader().cloneWithColumns(chunk.detachColumns()), context); - CurrentThread::updateProgressIn(local_progress); + auto * process = context->getProcessListElement(); + if (process) + process->updateProgressIn(local_progress); + ProfileEvents::increment(ProfileEvents::SelectedRows, local_progress.read_rows); + ProfileEvents::increment(ProfileEvents::SelectedBytes, local_progress.read_bytes); } diff --git a/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp b/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp index df3901e2eb1..ee661b39fac 100644 --- a/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp +++ b/src/Processors/tests/gtest_exception_on_incorrect_pipeline.cpp @@ -27,7 +27,8 @@ TEST(Processors, PortsConnected) processors.emplace_back(std::move(source)); processors.emplace_back(std::move(sink)); - PipelineExecutor executor(processors); + QueryStatus * element = nullptr; + PipelineExecutor executor(processors, element); executor.execute(1); } @@ -51,7 +52,8 @@ TEST(Processors, PortsNotConnected) try { - PipelineExecutor executor(processors); + QueryStatus * element = nullptr; + PipelineExecutor executor(processors, element); executor.execute(1); ASSERT_TRUE(false) << "Should have thrown."; } diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 40c64046560..dba7c7cb8f7 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -560,6 +560,7 @@ QueryPipeline QueryPipelineBuilder::getPipeline(QueryPipelineBuilder builder) { QueryPipeline res(std::move(builder.pipe)); res.setNumThreads(builder.getNumThreads()); + res.setProcessListElement(builder.process_list_element); return res; } diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index 673edfb6719..3fdbbf7ba2b 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -103,7 +103,6 @@ namespace ErrorCodes extern const int REQUIRED_PASSWORD; extern const int AUTHENTICATION_FAILED; - extern const int BAD_REQUEST_PARAMETER; extern const int INVALID_SESSION_TIMEOUT; extern const int HTTP_LENGTH_REQUIRED; } @@ -498,12 +497,9 @@ void HTTPHandler::processQuery( { std::string opentelemetry_traceparent = request.get("traceparent"); std::string error; - if (!client_info.client_trace_context.parseTraceparentHeader( - opentelemetry_traceparent, error)) + if (!client_info.client_trace_context.parseTraceparentHeader(opentelemetry_traceparent, error)) { - throw Exception(ErrorCodes::BAD_REQUEST_PARAMETER, - "Failed to parse OpenTelemetry traceparent header '{}': {}", - opentelemetry_traceparent, error); + LOG_DEBUG(log, "Failed to parse OpenTelemetry traceparent header '{}': {}", opentelemetry_traceparent, error); } client_info.client_trace_context.tracestate = request.get("tracestate", ""); } diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 6b4f77dd7d0..6fa2b25d181 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -325,7 +325,7 @@ void TCPHandler::runImpl() if (state.is_cancelled) return std::nullopt; - sendMergeTreeReadTaskRequstAssumeLocked(std::move(request)); + sendMergeTreeReadTaskRequestAssumeLocked(std::move(request)); return receivePartitionMergeTreeReadTaskResponseAssumeLocked(); }); @@ -805,7 +805,7 @@ void TCPHandler::sendReadTaskRequestAssumeLocked() } -void TCPHandler::sendMergeTreeReadTaskRequstAssumeLocked(PartitionReadRequest request) +void TCPHandler::sendMergeTreeReadTaskRequestAssumeLocked(PartitionReadRequest request) { writeVarUInt(Protocol::Server::MergeTreeReadTaskRequest, *out); request.serialize(*out); diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 4c4aeb0d913..6afda654e6a 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -239,7 +239,7 @@ private: void sendEndOfStream(); void sendPartUUIDs(); void sendReadTaskRequestAssumeLocked(); - void sendMergeTreeReadTaskRequstAssumeLocked(PartitionReadRequest request); + void sendMergeTreeReadTaskRequestAssumeLocked(PartitionReadRequest request); void sendProfileInfo(const ProfileInfo & info); void sendTotals(const Block & totals); void sendExtremes(const Block & extremes); diff --git a/src/Storages/ExecutableSettings.h b/src/Storages/ExecutableSettings.h index 9c0cfc05fa5..c6c1f0b9eb2 100644 --- a/src/Storages/ExecutableSettings.h +++ b/src/Storages/ExecutableSettings.h @@ -9,16 +9,23 @@ namespace DB class ASTStorage; #define LIST_OF_EXECUTABLE_SETTINGS(M) \ - M(UInt64, send_chunk_header, false, "Send number_of_rows\n before sending chunk to process", 0) \ - M(UInt64, pool_size, 16, "Processes pool size. If size == 0, then there is no size restrictions", 0) \ + M(Bool, send_chunk_header, false, "Send number_of_rows\n before sending chunk to process.", 0) \ + M(UInt64, pool_size, 16, "Processes pool size. If size == 0, then there is no size restrictions.", 0) \ M(UInt64, max_command_execution_time, 10, "Max command execution time in seconds.", 0) \ M(UInt64, command_termination_timeout, 10, "Command termination timeout in seconds.", 0) \ + M(UInt64, command_read_timeout, 10000, "Timeout for reading data from command stdout in milliseconds.", 0) \ + M(UInt64, command_write_timeout, 10000, "Timeout for writing data to command stdin in milliseconds.", 0) DECLARE_SETTINGS_TRAITS(ExecutableSettingsTraits, LIST_OF_EXECUTABLE_SETTINGS) /// Settings for ExecutablePool engine. struct ExecutableSettings : public BaseSettings { + std::string script_name; + std::vector script_arguments; + + bool is_executable_pool = false; + void loadFromQuery(ASTStorage & storage_def); }; diff --git a/src/Storages/ExternalDataSourceConfiguration.cpp b/src/Storages/ExternalDataSourceConfiguration.cpp index 42b3b148551..2d4b05c51b5 100644 --- a/src/Storages/ExternalDataSourceConfiguration.cpp +++ b/src/Storages/ExternalDataSourceConfiguration.cpp @@ -15,6 +15,11 @@ #if USE_RDKAFKA #include #endif +#if USE_MYSQL +#include +#endif + +#include namespace DB { @@ -24,6 +29,31 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +IMPLEMENT_SETTINGS_TRAITS(EmptySettingsTraits, EMPTY_SETTINGS) + +static const std::unordered_set dictionary_allowed_keys = { + "host", "port", "user", "password", "db", + "database", "table", "schema", "replica", + "update_field", "update_tag", "invalidate_query", "query", + "where", "name", "secure", "uri", "collection"}; + + +template +SettingsChanges getSettingsChangesFromConfig( + const BaseSettings & settings, const Poco::Util::AbstractConfiguration & config, const String & config_prefix) +{ + SettingsChanges config_settings; + for (const auto & setting : settings.all()) + { + const auto & setting_name = setting.getName(); + auto setting_value = config.getString(config_prefix + '.' + setting_name, ""); + if (!setting_value.empty()) + config_settings.emplace_back(setting_name, setting_value); + } + return config_settings; +} + + String ExternalDataSourceConfiguration::toString() const { WriteBufferFromOwnString configuration_info; @@ -59,7 +89,9 @@ void ExternalDataSourceConfiguration::set(const ExternalDataSourceConfiguration } -std::optional getExternalDataSourceConfiguration(const ASTs & args, ContextPtr context, bool is_database_engine, bool throw_on_no_collection) +template +std::optional getExternalDataSourceConfiguration( + const ASTs & args, ContextPtr context, bool is_database_engine, bool throw_on_no_collection, const BaseSettings & storage_settings) { if (args.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); @@ -82,6 +114,8 @@ std::optional getExternalDataSourceConfiguration(const throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no collection named `{}` in config", collection->name()); } + SettingsChanges config_settings = getSettingsChangesFromConfig(storage_settings, config, collection_prefix); + configuration.host = config.getString(collection_prefix + ".host", ""); configuration.port = config.getInt(collection_prefix + ".port", 0); configuration.username = config.getString(collection_prefix + ".user", ""); @@ -123,6 +157,7 @@ std::optional getExternalDataSourceConfiguration(const if (arg_value_literal) { auto arg_value = arg_value_literal->value; + if (arg_name == "host") configuration.host = arg_value.safeGet(); else if (arg_name == "port") @@ -139,6 +174,8 @@ std::optional getExternalDataSourceConfiguration(const configuration.schema = arg_value.safeGet(); else if (arg_name == "addresses_expr") configuration.addresses_expr = arg_value.safeGet(); + else if (storage_settings.has(arg_name)) + config_settings.emplace_back(arg_name, arg_value); else non_common_args.emplace_back(std::make_pair(arg_name, arg_value_ast)); } @@ -153,16 +190,29 @@ std::optional getExternalDataSourceConfiguration(const } } - ExternalDataSourceConfig source_config{ .configuration = configuration, .specific_args = non_common_args }; - return source_config; + return ExternalDataSourceInfo{ .configuration = configuration, .specific_args = non_common_args, .settings_changes = config_settings }; } return std::nullopt; } - -std::optional getExternalDataSourceConfiguration( - const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context) +static void validateConfigKeys( + const Poco::Util::AbstractConfiguration & dict_config, const String & config_prefix, HasConfigKeyFunc has_config_key_func) { + Poco::Util::AbstractConfiguration::Keys config_keys; + dict_config.keys(config_prefix, config_keys); + for (const auto & config_key : config_keys) + { + if (!has_config_key_func(config_key)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected key `{}` in dictionary source configuration", config_key); + } +} + +template +std::optional getExternalDataSourceConfiguration( + const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, + ContextPtr context, HasConfigKeyFunc has_config_key, const BaseSettings & settings) +{ + validateConfigKeys(dict_config, dict_config_prefix, has_config_key); ExternalDataSourceConfiguration configuration; auto collection_name = dict_config.getString(dict_config_prefix + ".name", ""); @@ -170,6 +220,11 @@ std::optional getExternalDataSourceConfiguratio { const auto & config = context->getConfigRef(); const auto & collection_prefix = fmt::format("named_collections.{}", collection_name); + validateConfigKeys(dict_config, collection_prefix, has_config_key); + auto config_settings = getSettingsChangesFromConfig(settings, config, collection_prefix); + auto dict_settings = getSettingsChangesFromConfig(settings, dict_config, dict_config_prefix); + /// dictionary config settings override collection settings. + config_settings.insert(config_settings.end(), dict_settings.begin(), dict_settings.end()); if (!config.has(collection_prefix)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no collection named `{}` in config", collection_name); @@ -178,30 +233,32 @@ std::optional getExternalDataSourceConfiguratio configuration.port = dict_config.getInt(dict_config_prefix + ".port", config.getUInt(collection_prefix + ".port", 0)); configuration.username = dict_config.getString(dict_config_prefix + ".user", config.getString(collection_prefix + ".user", "")); configuration.password = dict_config.getString(dict_config_prefix + ".password", config.getString(collection_prefix + ".password", "")); - configuration.database = dict_config.getString(dict_config_prefix + ".db", config.getString(collection_prefix + ".database", "")); + configuration.database = dict_config.getString(dict_config_prefix + ".db", config.getString(dict_config_prefix + ".database", + config.getString(collection_prefix + ".db", config.getString(collection_prefix + ".database", "")))); configuration.table = dict_config.getString(dict_config_prefix + ".table", config.getString(collection_prefix + ".table", "")); configuration.schema = dict_config.getString(dict_config_prefix + ".schema", config.getString(collection_prefix + ".schema", "")); if (configuration.host.empty() || configuration.port == 0 || configuration.username.empty() || configuration.table.empty()) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Named collection of connection parameters is missing some of the parameters and dictionary parameters are added"); + "Named collection of connection parameters is missing some of the parameters and dictionary parameters are not added"); } - return configuration; + return ExternalDataSourceInfo{ .configuration = configuration, .specific_args = {}, .settings_changes = config_settings }; } return std::nullopt; } ExternalDataSourcesByPriority getExternalDataSourceConfigurationByPriority( - const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context) + const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context, HasConfigKeyFunc has_config_key) { + validateConfigKeys(dict_config, dict_config_prefix, has_config_key); ExternalDataSourceConfiguration common_configuration; - auto named_collection = getExternalDataSourceConfiguration(dict_config, dict_config_prefix, context); + auto named_collection = getExternalDataSourceConfiguration(dict_config, dict_config_prefix, context, has_config_key); if (named_collection) { - common_configuration = *named_collection; + common_configuration = named_collection->configuration; } else { @@ -209,7 +266,7 @@ ExternalDataSourcesByPriority getExternalDataSourceConfigurationByPriority( common_configuration.port = dict_config.getUInt(dict_config_prefix + ".port", 0); common_configuration.username = dict_config.getString(dict_config_prefix + ".user", ""); common_configuration.password = dict_config.getString(dict_config_prefix + ".password", ""); - common_configuration.database = dict_config.getString(dict_config_prefix + ".db", ""); + common_configuration.database = dict_config.getString(dict_config_prefix + ".db", dict_config.getString(dict_config_prefix + ".database", "")); common_configuration.table = dict_config.getString(fmt::format("{}.table", dict_config_prefix), ""); common_configuration.schema = dict_config.getString(fmt::format("{}.schema", dict_config_prefix), ""); } @@ -233,8 +290,9 @@ ExternalDataSourcesByPriority getExternalDataSourceConfigurationByPriority( { ExternalDataSourceConfiguration replica_configuration(common_configuration); String replica_name = dict_config_prefix + "." + config_key; - size_t priority = dict_config.getInt(replica_name + ".priority", 0); + validateConfigKeys(dict_config, replica_name, has_config_key); + size_t priority = dict_config.getInt(replica_name + ".priority", 0); replica_configuration.host = dict_config.getString(replica_name + ".host", common_configuration.host); replica_configuration.port = dict_config.getUInt(replica_name + ".port", common_configuration.port); replica_configuration.username = dict_config.getString(replica_name + ".user", common_configuration.username); @@ -366,6 +424,7 @@ std::optional getURLBasedDataSourceConfiguration(const return std::nullopt; } + template bool getExternalDataSourceConfiguration(const ASTs & args, BaseSettings & settings, ContextPtr context) { @@ -380,14 +439,7 @@ bool getExternalDataSourceConfiguration(const ASTs & args, BaseSettings & set if (!config.has(config_prefix)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no collection named `{}` in config", collection->name()); - SettingsChanges config_settings; - for (const auto & setting : settings.all()) - { - const auto & setting_name = setting.getName(); - auto setting_value = config.getString(config_prefix + '.' + setting_name, ""); - if (!setting_value.empty()) - config_settings.emplace_back(setting_name, setting_value); - } + auto config_settings = getSettingsChangesFromConfig(settings, config, config_prefix); /// Check key-value arguments. for (size_t i = 1; i < args.size(); ++i) @@ -425,4 +477,32 @@ bool getExternalDataSourceConfiguration(const ASTs & args, BaseSettings & settings, ContextPtr context); #endif + +template +std::optional getExternalDataSourceConfiguration( + const ASTs & args, ContextPtr context, bool is_database_engine, bool throw_on_no_collection, const BaseSettings & storage_settings); + +template +std::optional getExternalDataSourceConfiguration( + const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, + ContextPtr context, HasConfigKeyFunc has_config_key, const BaseSettings & settings); + +template +SettingsChanges getSettingsChangesFromConfig( + const BaseSettings & settings, const Poco::Util::AbstractConfiguration & config, const String & config_prefix); + +#if USE_MYSQL +template +std::optional getExternalDataSourceConfiguration( + const ASTs & args, ContextPtr context, bool is_database_engine, bool throw_on_no_collection, const BaseSettings & storage_settings); + +template +std::optional getExternalDataSourceConfiguration( + const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, + ContextPtr context, HasConfigKeyFunc has_config_key, const BaseSettings & settings); + +template +SettingsChanges getSettingsChangesFromConfig( + const BaseSettings & settings, const Poco::Util::AbstractConfiguration & config, const String & config_prefix); +#endif } diff --git a/src/Storages/ExternalDataSourceConfiguration.h b/src/Storages/ExternalDataSourceConfiguration.h index 502f8b800e3..926ad64b515 100644 --- a/src/Storages/ExternalDataSourceConfiguration.h +++ b/src/Storages/ExternalDataSourceConfiguration.h @@ -7,6 +7,11 @@ namespace DB { +#define EMPTY_SETTINGS(M) +DECLARE_SETTINGS_TRAITS(EmptySettingsTraits, EMPTY_SETTINGS) + +struct EmptySettings : public BaseSettings {}; + struct ExternalDataSourceConfiguration { String host; @@ -46,10 +51,11 @@ struct StorageMongoDBConfiguration : ExternalDataSourceConfiguration using StorageSpecificArgs = std::vector>; -struct ExternalDataSourceConfig +struct ExternalDataSourceInfo { ExternalDataSourceConfiguration configuration; StorageSpecificArgs specific_args; + SettingsChanges settings_changes; }; /* If there is a storage engine's configuration specified in the named_collections, @@ -62,10 +68,16 @@ struct ExternalDataSourceConfig * Any key-value engine argument except common (`host`, `port`, `username`, `password`, `database`) * is returned in EngineArgs struct. */ -std::optional getExternalDataSourceConfiguration(const ASTs & args, ContextPtr context, bool is_database_engine = false, bool throw_on_no_collection = true); +template +std::optional getExternalDataSourceConfiguration( + const ASTs & args, ContextPtr context, bool is_database_engine = false, bool throw_on_no_collection = true, const BaseSettings & storage_settings = {}); -std::optional getExternalDataSourceConfiguration( - const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context); +using HasConfigKeyFunc = std::function; + +template +std::optional getExternalDataSourceConfiguration( + const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, + ContextPtr context, HasConfigKeyFunc has_config_key, const BaseSettings & settings = {}); /// Highest priority is 0, the bigger the number in map, the less the priority. @@ -80,7 +92,7 @@ struct ExternalDataSourcesByPriority }; ExternalDataSourcesByPriority -getExternalDataSourceConfigurationByPriority(const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context); +getExternalDataSourceConfigurationByPriority(const Poco::Util::AbstractConfiguration & dict_config, const String & dict_config_prefix, ContextPtr context, HasConfigKeyFunc has_config_key); struct URLBasedDataSourceConfiguration @@ -88,7 +100,7 @@ struct URLBasedDataSourceConfiguration String url; String format; String compression_method = "auto"; - String structure; + String structure = "auto"; std::vector> headers; String http_method; diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 2105228abf6..f22f6f66ced 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -29,6 +28,8 @@ #include #include + +#include #include #include @@ -51,10 +52,70 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ACCESS_DENIED; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } +namespace +{ + /* Recursive directory listing with matched paths as a result. + * Have the same method in StorageFile. + */ + Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match) + { + const size_t first_glob = for_match.find_first_of("*?{"); -static Strings listFilesWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match); + const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); + const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' + const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' + const size_t next_slash = suffix_with_globs.find('/', 1); + re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); + + HDFSFileInfo ls; + ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); + Strings result; + for (int i = 0; i < ls.length; ++i) + { + const String full_path = String(ls.file_info[i].mName); + const size_t last_slash = full_path.rfind('/'); + const String file_name = full_path.substr(last_slash); + const bool looking_for_directory = next_slash != std::string::npos; + const bool is_directory = ls.file_info[i].mKind == 'D'; + /// Condition with type of current file_info means what kind of path is it in current iteration of ls + if (!is_directory && !looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + result.push_back(String(ls.file_info[i].mName)); + } + } + else if (is_directory && looking_for_directory) + { + if (re2::RE2::FullMatch(file_name, matcher)) + { + Strings result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash)); + /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. + std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); + } + } + } + + return result; + } + + std::pair getPathFromUriAndUriWithoutPath(const String & uri) + { + const size_t begin_of_path = uri.find('/', uri.find("//") + 2); + return {uri.substr(begin_of_path), uri.substr(0, begin_of_path)}; + } + + std::vector getPathsList(const String & path_from_uri, const String & uri_without_path, ContextPtr context) + { + HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); + HDFSFSPtr fs = createHDFSFS(builder.get()); + + return LSWithRegexpMatching("/", fs, path_from_uri); + } +} StorageHDFS::StorageHDFS( const String & uri_, @@ -79,25 +140,52 @@ StorageHDFS::StorageHDFS( checkHDFSURL(uri); StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + + if (columns_.empty()) + { + auto columns = getTableStructureFromData(format_name, uri, compression_method, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription StorageHDFS::getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + ContextPtr ctx) +{ + auto read_buffer_creator = [&]() + { + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); + auto paths = getPathsList(path_from_uri, uri, ctx); + if (paths.empty()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files in HDFS with provided path. You must " + "specify table structure manually", + format); + + auto compression = chooseCompressionMethod(paths[0], compression_method); + return wrapReadBufferWithCompressionMethod( + std::make_unique(uri_without_path, paths[0], ctx->getGlobalContext()->getConfigRef()), compression); + }; + + return readSchemaFromFormat(format, std::nullopt, read_buffer_creator, ctx); +} + class HDFSSource::DisclosedGlobIterator::Impl { public: Impl(ContextPtr context_, const String & uri) { - const size_t begin_of_path = uri.find('/', uri.find("//") + 2); - const String path_from_uri = uri.substr(begin_of_path); - const String uri_without_path = uri.substr(0, begin_of_path); /// ends without '/' - - HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context_->getGlobalContext()->getConfigRef()); - HDFSFSPtr fs = createHDFSFS(builder.get()); - - uris = listFilesWithRegexpMatching("/", fs, path_from_uri); + const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); + uris = getPathsList(path_from_uri, uri_without_path, context_); for (auto & elem : uris) elem = uri_without_path + elem; uris_iter = uris.begin(); @@ -339,51 +427,6 @@ private: }; -/* Recursive directory listing with matched paths as a result. - * Have the same method in StorageFile. - */ -Strings listFilesWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match) -{ - const size_t first_glob = for_match.find_first_of("*?{"); - - const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); - const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' - const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' - - const size_t next_slash = suffix_with_globs.find('/', 1); - re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); - - HDFSFileInfo ls; - ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); - Strings result; - for (int i = 0; i < ls.length; ++i) - { - const String full_path = String(ls.file_info[i].mName); - const size_t last_slash = full_path.rfind('/'); - const String file_name = full_path.substr(last_slash); - const bool looking_for_directory = next_slash != std::string::npos; - const bool is_directory = ls.file_info[i].mKind == 'D'; - /// Condition with type of current file_info means what kind of path is it in current iteration of ls - if (!is_directory && !looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - { - result.push_back(String(ls.file_info[i].mName)); - } - } - else if (is_directory && looking_for_directory) - { - if (re2::RE2::FullMatch(file_name, matcher)) - { - Strings result_part = listFilesWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash)); - /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. - std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); - } - } - } - return result; -} - bool StorageHDFS::isColumnOriented() const { return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); @@ -400,6 +443,7 @@ Pipe StorageHDFS::read( { bool need_path_column = false; bool need_file_column = false; + for (const auto & column : column_names) { if (column == "_path") @@ -528,6 +572,7 @@ void registerStorageHDFS(StorageFactory & factory) }, { .supports_sort_order = true, // for partition by + .supports_schema_inference = true, .source_access_type = AccessType::HDFS, }); } diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index 3e2f7a43127..9e845d8fd74 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -31,7 +31,7 @@ public: size_t max_block_size, unsigned num_streams) override; - SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; + SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr /*context*/) override; void truncate( const ASTPtr & query, @@ -49,6 +49,12 @@ public: /// format to read only them. Note: this hack cannot be done with ordinary formats like TSV. bool isColumnOriented() const; + static ColumnsDescription getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + ContextPtr ctx); + protected: friend class HDFSSource; StorageHDFS( diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 021335fea1f..a923258b111 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -139,7 +139,6 @@ void IStorage::alter(const AlterCommands & params, ContextPtr context, AlterLock setInMemoryMetadata(new_metadata); } - void IStorage::checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const { for (const auto & command : commands) diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index bcbc771815b..6342c3f6b47 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -135,6 +135,9 @@ public: /// Returns true if the storage supports queries with the PREWHERE section. virtual bool supportsPrewhere() const { return false; } + /// Returns true if the storage supports optimization of moving conditions to PREWHERE section. + virtual bool canMoveConditionsToPrewhere() const { return supportsPrewhere(); } + /// Returns true if the storage replicates SELECT, INSERT and ALTER commands among replicas. virtual bool supportsReplication() const { return false; } diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index 2855e21356d..2a964aecd4e 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -361,10 +361,10 @@ void Service::sendPartFromDiskRemoteMeta(const MergeTreeData::DataPartPtr & part MergeTreeData::DataPartPtr Service::findPart(const String & name) { - /// It is important to include PreCommitted and Outdated parts here because remote replicas cannot reliably + /// It is important to include PreActive and Outdated parts here because remote replicas cannot reliably /// determine the local state of the part, so queries for the parts in these states are completely normal. auto part = data.getPartIfExists( - name, {MergeTreeDataPartState::PreCommitted, MergeTreeDataPartState::Committed, MergeTreeDataPartState::Outdated}); + name, {MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); if (part) return part; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 83328594363..da412e4941e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -28,6 +28,8 @@ namespace CurrentMetrics extern const Metric PartsTemporary; extern const Metric PartsPreCommitted; extern const Metric PartsCommitted; + extern const Metric PartsPreActive; + extern const Metric PartsActive; extern const Metric PartsOutdated; extern const Metric PartsDeleting; extern const Metric PartsDeleteOnDestroy; @@ -189,10 +191,12 @@ static void incrementStateMetric(IMergeTreeDataPart::State state) case IMergeTreeDataPart::State::Temporary: CurrentMetrics::add(CurrentMetrics::PartsTemporary); return; - case IMergeTreeDataPart::State::PreCommitted: + case IMergeTreeDataPart::State::PreActive: + CurrentMetrics::add(CurrentMetrics::PartsPreActive); CurrentMetrics::add(CurrentMetrics::PartsPreCommitted); return; - case IMergeTreeDataPart::State::Committed: + case IMergeTreeDataPart::State::Active: + CurrentMetrics::add(CurrentMetrics::PartsActive); CurrentMetrics::add(CurrentMetrics::PartsCommitted); return; case IMergeTreeDataPart::State::Outdated: @@ -214,10 +218,12 @@ static void decrementStateMetric(IMergeTreeDataPart::State state) case IMergeTreeDataPart::State::Temporary: CurrentMetrics::sub(CurrentMetrics::PartsTemporary); return; - case IMergeTreeDataPart::State::PreCommitted: + case IMergeTreeDataPart::State::PreActive: + CurrentMetrics::sub(CurrentMetrics::PartsPreActive); CurrentMetrics::sub(CurrentMetrics::PartsPreCommitted); return; - case IMergeTreeDataPart::State::Committed: + case IMergeTreeDataPart::State::Active: + CurrentMetrics::sub(CurrentMetrics::PartsActive); CurrentMetrics::sub(CurrentMetrics::PartsCommitted); return; case IMergeTreeDataPart::State::Outdated: @@ -286,7 +292,7 @@ IMergeTreeDataPart::IMergeTreeDataPart( , parent_part(parent_part_) { if (parent_part) - state = State::Committed; + state = State::Active; incrementStateMetric(state); incrementTypeMetric(part_type); @@ -311,7 +317,7 @@ IMergeTreeDataPart::IMergeTreeDataPart( , parent_part(parent_part_) { if (parent_part) - state = State::Committed; + state = State::Active; incrementStateMetric(state); incrementTypeMetric(part_type); @@ -1153,6 +1159,14 @@ void IMergeTreeDataPart::renameTo(const String & new_relative_path, bool remove_ storage.lockSharedData(*this); } +void IMergeTreeDataPart::cleanupOldName(const String & old_part_name) const +{ + if (name == old_part_name) + return; + + storage.unlockSharedData(*this, old_part_name); +} + std::optional IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const { /// NOTE: It's needed for zero-copy replication @@ -1615,6 +1629,12 @@ String IMergeTreeDataPart::getUniqueId() const } +UInt32 IMergeTreeDataPart::getNumberOfRefereneces() const +{ + return volume->getDisk()->getRefCount(fs::path(getFullRelativePath()) / "checksums.txt"); +} + + String IMergeTreeDataPart::getZeroLevelPartBlockID() const { if (info.level != 0) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index ab08ca1c33a..09449dc7521 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -218,19 +218,19 @@ public: * Part state should be modified under data_parts mutex. * * Possible state transitions: - * Temporary -> Precommitted: we are trying to commit a fetched, inserted or merged part to active set - * Precommitted -> Outdated: we could not add a part to active set and are doing a rollback (for example it is duplicated part) - * Precommitted -> Committed: we successfully committed a part to active dataset - * Precommitted -> Outdated: a part was replaced by a covering part or DROP PARTITION + * Temporary -> PreActive: we are trying to add a fetched, inserted or merged part to active set + * PreActive -> Outdated: we could not add a part to active set and are doing a rollback (for example it is duplicated part) + * PreActive -> Active: we successfully added a part to active dataset + * PreActive -> Outdated: a part was replaced by a covering part or DROP PARTITION * Outdated -> Deleting: a cleaner selected this part for deletion * Deleting -> Outdated: if an ZooKeeper error occurred during the deletion, we will retry deletion - * Committed -> DeleteOnDestroy: if part was moved to another disk + * Active -> DeleteOnDestroy: if part was moved to another disk */ enum class State { Temporary, /// the part is generating now, it is not in data_parts list - PreCommitted, /// the part is in data_parts, but not used for SELECTs - Committed, /// active data part, used by current and upcoming SELECTs + PreActive, /// the part is in data_parts, but not used for SELECTs + Active, /// active data part, used by current and upcoming SELECTs Outdated, /// not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes Deleting, /// not active data part with identity refcounter, it is deleting right now by a cleaner DeleteOnDestroy, /// part was moved to another disk and should be deleted in own destructor @@ -338,6 +338,9 @@ public: /// Changes only relative_dir_name, you need to update other metadata (name, is_temp) explicitly virtual void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const; + /// Cleanup shared locks made with old name after part renaming + virtual void cleanupOldName(const String & old_part_name) const; + /// Makes clone of a part in detached/ directory via hard links virtual void makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const; @@ -404,10 +407,14 @@ public: /// part creation (using alter query with materialize_ttl setting). bool checkAllTTLCalculated(const StorageMetadataPtr & metadata_snapshot) const; - /// Return some uniq string for file - /// Required for distinguish different copies of the same part on S3 + /// Return some uniq string for file. + /// Required for distinguish different copies of the same part on remote FS. String getUniqueId() const; + /// Return hardlink count for part. + /// Required for keep data on remote FS when part has shadow copies. + UInt32 getNumberOfRefereneces() const; + protected: /// Total size of all columns, calculated once in calcuateColumnSizesOnDisk diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index fbc818a7de9..5b69a4e68b6 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -22,6 +22,7 @@ namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER; extern const int LOGICAL_ERROR; + extern const int QUERY_WAS_CANCELLED; } @@ -131,8 +132,9 @@ bool MergeTreeBaseSelectProcessor::getTaskFromBuffer() if (Status::Accepted == res) return true; + /// To avoid any possibility of ignoring cancellation, exception will be thrown. if (Status::Cancelled == res) - break; + throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query had been cancelled"); } return false; } @@ -165,8 +167,18 @@ Chunk MergeTreeBaseSelectProcessor::generate() { while (!isCancelled()) { - if ((!task || task->isFinished()) && !getNewTask()) - return {}; + try + { + if ((!task || task->isFinished()) && !getNewTask()) + return {}; + } + catch (const Exception & e) + { + /// See MergeTreeBaseSelectProcessor::getTaskFromBuffer() + if (e.code() == ErrorCodes::QUERY_WAS_CANCELLED) + return {}; + throw; + } auto res = readFromPart(); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 1b7be8ca98d..b38a0112116 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +63,7 @@ #include #include +#include #include #include @@ -224,7 +226,6 @@ MergeTreeData::MergeTreeData( { try { - checkPartitionKeyAndInitMinMax(metadata_.partition_key); setProperties(metadata_, metadata_, attach); if (minmax_idx_date_column_pos == -1) @@ -358,10 +359,11 @@ static void checkKeyExpression(const ExpressionActions & expr, const Block & sam { const ColumnPtr & column = element.column; if (column && (isColumnConst(*column) || column->isDummy())) - throw Exception{key_name + " key cannot contain constants", ErrorCodes::ILLEGAL_COLUMN}; + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "{} key cannot contain constants", key_name); - if (!allow_nullable_key && element.type->isNullable()) - throw Exception{key_name + " key cannot contain nullable columns", ErrorCodes::ILLEGAL_COLUMN}; + if (!allow_nullable_key && hasNullable(element.type)) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, "{} key contains nullable columns, but `setting allow_nullable_key` is disabled", key_name); } } @@ -1038,8 +1040,8 @@ void MergeTreeData::loadDataPartsFromDisk( has_adaptive_parts.store(true, std::memory_order_relaxed); part->modification_time = part_disk_ptr->getLastModified(fs::path(relative_data_path) / part_name).epochTime(); - /// Assume that all parts are Committed, covered parts will be detected and marked as Outdated later - part->setState(DataPartState::Committed); + /// Assume that all parts are Active, covered parts will be detected and marked as Outdated later + part->setState(DataPartState::Active); std::lock_guard loading_lock(mutex); auto [it, inserted] = data_parts_indexes.insert(part); @@ -1131,12 +1133,12 @@ void MergeTreeData::loadDataPartsFromWAL( { for (auto & part : parts_from_wal) { - if (getActiveContainingPart(part->info, DataPartState::Committed, part_lock)) + if (getActiveContainingPart(part->info, DataPartState::Active, part_lock)) continue; part->modification_time = time(nullptr); - /// Assume that all parts are Committed, covered parts will be detected and marked as Outdated later - part->setState(DataPartState::Committed); + /// Assume that all parts are Active, covered parts will be detected and marked as Outdated later + part->setState(DataPartState::Active); auto [it, inserted] = data_parts_indexes.insert(part); if (!inserted) @@ -1292,9 +1294,9 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) removePartContributionToDataVolume(*it); }; - (*prev_jt)->assertState({DataPartState::Committed}); + (*prev_jt)->assertState({DataPartState::Active}); - while (curr_jt != data_parts_by_state_and_info.end() && (*curr_jt)->getState() == DataPartState::Committed) + while (curr_jt != data_parts_by_state_and_info.end() && (*curr_jt)->getState() == DataPartState::Active) { /// Don't consider data parts belonging to different partitions. if ((*curr_jt)->info.partition_id != (*prev_jt)->info.partition_id) @@ -2368,8 +2370,8 @@ MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace( DataPartsLock & /* data_parts_lock */) const { /// Parts contained in the part are consecutive in data_parts, intersecting the insertion place for the part itself. - auto it_middle = data_parts_by_state_and_info.lower_bound(DataPartStateAndInfo{DataPartState::Committed, new_part_info}); - auto committed_parts_range = getDataPartsStateRange(DataPartState::Committed); + auto it_middle = data_parts_by_state_and_info.lower_bound(DataPartStateAndInfo{DataPartState::Active, new_part_info}); + auto committed_parts_range = getDataPartsStateRange(DataPartState::Active); /// Go to the left. DataPartIteratorByStateAndInfo begin = it_middle; @@ -2457,6 +2459,8 @@ bool MergeTreeData::renameTempPartAndReplace( MergeTreePartInfo part_info = part->info; String part_name; + String old_part_name = part->name; + if (DataPartPtr existing_part_in_partition = getAnyPartInPartition(part->info.partition_id, lock)) { if (part->partition.value != existing_part_in_partition->partition.value) @@ -2520,10 +2524,11 @@ bool MergeTreeData::renameTempPartAndReplace( /// So, we maintain invariant: if a non-temporary part in filesystem then it is in data_parts /// /// If out_transaction is null, we commit the part to the active set immediately, else add it to the transaction. + part->name = part_name; part->info = part_info; part->is_temp = false; - part->setState(DataPartState::PreCommitted); + part->setState(DataPartState::PreActive); part->renameTo(part_name, true); auto part_it = data_parts_indexes.insert(part).first; @@ -2550,7 +2555,7 @@ bool MergeTreeData::renameTempPartAndReplace( decreaseDataVolume(reduce_bytes, reduce_rows, reduce_parts); - modifyPartState(part_it, DataPartState::Committed); + modifyPartState(part_it, DataPartState::Active); addPartContributionToColumnAndSecondaryIndexSizes(part); addPartContributionToDataVolume(part); } @@ -2568,6 +2573,9 @@ bool MergeTreeData::renameTempPartAndReplace( out_covered_parts->emplace_back(std::move(covered_part)); } + /// Cleanup shared locks made with old name + part->cleanupOldName(old_part_name); + return true; } @@ -2592,13 +2600,13 @@ void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVect for (const DataPartPtr & part : remove) { - if (part->getState() == IMergeTreeDataPart::State::Committed) + if (part->getState() == IMergeTreeDataPart::State::Active) { removePartContributionToColumnAndSecondaryIndexSizes(part); removePartContributionToDataVolume(part); } - if (part->getState() == IMergeTreeDataPart::State::Committed || clear_without_timeout) + if (part->getState() == IMergeTreeDataPart::State::Active || clear_without_timeout) part->remove_time.store(remove_time, std::memory_order_relaxed); if (part->getState() != IMergeTreeDataPart::State::Outdated) @@ -2634,7 +2642,7 @@ void MergeTreeData::removePartsFromWorkingSet(const DataPartsVector & remove, bo if (!data_parts_by_info.count(part->info)) throw Exception("Part " + part->getNameWithState() + " not found in data_parts", ErrorCodes::LOGICAL_ERROR); - part->assertState({DataPartState::PreCommitted, DataPartState::Committed, DataPartState::Outdated}); + part->assertState({DataPartState::PreActive, DataPartState::Active, DataPartState::Outdated}); } removePartsFromWorkingSet(remove, clear_without_timeout, lock); @@ -2732,7 +2740,7 @@ restore_covered) /// What if part_to_detach is a reference to *it_part? Make a new owner just in case. DataPartPtr part = *it_part; - if (part->getState() == DataPartState::Committed) + if (part->getState() == DataPartState::Active) { removePartContributionToDataVolume(part); removePartContributionToColumnAndSecondaryIndexSizes(part); @@ -2759,7 +2767,7 @@ restore_covered) auto is_appropriate_state = [] (DataPartState state) { - return state == DataPartState::Committed || state == DataPartState::Outdated; + return state == DataPartState::Active || state == DataPartState::Outdated; }; auto update_error = [&] (DataPartIteratorByInfo it) @@ -2781,11 +2789,11 @@ restore_covered) if ((*it)->info.min_block != part->info.min_block) update_error(it); - if ((*it)->getState() != DataPartState::Committed) + if ((*it)->getState() != DataPartState::Active) { addPartContributionToColumnAndSecondaryIndexSizes(*it); addPartContributionToDataVolume(*it); - modifyPartState(it, DataPartState::Committed); // iterator is not invalidated here + modifyPartState(it, DataPartState::Active); // iterator is not invalidated here } pos = (*it)->info.max_block + 1; @@ -2812,11 +2820,11 @@ restore_covered) if ((*it)->info.min_block > pos) update_error(it); - if ((*it)->getState() != DataPartState::Committed) + if ((*it)->getState() != DataPartState::Active) { addPartContributionToColumnAndSecondaryIndexSizes(*it); addPartContributionToDataVolume(*it); - modifyPartState(it, DataPartState::Committed); + modifyPartState(it, DataPartState::Active); } pos = (*it)->info.max_block + 1; @@ -2930,7 +2938,7 @@ size_t MergeTreeData::getMaxPartsCountForPartitionWithState(DataPartState state) size_t MergeTreeData::getMaxPartsCountForPartition() const { - return getMaxPartsCountForPartitionWithState(DataPartState::Committed); + return getMaxPartsCountForPartitionWithState(DataPartState::Active); } @@ -2945,7 +2953,7 @@ std::optional MergeTreeData::getMinPartDataVersion() const auto lock = lockParts(); std::optional result; - for (const auto & part : getDataPartsStateRange(DataPartState::Committed)) + for (const auto & part : getDataPartsStateRange(DataPartState::Active)) { if (!result || *result > part->info.getDataVersion()) result = part->info.getDataVersion(); @@ -3051,7 +3059,7 @@ MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart( void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) { auto lock = lockParts(); - for (auto original_active_part : getDataPartsStateRange(DataPartState::Committed)) // NOLINT (copy is intended) + for (auto original_active_part : getDataPartsStateRange(DataPartState::Active)) // NOLINT (copy is intended) { if (part_copy->name == original_active_part->name) { @@ -3076,7 +3084,7 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) data_parts_indexes.erase(active_part_it); auto part_it = data_parts_indexes.insert(part_copy).first; - modifyPartState(part_it, DataPartState::Committed); + modifyPartState(part_it, DataPartState::Active); removePartContributionToDataVolume(original_active_part); addPartContributionToDataVolume(part_copy); @@ -3101,7 +3109,7 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const MergeTreePartInfo & part_info) const { auto lock = lockParts(); - return getActiveContainingPart(part_info, DataPartState::Committed, lock); + return getActiveContainingPart(part_info, DataPartState::Active, lock); } MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const String & part_name) const @@ -3171,7 +3179,7 @@ void MergeTreeData::calculateColumnAndSecondaryIndexSizesImpl() column_sizes.clear(); /// Take into account only committed parts - auto committed_parts_range = getDataPartsStateRange(DataPartState::Committed); + auto committed_parts_range = getDataPartsStateRange(DataPartState::Active); for (const auto & part : committed_parts_range) addPartContributionToColumnAndSecondaryIndexSizes(part); } @@ -3266,7 +3274,7 @@ void MergeTreeData::checkAlterPartitionIsPossible( void MergeTreeData::checkPartitionCanBeDropped(const ASTPtr & partition) { const String partition_id = getPartitionIDFromQuery(partition, getContext()); - auto parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + auto parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); UInt64 partition_size = 0; @@ -3279,7 +3287,7 @@ void MergeTreeData::checkPartitionCanBeDropped(const ASTPtr & partition) void MergeTreeData::checkPartCanBeDropped(const String & part_name) { - auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Committed}); + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active}); if (!part) throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "No part {} in committed state", part_name); @@ -3305,7 +3313,7 @@ void MergeTreeData::movePartitionToDisk(const ASTPtr & partition, const String & throw Exception("Part " + partition_id + " is not exists or not active", ErrorCodes::NO_SUCH_DATA_PART); } else - parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); auto disk = getStoragePolicy()->getDiskByName(name); if (!disk) @@ -3350,7 +3358,7 @@ void MergeTreeData::movePartitionToVolume(const ASTPtr & partition, const String throw Exception("Part " + partition_id + " is not exists or not active", ErrorCodes::NO_SUCH_DATA_PART); } else - parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); auto volume = getStoragePolicy()->getVolumeByName(name); if (!volume) @@ -3532,7 +3540,7 @@ BackupEntries MergeTreeData::backup(const ASTs & partitions, ContextPtr local_co if (partitions.empty()) data_parts = getDataPartsVector(); else - data_parts = getDataPartsVectorInPartitions(MergeTreeDataPartState::Committed, getPartitionIDsFromQuery(partitions, local_context)); + data_parts = getDataPartsVectorInPartitions(MergeTreeDataPartState::Active, getPartitionIDsFromQuery(partitions, local_context)); return backupDataParts(data_parts); } @@ -3906,8 +3914,8 @@ void MergeTreeData::dropDetached(const ASTPtr & partition, bool part, ContextPtr for (auto & [old_name, new_name, disk] : renamed_parts.old_and_new_names) { - disk->removeRecursive(fs::path(relative_data_path) / "detached" / new_name / ""); - LOG_DEBUG(log, "Dropped detached part {}", old_name); + bool keep_shared = removeDetachedPart(disk, fs::path(relative_data_path) / "detached" / new_name / "", old_name, false); + LOG_DEBUG(log, "Dropped detached part {}, keep shared data: {}", old_name, keep_shared); old_name.clear(); } } @@ -4170,20 +4178,20 @@ MergeTreeData::DataParts MergeTreeData::getDataParts(const DataPartStates & affo MergeTreeData::DataParts MergeTreeData::getDataParts() const { - return getDataParts({DataPartState::Committed}); + return getDataParts({DataPartState::Active}); } MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVector() const { - return getDataPartsVector({DataPartState::Committed}); + return getDataPartsVector({DataPartState::Active}); } MergeTreeData::DataPartPtr MergeTreeData::getAnyPartInPartition( const String & partition_id, DataPartsLock & /*data_parts_lock*/) const { - auto it = data_parts_by_state_and_info.lower_bound(DataPartStateAndPartitionID{DataPartState::Committed, partition_id}); + auto it = data_parts_by_state_and_info.lower_bound(DataPartStateAndPartitionID{DataPartState::Active, partition_id}); - if (it != data_parts_by_state_and_info.end() && (*it)->getState() == DataPartState::Committed && (*it)->info.partition_id == partition_id) + if (it != data_parts_by_state_and_info.end() && (*it)->getState() == DataPartState::Active && (*it)->info.partition_id == partition_id) return *it; return nullptr; @@ -4276,7 +4284,7 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData: add_rows += part->rows_count; ++add_parts; - data.modifyPartState(part, DataPartState::Committed); + data.modifyPartState(part, DataPartState::Active); data.addPartContributionToColumnAndSecondaryIndexSizes(part); } } @@ -5195,7 +5203,9 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher( LOG_DEBUG(log, "Freezing part {} snapshot will be placed at {}", part->name, backup_path); - part->volume->getDisk()->createDirectories(backup_path); + auto disk = part->volume->getDisk(); + + disk->createDirectories(backup_path); String src_part_path = part->getFullRelativePath(); String backup_part_path = fs::path(backup_path) / relative_data_path / part->relative_path; @@ -5206,16 +5216,20 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher( src_part_path = fs::path(relative_data_path) / flushed_part_path / ""; } - localBackup(part->volume->getDisk(), src_part_path, backup_part_path); + localBackup(disk, src_part_path, backup_part_path); - part->volume->getDisk()->removeFileIfExists(fs::path(backup_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); + // Store metadata for replicated table. + // Do nothing for non-replocated. + createAndStoreFreezeMetadata(disk, part, backup_part_path); + + disk->removeFileIfExists(fs::path(backup_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); part->is_frozen.store(true, std::memory_order_relaxed); result.push_back(PartitionCommandResultInfo{ .partition_id = part->info.partition_id, .part_name = part->name, - .backup_path = fs::path(part->volume->getDisk()->getPath()) / backup_path, - .part_backup_path = fs::path(part->volume->getDisk()->getPath()) / backup_part_path, + .backup_path = fs::path(disk->getPath()) / backup_path, + .part_backup_path = fs::path(disk->getPath()) / backup_part_path, .backup_name = backup_name, }); ++parts_processed; @@ -5225,6 +5239,11 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher( return result; } +void MergeTreeData::createAndStoreFreezeMetadata(DiskPtr, DataPartPtr, String) const +{ + +} + PartitionCommandsResultInfo MergeTreeData::unfreezePartition( const ASTPtr & partition, const String & backup_name, @@ -5242,6 +5261,13 @@ PartitionCommandsResultInfo MergeTreeData::unfreezeAll( return unfreezePartitionsByMatcher([] (const String &) { return true; }, backup_name, local_context); } +bool MergeTreeData::removeDetachedPart(DiskPtr disk, const String & path, const String &, bool) +{ + disk->removeRecursive(path); + + return false; +} + PartitionCommandsResultInfo MergeTreeData::unfreezePartitionsByMatcher(MatcherFn matcher, const String & backup_name, ContextPtr) { auto backup_path = fs::path("shadow") / escapeForFileName(backup_name) / relative_data_path; @@ -5270,7 +5296,7 @@ PartitionCommandsResultInfo MergeTreeData::unfreezePartitionsByMatcher(MatcherFn const auto & path = it->path(); - disk->removeRecursive(path); + bool keep_shared = removeDetachedPart(disk, path, partition_directory, true); result.push_back(PartitionCommandResultInfo{ .partition_id = partition_id, @@ -5280,7 +5306,7 @@ PartitionCommandsResultInfo MergeTreeData::unfreezePartitionsByMatcher(MatcherFn .backup_name = backup_name, }); - LOG_DEBUG(log, "Unfreezed part by path {}", disk->getPath() + path); + LOG_DEBUG(log, "Unfreezed part by path {}, keep shared data: {}", disk->getPath() + path, keep_shared); } } @@ -5723,7 +5749,7 @@ ReservationPtr MergeTreeData::balancedReservation( for (const auto & part : covered_parts) submerging_big_parts_from_partition.insert(part->name); - for (const auto & part : getDataPartsStateRange(MergeTreeData::DataPartState::Committed)) + for (const auto & part : getDataPartsStateRange(MergeTreeData::DataPartState::Active)) { if (part->isStoredOnDisk() && part->getBytesOnDisk() >= min_bytes_to_rebalance_partition_over_jbod && part_info.partition_id == part->info.partition_id) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 380c2f4f4c5..f1d0abffc7a 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -233,7 +233,7 @@ public: const VolumePtr & volume, const String & relative_path, const IMergeTreeDataPart * parent_part = nullptr) const; /// Auxiliary object to add a set of parts into the working set in two steps: - /// * First, as PreCommitted parts (the parts are ready, but not yet in the active set). + /// * First, as PreActive parts (the parts are ready, but not yet in the active set). /// * Next, if commit() is called, the parts are added to the active set and the parts that are /// covered by them are marked Outdated. /// If neither commit() nor rollback() was called, the destructor rollbacks the operation. @@ -452,7 +452,7 @@ public: MutableDataPartsVector tryLoadPartsToAttach(const ASTPtr & partition, bool attach_part, ContextPtr context, PartsTemporaryRename & renamed_parts); - /// Returns Committed parts + /// Returns Active parts DataParts getDataParts() const; DataPartsVector getDataPartsVector() const; @@ -494,7 +494,7 @@ public: /// Renames temporary part to a permanent part and adds it to the parts set. /// It is assumed that the part does not intersect with existing parts. /// If increment != nullptr, part index is determining using increment. Otherwise part index remains unchanged. - /// If out_transaction != nullptr, adds the part in the PreCommitted state (the part will be added to the + /// If out_transaction != nullptr, adds the part in the PreActive state (the part will be added to the /// active set later with out_transaction->commit()). /// Else, commits the part immediately. /// Returns true if part was added. Returns false if part is covered by bigger part. @@ -518,7 +518,7 @@ public: void removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove); /// Removes parts from the working set parts. - /// Parts in add must already be in data_parts with PreCommitted, Committed, or Outdated states. + /// Parts in add must already be in data_parts with PreActive, Active, or Outdated states. /// If clear_without_timeout is true, the parts will be deleted at once, or during the next call to /// clearOldParts (ignoring old_parts_lifetime). void removePartsFromWorkingSet(const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock * acquired_lock = nullptr); @@ -873,10 +873,21 @@ public: /// Overridden in StorageReplicatedMergeTree virtual bool unlockSharedData(const IMergeTreeDataPart &) const { return true; } + /// Remove lock with old name for shared data part after rename + virtual bool unlockSharedData(const IMergeTreeDataPart &, const String &) const { return true; } + /// Fetch part only if some replica has it on shared storage like S3 /// Overridden in StorageReplicatedMergeTree virtual bool tryToFetchIfShared(const IMergeTreeDataPart &, const DiskPtr &, const String &) { return false; } + /// Check shared data usage on other replicas for detached/freezed part + /// Remove local files and remote files if needed + virtual bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed); + + /// Store metadata for replicated tables + /// Do nothing for non-replicated tables + virtual void createAndStoreFreezeMetadata(DiskPtr disk, DataPartPtr part, String backup_part_path) const; + /// Parts that currently submerging (merging to bigger parts) or emerging /// (to be appeared after merging finished). These two variables have to be used /// with `currently_submerging_emerging_mutex`. @@ -1049,7 +1060,7 @@ protected: /// If there is no part in the partition with ID `partition_id`, returns empty ptr. Should be called under the lock. DataPartPtr getAnyPartInPartition(const String & partition_id, DataPartsLock & data_parts_lock) const; - /// Return parts in the Committed set that are covered by the new_part_info or the part that covers it. + /// Return parts in the Active set that are covered by the new_part_info or the part that covers it. /// Will check that the new part doesn't already exist and that it doesn't intersect existing part. DataPartsVector getActivePartsToReplace( const MergeTreePartInfo & new_part_info, diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index b991166b3b6..6861599a1ac 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -125,8 +125,10 @@ struct Settings; M(UInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.", 0) \ M(String, storage_policy, "default", "Name of storage disk policy", 0) \ M(Bool, allow_nullable_key, false, "Allow Nullable types as primary keys.", 0) \ - M(Bool, allow_remote_fs_zero_copy_replication, true, "Allow Zero-copy replication over remote fs", 0) \ - M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm", 0) \ + M(Bool, allow_remote_fs_zero_copy_replication, true, "Allow Zero-copy replication over remote fs.", 0) \ + M(String, remote_fs_zero_copy_zookeeper_path, "/clickhouse/zero_copy", "ZooKeeper path for Zero-copy table-independet info.", 0) \ + M(Bool, remote_fs_zero_copy_path_compatible_mode, false, "Run zero-copy in compatible mode during conversion process.", 0) \ + M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm.", 0) \ M(Bool, assign_part_uuids, false, "Generate UUIDs for parts. Before enabling check that all replicas support new format.", 0) \ M(Int64, max_partitions_to_read, -1, "Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited. This setting is the default that can be overridden by the query-level setting with the same name.", 0) \ M(UInt64, max_concurrent_queries, 0, "Max number of concurrently executed queries related to the MergeTree table (0 - disabled). Queries will still be limited by other max_concurrent_queries settings.", 0) \ diff --git a/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp b/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp index 5fe7de70a20..db5ca15ce8a 100644 --- a/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.cpp @@ -217,9 +217,9 @@ bool ReplicatedMergeMutateTaskBase::executeImpl() ReplicatedMergeMutateTaskBase::CheckExistingPartResult ReplicatedMergeMutateTaskBase::checkExistingPart() { /// If we already have this part or a part covering it, we do not need to do anything. - /// The part may be still in the PreCommitted -> Committed transition so we first search - /// among PreCommitted parts to definitely find the desired part if it exists. - MergeTreeData::DataPartPtr existing_part = storage.getPartIfExists(entry.new_part_name, {MergeTreeDataPartState::PreCommitted}); + /// The part may be still in the PreActive -> Active transition so we first search + /// among PreActive parts to definitely find the desired part if it exists. + MergeTreeData::DataPartPtr existing_part = storage.getPartIfExists(entry.new_part_name, {MergeTreeDataPartState::PreActive}); if (!existing_part) existing_part = storage.getActiveContainingPart(entry.new_part_name); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index 8d3cb146990..8fcaee66007 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -272,10 +272,10 @@ std::pair ReplicatedMergeTreePartCheckThread::findLo /// but checker thread will remove part from zookeeper and queue fetch. bool exists_in_zookeeper = zookeeper->exists(part_path); - /// If the part is still in the PreCommitted -> Committed transition, it is not lost + /// If the part is still in the PreActive -> Active transition, it is not lost /// and there is no need to go searching for it on other replicas. To definitely find the needed part - /// if it exists (or a part containing it) we first search among the PreCommitted parts. - auto part = storage.getPartIfExists(part_name, {MergeTreeDataPartState::PreCommitted}); + /// if it exists (or a part containing it) we first search among the PreActive parts. + auto part = storage.getPartIfExists(part_name, {MergeTreeDataPartState::PreActive}); if (!part) part = storage.getActiveContainingPart(part_name); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index b3da3d47684..1432728d00a 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1177,7 +1177,7 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry( return false; } - auto part = data.getPartIfExists(name, {MergeTreeDataPartState::PreCommitted, MergeTreeDataPartState::Committed, MergeTreeDataPartState::Outdated}); + auto part = data.getPartIfExists(name, {MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); if (part) { if (auto part_in_memory = asInMemoryPart(part)) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp index 0cc6955ff72..7a5b82979bd 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp @@ -253,7 +253,7 @@ void ReplicatedMergeTreeRestartingThread::removeFailedQuorumParts() for (const auto & part_name : failed_parts) { auto part = storage.getPartIfExists( - part_name, {MergeTreeDataPartState::PreCommitted, MergeTreeDataPartState::Committed, MergeTreeDataPartState::Outdated}); + part_name, {MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); if (part) { diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 1ce748640dc..d2bf6ba308b 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -228,6 +228,8 @@ void ReplicatedMergeTreeSink::commitPart( bool is_already_existing_part = false; + String old_part_name = part->name; + while (true) { /// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem. @@ -370,7 +372,7 @@ void ReplicatedMergeTreeSink::commitPart( block_id, existing_part_name); /// If it does not exist, we will write a new part with existing name. - /// Note that it may also appear on filesystem right now in PreCommitted state due to concurrent inserts of the same data. + /// Note that it may also appear on filesystem right now in PreActive state due to concurrent inserts of the same data. /// It will be checked when we will try to rename directory. part->name = existing_part_name; @@ -508,6 +510,9 @@ void ReplicatedMergeTreeSink::commitPart( waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value); } + + /// Cleanup shared locks made with old name + part->cleanupOldName(old_part_name); } void ReplicatedMergeTreeSink::onStart() diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index cb52c8b86c0..1cf701492a9 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -22,19 +23,16 @@ #include #include -#include namespace DB { namespace ErrorCodes { - extern const int NOT_IMPLEMENTED; extern const int BAD_ARGUMENTS; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int UNKNOWN_ELEMENT_IN_CONFIG; - extern const int NO_ELEMENTS_IN_CONFIG; extern const int UNKNOWN_STORAGE; extern const int NO_REPLICA_NAME_GIVEN; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } @@ -62,171 +60,6 @@ static Names extractColumnNames(const ASTPtr & node) } } -/** Is used to order Graphite::Retentions by age and precision descending. - * Throws exception if not both age and precision are less or greater then another. - */ -static bool compareRetentions(const Graphite::Retention & a, const Graphite::Retention & b) -{ - if (a.age > b.age && a.precision > b.precision) - { - return true; - } - else if (a.age < b.age && a.precision < b.precision) - { - return false; - } - String error_msg = "age and precision should only grow up: " - + std::to_string(a.age) + ":" + std::to_string(a.precision) + " vs " - + std::to_string(b.age) + ":" + std::to_string(b.precision); - throw Exception( - error_msg, - ErrorCodes::BAD_ARGUMENTS); -} - -/** Read the settings for Graphite rollup from config. - * Example - * - * - * Path - * - * click_cost - * any - * - * 0 - * 3600 - * - * - * 86400 - * 60 - * - * - * - * max - * - * 0 - * 60 - * - * - * 3600 - * 300 - * - * - * 86400 - * 3600 - * - * - * - */ -static void appendGraphitePattern( - const Poco::Util::AbstractConfiguration & config, - const String & config_element, - Graphite::Patterns & out_patterns, - ContextPtr context) -{ - Graphite::Pattern pattern; - - Poco::Util::AbstractConfiguration::Keys keys; - config.keys(config_element, keys); - - for (const auto & key : keys) - { - if (key == "regexp") - { - pattern.regexp_str = config.getString(config_element + ".regexp"); - pattern.regexp = std::make_shared(pattern.regexp_str); - } - else if (key == "function") - { - String aggregate_function_name_with_params = config.getString(config_element + ".function"); - String aggregate_function_name; - Array params_row; - getAggregateFunctionNameAndParametersArray( - aggregate_function_name_with_params, aggregate_function_name, params_row, "GraphiteMergeTree storage initialization", context); - - /// TODO Not only Float64 - AggregateFunctionProperties properties; - pattern.function = AggregateFunctionFactory::instance().get( - aggregate_function_name, {std::make_shared()}, params_row, properties); - } - else if (startsWith(key, "retention")) - { - pattern.retentions.emplace_back(Graphite::Retention{ - .age = config.getUInt(config_element + "." + key + ".age"), - .precision = config.getUInt(config_element + "." + key + ".precision")}); - } - else - throw Exception("Unknown element in config: " + key, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); - } - - if (!pattern.function && pattern.retentions.empty()) - throw Exception( - "At least one of an aggregate function or retention rules is mandatory for rollup patterns in GraphiteMergeTree", - ErrorCodes::NO_ELEMENTS_IN_CONFIG); - - if (!pattern.function) - { - pattern.type = pattern.TypeRetention; - } - else if (pattern.retentions.empty()) - { - pattern.type = pattern.TypeAggregation; - } - else - { - pattern.type = pattern.TypeAll; - } - - if (pattern.type & pattern.TypeAggregation) /// TypeAggregation or TypeAll - if (pattern.function->allocatesMemoryInArena()) - throw Exception( - "Aggregate function " + pattern.function->getName() + " isn't supported in GraphiteMergeTree", ErrorCodes::NOT_IMPLEMENTED); - - /// retention should be in descending order of age. - if (pattern.type & pattern.TypeRetention) /// TypeRetention or TypeAll - std::sort(pattern.retentions.begin(), pattern.retentions.end(), compareRetentions); - - out_patterns.emplace_back(pattern); -} - -static void setGraphitePatternsFromConfig(ContextPtr context, const String & config_element, Graphite::Params & params) -{ - const auto & config = context->getConfigRef(); - - if (!config.has(config_element)) - throw Exception("No '" + config_element + "' element in configuration file", ErrorCodes::NO_ELEMENTS_IN_CONFIG); - - params.config_name = config_element; - params.path_column_name = config.getString(config_element + ".path_column_name", "Path"); - params.time_column_name = config.getString(config_element + ".time_column_name", "Time"); - params.value_column_name = config.getString(config_element + ".value_column_name", "Value"); - params.version_column_name = config.getString(config_element + ".version_column_name", "Timestamp"); - - Poco::Util::AbstractConfiguration::Keys keys; - config.keys(config_element, keys); - - for (const auto & key : keys) - { - if (startsWith(key, "pattern")) - { - appendGraphitePattern(config, config_element + "." + key, params.patterns, context); - } - else if (key == "default") - { - /// See below. - } - else if (key == "path_column_name" || key == "time_column_name" || key == "value_column_name" || key == "version_column_name") - { - /// See above. - } - else - throw Exception("Unknown element in config: " + key, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); - } - - if (config.has(config_element + ".default")) - appendGraphitePattern(config, config_element + "." + ".default", params.patterns, context); -} - - static String getMergeTreeVerboseHelp(bool) { using namespace std::string_literals; @@ -258,6 +91,34 @@ If you use the Replicated version of engines, see https://clickhouse.com/docs/en return help; } +static ColumnsDescription getColumnsDescriptionFromZookeeper(const String & raw_zookeeper_path, ContextMutablePtr context) +{ + String zookeeper_name = zkutil::extractZooKeeperName(raw_zookeeper_path); + String zookeeper_path = zkutil::extractZooKeeperPath(raw_zookeeper_path, true); + + if (!context->hasZooKeeper() && !context->hasAuxiliaryZooKeeper(zookeeper_name)) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure without zookeeper, you must specify the structure manually"}; + + zkutil::ZooKeeperPtr zookeeper; + try + { + if (zookeeper_name == StorageReplicatedMergeTree::getDefaultZooKeeperName()) + zookeeper = context->getZooKeeper(); + else + zookeeper = context->getAuxiliaryZooKeeper(zookeeper_name); + } + catch (...) + { + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure from zookeeper, because cannot get zookeeper: {}. You must specify structure manually", getCurrentExceptionMessage(false)}; + } + + if (!zookeeper->exists(zookeeper_path + "/replicas")) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot get replica structure, because there no other replicas in zookeeper. You must specify the structure manually"}; + + Coordination::Stat columns_stat; + return ColumnsDescription::parse(zookeeper->get(fs::path(zookeeper_path) / "columns", &columns_stat)); +} + static StoragePtr create(const StorageFactory::Arguments & args) { @@ -638,7 +499,14 @@ static StoragePtr create(const StorageFactory::Arguments & args) String date_column_name; StorageInMemoryMetadata metadata; - metadata.setColumns(args.columns); + + ColumnsDescription columns; + if (args.columns.empty() && replicated) + columns = getColumnsDescriptionFromZookeeper(zookeeper_path, args.getContext()); + else + columns = args.columns; + + metadata.setColumns(columns); metadata.setComment(args.comment); std::unique_ptr storage_settings; @@ -705,12 +573,12 @@ static StoragePtr create(const StorageFactory::Arguments & args) if (args.query.columns_list && args.query.columns_list->indices) for (auto & index : args.query.columns_list->indices->children) - metadata.secondary_indices.push_back(IndexDescription::getIndexFromAST(index, args.columns, args.getContext())); + metadata.secondary_indices.push_back(IndexDescription::getIndexFromAST(index, columns, args.getContext())); if (args.query.columns_list && args.query.columns_list->projections) for (auto & projection_ast : args.query.columns_list->projections->children) { - auto projection = ProjectionDescription::getProjectionFromAST(projection_ast, args.columns, args.getContext()); + auto projection = ProjectionDescription::getProjectionFromAST(projection_ast, columns, args.getContext()); metadata.projections.add(std::move(projection)); } @@ -720,10 +588,10 @@ static StoragePtr create(const StorageFactory::Arguments & args) constraints.push_back(constraint); metadata.constraints = ConstraintsDescription(constraints); - auto column_ttl_asts = args.columns.getColumnTTLs(); + auto column_ttl_asts = columns.getColumnTTLs(); for (const auto & [name, ast] : column_ttl_asts) { - auto new_ttl_entry = TTLDescription::getTTLFromAST(ast, args.columns, args.getContext(), metadata.primary_key); + auto new_ttl_entry = TTLDescription::getTTLFromAST(ast, columns, args.getContext(), metadata.primary_key); metadata.column_ttls_by_name[name] = new_ttl_entry; } @@ -850,6 +718,7 @@ void registerStorageMergeTree(StorageFactory & factory) features.supports_replication = true; features.supports_deduplication = true; + features.supports_schema_inference = true; factory.registerStorage("ReplicatedMergeTree", create, features); factory.registerStorage("ReplicatedCollapsingMergeTree", create, features); diff --git a/src/Storages/MySQL/MySQLHelpers.cpp b/src/Storages/MySQL/MySQLHelpers.cpp index e7745e6c0bb..edeb4ffca8a 100644 --- a/src/Storages/MySQL/MySQLHelpers.cpp +++ b/src/Storages/MySQL/MySQLHelpers.cpp @@ -8,9 +8,17 @@ namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + mysqlxx::PoolWithFailover createMySQLPoolWithFailover(const StorageMySQLConfiguration & configuration, const MySQLSettings & mysql_settings) { + if (!mysql_settings.connection_pool_size) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Connection pool cannot have zero size"); + return mysqlxx::PoolWithFailover( configuration.database, configuration.addresses, configuration.username, configuration.password, MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS, diff --git a/src/Storages/MySQL/MySQLSettings.h b/src/Storages/MySQL/MySQLSettings.h index aa2c2703d6b..be1e09c12e6 100644 --- a/src/Storages/MySQL/MySQLSettings.h +++ b/src/Storages/MySQL/MySQLSettings.h @@ -25,11 +25,14 @@ class ASTStorage; DECLARE_SETTINGS_TRAITS(MySQLSettingsTraits, LIST_OF_MYSQL_SETTINGS) +using MySQLBaseSettings = BaseSettings; + /** Settings for the MySQL family of engines. */ -struct MySQLSettings : public BaseSettings +struct MySQLSettings : public MySQLBaseSettings { void loadFromQuery(ASTStorage & storage_def); }; + } diff --git a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp index 4c66eda2fed..f02653d9167 100644 --- a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp +++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp @@ -18,6 +18,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int POSTGRESQL_REPLICATION_INTERNAL_ERROR; } MaterializedPostgreSQLConsumer::MaterializedPostgreSQLConsumer( @@ -29,7 +30,7 @@ MaterializedPostgreSQLConsumer::MaterializedPostgreSQLConsumer( const size_t max_block_size_, bool schema_as_a_part_of_table_name_, bool allow_automatic_update_, - Storages storages_, + StorageInfos storages_info_, const String & name_for_logger) : log(&Poco::Logger::get("PostgreSQLReplicaConsumer(" + name_for_logger + ")")) , context(context_) @@ -41,7 +42,6 @@ MaterializedPostgreSQLConsumer::MaterializedPostgreSQLConsumer( , max_block_size(max_block_size_) , schema_as_a_part_of_table_name(schema_as_a_part_of_table_name_) , allow_automatic_update(allow_automatic_update_) - , storages(storages_) { final_lsn = start_lsn; auto tx = std::make_shared(connection->getRef()); @@ -49,19 +49,28 @@ MaterializedPostgreSQLConsumer::MaterializedPostgreSQLConsumer( LOG_TRACE(log, "Starting replication. LSN: {} (last: {})", getLSNValue(current_lsn), getLSNValue(final_lsn)); tx->commit(); - for (const auto & [table_name, storage] : storages) - { - buffers.emplace(table_name, Buffer(storage)); - } + for (const auto & [table_name, storage_info] : storages_info_) + storages.emplace(table_name, storage_info); } -void MaterializedPostgreSQLConsumer::Buffer::createEmptyBuffer(StoragePtr storage) +MaterializedPostgreSQLConsumer::StorageData::StorageData(const StorageInfo & storage_info) + : storage(storage_info.storage), buffer(storage_info.storage->getInMemoryMetadataPtr(), storage_info.attributes) +{ + auto table_id = storage_info.storage->getStorageID(); + LOG_TRACE(&Poco::Logger::get("StorageMaterializedPostgreSQL"), + "New buffer for table {}, number of attributes: {}, number if columns: {}, structure: {}", + table_id.getNameForLogs(), buffer.attributes.size(), buffer.getColumnsNum(), buffer.description.sample_block.dumpStructure()); +} + + +MaterializedPostgreSQLConsumer::StorageData::Buffer::Buffer( + StorageMetadataPtr storage_metadata, const PostgreSQLTableStructure::Attributes & attributes_) + : attributes(attributes_) { - const auto storage_metadata = storage->getInMemoryMetadataPtr(); const Block sample_block = storage_metadata->getSampleBlock(); - /// Need to clear type, because in description.init() the types are appended (emplace_back) + /// Need to clear type, because in description.init() the types are appended description.types.clear(); description.init(sample_block); @@ -69,13 +78,13 @@ void MaterializedPostgreSQLConsumer::Buffer::createEmptyBuffer(StoragePtr storag const auto & storage_columns = storage_metadata->getColumns().getAllPhysical(); auto insert_columns = std::make_shared(); - auto table_id = storage->getStorageID(); - LOG_TRACE(&Poco::Logger::get("MaterializedPostgreSQLBuffer"), "New buffer for table {}.{} ({}), structure: {}", - table_id.database_name, table_id.table_name, toString(table_id.uuid), sample_block.dumpStructure()); + auto columns_num = description.sample_block.columns(); + assert(columns_num == storage_columns.size()); + if (attributes.size() + 2 != columns_num) /// +2 because sign and version columns + throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns number mismatch. Attributes: {}, buffer: {}", + attributes.size(), columns_num); - assert(description.sample_block.columns() == storage_columns.size()); size_t idx = 0; - for (const auto & column : storage_columns) { if (description.types[idx].first == ExternalResultDescription::ValueType::vtArray) @@ -85,37 +94,45 @@ void MaterializedPostgreSQLConsumer::Buffer::createEmptyBuffer(StoragePtr storag insert_columns->children.emplace_back(std::make_shared(column.name)); } - columnsAST = std::move(insert_columns); + columns_ast = std::move(insert_columns); } -void MaterializedPostgreSQLConsumer::insertValue(Buffer & buffer, const std::string & value, size_t column_idx) +void MaterializedPostgreSQLConsumer::insertValue(StorageData::Buffer & buffer, const std::string & value, size_t column_idx) { const auto & sample = buffer.description.sample_block.getByPosition(column_idx); bool is_nullable = buffer.description.types[column_idx].second; - if (is_nullable) + try { - ColumnNullable & column_nullable = assert_cast(*buffer.columns[column_idx]); - const auto & data_type = assert_cast(*sample.type); + if (is_nullable) + { + ColumnNullable & column_nullable = assert_cast(*buffer.columns[column_idx]); + const auto & data_type = assert_cast(*sample.type); - insertPostgreSQLValue( - column_nullable.getNestedColumn(), value, - buffer.description.types[column_idx].first, data_type.getNestedType(), buffer.array_info, column_idx); + insertPostgreSQLValue( + column_nullable.getNestedColumn(), value, + buffer.description.types[column_idx].first, data_type.getNestedType(), buffer.array_info, column_idx); - column_nullable.getNullMapData().emplace_back(0); + column_nullable.getNullMapData().emplace_back(0); + } + else + { + insertPostgreSQLValue( + *buffer.columns[column_idx], value, + buffer.description.types[column_idx].first, sample.type, + buffer.array_info, column_idx); + } } - else + catch (const pqxx::conversion_error & e) { - insertPostgreSQLValue( - *buffer.columns[column_idx], value, - buffer.description.types[column_idx].first, sample.type, - buffer.array_info, column_idx); + LOG_ERROR(log, "Conversion failed while inserting PostgreSQL value {}, will insert default value. Error: {}", value, e.what()); + insertDefaultValue(buffer, column_idx); } } -void MaterializedPostgreSQLConsumer::insertDefaultValue(Buffer & buffer, size_t column_idx) +void MaterializedPostgreSQLConsumer::insertDefaultValue(StorageData::Buffer & buffer, size_t column_idx) { const auto & sample = buffer.description.sample_block.getByPosition(column_idx); insertDefaultPostgreSQLValue(*buffer.columns[column_idx], *sample.column); @@ -186,10 +203,16 @@ Int8 MaterializedPostgreSQLConsumer::readInt8(const char * message, size_t & pos void MaterializedPostgreSQLConsumer::readTupleData( - Buffer & buffer, const char * message, size_t & pos, [[maybe_unused]] size_t size, PostgreSQLQuery type, bool old_value) + StorageData::Buffer & buffer, const char * message, size_t & pos, [[maybe_unused]] size_t size, PostgreSQLQuery type, bool old_value) { Int16 num_columns = readInt16(message, pos, size); + /// Sanity check. In fact, it was already checked. + if (static_cast(num_columns) + 2 != buffer.getColumnsNum()) /// +2 -- sign and version columns + throw Exception(ErrorCodes::POSTGRESQL_REPLICATION_INTERNAL_ERROR, + "Number of columns does not match. Got: {}, expected {}, current buffer structure: {}", + num_columns, buffer.getColumnsNum(), buffer.description.sample_block.dumpStructure()); + auto proccess_column_value = [&](Int8 identifier, Int16 column_idx) { switch (identifier) @@ -202,8 +225,15 @@ void MaterializedPostgreSQLConsumer::readTupleData( case 't': /// Text formatted value { Int32 col_len = readInt32(message, pos, size); - String value; + /// Sanity check for protocol misuse. + /// PostgreSQL uses a fixed page size (commonly 8 kB), and does not allow tuples to span multiple pages. + static constexpr Int32 sanity_check_max_col_len = 1024 * 8 * 2; /// *2 -- just in case. + if (unlikely(col_len > sanity_check_max_col_len)) + throw Exception(ErrorCodes::POSTGRESQL_REPLICATION_INTERNAL_ERROR, + "Column length is suspiciously long: {}", col_len); + + String value; for (Int32 i = 0; i < col_len; ++i) value += readInt8(message, pos, size); @@ -276,19 +306,20 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl { Int32 relation_id = readInt32(replication_message, pos, size); const auto & table_name = relation_id_to_name[relation_id]; - /// FIXME:If table name is empty here, it means we failed to load it, but it was included in publication. Need to remove? if (table_name.empty()) - LOG_WARNING(log, "No table mapping for relation id: {}. Probably table failed to be loaded", relation_id); + { + LOG_ERROR(log, "No table mapping for relation id: {}. It's a bug", relation_id); + return; + } if (!isSyncAllowed(relation_id, table_name)) return; Int8 new_tuple = readInt8(replication_message, pos, size); - auto buffer = buffers.find(table_name); - assert(buffer != buffers.end()); + auto & buffer = storages.find(table_name)->second.buffer; if (new_tuple) - readTupleData(buffer->second, replication_message, pos, size, PostgreSQLQuery::INSERT); + readTupleData(buffer, replication_message, pos, size, PostgreSQLQuery::INSERT); break; } @@ -296,15 +327,16 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl { Int32 relation_id = readInt32(replication_message, pos, size); const auto & table_name = relation_id_to_name[relation_id]; - /// FIXME:If table name is empty here, it means we failed to load it, but it was included in publication. Need to remove? if (table_name.empty()) - LOG_WARNING(log, "No table mapping for relation id: {}. Probably table failed to be loaded", relation_id); + { + LOG_ERROR(log, "No table mapping for relation id: {}. It's a bug", relation_id); + return; + } if (!isSyncAllowed(relation_id, table_name)) return; - auto buffer = buffers.find(table_name); - assert(buffer != buffers.end()); + auto & buffer = storages.find(table_name)->second.buffer; auto proccess_identifier = [&](Int8 identifier) -> bool { @@ -319,13 +351,13 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl /// it is much more efficient to use replica identity index, but support all possible cases. case 'O': { - readTupleData(buffer->second, replication_message, pos, size, PostgreSQLQuery::UPDATE, true); + readTupleData(buffer, replication_message, pos, size, PostgreSQLQuery::UPDATE, true); break; } case 'N': { /// New row. - readTupleData(buffer->second, replication_message, pos, size, PostgreSQLQuery::UPDATE); + readTupleData(buffer, replication_message, pos, size, PostgreSQLQuery::UPDATE); read_next = false; break; } @@ -347,9 +379,11 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl { Int32 relation_id = readInt32(replication_message, pos, size); const auto & table_name = relation_id_to_name[relation_id]; - /// FIXME:If table name is empty here, it means we failed to load it, but it was included in publication. Need to remove? if (table_name.empty()) - LOG_WARNING(log, "No table mapping for relation id: {}. Probably table failed to be loaded", relation_id); + { + LOG_ERROR(log, "No table mapping for relation id: {}. It's a bug", relation_id); + return; + } if (!isSyncAllowed(relation_id, table_name)) return; @@ -357,10 +391,8 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl /// 0 or 1 if replica identity is set to full. For now only default replica identity is supported (with primary keys). readInt8(replication_message, pos, size); - auto buffer = buffers.find(table_name); - assert(buffer != buffers.end()); - readTupleData(buffer->second, replication_message, pos, size, PostgreSQLQuery::DELETE); - + auto & buffer = storages.find(table_name)->second.buffer; + readTupleData(buffer, replication_message, pos, size, PostgreSQLQuery::DELETE); break; } case 'C': // Commit @@ -379,7 +411,6 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl Int32 relation_id = readInt32(replication_message, pos, size); String relation_namespace, relation_name; - readString(replication_message, pos, size, relation_namespace); readString(replication_message, pos, size, relation_name); @@ -389,22 +420,26 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl else table_name = relation_name; + if (!relation_id_to_name.contains(relation_id)) + relation_id_to_name[relation_id] = table_name; + if (!isSyncAllowed(relation_id, relation_name)) return; - if (storages.find(table_name) == storages.end()) + auto storage_iter = storages.find(table_name); + if (storage_iter == storages.end()) { - markTableAsSkipped(relation_id, table_name); - /// TODO: This can happen if we created a publication with this table but then got an exception that this + /// FIXME: This can happen if we created a publication with this table but then got an exception that this /// table has primary key or something else. LOG_ERROR(log, - "Storage for table {} does not exist, but is included in replication stream. (Storages number: {})", + "Storage for table {} does not exist, but is included in replication stream. (Storages number: {})" + "Please manually remove this table from replication (DETACH TABLE query) to avoid redundant replication", table_name, storages.size()); + markTableAsSkipped(relation_id, table_name); return; } - assert(buffers.contains(table_name)); - + auto & buffer = storage_iter->second.buffer; /// 'd' - default (primary key if any) /// 'n' - nothing @@ -412,7 +447,6 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl /// 'i' - user defined index with indisreplident set /// Only 'd' and 'i' - are supported. char replica_identity = readInt8(replication_message, pos, size); - if (replica_identity != 'd' && replica_identity != 'i') { LOG_WARNING(log, @@ -423,25 +457,29 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl Int16 num_columns = readInt16(replication_message, pos, size); - Int32 data_type_id; - Int32 type_modifier; /// For example, n in varchar(n) - - bool new_relation_definition = false; - if (schema_data.find(relation_id) == schema_data.end()) - { - relation_id_to_name[relation_id] = table_name; - schema_data.emplace(relation_id, SchemaData(num_columns)); - new_relation_definition = true; - } - - auto & current_schema_data = schema_data.find(relation_id)->second; - - if (current_schema_data.number_of_columns != num_columns) + if (static_cast(num_columns) + 2 != buffer.getColumnsNum()) /// +2 -- sign and version columns { markTableAsSkipped(relation_id, table_name); return; } + if (static_cast(num_columns) != buffer.attributes.size()) + { +#ifndef NDEBUG + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Mismatch in attributes size. Got {}, expected {}. It's a bug. Current buffer structure: {}", + num_columns, buffer.attributes.size(), buffer.description.sample_block.dumpStructure()); +#else + LOG_ERROR(log, "Mismatch in attributes size. Got {}, expected {}. It's a bug. Current buffer structure: {}", + num_columns, buffer.attributes.size(), buffer.description.sample_block.dumpStructure()); + markTableAsSkipped(relation_id, table_name); + return; +#endif + } + + Int32 data_type_id; + Int32 type_modifier; /// For example, n in varchar(n) + for (uint16_t i = 0; i < num_columns; ++i) { String column_name; @@ -451,23 +489,14 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl data_type_id = readInt32(replication_message, pos, size); type_modifier = readInt32(replication_message, pos, size); - if (new_relation_definition) + if (buffer.attributes[i].atttypid != data_type_id || buffer.attributes[i].atttypmod != type_modifier) { - current_schema_data.column_identifiers.emplace_back(std::make_pair(data_type_id, type_modifier)); - } - else - { - if (current_schema_data.column_identifiers[i].first != data_type_id - || current_schema_data.column_identifiers[i].second != type_modifier) - { - markTableAsSkipped(relation_id, table_name); - return; - } + markTableAsSkipped(relation_id, table_name); + return; } } tables_to_sync.insert(table_name); - break; } case 'O': // Origin @@ -489,19 +518,19 @@ void MaterializedPostgreSQLConsumer::syncTables() { for (const auto & table_name : tables_to_sync) { - auto & buffer = buffers.find(table_name)->second; - Block result_rows = buffer.description.sample_block.cloneWithColumns(std::move(buffer.columns)); + auto & storage_data = storages.find(table_name)->second; + Block result_rows = storage_data.buffer.description.sample_block.cloneWithColumns(std::move(storage_data.buffer.columns)); if (result_rows.rows()) { - auto storage = storages[table_name]; + auto storage = storage_data.storage; auto insert_context = Context::createCopy(context); insert_context->setInternalQuery(true); auto insert = std::make_shared(); insert->table_id = storage->getStorageID(); - insert->columns = buffer.columnsAST; + insert->columns = storage_data.buffer.columns_ast; InterpreterInsertQuery interpreter(insert, insert_context, true); auto io = interpreter.execute(); @@ -514,7 +543,7 @@ void MaterializedPostgreSQLConsumer::syncTables() CompletedPipelineExecutor executor(io.pipeline); executor.execute(); - buffer.columns = buffer.description.sample_block.cloneEmptyColumns(); + storage_data.buffer.columns = storage_data.buffer.description.sample_block.cloneEmptyColumns(); } } @@ -599,34 +628,21 @@ bool MaterializedPostgreSQLConsumer::isSyncAllowed(Int32 relation_id, const Stri void MaterializedPostgreSQLConsumer::markTableAsSkipped(Int32 relation_id, const String & relation_name) { - /// Empty lsn string means - continue waiting for valid lsn. - skip_list.insert({relation_id, ""}); + skip_list.insert({relation_id, ""}); /// Empty lsn string means - continue waiting for valid lsn. + storages.erase(relation_name); - if (storages.count(relation_name)) - { - /// Erase cached schema identifiers. It will be updated again once table is allowed back into replication stream - /// and it receives first data after update. - schema_data.erase(relation_id); - - /// Clear table buffer. - auto & buffer = buffers.find(relation_name)->second; - buffer.columns = buffer.description.sample_block.cloneEmptyColumns(); - - if (allow_automatic_update) - LOG_TRACE(log, "Table {} (relation_id: {}) is skipped temporarily. It will be reloaded in the background", relation_name, relation_id); - else - LOG_WARNING(log, "Table {} (relation_id: {}) is skipped, because table schema has changed", relation_name, relation_id); - } + if (allow_automatic_update) + LOG_TRACE(log, "Table {} (relation_id: {}) is skipped temporarily. It will be reloaded in the background", relation_name, relation_id); + else + LOG_WARNING(log, "Table {} (relation_id: {}) is skipped, because table schema has changed", relation_name, relation_id); } -void MaterializedPostgreSQLConsumer::addNested(const String & postgres_table_name, StoragePtr nested_storage, const String & table_start_lsn) +void MaterializedPostgreSQLConsumer::addNested( + const String & postgres_table_name, StorageInfo nested_storage_info, const String & table_start_lsn) { - /// Cache new pointer to replacingMergeTree table. - storages.emplace(postgres_table_name, nested_storage); - - /// Add new in-memory buffer. - buffers.emplace(postgres_table_name, Buffer(nested_storage)); + assert(!storages.contains(postgres_table_name)); + storages.emplace(postgres_table_name, nested_storage_info); /// Replication consumer will read wall and check for currently processed table whether it is allowed to start applying /// changes to this table. @@ -634,14 +650,10 @@ void MaterializedPostgreSQLConsumer::addNested(const String & postgres_table_nam } -void MaterializedPostgreSQLConsumer::updateNested(const String & table_name, StoragePtr nested_storage, Int32 table_id, const String & table_start_lsn) +void MaterializedPostgreSQLConsumer::updateNested(const String & table_name, StorageInfo nested_storage_info, Int32 table_id, const String & table_start_lsn) { - /// Cache new pointer to replacingMergeTree table. - storages[table_name] = nested_storage; - - /// Create a new empty buffer (with updated metadata), where data is first loaded before syncing into actual table. - auto & buffer = buffers.find(table_name)->second; - buffer.createEmptyBuffer(nested_storage); + assert(!storages.contains(table_name)); + storages.emplace(table_name, nested_storage_info); /// Set start position to valid lsn. Before it was an empty string. Further read for table allowed, if it has a valid lsn. skip_list[table_id] = table_start_lsn; @@ -651,7 +663,6 @@ void MaterializedPostgreSQLConsumer::updateNested(const String & table_name, Sto void MaterializedPostgreSQLConsumer::removeNested(const String & postgres_table_name) { storages.erase(postgres_table_name); - buffers.erase(postgres_table_name); deleted_tables.insert(postgres_table_name); } @@ -706,7 +717,17 @@ bool MaterializedPostgreSQLConsumer::readFromReplicationSlot() current_lsn = (*row)[0]; lsn_value = getLSNValue(current_lsn); - processReplicationMessage((*row)[1].c_str(), (*row)[1].size()); + try + { + // LOG_DEBUG(log, "Current message: {}", (*row)[1]); + processReplicationMessage((*row)[1].c_str(), (*row)[1].size()); + } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::POSTGRESQL_REPLICATION_INTERNAL_ERROR) + continue; + throw; + } } } catch (const Exception &) @@ -737,11 +758,6 @@ bool MaterializedPostgreSQLConsumer::readFromReplicationSlot() LOG_ERROR(log, "Conversion error: {}", e.what()); return false; } - catch (const pqxx::in_doubt_error & e) - { - LOG_ERROR(log, "PostgreSQL library has some doubts: {}", e.what()); - return false; - } catch (const pqxx::internal_error & e) { LOG_ERROR(log, "PostgreSQL library internal error: {}", e.what()); @@ -749,16 +765,8 @@ bool MaterializedPostgreSQLConsumer::readFromReplicationSlot() } catch (...) { - /// Since reading is done from a background task, it is important to catch any possible error - /// in order to understand why something does not work. - try - { - std::rethrow_exception(std::current_exception()); - } - catch (const std::exception& e) - { - LOG_ERROR(log, "Unexpected error: {}", e.what()); - } + tryLogCurrentException(__PRETTY_FUNCTION__); + return false; } if (!tables_to_sync.empty()) @@ -770,6 +778,11 @@ bool MaterializedPostgreSQLConsumer::readFromReplicationSlot() bool MaterializedPostgreSQLConsumer::consume(std::vector> & skipped_tables) { + /// Read up to max_block_size changed (approximately - in same cases might be more). + /// false: no data was read, reschedule. + /// true: some data was read, schedule as soon as possible. + auto read_next = readFromReplicationSlot(); + /// Check if there are tables, which are skipped from being updated by changes from replication stream, /// because schema changes were detected. Update them, if it is allowed. if (allow_automatic_update && !skip_list.empty()) @@ -786,10 +799,6 @@ bool MaterializedPostgreSQLConsumer::consume(std::vector #include #include +#include namespace DB { struct SettingChange; +struct StorageInfo +{ + StoragePtr storage; + PostgreSQLTableStructure::Attributes attributes; + + StorageInfo(StoragePtr storage_, const PostgreSQLTableStructure::Attributes & attributes_) + : storage(storage_), attributes(attributes_) {} +}; +using StorageInfos = std::unordered_map; + class MaterializedPostgreSQLConsumer { -public: - using Storages = std::unordered_map; +private: + struct StorageData + { + struct Buffer + { + ExternalResultDescription description; + MutableColumns columns; + /// Needed to pass to insert query columns list in syncTables(). + std::shared_ptr columns_ast; + /// Needed for insertPostgreSQLValue() method to parse array + std::unordered_map array_info; + /// To validate ddl. + PostgreSQLTableStructure::Attributes attributes; + + Buffer(StorageMetadataPtr storage_metadata, const PostgreSQLTableStructure::Attributes & attributes_); + + size_t getColumnsNum() const + { + const auto & sample_block = description.sample_block; + return sample_block.columns(); + } + }; + + StoragePtr storage; + Buffer buffer; + + explicit StorageData(const StorageInfo & storage_info); + StorageData(const StorageData & other) = delete; + }; + + using Storages = std::unordered_map; + +public: MaterializedPostgreSQLConsumer( ContextPtr context_, std::shared_ptr connection_, const String & replication_slot_name_, const String & publication_name_, const String & start_lsn, - const size_t max_block_size_, + size_t max_block_size_, bool schema_as_a_part_of_table_name_, bool allow_automatic_update_, - Storages storages_, + StorageInfos storages_, const String & name_for_logger); bool consume(std::vector> & skipped_tables); /// Called from reloadFromSnapshot by replication handler. This method is needed to move a table back into synchronization /// process if it was skipped due to schema changes. - void updateNested(const String & table_name, StoragePtr nested_storage, Int32 table_id, const String & table_start_lsn); + void updateNested(const String & table_name, StorageInfo nested_storage_info, Int32 table_id, const String & table_start_lsn); - void addNested(const String & postgres_table_name, StoragePtr nested_storage, const String & table_start_lsn); + void addNested(const String & postgres_table_name, StorageInfo nested_storage_info, const String & table_start_lsn); void removeNested(const String & postgres_table_name); @@ -55,25 +97,8 @@ private: bool isSyncAllowed(Int32 relation_id, const String & relation_name); - struct Buffer - { - ExternalResultDescription description; - MutableColumns columns; - - /// Needed to pass to insert query columns list in syncTables(). - std::shared_ptr columnsAST; - - /// Needed for insertPostgreSQLValue() method to parse array - std::unordered_map array_info; - - Buffer(StoragePtr storage) { createEmptyBuffer(storage); } - void createEmptyBuffer(StoragePtr storage); - }; - - using Buffers = std::unordered_map; - - static void insertDefaultValue(Buffer & buffer, size_t column_idx); - static void insertValue(Buffer & buffer, const std::string & value, size_t column_idx); + static void insertDefaultValue(StorageData::Buffer & buffer, size_t column_idx); + void insertValue(StorageData::Buffer & buffer, const std::string & value, size_t column_idx); enum class PostgreSQLQuery { @@ -82,7 +107,7 @@ private: DELETE }; - void readTupleData(Buffer & buffer, const char * message, size_t & pos, size_t size, PostgreSQLQuery type, bool old_value = false); + void readTupleData(StorageData::Buffer & buffer, const char * message, size_t & pos, size_t size, PostgreSQLQuery type, bool old_value = false); template static T unhexN(const char * message, size_t pos, size_t n); @@ -95,7 +120,7 @@ private: void markTableAsSkipped(Int32 relation_id, const String & relation_name); /// lsn - log sequnce nuumber, like wal offset (64 bit). - Int64 getLSNValue(const std::string & lsn) + static Int64 getLSNValue(const std::string & lsn) { UInt32 upper_half, lower_half; std::sscanf(lsn.data(), "%X/%X", &upper_half, &lower_half); @@ -125,28 +150,11 @@ private: /// Holds `postgres_table_name` set. std::unordered_set tables_to_sync; - /// `postgres_table_name` -> ReplacingMergeTree table. + /// `postgres_table_name` -> StorageData. Storages storages; - /// `postgres_table_name` -> In-memory buffer. - Buffers buffers; std::unordered_map relation_id_to_name; - struct SchemaData - { - Int16 number_of_columns; - /// data_type_id and type_modifier - std::vector> column_identifiers; - - SchemaData(Int16 number_of_columns_) : number_of_columns(number_of_columns_) {} - }; - - /// Cache for table schema data to be able to detect schema changes, because ddl is not - /// replicated with postgresql logical replication protocol, but some table schema info - /// is received if it is the first time we received dml message for given relation in current session or - /// if relation definition has changed since the last relation definition message. - std::unordered_map schema_data; - /// `postgres_relation_id` -> `start_lsn` /// skip_list contains relation ids for tables on which ddl was performed, which can break synchronization. /// This breaking changes are detected in replication stream in according replication message and table is added to skip list. diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp index 4848ae6c9ea..e7d72de2056 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp @@ -104,11 +104,16 @@ void PostgreSQLReplicationHandler::addStorage(const std::string & table_name, St } -void PostgreSQLReplicationHandler::startup() +void PostgreSQLReplicationHandler::startup(bool delayed) { - /// We load tables in a separate thread, because this database is not created yet. - /// (will get "database is currently dropped or renamed") - startup_task->activateAndSchedule(); + if (delayed) + { + startup_task->activateAndSchedule(); + } + else + { + startSynchronization(/* throw_on_error */ true); + } } @@ -175,6 +180,7 @@ void PostgreSQLReplicationHandler::shutdown() startup_task->deactivate(); consumer_task->deactivate(); cleanup_task->deactivate(); + consumer.reset(); /// Clear shared pointers to inner storages. } @@ -185,7 +191,7 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error) createPublicationIfNeeded(tx); /// List of nested tables (table_name -> nested_storage), which is passed to replication consumer. - std::unordered_map nested_storages; + std::unordered_map nested_storages; /// snapshot_name is initialized only if a new replication slot is created. /// start_lsn is initialized in two places: @@ -220,7 +226,7 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error) { try { - nested_storages[table_name] = loadFromSnapshot(*tmp_connection, snapshot_name, table_name, storage->as()); + nested_storages.emplace(table_name, loadFromSnapshot(*tmp_connection, snapshot_name, table_name, storage->as())); } catch (Exception & e) { @@ -262,7 +268,12 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error) auto * materialized_storage = storage->as (); try { - nested_storages[table_name] = materialized_storage->getNested(); + auto [postgres_table_schema, postgres_table_name] = getSchemaAndTableName(table_name); + auto table_structure = fetchPostgreSQLTableStructure(tx, postgres_table_name, postgres_table_schema, true, true, true); + if (!table_structure.physical_columns) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No columns"); + auto storage_info = StorageInfo(materialized_storage->getNested(), table_structure.physical_columns->attributes); + nested_storages.emplace(table_name, std::move(storage_info)); } catch (Exception & e) { @@ -315,7 +326,7 @@ ASTPtr PostgreSQLReplicationHandler::getCreateNestedTableQuery(StorageMaterializ } -StoragePtr PostgreSQLReplicationHandler::loadFromSnapshot(postgres::Connection & connection, String & snapshot_name, const String & table_name, +StorageInfo PostgreSQLReplicationHandler::loadFromSnapshot(postgres::Connection & connection, String & snapshot_name, const String & table_name, StorageMaterializedPostgreSQL * materialized_storage) { auto tx = std::make_shared(connection.getRef()); @@ -329,8 +340,13 @@ StoragePtr PostgreSQLReplicationHandler::loadFromSnapshot(postgres::Connection & query_str = fmt::format("SELECT * FROM {}", quoted_name); LOG_DEBUG(log, "Loading PostgreSQL table {}.{}", postgres_database, quoted_name); + auto table_structure = fetchTableStructure(*tx, table_name); + if (!table_structure->physical_columns) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No table attributes"); + auto table_attributes = table_structure->physical_columns->attributes; + auto table_override = tryGetTableOverride(current_database_name, table_name); - materialized_storage->createNestedIfNeeded(fetchTableStructure(*tx, table_name), table_override ? table_override->as() : nullptr); + materialized_storage->createNestedIfNeeded(std::move(table_structure), table_override ? table_override->as() : nullptr); auto nested_storage = materialized_storage->getNested(); auto insert = std::make_shared(); @@ -355,7 +371,7 @@ StoragePtr PostgreSQLReplicationHandler::loadFromSnapshot(postgres::Connection & auto nested_table_id = nested_storage->getStorageID(); LOG_DEBUG(log, "Loaded table {}.{} (uuid: {})", nested_table_id.database_name, nested_table_id.table_name, toString(nested_table_id.uuid)); - return nested_storage; + return StorageInfo(nested_storage, std::move(table_attributes)); } @@ -787,9 +803,6 @@ std::set PostgreSQLReplicationHandler::fetchTablesFromPublication(pqxx:: PostgreSQLTableStructurePtr PostgreSQLReplicationHandler::fetchTableStructure( pqxx::ReplicationTransaction & tx, const std::string & table_name) const { - if (!is_materialized_postgresql_database) - return nullptr; - PostgreSQLTableStructure structure; try { @@ -815,7 +828,7 @@ void PostgreSQLReplicationHandler::addTableToReplication(StorageMaterializedPost LOG_TRACE(log, "Adding table `{}` to replication", postgres_table_name); postgres::Connection replication_connection(connection_info, /* replication */true); String snapshot_name, start_lsn; - StoragePtr nested_storage; + StorageInfo nested_storage_info{ nullptr, {} }; { auto tx = std::make_shared(replication_connection.getRef()); @@ -831,8 +844,8 @@ void PostgreSQLReplicationHandler::addTableToReplication(StorageMaterializedPost throw Exception(ErrorCodes::LOGICAL_ERROR, "Internal table was not created"); postgres::Connection tmp_connection(connection_info); - nested_storage = loadFromSnapshot(tmp_connection, snapshot_name, postgres_table_name, materialized_storage); - materialized_storage->set(nested_storage); + nested_storage_info = loadFromSnapshot(tmp_connection, snapshot_name, postgres_table_name, materialized_storage); + materialized_storage->set(nested_storage_info.storage); } { @@ -841,7 +854,7 @@ void PostgreSQLReplicationHandler::addTableToReplication(StorageMaterializedPost } /// Pass storage to consumer and lsn position, from which to start receiving replication messages for this table. - consumer->addNested(postgres_table_name, nested_storage, start_lsn); + consumer->addNested(postgres_table_name, nested_storage_info, start_lsn); LOG_TRACE(log, "Table `{}` successfully added to replication", postgres_table_name); } catch (...) @@ -914,8 +927,8 @@ void PostgreSQLReplicationHandler::reloadFromSnapshot(const std::vectorcreateTemporary(); /// This snapshot is valid up to the end of the transaction, which exported it. - StoragePtr temp_nested_storage = loadFromSnapshot(tmp_connection, snapshot_name, table_name, - temp_materialized_storage->as ()); + auto [temp_nested_storage, table_attributes] = loadFromSnapshot( + tmp_connection, snapshot_name, table_name, temp_materialized_storage->as ()); auto table_id = materialized_storage->getNestedStorageID(); auto temp_table_id = temp_nested_storage->getStorageID(); @@ -949,7 +962,7 @@ void PostgreSQLReplicationHandler::reloadFromSnapshot(const std::vectorgetStorageID().getNameForLogs(), nested_sample_block.dumpStructure()); /// Pass pointer to new nested table into replication consumer, remove current table from skip list and set start lsn position. - consumer->updateNested(table_name, nested_storage, relation_id, start_lsn); + consumer->updateNested(table_name, StorageInfo(nested_storage, std::move(table_attributes)), relation_id, start_lsn); auto table_to_drop = DatabaseCatalog::instance().getTable(StorageID(temp_table_id.database_name, temp_table_id.table_name, table_id.uuid), nested_context); auto drop_table_id = table_to_drop->getStorageID(); diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h index c0a2a6f2559..263095ec9c2 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h @@ -29,7 +29,7 @@ public: bool is_materialized_postgresql_database_); /// Activate task to be run from a separate thread: wait until connection is available and call startReplication(). - void startup(); + void startup(bool delayed); /// Stop replication without cleanup. void shutdown(); @@ -87,7 +87,7 @@ private: void consumerFunc(); - StoragePtr loadFromSnapshot(postgres::Connection & connection, std::string & snapshot_name, const String & table_name, StorageMaterializedPostgreSQL * materialized_storage); + StorageInfo loadFromSnapshot(postgres::Connection & connection, std::string & snapshot_name, const String & table_name, StorageMaterializedPostgreSQL * materialized_storage); void reloadFromSnapshot(const std::vector> & relation_data); diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp index aefd1aedbf7..fe81b322bdb 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp @@ -87,14 +87,8 @@ StorageMaterializedPostgreSQL::StorageMaterializedPostgreSQL( *replication_settings, /* is_materialized_postgresql_database */false); - if (!is_attach) - { - replication_handler->addStorage(remote_table_name, this); - /// Start synchronization preliminary setup immediately and throw in case of failure. - /// It should be guaranteed that if MaterializedPostgreSQL table was created successfully, then - /// its nested table was also created. - replication_handler->startSynchronization(/* throw_on_error */ true); - } + replication_handler->addStorage(remote_table_name, this); + replication_handler->startup(/* delayed */is_attach); } @@ -234,19 +228,6 @@ void StorageMaterializedPostgreSQL::set(StoragePtr nested_storage) } -void StorageMaterializedPostgreSQL::startup() -{ - /// replication_handler != nullptr only in case of single table engine MaterializedPostgreSQL. - if (replication_handler && is_attach) - { - replication_handler->addStorage(remote_table_name, this); - /// In case of attach table use background startup in a separate thread. First wait until connection is reachable, - /// then check for nested table -- it should already be created. - replication_handler->startup(); - } -} - - void StorageMaterializedPostgreSQL::shutdown() { if (replication_handler) @@ -365,7 +346,7 @@ ASTPtr StorageMaterializedPostgreSQL::getColumnDeclaration(const DataTypePtr & d ast_expression->name = "DateTime64"; ast_expression->arguments = std::make_shared(); ast_expression->arguments->children.emplace_back(std::make_shared(UInt32(6))); - return ast_expression; + return std::move(ast_expression); } return std::make_shared(data_type->getName()); @@ -423,7 +404,7 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( table_id.database_name, table_id.table_name); } - if (!table_structure->columns && (!table_override || !table_override->columns)) + if (!table_structure->physical_columns && (!table_override || !table_override->columns)) { throw Exception(ErrorCodes::LOGICAL_ERROR, "No columns returned for table {}.{}", table_id.database_name, table_id.table_name); @@ -465,7 +446,7 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( } else { - ordinary_columns_and_types = *table_structure->columns; + ordinary_columns_and_types = table_structure->physical_columns->columns; columns_declare_list->set(columns_declare_list->columns, getColumnsExpressionList(ordinary_columns_and_types)); } @@ -475,7 +456,7 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( } else { - ordinary_columns_and_types = *table_structure->columns; + ordinary_columns_and_types = table_structure->physical_columns->columns; columns_declare_list->set(columns_declare_list->columns, getColumnsExpressionList(ordinary_columns_and_types)); } @@ -485,9 +466,9 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( NamesAndTypesList merging_columns; if (table_structure->primary_key_columns) - merging_columns = *table_structure->primary_key_columns; + merging_columns = table_structure->primary_key_columns->columns; else - merging_columns = *table_structure->replica_identity_columns; + merging_columns = table_structure->replica_identity_columns->columns; order_by_expression->name = "tuple"; order_by_expression->arguments = std::make_shared(); @@ -524,7 +505,7 @@ ASTPtr StorageMaterializedPostgreSQL::getCreateNestedTableQuery( storage_metadata.setConstraints(constraints); setInMemoryMetadata(storage_metadata); - return create_table_query; + return std::move(create_table_query); } diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h index 9e11f314738..ff9b95cad7c 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h @@ -74,8 +74,6 @@ public: String getName() const override { return "MaterializedPostgreSQL"; } - void startup() override; - void shutdown() override; /// Used only for single MaterializedPostgreSQL storage. diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index e53f5adec52..5df50ab9a7c 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -169,6 +169,7 @@ struct SelectQueryInfo bool ignore_projections = false; bool is_projection_query = false; bool merge_tree_empty_result = false; + bool settings_limit_offset_done = false; Block minmax_count_projection_block; MergeTreeDataSelectAnalysisResultPtr merge_tree_select_result_ptr; }; diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index 87a8ea2315d..0cc401aa93c 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -126,7 +126,13 @@ StorageBuffer::StorageBuffer( , bg_pool(getContext()->getBufferFlushSchedulePool()) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + auto dest_table = DatabaseCatalog::instance().getTable(destination_id, context_); + storage_metadata.setColumns(dest_table->getInMemoryMetadataPtr()->getColumns()); + } + else + storage_metadata.setColumns(columns_); storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); @@ -1167,6 +1173,7 @@ void registerStorageBuffer(StorageFactory & factory) }, { .supports_parallel_insert = true, + .supports_schema_inference = true, }); } diff --git a/src/Storages/StorageDictionary.cpp b/src/Storages/StorageDictionary.cpp index f6b330fe3df..da8c5f115b2 100644 --- a/src/Storages/StorageDictionary.cpp +++ b/src/Storages/StorageDictionary.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace DB @@ -21,6 +22,7 @@ namespace ErrorCodes extern const int THERE_IS_NO_COLUMN; extern const int CANNOT_DETACH_DICTIONARY_AS_TABLE; extern const int DICTIONARY_ALREADY_EXISTS; + extern const int NOT_IMPLEMENTED; } namespace @@ -111,10 +113,11 @@ StorageDictionary::StorageDictionary( const StorageID & table_id_, const String & dictionary_name_, const DictionaryStructure & dictionary_structure_, + const String & comment, Location location_, ContextPtr context_) : StorageDictionary( - table_id_, dictionary_name_, ColumnsDescription{getNamesAndTypes(dictionary_structure_)}, String{}, location_, context_) + table_id_, dictionary_name_, ColumnsDescription{getNamesAndTypes(dictionary_structure_)}, comment, location_, context_) { } @@ -126,6 +129,7 @@ StorageDictionary::StorageDictionary( table_id, table_id.getFullNameNotQuoted(), context_->getExternalDictionariesLoader().getDictionaryStructure(*dictionary_configuration), + dictionary_configuration->getString("dictionary.comment", ""), Location::SameDatabaseAndNameAsDictionary, context_) { @@ -230,7 +234,7 @@ void StorageDictionary::renameInMemory(const StorageID & new_table_id) if (move_to_atomic) configuration->setString("dictionary.uuid", toString(new_table_id.uuid)); else if (move_to_ordinary) - configuration->remove("dictionary.uuid"); + configuration->remove("dictionary.uuid"); } /// Dictionary is moving between databases of different engines or is renaming inside Ordinary database @@ -260,6 +264,40 @@ void StorageDictionary::renameInMemory(const StorageID & new_table_id) } } +void StorageDictionary::checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const +{ + for (const auto & command : commands) + { + if (location == Location::DictionaryDatabase || command.type != AlterCommand::COMMENT_TABLE) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Alter of type '{}' is not supported by storage {}", + command.type, getName()); + } +} + +void StorageDictionary::alter(const AlterCommands & params, ContextPtr alter_context, AlterLockHolder & lock_holder) +{ + IStorage::alter(params, alter_context, lock_holder); + + if (location == Location::Custom) + return; + + auto new_comment = getInMemoryMetadataPtr()->comment; + + auto storage_id = getStorageID(); + const auto & external_dictionaries_loader = getContext()->getExternalDictionariesLoader(); + auto result = external_dictionaries_loader.getLoadResult(storage_id.getInternalDictionaryName()); + + if (result.object) + { + auto dictionary = std::static_pointer_cast(result.object); + auto * dictionary_non_const = const_cast(dictionary.get()); + dictionary_non_const->setDictionaryComment(new_comment); + } + + std::lock_guard lock(dictionary_config_mutex); + configuration->setString("dictionary.comment", std::move(new_comment)); +} + void registerStorageDictionary(StorageFactory & factory) { factory.registerStorage("Dictionary", [](const StorageFactory::Arguments & args) diff --git a/src/Storages/StorageDictionary.h b/src/Storages/StorageDictionary.h index 7d0af8c0ee3..855d02b0947 100644 --- a/src/Storages/StorageDictionary.h +++ b/src/Storages/StorageDictionary.h @@ -42,6 +42,10 @@ public: void renameInMemory(const StorageID & new_table_id) override; + void checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const override; + + void alter(const AlterCommands & params, ContextPtr alter_context, AlterLockHolder &) override; + Poco::Timestamp getUpdateTime() const; LoadablesConfigurationPtr getConfiguration() const; @@ -89,6 +93,7 @@ private: const StorageID & table_id_, const String & dictionary_name_, const DictionaryStructure & dictionary_structure, + const String & comment, Location location_, ContextPtr context_); diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index ddf363e3957..19869b77106 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include #include @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -63,7 +62,6 @@ #include #include -#include #include #include @@ -71,8 +69,6 @@ #include #include -#include - #include #include #include @@ -329,7 +325,16 @@ StorageDistributed::StorageDistributed( , rng(randomSeed()) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + StorageID id = StorageID::createEmpty(); + id.table_name = remote_table; + id.database_name = remote_database; + storage_metadata.setColumns(getStructureOfRemoteTable(*getCluster(), id, getContext(), remote_table_function_ptr)); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); @@ -1398,6 +1403,7 @@ void registerStorageDistributed(StorageFactory & factory) { .supports_settings = true, .supports_parallel_insert = true, + .supports_schema_inference = true, .source_access_type = AccessType::REMOTE, }); } diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index 06fa8905639..e47e0fddd6c 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -53,6 +53,10 @@ public: bool supportsSubcolumns() const override { return true; } StoragePolicyPtr getStoragePolicy() const override; + /// Do not apply moving to PREWHERE optimization for distributed tables, + /// because we can't be sure that underlying table supports PREWHERE. + bool canMoveConditionsToPrewhere() const override { return false; } + bool isRemote() const override { return true; } QueryProcessingStage::Enum diff --git a/src/Storages/StorageExecutable.cpp b/src/Storages/StorageExecutable.cpp index 51ecfc1e884..21143438725 100644 --- a/src/Storages/StorageExecutable.cpp +++ b/src/Storages/StorageExecutable.cpp @@ -2,6 +2,8 @@ #include +#include + #include #include @@ -16,13 +18,12 @@ #include #include #include +#include #include #include #include #include -#include - namespace DB { @@ -30,80 +31,78 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; - extern const int LOGICAL_ERROR; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int TIMEOUT_EXCEEDED; } -StorageExecutable::StorageExecutable( - const StorageID & table_id_, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, - const ColumnsDescription & columns, - const ConstraintsDescription & constraints) - : IStorage(table_id_) - , script_name(script_name_) - , arguments(arguments_) - , format(format_) - , input_queries(input_queries_) - , log(&Poco::Logger::get("StorageExecutable")) +namespace { - StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns); - storage_metadata.setConstraints(constraints); - setInMemoryMetadata(storage_metadata); + void transformToSingleBlockSources(Pipes & inputs) + { + size_t inputs_size = inputs.size(); + for (size_t i = 0; i < inputs_size; ++i) + { + auto && input = inputs[i]; + QueryPipeline input_pipeline(std::move(input)); + PullingPipelineExecutor input_pipeline_executor(input_pipeline); + + auto header = input_pipeline_executor.getHeader(); + auto result_block = header.cloneEmpty(); + + size_t result_block_columns = result_block.columns(); + + Block result; + while (input_pipeline_executor.pull(result)) + { + for (size_t result_block_index = 0; result_block_index < result_block_columns; ++result_block_index) + { + auto & block_column = result.safeGetByPosition(result_block_index); + auto & result_block_column = result_block.safeGetByPosition(result_block_index); + + result_block_column.column->assumeMutable()->insertRangeFrom(*block_column.column, 0, block_column.column->size()); + } + } + + auto source = std::make_shared(std::move(result_block)); + inputs[i] = Pipe(std::move(source)); + } + } } StorageExecutable::StorageExecutable( const StorageID & table_id_, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, + const String & format, const ExecutableSettings & settings_, + const std::vector & input_queries_, const ColumnsDescription & columns, const ConstraintsDescription & constraints) : IStorage(table_id_) - , script_name(script_name_) - , arguments(arguments_) - , format(format_) - , input_queries(input_queries_) , settings(settings_) - /// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type. - , process_pool(std::make_shared(settings.pool_size == 0 ? std::numeric_limits::max() : settings.pool_size)) - , log(&Poco::Logger::get("StorageExecutablePool")) + , input_queries(input_queries_) + , log(settings.is_executable_pool ? &Poco::Logger::get("StorageExecutablePool") : &Poco::Logger::get("StorageExecutable")) { StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(columns); storage_metadata.setConstraints(constraints); setInMemoryMetadata(storage_metadata); + + ShellCommandSourceCoordinator::Configuration configuration + { + .format = format, + .command_termination_timeout_seconds = settings.command_termination_timeout, + .command_read_timeout_milliseconds = settings.command_read_timeout, + .command_write_timeout_milliseconds = settings.command_write_timeout, + + .pool_size = settings.pool_size, + .max_command_execution_time_seconds = settings.max_command_execution_time, + + .is_executable_pool = settings.is_executable_pool, + .send_chunk_header = settings.send_chunk_header, + .execute_direct = true + }; + + coordinator = std::make_unique(std::move(configuration)); } -class SendingChunkHeaderTransform final : public ISimpleTransform -{ -public: - SendingChunkHeaderTransform(const Block & header, WriteBuffer & buffer_) - : ISimpleTransform(header, header, false) - , buffer(buffer_) - { - } - - String getName() const override { return "SendingChunkHeaderTransform"; } - -protected: - - void transform(Chunk & chunk) override - { - writeText(chunk.getNumRows(), buffer); - writeChar('\n', buffer); - } - -private: - WriteBuffer & buffer; -}; - Pipe StorageExecutable::read( const Names & /*column_names*/, const StorageMetadataPtr & metadata_snapshot, @@ -113,10 +112,12 @@ Pipe StorageExecutable::read( size_t max_block_size, unsigned /*threads*/) { + auto & script_name = settings.script_name; + auto user_scripts_path = context->getUserScriptsPath(); auto script_path = user_scripts_path + '/' + script_name; - if (!pathStartsWith(script_path, user_scripts_path)) + if (!fileOrSymlinkPathStartsWith(script_path, user_scripts_path)) throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Executable file {} must be inside user scripts folder {}", script_name, @@ -128,101 +129,31 @@ Pipe StorageExecutable::read( script_name, user_scripts_path); - std::vector inputs; + Pipes inputs; inputs.reserve(input_queries.size()); for (auto & input_query : input_queries) { InterpreterSelectWithUnionQuery interpreter(input_query, context, {}); - inputs.emplace_back(interpreter.buildQueryPipeline()); + inputs.emplace_back(QueryPipelineBuilder::getPipe(interpreter.buildQueryPipeline())); } - ShellCommand::Config config(script_path); - config.arguments = arguments; - for (size_t i = 1; i < inputs.size(); ++i) - config.write_fds.emplace_back(i + 2); - - std::unique_ptr process; - - bool is_executable_pool = (process_pool != nullptr); - if (is_executable_pool) - { - bool result = process_pool->tryBorrowObject(process, [&config, this]() - { - config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, settings.command_termination_timeout }; - auto shell_command = ShellCommand::executeDirect(config); - return shell_command; - }, settings.max_command_execution_time * 10000); - - if (!result) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Could not get process from pool, max command execution timeout exceeded {} seconds", - settings.max_command_execution_time); - } - else - { - process = ShellCommand::executeDirect(config); - } - - std::vector tasks; - tasks.reserve(inputs.size()); - - for (size_t i = 0; i < inputs.size(); ++i) - { - WriteBufferFromFile * write_buffer = nullptr; - - if (i == 0) - { - write_buffer = &process->in; - } - else - { - auto descriptor = i + 2; - auto it = process->write_fds.find(descriptor); - if (it == process->write_fds.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Process does not contain descriptor to write {}", descriptor); - - write_buffer = &it->second; - } - - inputs[i].resize(1); - if (settings.send_chunk_header) - { - auto transform = std::make_shared(inputs[i].getHeader(), *write_buffer); - inputs[i].addTransform(std::move(transform)); - } - - auto pipeline = std::make_shared(QueryPipelineBuilder::getPipeline(std::move(inputs[i]))); - - auto out = context->getOutputFormat(format, *write_buffer, materializeBlock(pipeline->getHeader())); - out->setAutoFlush(); - pipeline->complete(std::move(out)); - - ShellCommandSource::SendDataTask task = [pipeline, write_buffer, is_executable_pool]() - { - CompletedPipelineExecutor executor(*pipeline); - executor.execute(); - - if (!is_executable_pool) - write_buffer->close(); - }; - - tasks.emplace_back(std::move(task)); - } + /// For executable pool we read data from input streams and convert it to single blocks streams. + if (settings.is_executable_pool) + transformToSingleBlockSources(inputs); auto sample_block = metadata_snapshot->getSampleBlock(); ShellCommandSourceConfiguration configuration; configuration.max_block_size = max_block_size; - if (is_executable_pool) + if (settings.is_executable_pool) { configuration.read_fixed_number_of_rows = true; configuration.read_number_of_rows_from_process_output = true; } - Pipe pipe(std::make_unique(context, format, std::move(sample_block), std::move(process), std::move(tasks), configuration, process_pool)); - return pipe; + return coordinator->createPipe(script_path, settings.script_arguments, std::move(inputs), std::move(sample_block), context, configuration); } void registerStorageExecutable(StorageFactory & factory) @@ -262,6 +193,11 @@ void registerStorageExecutable(StorageFactory & factory) const auto & columns = args.columns; const auto & constraints = args.constraints; + ExecutableSettings settings; + settings.script_name = script_name; + settings.script_arguments = script_name_with_arguments; + settings.is_executable_pool = is_executable_pool; + if (is_executable_pool) { size_t max_command_execution_time = 10; @@ -270,28 +206,28 @@ void registerStorageExecutable(StorageFactory & factory) if (max_execution_time_seconds != 0 && max_command_execution_time > max_execution_time_seconds) max_command_execution_time = max_execution_time_seconds; - ExecutableSettings pool_settings; - pool_settings.max_command_execution_time = max_command_execution_time; - if (args.storage_def->settings) - pool_settings.loadFromQuery(*args.storage_def); + settings.max_command_execution_time = max_command_execution_time; + } - return StorageExecutable::create(args.table_id, script_name, script_name_with_arguments, format, input_queries, pool_settings, columns, constraints); - } - else - { - return StorageExecutable::create(args.table_id, script_name, script_name_with_arguments, format, input_queries, columns, constraints); - } + if (args.storage_def->settings) + settings.loadFromQuery(*args.storage_def); + + auto global_context = args.getContext()->getGlobalContext(); + return StorageExecutable::create(args.table_id, format, settings, input_queries, columns, constraints); }; + StorageFactory::StorageFeatures storage_features; + storage_features.supports_settings = true; + factory.registerStorage("Executable", [&](const StorageFactory::Arguments & args) { return register_storage(args, false /*is_executable_pool*/); - }); + }, storage_features); factory.registerStorage("ExecutablePool", [&](const StorageFactory::Arguments & args) { return register_storage(args, true /*is_executable_pool*/); - }); + }, storage_features); } }; diff --git a/src/Storages/StorageExecutable.h b/src/Storages/StorageExecutable.h index 74df17f1463..b6248abae97 100644 --- a/src/Storages/StorageExecutable.h +++ b/src/Storages/StorageExecutable.h @@ -23,7 +23,7 @@ public: String getName() const override { - if (process_pool) + if (settings.is_executable_pool) return "ExecutablePool"; else return "Executable"; @@ -42,31 +42,17 @@ protected: StorageExecutable( const StorageID & table_id, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, - const ColumnsDescription & columns, - const ConstraintsDescription & constraints); - - StorageExecutable( - const StorageID & table_id, - const String & script_name_, - const std::vector & arguments_, - const String & format_, - const std::vector & input_queries_, - const ExecutableSettings & settings_, + const String & format, + const ExecutableSettings & settings, + const std::vector & input_queries, const ColumnsDescription & columns, const ConstraintsDescription & constraints); private: - String script_name; - std::vector arguments; - String format; - std::vector input_queries; ExecutableSettings settings; - std::shared_ptr process_pool; + std::vector input_queries; Poco::Logger * log; + std::unique_ptr coordinator; }; } diff --git a/src/Storages/StorageExternalDistributed.cpp b/src/Storages/StorageExternalDistributed.cpp index 927c070826b..40a2ad0b85e 100644 --- a/src/Storages/StorageExternalDistributed.cpp +++ b/src/Storages/StorageExternalDistributed.cpp @@ -272,7 +272,7 @@ void registerStorageExternalDistributed(StorageFactory & factory) ExternalDataSourceConfiguration configuration; if (auto named_collection = getExternalDataSourceConfiguration(inner_engine_args, args.getLocalContext())) { - auto [common_configuration, storage_specific_args] = named_collection.value(); + auto [common_configuration, storage_specific_args, _] = named_collection.value(); configuration.set(common_configuration); for (const auto & [name, value] : storage_specific_args) diff --git a/src/Storages/StorageFactory.h b/src/Storages/StorageFactory.h index 20db1a44897..6ffa6327176 100644 --- a/src/Storages/StorageFactory.h +++ b/src/Storages/StorageFactory.h @@ -66,6 +66,7 @@ public: bool supports_deduplication = false; /// See also IStorage::supportsParallelInsert() bool supports_parallel_insert = false; + bool supports_schema_inference = false; AccessType source_access_type = AccessType::NONE; }; @@ -98,6 +99,7 @@ public: .supports_replication = false, .supports_deduplication = false, .supports_parallel_insert = false, + .supports_schema_inference = false, .source_access_type = AccessType::NONE, }); @@ -126,6 +128,12 @@ public: AccessType getSourceAccessType(const String & table_engine) const; + bool checkIfStorageSupportsSchemaInterface(const String & storage_name) + { + if (storages.contains(storage_name)) + return storages[storage_name].features.supports_schema_inference; + return false; + } private: Storages storages; }; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 13a70af2ada..a479f982c70 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -15,8 +15,9 @@ #include #include -#include #include +#include +#include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +65,7 @@ namespace ErrorCodes extern const int INCOMPATIBLE_COLUMNS; extern const int CANNOT_STAT; extern const int LOGICAL_ERROR; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } namespace @@ -135,6 +138,56 @@ void checkCreationIsAllowed(ContextPtr context_global, const std::string & db_di throw Exception("File must not be a directory", ErrorCodes::INCORRECT_FILE_NAME); } +std::unique_ptr createReadBuffer( + const String & current_path, + bool use_table_fd, + const String & storage_name, + int table_fd, + const String & compression_method, + ContextPtr context) +{ + std::unique_ptr nested_buffer; + CompressionMethod method; + + struct stat file_stat{}; + + if (use_table_fd) + { + /// Check if file descriptor allows random reads (and reading it twice). + if (0 != fstat(table_fd, &file_stat)) + throwFromErrno("Cannot stat table file descriptor, inside " + storage_name, ErrorCodes::CANNOT_STAT); + + if (S_ISREG(file_stat.st_mode)) + nested_buffer = std::make_unique(table_fd); + else + nested_buffer = std::make_unique(table_fd); + + method = chooseCompressionMethod("", compression_method); + } + else + { + /// Check if file descriptor allows random reads (and reading it twice). + if (0 != stat(current_path.c_str(), &file_stat)) + throwFromErrno("Cannot stat file " + current_path, ErrorCodes::CANNOT_STAT); + + if (S_ISREG(file_stat.st_mode)) + nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); + else + nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); + + method = chooseCompressionMethod(current_path, compression_method); + } + + /// For clickhouse-local add progress callback to display progress bar. + if (context->getApplicationType() == Context::ApplicationType::LOCAL) + { + auto & in = static_cast(*nested_buffer); + in.setProgressCallback(context); + } + + return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); +} + } Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read) @@ -164,6 +217,42 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user return paths; } + +ColumnsDescription StorageFile::getTableStructureFromData( + const String & format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + ContextPtr context) +{ + if (format == "Distributed") + { + if (paths.empty()) + throw Exception( + "Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); + + auto source = StorageDistributedDirectoryMonitor::createSourceFromFile(paths[0]); + return ColumnsDescription(source->getOutputs().front().getHeader().getNamesAndTypesList()); + } + + auto read_buffer_creator = [&]() + { + String path; + auto it = std::find_if(paths.begin(), paths.end(), [](const String & p){ return std::filesystem::exists(p); }); + if (it == paths.end()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files with provided path. You must specify " + "table structure manually", + format); + + path = *it; + return createReadBuffer(path, false, "File", -1, compression_method, context); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, context); +} + bool StorageFile::isColumnOriented() const { return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); @@ -182,10 +271,13 @@ StorageFile::StorageFile(int table_fd_, CommonArguments args) throw Exception("Using file descriptor as source of storage isn't allowed for server daemons", ErrorCodes::DATABASE_ACCESS_DENIED); if (args.format_name == "Distributed") throw Exception("Distributed format is allowed only with explicit file path", ErrorCodes::INCORRECT_FILE_NAME); + if (args.columns.empty()) + throw Exception("Automatic schema inference is not allowed when using file descriptor as source of storage", ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE); is_db_table = false; use_table_fd = true; table_fd = table_fd_; + setStorageMetadata(args); } StorageFile::StorageFile(const std::string & table_path_, const std::string & user_files_path, CommonArguments args) @@ -194,22 +286,7 @@ StorageFile::StorageFile(const std::string & table_path_, const std::string & us is_db_table = false; paths = getPathsList(table_path_, user_files_path, args.getContext(), total_bytes_to_read); path_for_partitioned_write = table_path_; - - if (args.format_name == "Distributed") - { - if (paths.empty()) - throw Exception("Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); - - auto & first_path = paths[0]; - Block header = StorageDistributedDirectoryMonitor::createSourceFromFile(first_path)->getOutputs().front().getHeader(); - - StorageInMemoryMetadata storage_metadata; - auto columns = ColumnsDescription(header.getNamesAndTypesList()); - if (!args.columns.empty() && columns != args.columns) - throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS); - storage_metadata.setColumns(columns); - setInMemoryMetadata(storage_metadata); - } + setStorageMetadata(args); } StorageFile::StorageFile(const std::string & relative_table_dir_path, CommonArguments args) @@ -225,6 +302,8 @@ StorageFile::StorageFile(const std::string & relative_table_dir_path, CommonArgu paths = {getTablePath(table_dir_path, format_name)}; if (fs::exists(paths[0])) total_bytes_to_read = fs::file_size(paths[0]); + + setStorageMetadata(args); } StorageFile::StorageFile(CommonArguments args) @@ -233,9 +312,21 @@ StorageFile::StorageFile(CommonArguments args) , format_settings(args.format_settings) , compression_method(args.compression_method) , base_path(args.getContext()->getPath()) +{ +} + +void StorageFile::setStorageMetadata(CommonArguments args) { StorageInMemoryMetadata storage_metadata; - if (args.format_name != "Distributed") + + if (args.format_name == "Distributed" || args.columns.empty()) + { + auto columns = getTableStructureFromData(format_name, paths, compression_method, format_settings, args.getContext()); + if (!args.columns.empty() && args.columns != columns) + throw Exception("Table structure and file structure are different", ErrorCodes::INCOMPATIBLE_COLUMNS); + storage_metadata.setColumns(columns); + } + else storage_metadata.setColumns(args.columns); storage_metadata.setConstraints(args.constraints); @@ -350,46 +441,7 @@ public: } } - std::unique_ptr nested_buffer; - CompressionMethod method; - - struct stat file_stat{}; - - if (storage->use_table_fd) - { - /// Check if file descriptor allows random reads (and reading it twice). - if (0 != fstat(storage->table_fd, &file_stat)) - throwFromErrno("Cannot stat table file descriptor, inside " + storage->getName(), ErrorCodes::CANNOT_STAT); - - if (S_ISREG(file_stat.st_mode)) - nested_buffer = std::make_unique(storage->table_fd); - else - nested_buffer = std::make_unique(storage->table_fd); - - method = chooseCompressionMethod("", storage->compression_method); - } - else - { - /// Check if file descriptor allows random reads (and reading it twice). - if (0 != stat(current_path.c_str(), &file_stat)) - throwFromErrno("Cannot stat file " + current_path, ErrorCodes::CANNOT_STAT); - - if (S_ISREG(file_stat.st_mode)) - nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); - else - nested_buffer = std::make_unique(current_path, context->getSettingsRef().max_read_buffer_size); - - method = chooseCompressionMethod(current_path, storage->compression_method); - } - - /// For clickhouse-local add progress callback to display progress bar. - if (context->getApplicationType() == Context::ApplicationType::LOCAL) - { - auto & in = static_cast(*nested_buffer); - in.setProgressCallback(context); - } - - read_buf = wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); + read_buf = createReadBuffer(current_path, storage->use_table_fd, storage->getName(), storage->table_fd, storage->compression_method, context); auto get_block_for_format = [&]() -> Block { @@ -853,7 +905,8 @@ void registerStorageFile(StorageFactory & factory) { StorageFactory::StorageFeatures storage_features{ .supports_settings = true, - .source_access_type = AccessType::FILE + .supports_schema_inference = true, + .source_access_type = AccessType::FILE, }; factory.registerStorage( diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index f48d1c285da..6b015976589 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -1,6 +1,7 @@ #pragma once #include + #include #include @@ -70,6 +71,13 @@ public: bool supportsPartitionBy() const override { return true; } + static ColumnsDescription getTableStructureFromData( + const String & format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + ContextPtr context); + protected: friend class StorageFileSource; friend class StorageFileSink; @@ -86,6 +94,8 @@ protected: private: explicit StorageFile(CommonArguments args); + void setStorageMetadata(CommonArguments args); + std::string format_name; // We use format settings from global context + CREATE query for File table // function -- in this case, format_settings is set. diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 56844192ee9..49111e02b11 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -412,6 +412,11 @@ StoragePtr StorageMaterializedView::tryGetTargetTable() const return DatabaseCatalog::instance().tryGetTable(target_table_id, getContext()); } +NamesAndTypesList StorageMaterializedView::getVirtuals() const +{ + return getTargetTable()->getVirtuals(); +} + Strings StorageMaterializedView::getDataPaths() const { if (auto table = tryGetTargetTable()) diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index c110d0b211c..395560c1ca7 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -71,6 +71,9 @@ public: StoragePtr getTargetTable() const; StoragePtr tryGetTargetTable() const; + /// Get the virtual column of the target table; + NamesAndTypesList getVirtuals() const override; + ActionLock getActionLock(StorageActionBlockType type) override; Pipe read( diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index f82f9d21217..0dc6f2931d3 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -43,6 +43,7 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int SAMPLING_NOT_SUPPORTED; extern const int ALTER_OF_COLUMN_IS_FORBIDDEN; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } StorageMerge::StorageMerge( @@ -61,7 +62,7 @@ StorageMerge::StorageMerge( , database_is_regexp(database_is_regexp_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + storage_metadata.setColumns(columns_.empty() ? getColumnsDescriptionFromSourceTables() : columns_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } @@ -82,11 +83,19 @@ StorageMerge::StorageMerge( , database_is_regexp(database_is_regexp_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + storage_metadata.setColumns(columns_.empty() ? getColumnsDescriptionFromSourceTables() : columns_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription StorageMerge::getColumnsDescriptionFromSourceTables() const +{ + auto table = getFirstTable([](auto && t) { return t; }); + if (!table) + throw Exception{ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "There are no tables satisfied provided regexp, you must specify table structure manually"}; + return table->getInMemoryMetadataPtr()->getColumns(); +} + template StoragePtr StorageMerge::getFirstTable(F && predicate) const { @@ -107,6 +116,15 @@ StoragePtr StorageMerge::getFirstTable(F && predicate) const return {}; } +template +void StorageMerge::forEachTable(F && func) const +{ + getFirstTable([&func](const auto & table) + { + func(table); + return false; + }); +} bool StorageMerge::isRemote() const { @@ -114,6 +132,16 @@ bool StorageMerge::isRemote() const return first_remote_table != nullptr; } +bool StorageMerge::canMoveConditionsToPrewhere() const +{ + /// NOTE: This check is used during query analysis as condition for applying + /// "move to PREWHERE" optimization. However, it contains a logical race: + /// If new table that matches regexp for current storage and doesn't support PREWHERE + /// will appear after this check and before calling "read" method, the optimized query may fail. + /// Since it's quite rare case, we just ignore this possibility. + + return getFirstTable([](const auto & table) { return !table->canMoveConditionsToPrewhere(); }) == nullptr; +} bool StorageMerge::mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, ContextPtr query_context, const StorageMetadataPtr & /*metadata_snapshot*/) const { @@ -762,11 +790,15 @@ void StorageMerge::convertingSourceStream( IStorage::ColumnSizeByName StorageMerge::getColumnSizes() const { + ColumnSizeByName column_sizes; - auto first_materialized_mysql = getFirstTable([](const StoragePtr & table) { return table && table->getName() == "MaterializedMySQL"; }); - if (!first_materialized_mysql) - return {}; - return first_materialized_mysql->getColumnSizes(); + forEachTable([&](const auto & table) + { + for (const auto & [name, size] : table->getColumnSizes()) + column_sizes[name].add(size); + }); + + return column_sizes; } @@ -816,6 +848,9 @@ void registerStorageMerge(StorageFactory & factory) return StorageMerge::create( args.table_id, args.columns, args.comment, source_database_name_or_regexp, is_regexp, table_name_regexp, args.getContext()); + }, + { + .supports_schema_inference = true }); } diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 56adeab9279..e0d81531325 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -22,10 +22,12 @@ public: /// The check is delayed to the read method. It checks the support of the tables used. bool supportsSampling() const override { return true; } - bool supportsPrewhere() const override { return true; } bool supportsFinal() const override { return true; } bool supportsIndexForIn() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsPrewhere() const override { return true; } + + bool canMoveConditionsToPrewhere() const override; QueryProcessingStage::Enum getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; @@ -75,6 +77,9 @@ private: template StoragePtr getFirstTable(F && predicate) const; + template + void forEachTable(F && func) const; + DatabaseTablesIteratorPtr getDatabaseIterator(const String & database_name, ContextPtr context) const; DatabaseTablesIterators getDatabaseIterators(ContextPtr context) const; @@ -132,6 +137,8 @@ protected: static SelectQueryInfo getModifiedQueryInfo( const SelectQueryInfo & query_info, ContextPtr modified_context, const StorageID & current_storage_id, bool is_merge_engine); + + ColumnsDescription getColumnsDescriptionFromSourceTables() const; }; } diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 03ac27d0e46..11815d9ceef 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -229,7 +229,7 @@ std::optional StorageMergeTree::totalRows(const Settings &) const std::optional StorageMergeTree::totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, ContextPtr local_context) const { - auto parts = getDataPartsVector({DataPartState::Committed}); + auto parts = getDataPartsVector({DataPartState::Active}); return totalRowsByPartitionPredicateImpl(query_info, local_context, parts); } @@ -1294,7 +1294,7 @@ MergeTreeDataPartPtr StorageMergeTree::outdatePart(const String & part_name, boo { /// Forcefully stop merges and make part outdated auto merge_blocker = stopMergesAndWait(); - auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Committed}); + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active}); if (!part) throw Exception("Part " + part_name + " not found, won't try to drop it.", ErrorCodes::NO_SUCH_DATA_PART); removePartsFromWorkingSet({part}, true); @@ -1306,7 +1306,7 @@ MergeTreeDataPartPtr StorageMergeTree::outdatePart(const String & part_name, boo /// Wait merges selector std::unique_lock lock(currently_processing_in_background_mutex); - auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Committed}); + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active}); /// It's okay, part was already removed if (!part) return nullptr; @@ -1344,7 +1344,7 @@ void StorageMergeTree::dropPartition(const ASTPtr & partition, bool detach, Cont /// This protects against "revival" of data for a removed partition after completion of merge. auto merge_blocker = stopMergesAndWait(); String partition_id = getPartitionIDFromQuery(partition, local_context); - parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); /// TODO should we throw an exception if parts_to_remove is empty? removePartsFromWorkingSet(parts_to_remove, true); @@ -1426,7 +1426,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con MergeTreeData & src_data = checkStructureAndGetMergeTreeData(source_table, source_metadata_snapshot, my_metadata_snapshot); String partition_id = getPartitionIDFromQuery(partition, local_context); - DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); MutableDataPartsVector dst_parts; static const String TMP_PREFIX = "tmp_replace_from_"; @@ -1511,7 +1511,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const MergeTreeData & src_data = dest_table_storage->checkStructureAndGetMergeTreeData(*this, metadata_snapshot, dest_metadata_snapshot); String partition_id = getPartitionIDFromQuery(partition, local_context); - DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); MutableDataPartsVector dst_parts; static const String TMP_PREFIX = "tmp_move_from_"; @@ -1591,7 +1591,7 @@ CheckResults StorageMergeTree::checkData(const ASTPtr & query, ContextPtr local_ if (const auto & check_query = query->as(); check_query.partition) { String partition_id = getPartitionIDFromQuery(check_query.partition, local_context); - data_parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + data_parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); } else data_parts = getDataPartsVector(); diff --git a/src/Storages/StorageMongoDB.cpp b/src/Storages/StorageMongoDB.cpp index 2c1b44d8685..9b25b44c0e7 100644 --- a/src/Storages/StorageMongoDB.cpp +++ b/src/Storages/StorageMongoDB.cpp @@ -117,7 +117,7 @@ StorageMongoDBConfiguration StorageMongoDB::getConfiguration(ASTs engine_args, C StorageMongoDBConfiguration configuration; if (auto named_collection = getExternalDataSourceConfiguration(engine_args, context)) { - auto [common_configuration, storage_specific_args] = named_collection.value(); + auto [common_configuration, storage_specific_args, _] = named_collection.value(); configuration.set(common_configuration); for (const auto & [arg_name, arg_value] : storage_specific_args) diff --git a/src/Storages/StorageMySQL.cpp b/src/Storages/StorageMySQL.cpp index 66adf3ae272..83cf2b07b21 100644 --- a/src/Storages/StorageMySQL.cpp +++ b/src/Storages/StorageMySQL.cpp @@ -238,15 +238,17 @@ SinkToStoragePtr StorageMySQL::write(const ASTPtr & /*query*/, const StorageMeta } -StorageMySQLConfiguration StorageMySQL::getConfiguration(ASTs engine_args, ContextPtr context_) +StorageMySQLConfiguration StorageMySQL::getConfiguration(ASTs engine_args, ContextPtr context_, MySQLBaseSettings & storage_settings) { StorageMySQLConfiguration configuration; - if (auto named_collection = getExternalDataSourceConfiguration(engine_args, context_)) + if (auto named_collection = getExternalDataSourceConfiguration( + engine_args, context_, /* is_database_engine */false, /* throw_on_no_collection */true, storage_settings)) { - auto [common_configuration, storage_specific_args] = named_collection.value(); + auto [common_configuration, storage_specific_args, settings_changes] = named_collection.value(); configuration.set(common_configuration); configuration.addresses = {std::make_pair(configuration.host, configuration.port)}; + storage_settings.applyChanges(settings_changes); for (const auto & [arg_name, arg_value] : storage_specific_args) { @@ -298,9 +300,9 @@ void registerStorageMySQL(StorageFactory & factory) { factory.registerStorage("MySQL", [](const StorageFactory::Arguments & args) { - auto configuration = StorageMySQL::getConfiguration(args.engine_args, args.getLocalContext()); - MySQLSettings mysql_settings; /// TODO: move some arguments from the arguments to the SETTINGS. + auto configuration = StorageMySQL::getConfiguration(args.engine_args, args.getLocalContext(), mysql_settings); + if (args.storage_def->settings) mysql_settings.loadFromQuery(*args.storage_def); diff --git a/src/Storages/StorageMySQL.h b/src/Storages/StorageMySQL.h index cc3673e50ca..fe2ee8439bc 100644 --- a/src/Storages/StorageMySQL.h +++ b/src/Storages/StorageMySQL.h @@ -53,7 +53,7 @@ public: SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override; - static StorageMySQLConfiguration getConfiguration(ASTs engine_args, ContextPtr context_); + static StorageMySQLConfiguration getConfiguration(ASTs engine_args, ContextPtr context_, MySQLBaseSettings & storage_settings); private: friend class StorageMySQLSink; diff --git a/src/Storages/StoragePostgreSQL.cpp b/src/Storages/StoragePostgreSQL.cpp index 8327bb92a38..5042f911149 100644 --- a/src/Storages/StoragePostgreSQL.cpp +++ b/src/Storages/StoragePostgreSQL.cpp @@ -390,7 +390,7 @@ StoragePostgreSQLConfiguration StoragePostgreSQL::getConfiguration(ASTs engine_a StoragePostgreSQLConfiguration configuration; if (auto named_collection = getExternalDataSourceConfiguration(engine_args, context)) { - auto [common_configuration, storage_specific_args] = named_collection.value(); + auto [common_configuration, storage_specific_args, _] = named_collection.value(); configuration.set(common_configuration); configuration.addresses = {std::make_pair(configuration.host, configuration.port)}; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a1f82e14868..91a9c8567ba 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3,7 +3,6 @@ #include "Common/hex.h" #include #include -#include #include #include #include @@ -20,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -35,7 +33,6 @@ #include -#include #include #include @@ -45,7 +42,6 @@ #include #include #include -#include #include #include @@ -68,7 +64,6 @@ #include -#include #include #include @@ -194,56 +189,6 @@ zkutil::ZooKeeperPtr StorageReplicatedMergeTree::getZooKeeper() const return res; } -static std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log = nullptr) -{ - if (!zookeeper_path.empty() && zookeeper_path.back() == '/') - zookeeper_path.resize(zookeeper_path.size() - 1); - /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. - if (!zookeeper_path.empty() && zookeeper_path.front() != '/') - { - /// Do not allow this for new tables, print warning for tables created in old versions - if (check_starts_with_slash) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "ZooKeeper path must starts with '/', got '{}'", zookeeper_path); - if (log) - LOG_WARNING(log, "ZooKeeper path ('{}') does not start with '/'. It will not be supported in future releases"); - zookeeper_path = "/" + zookeeper_path; - } - - return zookeeper_path; -} - -static String extractZooKeeperName(const String & path) -{ - static constexpr auto default_zookeeper_name = "default"; - if (path.empty()) - throw Exception("ZooKeeper path should not be empty", ErrorCodes::BAD_ARGUMENTS); - if (path[0] == '/') - return default_zookeeper_name; - auto pos = path.find(":/"); - if (pos != String::npos && pos < path.find('/')) - { - auto zookeeper_name = path.substr(0, pos); - if (zookeeper_name.empty()) - throw Exception("Zookeeper path should start with '/' or ':/'", ErrorCodes::BAD_ARGUMENTS); - return zookeeper_name; - } - return default_zookeeper_name; -} - -static String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr) -{ - if (path.empty()) - throw Exception("ZooKeeper path should not be empty", ErrorCodes::BAD_ARGUMENTS); - if (path[0] == '/') - return normalizeZooKeeperPath(path, check_starts_with_slash, log); - auto pos = path.find(":/"); - if (pos != String::npos && pos < path.find('/')) - { - return normalizeZooKeeperPath(path.substr(pos + 1, String::npos), check_starts_with_slash, log); - } - return normalizeZooKeeperPath(path, check_starts_with_slash, log); -} - static MergeTreePartInfo makeDummyDropRangeForMovePartitionOrAttachPartitionFrom(const String & partition_id) { /// NOTE We don't have special log entry type for MOVE PARTITION/ATTACH PARTITION FROM, @@ -287,8 +232,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( true, /// require_part_metadata attach, [this] (const std::string & name) { enqueuePartForCheck(name); }) - , zookeeper_name(extractZooKeeperName(zookeeper_path_)) - , zookeeper_path(extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log)) + , zookeeper_name(zkutil::extractZooKeeperName(zookeeper_path_)) + , zookeeper_path(zkutil::extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log)) , replica_name(replica_name_) , replica_path(fs::path(zookeeper_path) / "replicas" / replica_name_) , reader(*this) @@ -497,6 +442,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( createNewZooKeeperNodes(); syncPinnedPartUUIDs(); + + createTableSharedID(); } @@ -1175,8 +1122,8 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks) /// Parts in ZK. NameSet expected_parts(expected_parts_vec.begin(), expected_parts_vec.end()); - /// There are no PreCommitted parts at startup. - auto parts = getDataParts({MergeTreeDataPartState::Committed, MergeTreeDataPartState::Outdated}); + /// There are no PreActive parts at startup. + auto parts = getDataParts({MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); /** Local parts that are not in ZK. * In very rare cases they may cover missing parts @@ -1515,9 +1462,9 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) if (is_get_or_attach || entry.type == LogEntry::MERGE_PARTS || entry.type == LogEntry::MUTATE_PART) { /// If we already have this part or a part covering it, we do not need to do anything. - /// The part may be still in the PreCommitted -> Committed transition so we first search - /// among PreCommitted parts to definitely find the desired part if it exists. - DataPartPtr existing_part = getPartIfExists(entry.new_part_name, {MergeTreeDataPartState::PreCommitted}); + /// The part may be still in the PreActive -> Active transition so we first search + /// among PreActive parts to definitely find the desired part if it exists. + DataPartPtr existing_part = getPartIfExists(entry.new_part_name, {MergeTreeDataPartState::PreActive}); if (!existing_part) existing_part = getActiveContainingPart(entry.new_part_name); @@ -1958,7 +1905,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) for (const PartDescriptionPtr & part_desc : all_parts) { - if (!getActiveContainingPart(part_desc->new_part_info, MergeTreeDataPartState::Committed, data_parts_lock)) + if (!getActiveContainingPart(part_desc->new_part_info, MergeTreeDataPartState::Active, data_parts_lock)) parts_to_add.emplace_back(part_desc); } @@ -2016,7 +1963,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); DataPartStates valid_states{ - MergeTreeDataPartState::PreCommitted, MergeTreeDataPartState::Committed, MergeTreeDataPartState::Outdated}; + MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}; size_t num_clonable_parts = 0; for (PartDescriptionPtr & part_desc : parts_to_add) @@ -3342,7 +3289,7 @@ void StorageReplicatedMergeTree::removePartAndEnqueueFetch(const String & part_n /// It's quite dangerous, so clone covered parts to detached. auto broken_part_info = MergeTreePartInfo::fromPartName(part_name, format_version); - auto partition_range = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, broken_part_info.partition_id); + auto partition_range = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, broken_part_info.partition_id); for (const auto & part : partition_range) { if (!broken_part_info.contains(part->info)) @@ -4274,7 +4221,7 @@ Pipe StorageReplicatedMergeTree::read( template -void StorageReplicatedMergeTree::foreachCommittedParts(Func && func, bool select_sequential_consistency) const +void StorageReplicatedMergeTree::foreachActiveParts(Func && func, bool select_sequential_consistency) const { std::optional max_added_blocks = {}; @@ -4285,7 +4232,7 @@ void StorageReplicatedMergeTree::foreachCommittedParts(Func && func, bool select max_added_blocks = getMaxAddedBlocks(); auto lock = lockParts(); - for (const auto & part : getDataPartsStateRange(DataPartState::Committed)) + for (const auto & part : getDataPartsStateRange(DataPartState::Active)) { if (part->isEmpty()) continue; @@ -4304,21 +4251,21 @@ void StorageReplicatedMergeTree::foreachCommittedParts(Func && func, bool select std::optional StorageReplicatedMergeTree::totalRows(const Settings & settings) const { UInt64 res = 0; - foreachCommittedParts([&res](auto & part) { res += part->rows_count; }, settings.select_sequential_consistency); + foreachActiveParts([&res](auto & part) { res += part->rows_count; }, settings.select_sequential_consistency); return res; } std::optional StorageReplicatedMergeTree::totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, ContextPtr local_context) const { DataPartsVector parts; - foreachCommittedParts([&](auto & part) { parts.push_back(part); }, local_context->getSettingsRef().select_sequential_consistency); + foreachActiveParts([&](auto & part) { parts.push_back(part); }, local_context->getSettingsRef().select_sequential_consistency); return totalRowsByPartitionPredicateImpl(query_info, local_context, parts); } std::optional StorageReplicatedMergeTree::totalBytes(const Settings & settings) const { UInt64 res = 0; - foreachCommittedParts([&res](auto & part) { res += part->getBytesOnDisk(); }, settings.select_sequential_consistency); + foreachActiveParts([&res](auto & part) { res += part->getBytesOnDisk(); }, settings.select_sequential_consistency); return res; } @@ -4863,12 +4810,12 @@ void StorageReplicatedMergeTree::restoreMetadataInZooKeeper() const DataPartsVector all_parts = getAllDataPartsVector(); Strings active_parts_names; - /// Why all parts (not only Committed) are moved to detached/: + /// Why all parts (not only Active) are moved to detached/: /// After ZK metadata restoration ZK resets sequential counters (including block number counters), so one may /// potentially encounter a situation that a part we want to attach already exists. for (const auto & part : all_parts) { - if (part->getState() == DataPartState::Committed) + if (part->getState() == DataPartState::Active) active_parts_names.push_back(part->name); forgetPartAndMoveToDetached(part); @@ -5561,8 +5508,8 @@ void StorageReplicatedMergeTree::fetchPartition( info.table_id = getStorageID(); info.table_id.uuid = UUIDHelpers::Nil; auto expand_from = query_context->getMacros()->expand(from_, info); - String auxiliary_zookeeper_name = extractZooKeeperName(expand_from); - String from = extractZooKeeperPath(expand_from, /* check_starts_with_slash */ true); + String auxiliary_zookeeper_name = zkutil::extractZooKeeperName(expand_from); + String from = zkutil::extractZooKeeperPath(expand_from, /* check_starts_with_slash */ true); if (from.empty()) throw Exception("ZooKeeper path should not be empty", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -6216,7 +6163,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( String partition_id = getPartitionIDFromQuery(partition, query_context); /// NOTE: Some covered parts may be missing in src_all_parts if corresponding log entries are not executed yet. - DataPartsVector src_all_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + DataPartsVector src_all_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); LOG_DEBUG(log, "Cloning {} parts", src_all_parts.size()); @@ -6638,7 +6585,7 @@ void StorageReplicatedMergeTree::movePartitionToShard( if (!move_part) throw Exception("MOVE PARTITION TO SHARD is not supported, use MOVE PART instead", ErrorCodes::NOT_IMPLEMENTED); - if (normalizeZooKeeperPath(zookeeper_path, /* check_starts_with_slash */ true) == normalizeZooKeeperPath(to, /* check_starts_with_slash */ true)) + if (zkutil::normalizeZooKeeperPath(zookeeper_path, /* check_starts_with_slash */ true) == zkutil::normalizeZooKeeperPath(to, /* check_starts_with_slash */ true)) throw Exception("Source and destination are the same", ErrorCodes::BAD_ARGUMENTS); auto zookeeper = getZooKeeper(); @@ -6646,7 +6593,7 @@ void StorageReplicatedMergeTree::movePartitionToShard( String part_name = partition->as().value.safeGet(); auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); - auto part = getPartIfExists(part_info, {MergeTreeDataPartState::Committed}); + auto part = getPartIfExists(part_info, {MergeTreeDataPartState::Active}); if (!part) throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "Part {} not found locally", part_name); @@ -6864,7 +6811,7 @@ bool StorageReplicatedMergeTree::dropPartImpl( { ReplicatedMergeTreeMergePredicate merge_pred = queue.getMergePredicate(zookeeper); - auto part = getPartIfExists(part_info, {MergeTreeDataPartState::Committed}); + auto part = getPartIfExists(part_info, {MergeTreeDataPartState::Active}); if (!part) { @@ -7039,7 +6986,7 @@ CheckResults StorageReplicatedMergeTree::checkData(const ASTPtr & query, Context if (const auto & check_query = query->as(); check_query.partition) { String partition_id = getPartitionIDFromQuery(check_query.partition, local_context); - data_parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); + data_parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); } else data_parts = getDataPartsVector(); @@ -7080,12 +7027,53 @@ void StorageReplicatedMergeTree::startBackgroundMovesIfNeeded() background_moves_assignee.start(); } + std::unique_ptr StorageReplicatedMergeTree::getDefaultSettings() const { return std::make_unique(getContext()->getReplicatedMergeTreeSettings()); } +String StorageReplicatedMergeTree::getTableSharedID() const +{ + return toString(table_shared_id); +} + + +void StorageReplicatedMergeTree::createTableSharedID() +{ + if (table_shared_id != UUIDHelpers::Nil) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Table shared id already initialized"); + + zkutil::ZooKeeperPtr zookeeper = getZooKeeper(); + String zookeeper_table_id_path = fs::path(zookeeper_path) / "table_shared_id"; + String id; + if (!zookeeper->tryGet(zookeeper_table_id_path, id)) + { + UUID table_id_candidate; + auto storage_id = getStorageID(); + if (storage_id.uuid != UUIDHelpers::Nil) + table_id_candidate = storage_id.uuid; + else + table_id_candidate = UUIDHelpers::generateV4(); + + id = toString(table_id_candidate); + + auto code = zookeeper->tryCreate(zookeeper_table_id_path, id, zkutil::CreateMode::Persistent); + if (code == Coordination::Error::ZNODEEXISTS) + { /// Other replica create node early + id = zookeeper->get(zookeeper_table_id_path); + } + else if (code != Coordination::Error::ZOK) + { + throw zkutil::KeeperException(code, zookeeper_table_id_path); + } + } + + table_shared_id = parseFromString(id); +} + + void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part) const { if (!part.volume) @@ -7093,7 +7081,6 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part) DiskPtr disk = part.volume->getDisk(); if (!disk || !disk->supportZeroCopyReplication()) return; - String zero_copy = fmt::format("zero_copy_{}", toString(disk->getType())); zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); if (!zookeeper) @@ -7102,73 +7089,100 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part) String id = part.getUniqueId(); boost::replace_all(id, "/", "_"); - String zookeeper_node = fs::path(zookeeper_path) / zero_copy / "shared" / part.name / id / replica_name; - - LOG_TRACE(log, "Set zookeeper lock {}", zookeeper_node); - - /// In rare case other replica can remove path between createAncestors and createIfNotExists - /// So we make up to 5 attempts - for (int attempts = 5; attempts > 0; --attempts) + Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk->getType(), getTableSharedID(), + part.name, zookeeper_path); + for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { - try - { - zookeeper->createAncestors(zookeeper_node); - zookeeper->createIfNotExists(zookeeper_node, "lock"); - break; - } - catch (const zkutil::KeeperException & e) - { - if (e.code == Coordination::Error::ZNONODE) - continue; - throw; - } + String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name; + + LOG_TRACE(log, "Set zookeeper lock {}", zookeeper_node); + createZeroCopyLockNode(zookeeper, zookeeper_node); } } bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const +{ + return unlockSharedData(part, part.name); +} + + +bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, const String & name) const { if (!part.volume) return true; DiskPtr disk = part.volume->getDisk(); if (!disk || !disk->supportZeroCopyReplication()) return true; - String zero_copy = fmt::format("zero_copy_{}", toString(disk->getType())); zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); if (!zookeeper) return true; - String id = part.getUniqueId(); + auto ref_count = part.getNumberOfRefereneces(); + if (ref_count > 0) /// Keep part shard info for frozen backups + return false; + + return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), name, replica_name, disk, zookeeper, *getSettings(), log, + zookeeper_path); +} + + +bool StorageReplicatedMergeTree::unlockSharedDataByID(String id, const String & table_uuid, const String & part_name, + const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings, + Poco::Logger * logger, const String & zookeeper_path_old) +{ boost::replace_all(id, "/", "_"); - String zookeeper_part_node = fs::path(zookeeper_path) / zero_copy / "shared" / part.name; - String zookeeper_part_uniq_node = fs::path(zookeeper_part_node) / id; - String zookeeper_node = fs::path(zookeeper_part_uniq_node) / replica_name; + Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk->getType(), table_uuid, part_name, zookeeper_path_old); - LOG_TRACE(log, "Remove zookeeper lock {}", zookeeper_node); + bool res = true; - zookeeper->tryRemove(zookeeper_node); - - Strings children; - zookeeper->tryGetChildren(zookeeper_part_uniq_node, children); - - if (!children.empty()) + for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { - LOG_TRACE(log, "Found zookeper locks for {}", zookeeper_part_uniq_node); - return false; + String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / id; + String zookeeper_node = fs::path(zookeeper_part_uniq_node) / replica_name_; + + LOG_TRACE(logger, "Remove zookeeper lock {}", zookeeper_node); + + zookeeper_ptr->tryRemove(zookeeper_node); + + Strings children; + zookeeper_ptr->tryGetChildren(zookeeper_part_uniq_node, children); + + if (!children.empty()) + { + LOG_TRACE(logger, "Found zookeper locks for {}", zookeeper_part_uniq_node); + res = false; + continue; + } + + auto e = zookeeper_ptr->tryRemove(zookeeper_part_uniq_node); + + LOG_TRACE(logger, "Remove parent zookeeper lock {} : {}", zookeeper_part_uniq_node, e != Coordination::Error::ZNOTEMPTY); + + /// Even when we have lock with same part name, but with different uniq, we can remove files on S3 + children.clear(); + String zookeeper_part_node = fs::path(zookeeper_part_uniq_node).parent_path(); + zookeeper_ptr->tryGetChildren(zookeeper_part_node, children); + if (children.empty()) + { + /// Cleanup after last uniq removing + e = zookeeper_ptr->tryRemove(zookeeper_part_node); + + LOG_TRACE(logger, "Remove parent zookeeper lock {} : {}", zookeeper_part_node, e != Coordination::Error::ZNOTEMPTY); + } + else + { + LOG_TRACE(logger, "Can't remove parent zookeeper lock {} : {}", zookeeper_part_node, children.size()); + for (auto & c : children) + { + LOG_TRACE(logger, "Child node {}", c); + } + } } - zookeeper->tryRemove(zookeeper_part_uniq_node); - - /// Even when we have lock with same part name, but with different uniq, we can remove files on S3 - children.clear(); - zookeeper->tryGetChildren(zookeeper_part_node, children); - if (children.empty()) - /// Cleanup after last uniq removing - zookeeper->tryRemove(zookeeper_part_node); - - return true; + return res; } @@ -7201,20 +7215,24 @@ String StorageReplicatedMergeTree::getSharedDataReplica( if (!zookeeper) return best_replica; - String zero_copy = fmt::format("zero_copy_{}", toString(disk_type)); - String zookeeper_part_node = fs::path(zookeeper_path) / zero_copy / "shared" / part.name; + Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk_type, getTableSharedID(), part.name, + zookeeper_path); - Strings ids; - zookeeper->tryGetChildren(zookeeper_part_node, ids); + std::set replicas; - Strings replicas; - for (const auto & id : ids) + for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { - String zookeeper_part_uniq_node = fs::path(zookeeper_part_node) / id; - Strings id_replicas; - zookeeper->tryGetChildren(zookeeper_part_uniq_node, id_replicas); - LOG_TRACE(log, "Found zookeper replicas for {}: {}", zookeeper_part_uniq_node, id_replicas.size()); - replicas.insert(replicas.end(), id_replicas.begin(), id_replicas.end()); + Strings ids; + zookeeper->tryGetChildren(zc_zookeeper_path, ids); + + for (const auto & id : ids) + { + String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / id; + Strings id_replicas; + zookeeper->tryGetChildren(zookeeper_part_uniq_node, id_replicas); + LOG_TRACE(log, "Found zookeper replicas for {}: {}", zookeeper_part_uniq_node, id_replicas.size()); + replicas.insert(id_replicas.begin(), id_replicas.end()); + } } LOG_TRACE(log, "Found zookeper replicas for part {}: {}", part.name, replicas.size()); @@ -7267,24 +7285,45 @@ String StorageReplicatedMergeTree::getSharedDataReplica( return best_replica; } -String StorageReplicatedMergeTree::findReplicaHavingPart( - const String & part_name, const String & zookeeper_path_, zkutil::ZooKeeper::Ptr zookeeper_) + +Strings StorageReplicatedMergeTree::getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, + const String & part_name, const String & zookeeper_path_old) { - Strings replicas = zookeeper_->getChildren(fs::path(zookeeper_path_) / "replicas"); + Strings res; + + String zero_copy = fmt::format("zero_copy_{}", toString(disk_type)); + + String new_path = fs::path(settings.remote_fs_zero_copy_zookeeper_path.toString()) / zero_copy / table_uuid / part_name; + res.push_back(new_path); + if (settings.remote_fs_zero_copy_path_compatible_mode && !zookeeper_path_old.empty()) + { /// Compatibility mode for cluster with old and new versions + String old_path = fs::path(zookeeper_path_old) / zero_copy / "shared" / part_name; + res.push_back(old_path); + } + + return res; +} + + +String StorageReplicatedMergeTree::findReplicaHavingPart( + const String & part_name, const String & zookeeper_path_, zkutil::ZooKeeper::Ptr zookeeper_ptr) +{ + Strings replicas = zookeeper_ptr->getChildren(fs::path(zookeeper_path_) / "replicas"); /// Select replicas in uniformly random order. std::shuffle(replicas.begin(), replicas.end(), thread_local_rng); for (const String & replica : replicas) { - if (zookeeper_->exists(fs::path(zookeeper_path_) / "replicas" / replica / "parts" / part_name) - && zookeeper_->exists(fs::path(zookeeper_path_) / "replicas" / replica / "is_active")) + if (zookeeper_ptr->exists(fs::path(zookeeper_path_) / "replicas" / replica / "parts" / part_name) + && zookeeper_ptr->exists(fs::path(zookeeper_path_) / "replicas" / replica / "is_active")) return fs::path(zookeeper_path_) / "replicas" / replica; } return {}; } + bool StorageReplicatedMergeTree::checkIfDetachedPartExists(const String & part_name) { fs::directory_iterator dir_end; @@ -7295,6 +7334,7 @@ bool StorageReplicatedMergeTree::checkIfDetachedPartExists(const String & part_n return false; } + bool StorageReplicatedMergeTree::checkIfDetachedPartitionExists(const String & partition_name) { fs::directory_iterator dir_end; @@ -7485,4 +7525,180 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP return true; } + +void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node) +{ + /// In rare case other replica can remove path between createAncestors and createIfNotExists + /// So we make up to 5 attempts + + for (int attempts = 5; attempts > 0; --attempts) + { + try + { + zookeeper->createAncestors(zookeeper_node); + zookeeper->createIfNotExists(zookeeper_node, "lock"); + break; + } + catch (const zkutil::KeeperException & e) + { + if (e.code == Coordination::Error::ZNONODE) + continue; + throw; + } + } +} + + +namespace +{ + +/// Special metadata used during freeze table. Required for zero-copy +/// replication. +struct FreezeMetaData +{ +public: + void fill(const StorageReplicatedMergeTree & storage) + { + is_replicated = storage.supportsReplication(); + is_remote = storage.isRemote(); + replica_name = storage.getReplicaName(); + zookeeper_name = storage.getZooKeeperName(); + table_shared_id = storage.getTableSharedID(); + } + + void save(DiskPtr disk, const String & path) const + { + auto file_path = getFileName(path); + auto buffer = disk->writeMetaFile(file_path, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite); + writeIntText(version, *buffer); + buffer->write("\n", 1); + writeBoolText(is_replicated, *buffer); + buffer->write("\n", 1); + writeBoolText(is_remote, *buffer); + buffer->write("\n", 1); + writeString(replica_name, *buffer); + buffer->write("\n", 1); + writeString(zookeeper_name, *buffer); + buffer->write("\n", 1); + writeString(table_shared_id, *buffer); + buffer->write("\n", 1); + } + + bool load(DiskPtr disk, const String & path) + { + auto file_path = getFileName(path); + if (!disk->exists(file_path)) + return false; + auto buffer = disk->readMetaFile(file_path, ReadSettings(), {}); + readIntText(version, *buffer); + if (version != 1) + { + LOG_ERROR(&Poco::Logger::get("FreezeMetaData"), "Unknown freezed metadata version: {}", version); + return false; + } + DB::assertChar('\n', *buffer); + readBoolText(is_replicated, *buffer); + DB::assertChar('\n', *buffer); + readBoolText(is_remote, *buffer); + DB::assertChar('\n', *buffer); + readString(replica_name, *buffer); + DB::assertChar('\n', *buffer); + readString(zookeeper_name, *buffer); + DB::assertChar('\n', *buffer); + readString(table_shared_id, *buffer); + DB::assertChar('\n', *buffer); + return true; + } + + static void clean(DiskPtr disk, const String & path) + { + disk->removeMetaFileIfExists(getFileName(path)); + } + +private: + static String getFileName(const String & path) + { + return fs::path(path) / "frozen_metadata.txt"; + } + +public: + int version = 1; + bool is_replicated; + bool is_remote; + String replica_name; + String zookeeper_name; + String table_shared_id; +}; + +} + +bool StorageReplicatedMergeTree::removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) +{ + if (disk->supportZeroCopyReplication()) + { + if (is_freezed) + { + FreezeMetaData meta; + if (meta.load(disk, path)) + { + FreezeMetaData::clean(disk, path); + return removeSharedDetachedPart(disk, path, part_name, meta.table_shared_id, meta.zookeeper_name, meta.replica_name, ""); + } + } + else + { + String table_id = getTableSharedID(); + + return removeSharedDetachedPart(disk, path, part_name, table_id, zookeeper_name, replica_name, zookeeper_path); + } + } + + disk->removeRecursive(path); + + return false; +} + + +bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const String & path, const String & part_name, const String & table_uuid, + const String &, const String & detached_replica_name, const String & detached_zookeeper_path) +{ + bool keep_shared = false; + + zkutil::ZooKeeperPtr zookeeper = getZooKeeper(); + + if (zookeeper) + { + fs::path checksums = fs::path(path) / "checksums.txt"; + if (disk->exists(checksums)) + { + auto ref_count = disk->getRefCount(checksums); + if (ref_count == 0) + { + String id = disk->getUniqueId(checksums); + keep_shared = !StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name, + detached_replica_name, disk, zookeeper, getContext()->getReplicatedMergeTreeSettings(), log, + detached_zookeeper_path); + } + else + keep_shared = true; + } + } + + disk->removeSharedRecursive(path, keep_shared); + + return keep_shared; +} + + +void StorageReplicatedMergeTree::createAndStoreFreezeMetadata(DiskPtr disk, DataPartPtr, String backup_part_path) const +{ + if (disk->supportZeroCopyReplication()) + { + FreezeMetaData meta; + meta.fill(*this); + meta.save(disk, backup_part_path); + } +} + + } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 6861d89f070..e390a0bcea4 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -236,6 +237,16 @@ public: /// Return false if data is still used by another node bool unlockSharedData(const IMergeTreeDataPart & part) const override; + /// Remove lock with old name for shared data part after rename + bool unlockSharedData(const IMergeTreeDataPart & part, const String & name) const override; + + /// Unlock shared data part in zookeeper by part id + /// Return true if data unlocked + /// Return false if data is still used by another node + static bool unlockSharedDataByID(String id, const String & table_uuid, const String & part_name, const String & replica_name_, + DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger, + const String & zookeeper_path_old); + /// Fetch part only if some replica has it on shared storage like S3 bool tryToFetchIfShared(const IMergeTreeDataPart & part, const DiskPtr & disk, const String & path) override; @@ -245,7 +256,7 @@ public: inline String getReplicaName() const { return replica_name; } /// Restores table metadata if ZooKeeper lost it. - /// Used only on restarted readonly replicas (not checked). All active (Committed) parts are moved to detached/ + /// Used only on restarted readonly replicas (not checked). All active (Active) parts are moved to detached/ /// folder and attached. Parts in all other states are just moved to detached/ folder. void restoreMetadataInZooKeeper(); @@ -263,6 +274,14 @@ public: bool createEmptyPartInsteadOfLost(zkutil::ZooKeeperPtr zookeeper, const String & lost_part_name); + // Return default or custom zookeeper name for table + String getZooKeeperName() const { return zookeeper_name; } + + // Return table id, common for different replicas + String getTableSharedID() const; + + static const String getDefaultZooKeeperName() { return default_zookeeper_name; } + private: std::atomic_bool are_restoring_replica {false}; @@ -391,8 +410,11 @@ private: ThrottlerPtr replicated_fetches_throttler; ThrottlerPtr replicated_sends_throttler; + /// Global ID, synced via ZooKeeper between replicas + UUID table_shared_id; + template - void foreachCommittedParts(Func && func, bool select_sequential_consistency) const; + void foreachActiveParts(Func && func, bool select_sequential_consistency) const; /** Creates the minimum set of nodes in ZooKeeper and create first replica. * Returns true if was created, false if exists. @@ -436,7 +458,7 @@ private: String getChecksumsForZooKeeper(const MergeTreeDataPartChecksums & checksums) const; - /// Accepts a PreCommitted part, atomically checks its checksums with ones on other replicas and commit the part + /// Accepts a PreActive part, atomically checks its checksums with ones on other replicas and commit the part DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const DataPartPtr & part); bool partIsAssignedToBackgroundOperation(const DataPartPtr & part) const override; @@ -720,6 +742,22 @@ private: PartitionBlockNumbersHolder allocateBlockNumbersInAffectedPartitions( const MutationCommands & commands, ContextPtr query_context, const zkutil::ZooKeeperPtr & zookeeper) const; + static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, + const String & part_name, const String & zookeeper_path_old); + + static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node); + + bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) override; + + bool removeSharedDetachedPart(DiskPtr disk, const String & path, const String & part_name, const String & table_uuid, + const String & zookeeper_name, const String & replica_name, const String & zookeeper_path); + + /// Create freeze metadata for table and save in zookeeper. Required only if zero-copy replication enabled. + void createAndStoreFreezeMetadata(DiskPtr disk, DataPartPtr part, String backup_part_path) const override; + + // Create table id if needed + void createTableSharedID(); + protected: /** If not 'attach', either creates a new table in ZK, or adds a replica to an existing table. */ diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 3a03ac3906c..3d988472b54 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -3,7 +3,6 @@ #if USE_AWS_S3 -#include #include #include @@ -25,9 +24,9 @@ #include #include -#include #include +#include #include #include @@ -70,6 +69,7 @@ namespace ErrorCodes extern const int S3_ERROR; extern const int UNEXPECTED_EXPRESSION; extern const int CANNOT_OPEN_FILE; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } class IOutputFormat; @@ -480,13 +480,39 @@ StorageS3::StorageS3( { context_->getGlobalContext()->getRemoteHostFilter().checkURL(uri_.uri); StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + + updateClientAndAuthSettings(context_, client_auth); + if (columns_.empty()) + { + auto columns = getTableStructureFromDataImpl(format_name, client_auth, max_single_read_retries_, compression_method, distributed_processing_, format_settings, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); - updateClientAndAuthSettings(context_, client_auth); } +std::shared_ptr StorageS3::createFileIterator(const ClientAuthentication & client_auth, bool distributed_processing, ContextPtr local_context) +{ + std::shared_ptr iterator_wrapper{nullptr}; + if (distributed_processing) + { + return std::make_shared( + [callback = local_context->getReadTaskCallback()]() -> String { + return callback(); + }); + } + + /// Iterate through disclosed globs and make a source for each file + auto glob_iterator = std::make_shared(*client_auth.client, client_auth.uri); + return std::make_shared([glob_iterator]() + { + return glob_iterator->next(); + }); +} Pipe StorageS3::read( const Names & column_names, @@ -510,23 +536,7 @@ Pipe StorageS3::read( need_file_column = true; } - std::shared_ptr iterator_wrapper{nullptr}; - if (distributed_processing) - { - iterator_wrapper = std::make_shared( - [callback = local_context->getReadTaskCallback()]() -> String { - return callback(); - }); - } - else - { - /// Iterate through disclosed globs and make a source for each file - auto glob_iterator = std::make_shared(*client_auth.client, client_auth.uri); - iterator_wrapper = std::make_shared([glob_iterator]() - { - return glob_iterator->next(); - }); - } + std::shared_ptr iterator_wrapper = createFileIterator(client_auth, distributed_processing, local_context); for (size_t i = 0; i < num_streams; ++i) { @@ -707,6 +717,51 @@ StorageS3Configuration StorageS3::getConfiguration(ASTs & engine_args, ContextPt return configuration; } +ColumnsDescription StorageS3::getTableStructureFromData( + const String & format, + const S3::URI & uri, + const String & access_key_id, + const String & secret_access_key, + UInt64 max_connections, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx) +{ + ClientAuthentication client_auth{uri, access_key_id, secret_access_key, max_connections, {}, {}}; + updateClientAndAuthSettings(ctx, client_auth); + return getTableStructureFromDataImpl(format, client_auth, max_single_read_retries, compression_method, distributed_processing, format_settings, ctx); +} + +ColumnsDescription StorageS3::getTableStructureFromDataImpl( + const String & format, + const ClientAuthentication & client_auth, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx) +{ + auto read_buffer_creator = [&]() + { + auto file_iterator = createFileIterator(client_auth, distributed_processing, ctx); + String current_key = (*file_iterator)(); + if (current_key.empty()) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot extract table structure from {} format file, because there are no files with provided path in S3. You must specify " + "table structure manually", + format); + + return wrapReadBufferWithCompressionMethod( + std::make_unique(client_auth.client, client_auth.uri.bucket, current_key, max_single_read_retries, ctx->getReadSettings()), + chooseCompressionMethod(current_key, compression_method)); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, ctx); +} + void registerStorageS3Impl(const String & name, StorageFactory & factory) { @@ -775,6 +830,7 @@ void registerStorageS3Impl(const String & name, StorageFactory & factory) { .supports_settings = true, .supports_sort_order = true, // for partition by + .supports_schema_inference = true, .source_access_type = AccessType::S3, }); } diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 248238379dc..0690040915d 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -147,8 +147,19 @@ public: static StorageS3Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context); -private: + static ColumnsDescription getTableStructureFromData( + const String & format, + const S3::URI & uri, + const String & access_key_id, + const String & secret_access_key, + UInt64 max_connections, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx); +private: friend class StorageS3Cluster; friend class TableFunctionS3Cluster; @@ -175,6 +186,17 @@ private: ASTPtr partition_by; static void updateClientAndAuthSettings(ContextPtr, ClientAuthentication &); + + static std::shared_ptr createFileIterator(const ClientAuthentication & client_auth, bool distributed_processing, ContextPtr local_context); + + static ColumnsDescription getTableStructureFromDataImpl( + const String & format, + const ClientAuthentication & client_auth, + UInt64 max_single_read_retries, + const String & compression_method, + bool distributed_processing, + const std::optional & format_settings, + ContextPtr ctx); }; } diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 0eec77ac8e7..471b460d349 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -13,8 +13,9 @@ #include #include -#include +#include #include +#include #include #include @@ -40,7 +41,7 @@ namespace ErrorCodes IStorageURLBase::IStorageURLBase( const String & uri_, - ContextPtr /*context_*/, + ContextPtr context_, const StorageID & table_id_, const String & format_name_, const std::optional & format_settings_, @@ -61,12 +62,48 @@ IStorageURLBase::IStorageURLBase( , partition_by(partition_by_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (columns_.empty()) + { + auto columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_); + storage_metadata.setColumns(columns); + } + else + storage_metadata.setColumns(columns_); storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); } +ColumnsDescription IStorageURLBase::getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + const ReadWriteBufferFromHTTP::HTTPHeaderEntries & headers, + const std::optional & format_settings, + ContextPtr context) +{ + auto read_buffer_creator = [&]() + { + auto parsed_uri = Poco::URI(uri); + return wrapReadBufferWithCompressionMethod( + std::make_unique( + parsed_uri, + Poco::Net::HTTPRequest::HTTP_GET, + nullptr, + ConnectionTimeouts::getHTTPTimeouts(context), + Poco::Net::HTTPBasicCredentials{}, + context->getSettingsRef().max_http_get_redirects, + DBMS_DEFAULT_BUFFER_SIZE, + context->getReadSettings(), + headers, + ReadWriteBufferFromHTTP::Range{}, + context->getRemoteHostFilter()), + chooseCompressionMethod(parsed_uri.getPath(), compression_method)); + }; + + return readSchemaFromFormat(format, format_settings, read_buffer_creator, context); +} + namespace { ReadWriteBufferFromHTTP::HTTPHeaderEntries getHeaders( @@ -642,6 +679,7 @@ void registerStorageURL(StorageFactory & factory) }, { .supports_settings = true, + .supports_schema_inference = true, .source_access_type = AccessType::URL, }); } diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index cf72352a183..790f01135d3 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -41,6 +41,14 @@ public: bool supportsPartitionBy() const override { return true; } + static ColumnsDescription getTableStructureFromData( + const String & format, + const String & uri, + const String & compression_method, + const ReadWriteBufferFromHTTP::HTTPHeaderEntries & headers, + const std::optional & format_settings, + ContextPtr context); + protected: IStorageURLBase( const String & uri_, diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index dcf664db6fe..bcf7d7856cf 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -140,7 +140,8 @@ void StorageView::read( current_inner_query = query_info.view_query->clone(); } - InterpreterSelectWithUnionQuery interpreter(current_inner_query, context, {}, column_names); + auto options = SelectQueryOptions(QueryProcessingStage::Complete, 0, false, query_info.settings_limit_offset_done); + InterpreterSelectWithUnionQuery interpreter(current_inner_query, context, options, column_names); interpreter.buildQueryPlan(query_plan); /// It's expected that the columns read from storage are not constant. diff --git a/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in b/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in index 8a19d7649aa..9435bdcc65b 100644 --- a/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in +++ b/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in @@ -50,10 +50,31 @@ const char * auto_config_build[] "USE_KRB5", "@USE_KRB5@", "USE_FILELOG", "@USE_FILELOG@", "USE_BZIP2", "@USE_BZIP2@", + "USE_AMQPCPP", "@USE_AMQPCPP@", + "USE_ROCKSDB", "@USE_ROCKSDB@", + "USE_NURAFT", "@USE_NURAFT@", + "USE_NLP", "@USE_NLP@", + "USE_SQLITE", "@USE_SQLITE@", + "USE_INTERNAL_LLVM_LIBRARY", "@USE_INTERNAL_LLVM_LIBRARY@", + "USE_OPENCL", "@USE_OPENCL@", + "USE_LIBPQXX", "@USE_LIBPQXX@", + "USE_AZURE_BLOB_STORAGE", "@USE_AZURE_BLOB_STORAGE@", + "USE_INTERNAL_SSL_LIBRARY", "@USE_INTERNAL_SSL_LIBRARY@", + "USE_AWS_S3", "@USE_AWS_S3@", + "USE_CASSANDRA", "@USE_CASSANDRA@", + "USE_YAML_CPP", "@USE_YAML_CPP@", + "USE_INTERNAL_HDFS3_LIBRARY", "@USE_INTERNAL_HDFS3_LIBRARY@", + "CLICKHOUSE_SPLIT_BINARY", "@CLICKHOUSE_SPLIT_BINARY@", + "USE_SENTRY", "@USE_SENTRY@", + "USE_DATASKETCHES", "@USE_DATASKETCHES@", + "USE_AVRO", "@USE_AVRO@", + "USE_ARROW", "@USE_ARROW@", + "USE_ORC", "@USE_ORC@", + "USE_MSGPACK", "@USE_MSGPACK@", "GIT_HASH", "@GIT_HASH@", - "GIT_BRANCH", "@GIT_BRANCH@", + "GIT_BRANCH", R"IRjaNsZIL9Yh7FQ4(@GIT_BRANCH@)IRjaNsZIL9Yh7FQ4", "GIT_DATE", "@GIT_DATE@", - "GIT_COMMIT_SUBJECT", "@GIT_COMMIT_SUBJECT@", + "GIT_COMMIT_SUBJECT", R"Gi17KJMlbGCjErEN(@GIT_COMMIT_SUBJECT@)Gi17KJMlbGCjErEN", nullptr, nullptr }; diff --git a/src/Storages/System/StorageSystemDictionaries.cpp b/src/Storages/System/StorageSystemDictionaries.cpp index d8f92d38081..c0d7d8cc4ed 100644 --- a/src/Storages/System/StorageSystemDictionaries.cpp +++ b/src/Storages/System/StorageSystemDictionaries.cpp @@ -142,7 +142,9 @@ void StorageSystemDictionaries::fillData(MutableColumns & res_columns, ContextPt res_columns[i++]->insertDefault(); if (dict_ptr) + { res_columns[i++]->insert(dict_ptr->getDictionaryComment()); + } else { if (load_result.config && load_result.config->config->has("dictionary.comment")) diff --git a/src/Storages/System/StorageSystemGraphite.cpp b/src/Storages/System/StorageSystemGraphite.cpp index dd592600d18..8711162385f 100644 --- a/src/Storages/System/StorageSystemGraphite.cpp +++ b/src/Storages/System/StorageSystemGraphite.cpp @@ -10,6 +10,7 @@ NamesAndTypesList StorageSystemGraphite::getNamesAndTypes() { return { {"config_name", std::make_shared()}, + {"rule_type", std::make_shared()}, {"regexp", std::make_shared()}, {"function", std::make_shared()}, {"age", std::make_shared()}, @@ -85,6 +86,7 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, ContextPtr co bool is_default = pattern.regexp == nullptr; String regexp; String function; + const String & rule_type = ruleTypeStr(pattern.rule_type); if (is_default) { @@ -107,6 +109,7 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, ContextPtr co { size_t i = 0; res_columns[i++]->insert(config.first); + res_columns[i++]->insert(rule_type); res_columns[i++]->insert(regexp); res_columns[i++]->insert(function); res_columns[i++]->insert(retention.age); @@ -121,6 +124,7 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, ContextPtr co { size_t i = 0; res_columns[i++]->insert(config.first); + res_columns[i++]->insert(rule_type); res_columns[i++]->insert(regexp); res_columns[i++]->insert(function); res_columns[i++]->insertDefault(); diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index 6826082ef1d..2efb337b302 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -117,7 +117,7 @@ void StorageSystemParts::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(part->getTypeName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part_state == State::Committed); + columns[res_index++]->insert(part_state == State::Active); if (columns_mask[src_index++]) columns[res_index++]->insert(part->getMarksCount()); if (columns_mask[src_index++]) diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index c730d5a95c9..6c8159ca720 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -57,12 +57,12 @@ StoragesInfo::getParts(MergeTreeData::DataPartStateVector & state, bool has_stat { /// If has_state_column is requested, return all states. if (!has_state_column) - return data->getDataPartsVector({State::Committed, State::Outdated}, &state, require_projection_parts); + return data->getDataPartsVector({State::Active, State::Outdated}, &state, require_projection_parts); return data->getAllDataPartsVector(&state, require_projection_parts); } - return data->getDataPartsVector({State::Committed}, &state, require_projection_parts); + return data->getDataPartsVector({State::Active}, &state, require_projection_parts); } StoragesInfoStream::StoragesInfoStream(const SelectQueryInfo & query_info, ContextPtr context) diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp index f1b3a13c332..f5e9b82c136 100644 --- a/src/Storages/System/StorageSystemPartsColumns.cpp +++ b/src/Storages/System/StorageSystemPartsColumns.cpp @@ -132,7 +132,7 @@ void StorageSystemPartsColumns::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(part->getTypeName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part_state == State::Committed); + columns[res_index++]->insert(part_state == State::Active); if (columns_mask[src_index++]) columns[res_index++]->insert(part->getMarksCount()); diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index 378437bd4ec..d15acc97cb1 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -125,7 +125,7 @@ void StorageSystemProjectionParts::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(parent_part->getTypeName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part_state == State::Committed); + columns[res_index++]->insert(part_state == State::Active); if (columns_mask[src_index++]) columns[res_index++]->insert(part->getMarksCount()); if (columns_mask[src_index++]) diff --git a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp index f6490177014..29c877733d8 100644 --- a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp +++ b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp @@ -146,7 +146,7 @@ void StorageSystemProjectionPartsColumns::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(parent_part->getTypeName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part_state == State::Committed); + columns[res_index++]->insert(part_state == State::Active); if (columns_mask[src_index++]) columns[res_index++]->insert(part->getMarksCount()); if (columns_mask[src_index++]) diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index ac52f0afb32..24e3fe4f7a9 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -88,6 +88,26 @@ static ColumnPtr getFilteredDatabases(const SelectQueryInfo & query_info, Contex return block.getByPosition(0).column; } +static ColumnPtr getFilteredTables(const ASTPtr & query, const ColumnPtr & filtered_databases_column, ContextPtr context) +{ + MutableColumnPtr column = ColumnString::create(); + + for (size_t database_idx = 0; database_idx < filtered_databases_column->size(); ++database_idx) + { + const auto & database_name = filtered_databases_column->getDataAt(database_idx).toString(); + DatabasePtr database = DatabaseCatalog::instance().tryGetDatabase(database_name); + if (!database) + continue; + + for (auto table_it = database->getTablesIterator(context); table_it->isValid(); table_it->next()) + column->insert(table_it->name()); + } + + Block block {ColumnWithTypeAndName(std::move(column), std::make_shared(), "name")}; + VirtualColumnUtils::filterBlockWithQuery(query, block, context); + return block.getByPosition(0).column; +} + /// Avoid heavy operation on tables if we only queried columns that we can get without table object. /// Otherwise it will require table initialization for Lazy database. static bool needLockStructure(const DatabasePtr & database, const Block & header) @@ -112,12 +132,19 @@ public: Block header, UInt64 max_block_size_, ColumnPtr databases_, + ColumnPtr tables_, ContextPtr context_) : SourceWithProgress(std::move(header)) , columns_mask(std::move(columns_mask_)) , max_block_size(max_block_size_) , databases(std::move(databases_)) - , context(Context::createCopy(context_)) {} + , context(Context::createCopy(context_)) + { + size_t size = tables_->size(); + tables.reserve(size); + for (size_t idx = 0; idx < size; ++idx) + tables.insert(tables_->getDataAt(idx).toString()); + } String getName() const override { return "Tables"; } @@ -239,6 +266,9 @@ protected: for (; rows_count < max_block_size && tables_it->isValid(); tables_it->next()) { auto table_name = tables_it->name(); + if (!tables.contains(table_name)) + continue; + if (check_access_for_tables && !access->isGranted(AccessType::SHOW_TABLES, database_name, table_name)) continue; @@ -514,6 +544,7 @@ private: std::vector columns_mask; UInt64 max_block_size; ColumnPtr databases; + NameSet tables; size_t database_idx = 0; DatabaseTablesIteratorPtr tables_it; ContextPtr context; @@ -552,9 +583,10 @@ Pipe StorageSystemTables::read( } ColumnPtr filtered_databases_column = getFilteredDatabases(query_info, context); + ColumnPtr filtered_tables_column = getFilteredTables(query_info.query, filtered_databases_column, context); return Pipe(std::make_shared( - std::move(columns_mask), std::move(res_block), max_block_size, std::move(filtered_databases_column), context)); + std::move(columns_mask), std::move(res_block), max_block_size, std::move(filtered_databases_column), std::move(filtered_tables_column), context)); } } diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index 25ecc0e16ef..a81a5a9649a 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -91,10 +92,6 @@ namespace data.is_hop = t->name == "hop"; auto temp_node = t->clone(); temp_node->setAlias(""); - if (startsWith(t->arguments->children[0]->getColumnName(), "toDateTime")) - throw Exception( - "The first argument of time window function should not be a constant value.", - ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW); if (!data.window_function) { data.serialized_window_function = serializeAST(*temp_node); @@ -565,7 +562,13 @@ std::shared_ptr StorageWindowView::getInnerTableCreateQuery( inner_create_query->setDatabase(database_name); inner_create_query->setTable(table_name); - auto inner_select_query = std::static_pointer_cast(inner_query); + Aliases aliases; + QueryAliasesVisitor(aliases).visit(inner_query); + auto inner_query_normalized = inner_query->clone(); + QueryNormalizer::Data normalizer_data(aliases, {}, false, getContext()->getSettingsRef(), false); + QueryNormalizer(normalizer_data).visit(inner_query_normalized); + + auto inner_select_query = std::static_pointer_cast(inner_query_normalized); auto t_sample_block = InterpreterSelectQuery( @@ -582,6 +585,8 @@ std::shared_ptr StorageWindowView::getInnerTableCreateQuery( columns_list->children.push_back(column_window); } + bool has_window_id = false; + for (const auto & column : t_sample_block.getColumnsWithTypeAndName()) { ParserIdentifierWithOptionalParameters parser; @@ -591,8 +596,18 @@ std::shared_ptr StorageWindowView::getInnerTableCreateQuery( column_dec->name = column.name; column_dec->type = ast; columns_list->children.push_back(column_dec); + if (!is_time_column_func_now && !has_window_id) + { + if (startsWith(column.name, "windowID")) + has_window_id = true; + } } + if (!is_time_column_func_now && !has_window_id) + throw Exception( + "The first argument of time window function should not be a constant value.", + ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_WINDOW_VIEW); + ToIdentifierMatcher::Data query_data; query_data.window_id_name = window_id_name; query_data.window_id_alias = window_id_alias; @@ -634,10 +649,15 @@ std::shared_ptr StorageWindowView::getInnerTableCreateQuery( /// tumble/hop -> windowID func_window_visitor.visit(node); to_identifier_visitor.visit(node); + QueryNormalizer(normalizer_data).visit(node); + node->setAlias(""); new_storage->set(field, node); } }; + for (auto & [alias_name, ast] : aliases) + ast = std::make_shared(ast->getColumnName()); + visit(storage->partition_by, new_storage->partition_by); visit(storage->primary_key, new_storage->primary_key); visit(storage->order_by, new_storage->order_by); @@ -877,12 +897,12 @@ void StorageWindowView::threadFuncFireEvent() std::unique_lock lock(fire_signal_mutex); while (!shutdown_called) { - LOG_TRACE(log, "Fire events: {}", fire_signal.size()); - bool signaled = std::cv_status::no_timeout == fire_signal_condition.wait_for(lock, std::chrono::seconds(5)); if (!signaled) continue; + LOG_TRACE(log, "Fire events: {}", fire_signal.size()); + while (!fire_signal.empty()) { fire(fire_signal.front()); diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index fa7f6e52220..42b24abdbbe 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -15,25 +15,23 @@ namespace DB { StoragePtr ITableFunction::execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, - ColumnsDescription cached_columns) const + ColumnsDescription cached_columns, bool use_global_context) const { ProfileEvents::increment(ProfileEvents::TableFunctionExecute); context->checkAccess(AccessType::CREATE_TEMPORARY_TABLE | StorageFactory::instance().getSourceAccessType(getStorageTypeName())); + auto context_to_use = use_global_context ? context->getGlobalContext() : context; + if (cached_columns.empty()) return executeImpl(ast_function, context, table_name, std::move(cached_columns)); - /// We have table structure, so it's CREATE AS table_function(). - /// We should use global context here because there will be no query context on server startup - /// and because storage lifetime is bigger than query context lifetime. - auto global_context = context->getGlobalContext(); if (hasStaticStructure() && cached_columns == getActualTableStructure(context)) - return executeImpl(ast_function, global_context, table_name, std::move(cached_columns)); + return executeImpl(ast_function, context_to_use, table_name, std::move(cached_columns)); auto this_table_function = shared_from_this(); auto get_storage = [=]() -> StoragePtr { - return this_table_function->executeImpl(ast_function, global_context, table_name, cached_columns); + return this_table_function->executeImpl(ast_function, context_to_use, table_name, cached_columns); }; /// It will request actual table structure and create underlying storage lazily diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index 56147ffd598..93cf5057e88 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -54,7 +54,7 @@ public: /// Create storage according to the query. StoragePtr - execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}) const; + execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}, bool use_global_context = false) const; virtual ~ITableFunction() = default; diff --git a/src/TableFunctions/ITableFunctionFileLike.cpp b/src/TableFunctions/ITableFunctionFileLike.cpp index 699ad698bd8..4395c318983 100644 --- a/src/TableFunctions/ITableFunctionFileLike.cpp +++ b/src/TableFunctions/ITableFunctionFileLike.cpp @@ -1,4 +1,3 @@ -#include #include #include @@ -6,16 +5,16 @@ #include #include -#include #include #include -#include #include #include +#include + namespace DB { @@ -23,10 +22,27 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int INCORRECT_FILE_NAME; extern const int BAD_ARGUMENTS; } +namespace +{ + void checkIfFormatSupportsAutoStructure(const String & name, const String & format) + { + if (name == "file" && format == "Distributed") + return; + + if (FormatFactory::instance().checkIfFormatHasAnySchemaReader(format)) + return; + + throw Exception( + "Table function '" + name + + "' allows automatic structure determination only for formats that support schema inference and for Distributed format in table function " + "'file'", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + } +} + void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context) { /// Parse args @@ -46,21 +62,23 @@ void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, Context filename = args[0]->as().value.safeGet(); format = args[1]->as().value.safeGet(); - if (args.size() == 2 && getName() == "file") + if (args.size() == 2) { - if (format == "Distributed") - return; - throw Exception("Table function '" + getName() + "' allows 2 arguments only for Distributed format.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + checkIfFormatSupportsAutoStructure(getName(), format); + return; } if (args.size() != 3 && args.size() != 4) - throw Exception("Table function '" + getName() + "' requires 3 or 4 arguments: filename, format, structure and compression method (default auto).", + throw Exception("Table function '" + getName() + "' requires 2, 3 or 4 arguments: filename, format, structure (default auto) and compression method (default auto)", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); structure = args[2]->as().value.safeGet(); + if (structure == "auto") + checkIfFormatSupportsAutoStructure(getName(), format); + if (structure.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Table structure is empty for table function '{}'", + "Table structure is empty for table function '{}'. If you want to use automatic schema inference, use 'auto'", ast_function->formatForErrorMessage()); if (args.size() == 4) @@ -69,25 +87,12 @@ void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, Context StoragePtr ITableFunctionFileLike::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - auto columns = getActualTableStructure(context); + ColumnsDescription columns; + if (structure != "auto") + columns = parseColumnsListFromString(structure, context); StoragePtr storage = getStorage(filename, format, columns, context, table_name, compression_method); storage->startup(); return storage; } -ColumnsDescription ITableFunctionFileLike::getActualTableStructure(ContextPtr context) const -{ - if (structure.empty()) - { - assert(getName() == "file" && format == "Distributed"); - size_t total_bytes_to_read = 0; - Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read); - if (paths.empty()) - throw Exception("Cannot get table structure from file, because no files match specified name", ErrorCodes::INCORRECT_FILE_NAME); - auto source = StorageDistributedDirectoryMonitor::createSourceFromFile(paths[0]); - return ColumnsDescription{source->getOutputs().front().getHeader().getNamesAndTypesList()}; - } - return parseColumnsListFromString(structure, context); -} - } diff --git a/src/TableFunctions/ITableFunctionFileLike.h b/src/TableFunctions/ITableFunctionFileLike.h index 2069f02b0dd..2ceafdee229 100644 --- a/src/TableFunctions/ITableFunctionFileLike.h +++ b/src/TableFunctions/ITableFunctionFileLike.h @@ -8,7 +8,7 @@ class ColumnsDescription; class Context; /* - * function(source, format, structure) - creates a temporary storage from formatted source + * function(source, format, structure[, compression_method]) - creates a temporary storage from formatted source */ class ITableFunctionFileLike : public ITableFunction { @@ -18,7 +18,7 @@ protected: String filename; String format; - String structure; + String structure = "auto"; String compression_method = "auto"; private: @@ -28,8 +28,7 @@ private: const String & source, const String & format, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method) const = 0; - ColumnsDescription getActualTableStructure(ContextPtr context) const override; - - bool hasStaticStructure() const override { return true; } + bool hasStaticStructure() const override { return structure != "auto"; } }; + } diff --git a/src/TableFunctions/TableFunctionExecutable.cpp b/src/TableFunctions/TableFunctionExecutable.cpp index 9edb75b0a69..41ba2db5c33 100644 --- a/src/TableFunctions/TableFunctionExecutable.cpp +++ b/src/TableFunctions/TableFunctionExecutable.cpp @@ -75,7 +75,12 @@ ColumnsDescription TableFunctionExecutable::getActualTableStructure(ContextPtr c StoragePtr TableFunctionExecutable::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { auto storage_id = StorageID(getDatabaseName(), table_name); - auto storage = StorageExecutable::create(storage_id, script_name, arguments, format, input_queries, getActualTableStructure(context), ConstraintsDescription{}); + auto global_context = context->getGlobalContext(); + ExecutableSettings settings; + settings.script_name = script_name; + settings.script_arguments = std::move(arguments); + + auto storage = StorageExecutable::create(storage_id, format, settings, input_queries, getActualTableStructure(context), ConstraintsDescription{}); storage->startup(); return storage; } diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index d8bdb3b45c4..71aba5494e8 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -1,4 +1,5 @@ #include +#include #include "registerTableFunctions.h" #include @@ -9,11 +10,13 @@ namespace DB { + StoragePtr TableFunctionFile::getStorage(const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const std::string & compression_method_) const { + LOG_DEBUG(&Poco::Logger::get("TableFunctionFile"), "getStorage"); // For `file` table function, we are going to use format settings from the // query context. StorageFile::CommonArguments args{ @@ -30,8 +33,21 @@ StoragePtr TableFunctionFile::getStorage(const String & source, return StorageFile::create(source, global_context->getUserFilesPath(), args); } +ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + { + size_t total_bytes_to_read = 0; + Strings paths = StorageFile::getPathsList(filename, context->getUserFilesPath(), context, total_bytes_to_read); + return StorageFile::getTableStructureFromData(format, paths, compression_method, std::nullopt, context); + } + + return parseColumnsListFromString(structure, context); +} + void registerTableFunctionFile(TableFunctionFactory & factory) { factory.registerFunction(); } + } diff --git a/src/TableFunctions/TableFunctionFile.h b/src/TableFunctions/TableFunctionFile.h index 460656a7218..f26e4a9c06d 100644 --- a/src/TableFunctions/TableFunctionFile.h +++ b/src/TableFunctions/TableFunctionFile.h @@ -6,7 +6,7 @@ namespace DB { -/* file(path, format, structure) - creates a temporary storage from file +/* file(path, format[, structure, compression]) - creates a temporary storage from file * * The file must be in the clickhouse data directory. * The relative path begins with the clickhouse data directory. @@ -20,9 +20,13 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + private: StoragePtr getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const std::string & compression_method_) const override; const char * getStorageTypeName() const override { return "File"; } -};} +}; + +} diff --git a/src/TableFunctions/TableFunctionHDFS.cpp b/src/TableFunctions/TableFunctionHDFS.cpp index 245674b0e06..b626f563977 100644 --- a/src/TableFunctions/TableFunctionHDFS.cpp +++ b/src/TableFunctions/TableFunctionHDFS.cpp @@ -6,9 +6,11 @@ #include #include #include +#include namespace DB { + StoragePtr TableFunctionHDFS::getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method_) const @@ -24,12 +26,18 @@ StoragePtr TableFunctionHDFS::getStorage( compression_method_); } +ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + return StorageHDFS::getTableStructureFromData(format, filename, compression_method, context); + + return parseColumnsListFromString(structure, context); +} -#if USE_HDFS void registerTableFunctionHDFS(TableFunctionFactory & factory) { factory.registerFunction(); } -#endif + } #endif diff --git a/src/TableFunctions/TableFunctionHDFS.h b/src/TableFunctions/TableFunctionHDFS.h index 70bdc67efc8..74139818209 100644 --- a/src/TableFunctions/TableFunctionHDFS.h +++ b/src/TableFunctions/TableFunctionHDFS.h @@ -12,7 +12,7 @@ namespace DB class Context; -/* hdfs(URI, format, structure) - creates a temporary storage from hdfs files +/* hdfs(URI, format[, structure, compression]) - creates a temporary storage from hdfs files * */ class TableFunctionHDFS : public ITableFunctionFileLike @@ -24,6 +24,8 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + private: StoragePtr getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, diff --git a/src/TableFunctions/TableFunctionMySQL.cpp b/src/TableFunctions/TableFunctionMySQL.cpp index e959fa754c9..cfed24caef6 100644 --- a/src/TableFunctions/TableFunctionMySQL.cpp +++ b/src/TableFunctions/TableFunctionMySQL.cpp @@ -37,8 +37,8 @@ void TableFunctionMySQL::parseArguments(const ASTPtr & ast_function, ContextPtr if (!args_func.arguments) throw Exception("Table function 'mysql' must have arguments.", ErrorCodes::LOGICAL_ERROR); - configuration = StorageMySQL::getConfiguration(args_func.arguments->children, context); MySQLSettings mysql_settings; + configuration = StorageMySQL::getConfiguration(args_func.arguments->children, context, mysql_settings); const auto & settings = context->getSettingsRef(); mysql_settings.connect_timeout = settings.external_storage_connect_timeout_sec; mysql_settings.read_write_timeout = settings.external_storage_rw_timeout_sec; diff --git a/src/TableFunctions/TableFunctionPostgreSQL.cpp b/src/TableFunctions/TableFunctionPostgreSQL.cpp index d948f40588f..7e7424be38f 100644 --- a/src/TableFunctions/TableFunctionPostgreSQL.cpp +++ b/src/TableFunctions/TableFunctionPostgreSQL.cpp @@ -45,13 +45,13 @@ ColumnsDescription TableFunctionPostgreSQL::getActualTableStructure(ContextPtr c { const bool use_nulls = context->getSettingsRef().external_table_functions_use_nulls; auto connection_holder = connection_pool->get(); - auto columns = fetchPostgreSQLTableStructure( - connection_holder->get(), configuration->table, configuration->schema, use_nulls).columns; + auto columns_info = fetchPostgreSQLTableStructure( + connection_holder->get(), configuration->table, configuration->schema, use_nulls).physical_columns; - if (!columns) + if (!columns_info) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table structure not returned"); - return ColumnsDescription{*columns}; + return ColumnsDescription{columns_info->columns}; } diff --git a/src/TableFunctions/TableFunctionRemote.cpp b/src/TableFunctions/TableFunctionRemote.cpp index f7af6bee7d9..85857011616 100644 --- a/src/TableFunctions/TableFunctionRemote.cpp +++ b/src/TableFunctions/TableFunctionRemote.cpp @@ -60,7 +60,7 @@ void TableFunctionRemote::parseArguments(const ASTPtr & ast_function, ContextPtr * Specific args (remote): sharding_key, or database (in case it is not ASTLiteral). * None of the common arguments is empty at this point, it is checked in getExternalDataSourceConfiguration. */ - auto [common_configuration, storage_specific_args] = named_collection.value(); + auto [common_configuration, storage_specific_args, _] = named_collection.value(); configuration.set(common_configuration); for (const auto & [arg_name, arg_value] : storage_specific_args) diff --git a/src/TableFunctions/TableFunctionRemote.h b/src/TableFunctions/TableFunctionRemote.h index 845c36182dc..976397ddc45 100644 --- a/src/TableFunctions/TableFunctionRemote.h +++ b/src/TableFunctions/TableFunctionRemote.h @@ -27,6 +27,7 @@ public: bool needStructureConversion() const override { return false; } private: + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; const char * getStorageTypeName() const override { return "Distributed"; } diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index e26c282c622..c4be01c6b5c 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include "registerTableFunctions.h" @@ -28,6 +29,7 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con const auto message = fmt::format( "The signature of table function {} could be the following:\n" \ + " - url, format\n" \ " - url, format, structure\n" \ " - url, format, structure, compression_method\n" \ " - url, access_key_id, secret_access_key, format, structure\n" \ @@ -69,17 +71,32 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con /// Size -> argument indexes static auto size_to_args = std::map> { + {2, {{"format", 1}}}, {3, {{"format", 1}, {"structure", 2}}}, - {4, {{"format", 1}, {"structure", 2}, {"compression_method", 3}}}, {5, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}}}, {6, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}, {"compression_method", 5}}} }; + std::map args_to_idx; + /// For 4 arguments we support 2 possible variants: + /// s3(source, format, structure, compression_method) and s3(source, access_key_id, access_key_id, format) + /// We can distinguish them by looking at the 4-th argument: check if it's a format name or not. + if (args.size() == 4) + { + auto last_arg = args[3]->as().value.safeGet(); + if (FormatFactory::instance().getAllFormats().contains(last_arg)) + args_to_idx = {{"access_key_id", 1}, {"access_key_id", 2}, {"format", 3}}; + else + args_to_idx = {{"format", 1}, {"structure", 2}, {"compression_method", 3}}; + } + else + { + args_to_idx = size_to_args[args.size()]; + } + /// This argument is always the first configuration.url = args[0]->as().value.safeGet(); - auto & args_to_idx = size_to_args[args.size()]; - if (args_to_idx.contains("format")) configuration.format = args[args_to_idx["format"]]->as().value.safeGet(); @@ -101,6 +118,21 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) const { + if (s3_configuration->structure == "auto") + { + return StorageS3::getTableStructureFromData( + s3_configuration->format, + S3::URI(Poco::URI(s3_configuration->url)), + s3_configuration->access_key_id, + s3_configuration->secret_access_key, + context->getSettingsRef().s3_max_connections, + context->getSettingsRef().s3_max_single_read_retries, + s3_configuration->compression_method, + false, + std::nullopt, + context); + } + return parseColumnsListFromString(s3_configuration->structure, context); } @@ -113,6 +145,10 @@ StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, Context UInt64 max_single_part_upload_size = context->getSettingsRef().s3_max_single_part_upload_size; UInt64 max_connections = context->getSettingsRef().s3_max_connections; + ColumnsDescription columns; + if (s3_configuration->structure != "auto") + columns = parseColumnsListFromString(s3_configuration->structure, context); + StoragePtr storage = StorageS3::create( s3_uri, s3_configuration->access_key_id, diff --git a/src/TableFunctions/TableFunctionS3.h b/src/TableFunctions/TableFunctionS3.h index 8d4c1391236..374e653072e 100644 --- a/src/TableFunctions/TableFunctionS3.h +++ b/src/TableFunctions/TableFunctionS3.h @@ -13,7 +13,7 @@ namespace DB class Context; -/* s3(source, [access_key_id, secret_access_key,] format, structure) - creates a temporary storage for a file in S3 +/* s3(source, [access_key_id, secret_access_key,] format, structure[, compression]) - creates a temporary storage for a file in S3 */ class TableFunctionS3 : public ITableFunction { @@ -23,7 +23,7 @@ public: { return name; } - bool hasStaticStructure() const override { return true; } + bool hasStaticStructure() const override { return s3_configuration->structure != "auto"; } protected: StoragePtr executeImpl( diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index c3ea30f800f..7c4d7b4a444 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -2,11 +2,11 @@ #include "registerTableFunctions.h" #include -#include #include #include #include #include +#include #include @@ -59,20 +59,10 @@ void TableFunctionURL::parseArguments(const ASTPtr & ast_function, ContextPtr co } } - StoragePtr TableFunctionURL::getStorage( const String & source, const String & format_, const ColumnsDescription & columns, ContextPtr global_context, const std::string & table_name, const String & compression_method_) const { - ReadWriteBufferFromHTTP::HTTPHeaderEntries headers; - for (const auto & [header, value] : configuration.headers) - { - auto value_literal = value.safeGet(); - if (header == "Range") - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Range headers are not allowed"); - headers.emplace_back(std::make_pair(header, value_literal)); - } - return StorageURL::create( source, StorageID(getDatabaseName(), table_name), @@ -83,10 +73,31 @@ StoragePtr TableFunctionURL::getStorage( String{}, global_context, compression_method_, - headers, + getHeaders(), configuration.http_method); } +ReadWriteBufferFromHTTP::HTTPHeaderEntries TableFunctionURL::getHeaders() const +{ + ReadWriteBufferFromHTTP::HTTPHeaderEntries headers; + for (const auto & [header, value] : configuration.headers) + { + auto value_literal = value.safeGet(); + if (header == "Range") + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Range headers are not allowed"); + headers.emplace_back(std::make_pair(header, value_literal)); + } + return headers; +} + +ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context) const +{ + if (structure == "auto") + return StorageURL::getTableStructureFromData(format, filename, compression_method, getHeaders(), std::nullopt, context); + + return parseColumnsListFromString(structure, context); +} + void registerTableFunctionURL(TableFunctionFactory & factory) { factory.registerFunction(); diff --git a/src/TableFunctions/TableFunctionURL.h b/src/TableFunctions/TableFunctionURL.h index 9425112acb2..798a37dc478 100644 --- a/src/TableFunctions/TableFunctionURL.h +++ b/src/TableFunctions/TableFunctionURL.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB @@ -9,7 +10,7 @@ namespace DB class Context; -/* url(source, format, structure) - creates a temporary storage from url +/* url(source, format[, structure, compression]) - creates a temporary storage from url */ class TableFunctionURL : public ITableFunctionFileLike { @@ -20,6 +21,8 @@ public: return name; } + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + protected: void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; @@ -29,6 +32,8 @@ private: const std::string & table_name, const String & compression_method_) const override; const char * getStorageTypeName() const override { return "URL"; } + ReadWriteBufferFromHTTP::HTTPHeaderEntries getHeaders() const; + URLBasedDataSourceConfiguration configuration; }; diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index f37ea49e387..0362732403f 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -6,80 +6,98 @@ import json import os import sys import time -from github import Github +from typing import List, Optional, Tuple from env_helper import REPO_COPY, TEMP_PATH, CACHES_PATH, IMAGES_PATH from s3_helper import S3Helper from pr_info import PRInfo -from get_robot_token import get_best_robot_token -from version_helper import get_version_from_repo, update_version_local +from version_helper import ( + ClickHouseVersion, + get_version_from_repo, + update_version_local, +) from ccache_utils import get_ccache_if_not_exists, upload_ccache -from ci_config import CI_CONFIG +from ci_config import CI_CONFIG, BuildConfig from docker_pull_helper import get_image_with_version from tee_popen import TeePopen -def get_build_config(build_check_name, build_name): - if build_check_name == 'ClickHouse build check (actions)': - build_config_name = 'build_config' +def get_build_config(build_check_name: str, build_name: str) -> BuildConfig: + if build_check_name == "ClickHouse build check (actions)": + build_config_name = "build_config" else: raise Exception(f"Unknown build check name {build_check_name}") return CI_CONFIG[build_config_name][build_name] -def _can_export_binaries(build_config): - if build_config['package_type'] != 'deb': +def _can_export_binaries(build_config: BuildConfig) -> bool: + if build_config["package_type"] != "deb": return False - if build_config['bundled'] != "bundled": + if build_config["bundled"] != "bundled": return False - if build_config['splitted'] == 'splitted': + if build_config["splitted"] == "splitted": return False - if build_config['sanitizer'] != '': + if build_config["sanitizer"] != "": return True - if build_config['build_type'] != '': + if build_config["build_type"] != "": return True return False -def get_packager_cmd(build_config, packager_path, output_path, build_version, image_version, ccache_path, pr_info): - package_type = build_config['package_type'] - comp = build_config['compiler'] - cmd = f"cd {packager_path} && ./packager --output-dir={output_path} --package-type={package_type} --compiler={comp}" +def get_packager_cmd( + build_config: BuildConfig, + packager_path: str, + output_path: str, + build_version: str, + image_version: str, + ccache_path: str, + pr_info: PRInfo, +) -> str: + package_type = build_config["package_type"] + comp = build_config["compiler"] + cmd = ( + f"cd {packager_path} && ./packager --output-dir={output_path} " + f"--package-type={package_type} --compiler={comp}" + ) - if build_config['build_type']: - cmd += ' --build-type={}'.format(build_config['build_type']) - if build_config['sanitizer']: - cmd += ' --sanitizer={}'.format(build_config['sanitizer']) - if build_config['splitted'] == 'splitted': - cmd += ' --split-binary' - if build_config['tidy'] == 'enable': - cmd += ' --clang-tidy' + if build_config["build_type"]: + cmd += " --build-type={}".format(build_config["build_type"]) + if build_config["sanitizer"]: + cmd += " --sanitizer={}".format(build_config["sanitizer"]) + if build_config["splitted"] == "splitted": + cmd += " --split-binary" + if build_config["tidy"] == "enable": + cmd += " --clang-tidy" - cmd += ' --cache=ccache' - cmd += ' --ccache_dir={}'.format(ccache_path) + cmd += " --cache=ccache" + cmd += " --ccache_dir={}".format(ccache_path) - if 'alien_pkgs' in build_config and build_config['alien_pkgs']: - if pr_info.number == 0 or 'release' in pr_info.labels: - cmd += ' --alien-pkgs rpm tgz' + if "alien_pkgs" in build_config and build_config["alien_pkgs"]: + if pr_info.number == 0 or "release" in pr_info.labels: + cmd += " --alien-pkgs rpm tgz" - cmd += ' --docker-image-version={}'.format(image_version) - cmd += ' --version={}'.format(build_version) + cmd += " --docker-image-version={}".format(image_version) + cmd += " --version={}".format(build_version) if _can_export_binaries(build_config): - cmd += ' --with-binaries=tests' + cmd += " --with-binaries=tests" return cmd -def get_image_name(build_config): - if build_config['package_type'] != 'deb': - return 'clickhouse/binary-builder' + +def get_image_name(build_config: BuildConfig) -> str: + if build_config["package_type"] != "deb": + return "clickhouse/binary-builder" else: - return 'clickhouse/deb-builder' + return "clickhouse/deb-builder" -def build_clickhouse(packager_cmd, logs_path, build_output_path): - build_log_path = os.path.join(logs_path, 'build_log.log') +def build_clickhouse( + packager_cmd: str, logs_path: str, build_output_path: str +) -> Tuple[str, bool]: + build_log_path = os.path.join(logs_path, "build_log.log") + success = False with TeePopen(packager_cmd, build_log_path) as process: retcode = process.wait() if os.path.exists(build_output_path): @@ -88,16 +106,21 @@ def build_clickhouse(packager_cmd, logs_path, build_output_path): build_results = [] if retcode == 0: - if len(build_results) != 0: + if len(build_results) > 0: + success = True logging.info("Built successfully") else: - logging.info("Success exit code, but no build artifacts => build failed") + logging.info( + "Success exit code, but no build artifacts => build failed" + ) else: logging.info("Build failed") - return build_log_path, retcode == 0 and len(build_results) > 0 + return build_log_path, success -def get_build_results_if_exists(s3_helper, s3_prefix): +def get_build_results_if_exists( + s3_helper: S3Helper, s3_prefix: str +) -> Optional[List[str]]: try: content = s3_helper.list_prefix(s3_prefix) return content @@ -105,8 +128,19 @@ def get_build_results_if_exists(s3_helper, s3_prefix): logging.info("Got exception %s listing %s", ex, s3_prefix) return None -def create_json_artifact(temp_path, build_name, log_url, build_urls, build_config, elapsed, success): - subprocess.check_call(f"echo 'BUILD_NAME=build_urls_{build_name}' >> $GITHUB_ENV", shell=True) + +def create_json_artifact( + temp_path: str, + build_name: str, + log_url: str, + build_urls: List[str], + build_config: BuildConfig, + elapsed: int, + success: bool, +): + subprocess.check_call( + f"echo 'BUILD_NAME=build_urls_{build_name}' >> $GITHUB_ENV", shell=True + ) result = { "log_url": log_url, @@ -116,48 +150,79 @@ def create_json_artifact(temp_path, build_name, log_url, build_urls, build_confi "status": success, } - json_name = "build_urls_" + build_name + '.json' + json_name = "build_urls_" + build_name + ".json" - print ("Dump json report", result, "to", json_name, "with env", "build_urls_{build_name}") + print( + "Dump json report", + result, + "to", + json_name, + "with env", + "build_urls_{build_name}", + ) - with open(os.path.join(temp_path, json_name), 'w') as build_links: + with open(os.path.join(temp_path, json_name), "w") as build_links: json.dump(result, build_links) -if __name__ == "__main__": +def get_release_or_pr( + pr_info: PRInfo, build_config: BuildConfig, version: ClickHouseVersion +) -> str: + if "release" in pr_info.labels or "release-lts" in pr_info.labels: + # for release pull requests we use branch names prefixes, not pr numbers + return pr_info.head_ref + elif pr_info.number == 0 and build_config["package_type"] != "performance": + # for pushes to master - major version, but not for performance builds + # they havily relies on a fixed path for build package and nobody going + # to deploy them somewhere, so it's ok. + return ".".join(version.as_tuple()[:2]) + # PR number for anything else + return str(pr_info.number) + + +def upload_master_static_binaries( + pr_info: PRInfo, + build_config: BuildConfig, + s3_helper: S3Helper, + build_output_path: str, +): + """Upload binary artifacts to a static S3 links""" + if pr_info.number != 0: + return + elif build_config["package_type"] != "binary": + return + elif build_config["splitted"] == "splitted": + return + elif pr_info.base_ref != "master": + return + + s3_path = "/".join( + (pr_info.base_ref, os.path.basename(build_output_path), "clickhouse") + ) + binary = os.path.join(build_output_path, "clickhouse") + url = s3_helper.upload_build_file_to_s3(binary, s3_path) + print(f"::notice ::Binary static URL: {url}") + + +def main(): logging.basicConfig(level=logging.INFO) - repo_path = REPO_COPY - temp_path = TEMP_PATH - caches_path = CACHES_PATH build_check_name = sys.argv[1] build_name = sys.argv[2] build_config = get_build_config(build_check_name, build_name) - if not os.path.exists(temp_path): - os.makedirs(temp_path) + if not os.path.exists(TEMP_PATH): + os.makedirs(TEMP_PATH) pr_info = PRInfo() - logging.info("Repo copy path %s", repo_path) + logging.info("Repo copy path %s", REPO_COPY) - gh = Github(get_best_robot_token()) - s3_helper = S3Helper('https://s3.amazonaws.com') + s3_helper = S3Helper("https://s3.amazonaws.com") - version = get_version_from_repo(repo_path) - release_or_pr = None - if 'release' in pr_info.labels or 'release-lts' in pr_info.labels: - # for release pull requests we use branch names prefixes, not pr numbers - release_or_pr = pr_info.head_ref - elif pr_info.number == 0 and build_config['package_type'] != "performance": - # for pushes to master - major version, but not for performance builds - # they havily relies on a fixed path for build package and nobody going - # to deploy them somewhere, so it's ok. - release_or_pr = ".".join(version.as_tuple()[:2]) - else: - # PR number for anything else - release_or_pr = str(pr_info.number) + version = get_version_from_repo(REPO_COPY) + release_or_pr = get_release_or_pr(pr_info, build_config, version) s3_path_prefix = "/".join((release_or_pr, pr_info.sha, build_name)) @@ -167,14 +232,27 @@ if __name__ == "__main__": if build_results is not None and len(build_results) > 0: logging.info("Some build results found %s", build_results) build_urls = [] - log_url = '' + log_url = "" for url in build_results: - if 'build_log.log' in url: - log_url = 'https://s3.amazonaws.com/clickhouse-builds/' + url.replace('+', '%2B').replace(' ', '%20') + if "build_log.log" in url: + log_url = "https://s3.amazonaws.com/clickhouse-builds/" + url.replace( + "+", "%2B" + ).replace(" ", "%20") else: - build_urls.append('https://s3.amazonaws.com/clickhouse-builds/' + url.replace('+', '%2B').replace(' ', '%20')) - create_json_artifact(temp_path, build_name, log_url, build_urls, build_config, 0, len(build_urls) > 0) - sys.exit(0) + build_urls.append( + "https://s3.amazonaws.com/clickhouse-builds/" + + url.replace("+", "%2B").replace(" ", "%20") + ) + create_json_artifact( + TEMP_PATH, + build_name, + log_url, + build_urls, + build_config, + 0, + len(build_urls) > 0, + ) + return image_name = get_image_name(build_config) docker_image = get_image_with_version(IMAGES_PATH, image_name) @@ -182,65 +260,93 @@ if __name__ == "__main__": logging.info("Got version from repo %s", version.get_version_string()) - version_type = 'testing' - if 'release' in pr_info.labels or 'release-lts' in pr_info.labels: - version_type = 'stable' + version_type = "testing" + if "release" in pr_info.labels or "release-lts" in pr_info.labels: + version_type = "stable" - update_version_local(repo_path, pr_info.sha, version, version_type) + update_version_local(REPO_COPY, pr_info.sha, version, version_type) logging.info("Updated local files with version") logging.info("Build short name %s", build_name) - build_output_path = os.path.join(temp_path, build_name) + build_output_path = os.path.join(TEMP_PATH, build_name) if not os.path.exists(build_output_path): os.makedirs(build_output_path) - ccache_path = os.path.join(caches_path, build_name + '_ccache') + ccache_path = os.path.join(CACHES_PATH, build_name + "_ccache") logging.info("Will try to fetch cache for our build") - get_ccache_if_not_exists(ccache_path, s3_helper, pr_info.number, temp_path) + get_ccache_if_not_exists(ccache_path, s3_helper, pr_info.number, TEMP_PATH) if not os.path.exists(ccache_path): logging.info("cache was not fetched, will create empty dir") os.makedirs(ccache_path) - if build_config['package_type'] == "performance" and pr_info.number != 0: + if build_config["package_type"] == "performance" and pr_info.number != 0: # because perf tests store some information about git commits - subprocess.check_call(f"cd {repo_path} && git fetch origin master:master", shell=True) + subprocess.check_call( + f"cd {REPO_COPY} && git fetch origin master:master", shell=True + ) - packager_cmd = get_packager_cmd(build_config, os.path.join(repo_path, "docker/packager"), build_output_path, version.get_version_string(), image_version, ccache_path, pr_info) + packager_cmd = get_packager_cmd( + build_config, + os.path.join(REPO_COPY, "docker/packager"), + build_output_path, + version.get_version_string(), + image_version, + ccache_path, + pr_info, + ) logging.info("Going to run packager with %s", packager_cmd) - build_clickhouse_log = os.path.join(temp_path, "build_log") + build_clickhouse_log = os.path.join(TEMP_PATH, "build_log") if not os.path.exists(build_clickhouse_log): os.makedirs(build_clickhouse_log) start = time.time() - log_path, success = build_clickhouse(packager_cmd, build_clickhouse_log, build_output_path) + log_path, success = build_clickhouse( + packager_cmd, build_clickhouse_log, build_output_path + ) elapsed = int(time.time() - start) - subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {build_output_path}", shell=True) + subprocess.check_call( + f"sudo chown -R ubuntu:ubuntu {build_output_path}", shell=True + ) subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {ccache_path}", shell=True) logging.info("Build finished with %s, log path %s", success, log_path) - logging.info("Will upload cache") - upload_ccache(ccache_path, s3_helper, pr_info.number, temp_path) + upload_ccache(ccache_path, s3_helper, pr_info.number, TEMP_PATH) if os.path.exists(log_path): - log_url = s3_helper.upload_build_file_to_s3(log_path, s3_path_prefix + "/" + os.path.basename(log_path)) + log_url = s3_helper.upload_build_file_to_s3( + log_path, s3_path_prefix + "/" + os.path.basename(log_path) + ) logging.info("Log url %s", log_url) else: logging.info("Build log doesn't exist") - build_urls = s3_helper.upload_build_folder_to_s3(build_output_path, s3_path_prefix, keep_dirs_in_s3_path=False, upload_symlinks=False) + build_urls = s3_helper.upload_build_folder_to_s3( + build_output_path, + s3_path_prefix, + keep_dirs_in_s3_path=False, + upload_symlinks=False, + ) logging.info("Got build URLs %s", build_urls) - print("::notice ::Build URLs: {}".format('\n'.join(build_urls))) + print("::notice ::Build URLs: {}".format("\n".join(build_urls))) print("::notice ::Log URL: {}".format(log_url)) - create_json_artifact(temp_path, build_name, log_url, build_urls, build_config, elapsed, success) + create_json_artifact( + TEMP_PATH, build_name, log_url, build_urls, build_config, elapsed, success + ) + + upload_master_static_binaries(pr_info, build_config, s3_helper, build_output_path) # Fail build job if not successeded if not success: sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index d5f8757ffdf..ba31e8e803c 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +from typing import Dict, TypeVar + +ConfValue = TypeVar("ConfValue", str, bool) +BuildConfig = Dict[str, ConfValue] + CI_CONFIG = { "build_config": { "package_release": { @@ -334,4 +339,4 @@ CI_CONFIG = { "required_build": "performance", }, }, -} +} # type: dict diff --git a/tests/ci/docs_release.py b/tests/ci/docs_release.py index 90588848f12..825bca0b68b 100644 --- a/tests/ci/docs_release.py +++ b/tests/ci/docs_release.py @@ -2,6 +2,7 @@ import logging import subprocess import os +import sys from github import Github @@ -13,6 +14,7 @@ from ssh import SSHKey from upload_result_helper import upload_results from docker_pull_helper import get_image_with_version from commit_status_helper import get_commit +from rerun_helper import RerunHelper NAME = "Docs Release (actions)" @@ -22,9 +24,12 @@ if __name__ == "__main__": temp_path = TEMP_PATH repo_path = REPO_COPY - pr_info = PRInfo(need_changed_files=True) - gh = Github(get_best_robot_token()) + pr_info = PRInfo(need_changed_files=True) + rerun_helper = RerunHelper(gh, pr_info, NAME) + if rerun_helper.is_already_finished_by_status(): + logging.info("Check is already finished according to github status, exiting") + sys.exit(0) if not os.path.exists(temp_path): os.makedirs(temp_path) diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py index 20e33f2f2dc..e87528dd528 100644 --- a/tests/ci/integration_test_check.py +++ b/tests/ci/integration_test_check.py @@ -33,6 +33,7 @@ IMAGES = [ "clickhouse/integration-test", "clickhouse/kerberos-kdc", "clickhouse/integration-helper", + "clickhouse/dotnet-client", ] def get_json_params_dict(check_name, pr_info, docker_images, run_by_hash_total, run_by_hash_num): diff --git a/tests/ci/keeper_jepsen_check.py b/tests/ci/keeper_jepsen_check.py index 2c2b8b4783f..b7acc92b0f3 100644 --- a/tests/ci/keeper_jepsen_check.py +++ b/tests/ci/keeper_jepsen_check.py @@ -122,7 +122,7 @@ if __name__ == "__main__": logging.info("Start at PR number %s, commit sha %s labels %s", pr_info.number, pr_info.sha, pr_info.labels) - if pr_info.number != 0 and 'jepsen-test' not in pr_info.labels(): + if pr_info.number != 0 and 'jepsen-test' not in pr_info.labels: logging.info("Not jepsen test label in labels list, skipping") sys.exit(0) diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index 812834824b7..331b8a7becf 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import json import os -import urllib import requests from unidiff import PatchSet @@ -42,6 +41,10 @@ class PRInfo: github_event = {'commits': 1, 'after': 'HEAD', 'ref': None} self.event = github_event self.changed_files = set([]) + self.body = "" + ref = github_event.get("ref", "refs/head/master") + if ref.startswith('refs/heads/'): + ref = ref[11:] # workflow completed event, used for PRs only if 'action' in github_event and github_event['action'] == 'completed': @@ -68,6 +71,7 @@ class PRInfo: self.base_name = github_event['pull_request']['base']['repo']['full_name'] self.head_ref = github_event['pull_request']['head']['ref'] self.head_name = github_event['pull_request']['head']['repo']['full_name'] + self.body = github_event['pull_request']['body'] if labels_from_api: response = requests.get(f"https://api.github.com/repos/{GITHUB_REPOSITORY}/issues/{self.number}/labels") @@ -94,10 +98,10 @@ class PRInfo: if pull_request is None or pull_request['state'] == 'closed': # it's merged PR to master self.number = 0 self.labels = {} - self.pr_html_url = f"{repo_prefix}/commits/master" - self.base_ref = "master" + self.pr_html_url = f"{repo_prefix}/commits/{ref}" + self.base_ref = ref self.base_name = self.repo_full_name - self.head_ref = "master" + self.head_ref = ref self.head_name = self.repo_full_name self.diff_url = \ f"https://api.github.com/repos/{GITHUB_REPOSITORY}/compare/{github_event['before']}...{self.sha}" @@ -127,10 +131,10 @@ class PRInfo: self.task_url = f"{repo_prefix}/actions/runs/{GITHUB_RUN_ID or '0'}" self.commit_html_url = f"{repo_prefix}/commits/{self.sha}" self.repo_full_name = GITHUB_REPOSITORY - self.pr_html_url = f"{repo_prefix}/commits/master" - self.base_ref = "master" + self.pr_html_url = f"{repo_prefix}/commits/{ref}" + self.base_ref = ref self.base_name = self.repo_full_name - self.head_ref = "master" + self.head_ref = ref self.head_name = self.repo_full_name if need_changed_files: @@ -140,16 +144,15 @@ class PRInfo: if not self.diff_url: raise Exception("Diff URL cannot be find for event") + response = requests.get(self.diff_url) + response.raise_for_status() if 'commits' in self.event and self.number == 0: - response = requests.get(self.diff_url) - response.raise_for_status() diff = response.json() if 'files' in diff: self.changed_files = [f['filename'] for f in diff['files']] else: - diff = urllib.request.urlopen(self.diff_url) - diff_object = PatchSet(diff, diff.headers.get_charsets()[0]) + diff_object = PatchSet(response.text) self.changed_files = {f.path for f in diff_object} def get_dict(self): diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 692cda18f20..3fe74a5ca8b 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import sys import logging +import re from github import Github from env_helper import GITHUB_RUN_ID, GITHUB_REPOSITORY, GITHUB_SERVER_URL @@ -8,10 +9,10 @@ from pr_info import PRInfo from get_robot_token import get_best_robot_token from commit_status_helper import get_commit -NAME = 'Run Check (actions)' +NAME = "Run Check (actions)" TRUSTED_ORG_IDS = { - 7409213, # yandex + 7409213, # yandex 28471076, # altinity 54801242, # clickhouse } @@ -22,53 +23,58 @@ DO_NOT_TEST_LABEL = "do not test" # Individual trusted contirbutors who are not in any trusted organization. # Can be changed in runtime: we will append users that we learned to be in # a trusted org, to save GitHub API calls. -TRUSTED_CONTRIBUTORS = {e.lower() for e in [ - "achimbab", - "adevyatova ", # DOCSUP - "Algunenano", # Raúl Marín, Tinybird - "AnaUvarova", # DOCSUP - "anauvarova", # technical writer, Yandex - "annvsh", # technical writer, Yandex - "atereh", # DOCSUP - "azat", - "bharatnc", # Newbie, but already with many contributions. - "bobrik", # Seasoned contributor, CloundFlare - "BohuTANG", - "codyrobert", # Flickerbox engineer - "cwurm", # Employee - "damozhaeva", # DOCSUP - "den-crane", - "flickerbox-tom", # Flickerbox - "gyuton", # technical writer, Yandex - "hagen1778", # Roman Khavronenko, seasoned contributor - "hczhcz", - "hexiaoting", # Seasoned contributor - "ildus", # adjust, ex-pgpro - "javisantana", # a Spanish ClickHouse enthusiast, ex-Carto - "ka1bi4", # DOCSUP - "kirillikoff", # DOCSUP - "kitaisreal", # Seasoned contributor - "kreuzerkrieg", - "lehasm", # DOCSUP - "michon470", # DOCSUP - "MyroTk", # Tester in Altinity - "myrrc", # Michael Kot, Altinity - "nikvas0", - "nvartolomei", - "olgarev", # DOCSUP - "otrazhenia", # Yandex docs contractor - "pdv-ru", # DOCSUP - "podshumok", # cmake expert from QRator Labs - "s-mx", # Maxim Sabyanin, former employee, present contributor - "sevirov", # technical writer, Yandex - "spongedu", # Seasoned contributor - "ucasFL", # Amos Bird's friend - "vdimir", # Employee - "vzakaznikov", - "YiuRULE", - "zlobober", # Developer of YT - "ilejn", # Arenadata, responsible for Kerberized Kafka -]} +TRUSTED_CONTRIBUTORS = { + e.lower() + for e in [ + "achimbab", + "adevyatova ", # DOCSUP + "Algunenano", # Raúl Marín, Tinybird + "AnaUvarova", # DOCSUP + "anauvarova", # technical writer, Yandex + "annvsh", # technical writer, Yandex + "atereh", # DOCSUP + "azat", + "bharatnc", # Newbie, but already with many contributions. + "bobrik", # Seasoned contributor, CloundFlare + "BohuTANG", + "codyrobert", # Flickerbox engineer + "cwurm", # Employee + "damozhaeva", # DOCSUP + "den-crane", + "flickerbox-tom", # Flickerbox + "gyuton", # technical writer, Yandex + "hagen1778", # Roman Khavronenko, seasoned contributor + "hczhcz", + "hexiaoting", # Seasoned contributor + "ildus", # adjust, ex-pgpro + "javisantana", # a Spanish ClickHouse enthusiast, ex-Carto + "ka1bi4", # DOCSUP + "kirillikoff", # DOCSUP + "kitaisreal", # Seasoned contributor + "kreuzerkrieg", + "lehasm", # DOCSUP + "michon470", # DOCSUP + "MyroTk", # Tester in Altinity + "myrrc", # Michael Kot, Altinity + "nikvas0", + "nvartolomei", + "olgarev", # DOCSUP + "otrazhenia", # Yandex docs contractor + "pdv-ru", # DOCSUP + "podshumok", # cmake expert from QRator Labs + "s-mx", # Maxim Sabyanin, former employee, present contributor + "sevirov", # technical writer, Yandex + "spongedu", # Seasoned contributor + "ucasFL", # Amos Bird's friend + "vdimir", # Employee + "vzakaznikov", + "YiuRULE", + "zlobober", # Developer of YT + "ilejn", # Arenadata, responsible for Kerberized Kafka + "thomoco", # ClickHouse + "BoloniniD", # Seasoned contributor, HSE + ] +} def pr_is_by_trusted_user(pr_user_login, pr_user_orgs): @@ -80,33 +86,123 @@ def pr_is_by_trusted_user(pr_user_login, pr_user_orgs): for org_id in pr_user_orgs: if org_id in TRUSTED_ORG_IDS: - logging.info("Org '%s' is trusted; will mark user %s as trusted", org_id, pr_user_login) + logging.info( + "Org '%s' is trusted; will mark user %s as trusted", + org_id, + pr_user_login, + ) return True logging.info("Org '%s' is not trusted", org_id) return False + # Returns whether we should look into individual checks for this PR. If not, it # can be skipped entirely. def should_run_checks_for_pr(pr_info): # Consider the labels and whether the user is trusted. print("Got labels", pr_info.labels) - force_labels = set(['force tests']).intersection(pr_info.labels) + force_labels = set(["force tests"]).intersection(pr_info.labels) if force_labels: - return True, "Labeled '{}'".format(', '.join(force_labels)) + return True, "Labeled '{}'".format(", ".join(force_labels)) - if 'do not test' in pr_info.labels: + if "do not test" in pr_info.labels: return False, "Labeled 'do not test'" - if 'can be tested' not in pr_info.labels and not pr_is_by_trusted_user(pr_info.user_login, pr_info.user_orgs): + if "can be tested" not in pr_info.labels and not pr_is_by_trusted_user( + pr_info.user_login, pr_info.user_orgs + ): return False, "Needs 'can be tested' label" - if 'release' in pr_info.labels or 'pr-backport' in pr_info.labels or 'pr-cherrypick' in pr_info.labels: + if ( + "release" in pr_info.labels + or "pr-backport" in pr_info.labels + or "pr-cherrypick" in pr_info.labels + ): return False, "Don't try new checks for release/backports/cherry-picks" return True, "No special conditions apply" +def check_pr_description(pr_info): + description = pr_info.body + + lines = [ + line + for line in map( + lambda x: x.strip(), description.split("\n") if description else [] + ) + ] + lines = [re.sub(r"\s+", " ", l) for l in lines] + + category = "" + entry = "" + + i = 0 + while i < len(lines): + if re.match(r"(?i)^[>*_ ]*change\s*log\s*category", lines[i]): + i += 1 + if i >= len(lines): + break + # Can have one empty line between header and the category + # itself. Filter it out. + if not lines[i]: + i += 1 + if i >= len(lines): + break + category = re.sub(r"^[-*\s]*", "", lines[i]) + i += 1 + + # Should not have more than one category. Require empty line + # after the first found category. + if i >= len(lines): + break + if lines[i]: + second_category = re.sub(r"^[-*\s]*", "", lines[i]) + result_status = ( + "More than one changelog category specified: '" + + category + + "', '" + + second_category + + "'" + ) + return result_status[:140] + + elif re.match( + r"(?i)^[>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] + ): + i += 1 + # Can have one empty line between header and the entry itself. + # Filter it out. + if i < len(lines) and not lines[i]: + i += 1 + # All following lines until empty one are the changelog entry. + entry_lines = [] + while i < len(lines) and lines[i]: + entry_lines.append(lines[i]) + i += 1 + entry = " ".join(entry_lines) + # Don't accept changelog entries like '...'. + entry = re.sub(r"[#>*_.\- ]", "", entry) + else: + i += 1 + + if not category: + return "Changelog category is empty" + + # Filter out the PR categories that are not for changelog. + if re.match( + r"(?i)doc|((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)", + category, + ): + return "" + + if not entry: + return "Changelog entry required for category '{}'".format(category) + + return "" + + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) @@ -114,15 +210,40 @@ if __name__ == "__main__": can_run, description = should_run_checks_for_pr(pr_info) gh = Github(get_best_robot_token()) commit = get_commit(gh, pr_info.sha) + + description_report = check_pr_description(pr_info)[:139] + if description_report: + print("::notice ::Cannot run, description does not match the template") + url = ( + f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/" + "blob/master/.github/PULL_REQUEST_TEMPLATE.md?plain=1" + ) + commit.create_status( + context=NAME, + description=description_report, + state="failure", + target_url=url, + ) + sys.exit(1) + url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/actions/runs/{GITHUB_RUN_ID}" if not can_run: print("::notice ::Cannot run") - commit.create_status(context=NAME, description=description, state="failure", target_url=url) + commit.create_status( + context=NAME, description=description, state="failure", target_url=url + ) sys.exit(1) else: - if 'pr-documentation' in pr_info.labels or 'pr-doc-fix' in pr_info.labels: - commit.create_status(context=NAME, description="Skipping checks for documentation", state="success", target_url=url) + if "pr-documentation" in pr_info.labels or "pr-doc-fix" in pr_info.labels: + commit.create_status( + context=NAME, + description="Skipping checks for documentation", + state="success", + target_url=url, + ) print("::notice ::Can run, but it's documentation PR, skipping") else: print("::notice ::Can run") - commit.create_status(context=NAME, description=description, state="pending", target_url=url) + commit.create_status( + context=NAME, description=description, state="pending", target_url=url + ) diff --git a/tests/ci/tee_popen.py b/tests/ci/tee_popen.py index cbb915e6de7..20302dacb97 100644 --- a/tests/ci/tee_popen.py +++ b/tests/ci/tee_popen.py @@ -15,11 +15,19 @@ class TeePopen: self.command = command self.log_file = log_file self.env = env + self.process = None def __enter__(self): - # pylint: disable=W0201 - self.process = Popen(self.command, shell=True, universal_newlines=True, env=self.env, stderr=STDOUT, stdout=PIPE, bufsize=1) - self.log_file = open(self.log_file, 'w', encoding='utf-8') + self.process = Popen( + self.command, + shell=True, + universal_newlines=True, + env=self.env, + stderr=STDOUT, + stdout=PIPE, + bufsize=1, + ) + self.log_file = open(self.log_file, "w", encoding="utf-8") return self def __exit__(self, t, value, traceback): diff --git a/tests/ci/workflow_approve_rerun_lambda/app.py b/tests/ci/workflow_approve_rerun_lambda/app.py index f2502f605af..396431a2e5f 100644 --- a/tests/ci/workflow_approve_rerun_lambda/app.py +++ b/tests/ci/workflow_approve_rerun_lambda/app.py @@ -41,6 +41,7 @@ TRUSTED_ORG_IDS = { NEED_RERUN_WORKFLOWS = { 13241696, # PR + 14738810, # DocsRelease 15834118, # Docs 15522500, # MasterCI 15516108, # ReleaseCI @@ -92,6 +93,7 @@ TRUSTED_CONTRIBUTORS = {e.lower() for e in [ "vzakaznikov", "YiuRULE", "zlobober", # Developer of YT + "BoloniniD", # Seasoned contributor, HSE ]} diff --git a/tests/config/executable_pool_dictionary.xml b/tests/config/executable_pool_dictionary.xml index 13f34f0048e..212552a6776 100644 --- a/tests/config/executable_pool_dictionary.xml +++ b/tests/config/executable_pool_dictionary.xml @@ -61,10 +61,11 @@ - + TabSeparated while read read_data; do printf "$read_data\tvalue a\tvalue b\n"; done - + 5 + diff --git a/tests/config/test_function.xml b/tests/config/test_function.xml index 2e31c9677ec..928cbd75c78 100644 --- a/tests/config/test_function.xml +++ b/tests/config/test_function.xml @@ -11,6 +11,6 @@ TabSeparated cd /; clickhouse-local --input-format TabSeparated --output-format TabSeparated --structure 'x UInt64, y UInt64' --query "SELECT x + y FROM table" - 0 + 0 diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index 830b8e149f6..6058a332c29 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -228,6 +228,7 @@ class ClickhouseIntegrationTestsRunner: "clickhouse/mysql-java-client", "clickhouse/mysql-js-client", "clickhouse/mysql-php-client", "clickhouse/postgresql-java-client", "clickhouse/integration-test", "clickhouse/kerberos-kdc", + "clickhouse/dotnet-client", "clickhouse/integration-helper", ] @@ -252,7 +253,7 @@ class ClickhouseIntegrationTestsRunner: logging.info("Executing installation cmd %s", cmd) retcode = subprocess.Popen(cmd, shell=True, stderr=log, stdout=log).wait() if retcode == 0: - logging.info("Instsallation of %s successfull", full_path) + logging.info("Installation of %s successfull", full_path) else: raise Exception("Installation of %s failed", full_path) break diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 0c513f68c32..bb36d3452d7 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -2256,7 +2256,7 @@ class ClickHouseInstance: logging.debug('{} log line(s) matching "{}" appeared in a {:.3f} seconds'.format(repetitions, regexp, wait_duration)) return wait_duration - def file_exists(self, path): + def path_exists(self, path): return self.exec_in_container( ["bash", "-c", "echo $(if [ -e '{}' ]; then echo 'yes'; else echo 'no'; fi)".format(path)]) == 'yes\n' @@ -2694,6 +2694,20 @@ class ClickHouseInstance: if p.exists(self.path): shutil.rmtree(self.path) + def wait_for_path_exists(self, path, seconds): + while seconds > 0: + seconds -= 1 + if self.path_exists(path): + return + time.sleep(1) + + def get_backuped_s3_objects(self, disk, backup_name): + path = f'/var/lib/clickhouse/disks/{disk}/shadow/{backup_name}/store' + self.wait_for_path_exists(path, 10) + command = ['find', path, '-type', 'f', + '-exec', 'grep', '-o', 'r[01]\\{64\\}-file-[[:lower:]]\\{32\\}', '{}', ';'] + return self.exec_in_container(command).split('\n') + class ClickHouseKiller(object): def __init__(self, clickhouse_node): diff --git a/tests/integration/helpers/postgres_utility.py b/tests/integration/helpers/postgres_utility.py new file mode 100644 index 00000000000..16461ea3310 --- /dev/null +++ b/tests/integration/helpers/postgres_utility.py @@ -0,0 +1,273 @@ +import psycopg2 +import time +from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT + +postgres_table_template = """ + CREATE TABLE IF NOT EXISTS "{}" ( + key Integer NOT NULL, value Integer, PRIMARY KEY(key)) + """ +postgres_table_template_2 = """ + CREATE TABLE IF NOT EXISTS "{}" ( + key Integer NOT NULL, value1 Integer, value2 Integer, value3 Integer, PRIMARY KEY(key)) + """ +postgres_table_template_3 = """ + CREATE TABLE IF NOT EXISTS "{}" ( + key1 Integer NOT NULL, value1 Integer, key2 Integer NOT NULL, value2 Integer NOT NULL) + """ +postgres_table_template_4 = """ + CREATE TABLE IF NOT EXISTS "{}"."{}" ( + key Integer NOT NULL, value Integer, PRIMARY KEY(key)) + """ +postgres_table_template_5 = """ + CREATE TABLE IF NOT EXISTS "{}" ( + key Integer NOT NULL, value UUID, PRIMARY KEY(key)) + """ + +def get_postgres_conn(ip, port, database=False, auto_commit=True, database_name='postgres_database', replication=False): + if database == True: + conn_string = f"host={ip} port={port} dbname='{database_name}' user='postgres' password='mysecretpassword'" + else: + conn_string = f"host={ip} port={port} user='postgres' password='mysecretpassword'" + + if replication: + conn_string += " replication='database'" + + conn = psycopg2.connect(conn_string) + if auto_commit: + conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) + conn.autocommit = True + return conn + +def create_replication_slot(conn, slot_name='user_slot'): + cursor = conn.cursor() + cursor.execute(f'CREATE_REPLICATION_SLOT {slot_name} LOGICAL pgoutput EXPORT_SNAPSHOT') + result = cursor.fetchall() + print(result[0][0]) # slot name + print(result[0][1]) # start lsn + print(result[0][2]) # snapshot + return result[0][2] + +def drop_replication_slot(conn, slot_name='user_slot'): + cursor = conn.cursor() + cursor.execute(f"select pg_drop_replication_slot('{slot_name}')") + + +def create_postgres_schema(cursor, schema_name): + drop_postgres_schema(cursor, schema_name) + cursor.execute(f'CREATE SCHEMA {schema_name}') + +def drop_postgres_schema(cursor, schema_name): + cursor.execute(f'DROP SCHEMA IF EXISTS {schema_name} CASCADE') + + +def create_postgres_table(cursor, table_name, replica_identity_full=False, template=postgres_table_template): + drop_postgres_table(cursor, table_name) + cursor.execute(template.format(table_name)) + if replica_identity_full: + cursor.execute(f'ALTER TABLE {table_name} REPLICA IDENTITY FULL;') + +def drop_postgres_table(cursor, table_name): + cursor.execute(f"""DROP TABLE IF EXISTS "{table_name}" """) + + +def create_postgres_table_with_schema(cursor, schema_name, table_name): + drop_postgres_table_with_schema(cursor, schema_name, table_name) + cursor.execute(postgres_table_template_4.format(schema_name, table_name)) + +def drop_postgres_table_with_schema(cursor, schema_name, table_name): + cursor.execute(f"""DROP TABLE IF EXISTS "{schema_name}"."{table_name}" """) + + +class PostgresManager: + def __init__(self): + self.created_postgres_db_list = set() + self.created_materialized_postgres_db_list = set() + self.created_ch_postgres_db_list = set() + + def init(self, instance, ip, port): + self.instance = instance + self.ip = ip + self.port = port + self.conn = get_postgres_conn(ip=self.ip, port=self.port) + self.prepare() + + def restart(self): + try: + self.clear() + self.prepare() + except Exception as ex: + self.prepare() + raise ex + + def prepare(self): + conn = get_postgres_conn(ip=self.ip, port=self.port) + cursor = conn.cursor() + self.create_postgres_db(cursor, 'postgres_database') + self.create_clickhouse_postgres_db(ip=self.ip, port=self.port) + + def clear(self): + if self.conn.closed == 0: + self.conn.close() + for db in self.created_materialized_postgres_db_list.copy(): + self.drop_materialized_db(db); + for db in self.created_ch_postgres_db_list.copy(): + self.drop_clickhouse_postgres_db(db) + if len(self.created_postgres_db_list) > 0: + conn = get_postgres_conn(ip=self.ip, port=self.port) + cursor = conn.cursor() + for db in self.created_postgres_db_list.copy(): + self.drop_postgres_db(cursor, db) + + def get_db_cursor(self): + self.conn = get_postgres_conn(ip=self.ip, port=self.port, database=True) + return self.conn.cursor() + + def create_postgres_db(self, cursor, name='postgres_database'): + self.drop_postgres_db(cursor, name) + self.created_postgres_db_list.add(name) + cursor.execute(f"CREATE DATABASE {name}") + + def drop_postgres_db(self, cursor, name='postgres_database'): + cursor.execute(f"DROP DATABASE IF EXISTS {name}") + if name in self.created_postgres_db_list: + self.created_postgres_db_list.remove(name) + + def create_clickhouse_postgres_db(self, ip, port, name='postgres_database', database_name='postgres_database', schema_name=''): + self.drop_clickhouse_postgres_db(name) + self.created_ch_postgres_db_list.add(name) + + if len(schema_name) == 0: + self.instance.query(f''' + CREATE DATABASE {name} + ENGINE = PostgreSQL('{ip}:{port}', '{database_name}', 'postgres', 'mysecretpassword')''') + else: + self.instance.query(f''' + CREATE DATABASE {name} + ENGINE = PostgreSQL('{ip}:{port}', '{database_name}', 'postgres', 'mysecretpassword', '{schema_name}')''') + + def drop_clickhouse_postgres_db(self, name='postgres_database'): + self.instance.query(f'DROP DATABASE IF EXISTS {name}') + if name in self.created_ch_postgres_db_list: + self.created_ch_postgres_db_list.remove(name) + + + def create_materialized_db(self, ip, port, + materialized_database='test_database', postgres_database='postgres_database', + settings=[], table_overrides=''): + self.created_materialized_postgres_db_list.add(materialized_database) + self.instance.query(f"DROP DATABASE IF EXISTS {materialized_database}") + + create_query = f"CREATE DATABASE {materialized_database} ENGINE = MaterializedPostgreSQL('{ip}:{port}', '{postgres_database}', 'postgres', 'mysecretpassword')" + if len(settings) > 0: + create_query += " SETTINGS " + for i in range(len(settings)): + if i != 0: + create_query += ', ' + create_query += settings[i] + create_query += table_overrides + self.instance.query(create_query) + assert materialized_database in self.instance.query('SHOW DATABASES') + + def drop_materialized_db(self, materialized_database='test_database'): + self.instance.query(f'DROP DATABASE IF EXISTS {materialized_database} NO DELAY') + if materialized_database in self.created_materialized_postgres_db_list: + self.created_materialized_postgres_db_list.remove(materialized_database) + assert materialized_database not in self.instance.query('SHOW DATABASES') + + def create_and_fill_postgres_table(self, table_name): + conn = get_postgres_conn(ip=self.ip, port=self.port, database=True) + cursor = conn.cursor() + self.create_and_fill_postgres_table_from_cursor(cursor, table_name) + + def create_and_fill_postgres_table_from_cursor(self, cursor, table_name): + create_postgres_table(cursor, table_name); + self.instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number from numbers(50)") + + def create_and_fill_postgres_tables(self, tables_num, numbers=50): + conn = get_postgres_conn(ip=self.ip, port=self.port, database=True) + cursor = conn.cursor() + self.create_and_fill_postgres_tables_from_cursor(cursor, tables_num, numbers=numbers) + + def create_and_fill_postgres_tables_from_cursor(self, cursor, tables_num, numbers=50): + for i in range(tables_num): + table_name = f'postgresql_replica_{i}' + create_postgres_table(cursor, table_name); + if numbers > 0: + self.instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number from numbers({numbers})") + + +queries = [ + 'INSERT INTO postgresql_replica_{} select i, i from generate_series(0, 10000) as t(i);', + 'DELETE FROM postgresql_replica_{} WHERE (value*value) % 3 = 0;', + 'UPDATE postgresql_replica_{} SET value = value - 125 WHERE key % 2 = 0;', + "UPDATE postgresql_replica_{} SET key=key+20000 WHERE key%2=0", + 'INSERT INTO postgresql_replica_{} select i, i from generate_series(40000, 50000) as t(i);', + 'DELETE FROM postgresql_replica_{} WHERE key % 10 = 0;', + 'UPDATE postgresql_replica_{} SET value = value + 101 WHERE key % 2 = 1;', + "UPDATE postgresql_replica_{} SET key=key+80000 WHERE key%2=1", + 'DELETE FROM postgresql_replica_{} WHERE value % 2 = 0;', + 'UPDATE postgresql_replica_{} SET value = value + 2000 WHERE key % 5 = 0;', + 'INSERT INTO postgresql_replica_{} select i, i from generate_series(200000, 250000) as t(i);', + 'DELETE FROM postgresql_replica_{} WHERE value % 3 = 0;', + 'UPDATE postgresql_replica_{} SET value = value * 2 WHERE key % 3 = 0;', + "UPDATE postgresql_replica_{} SET key=key+500000 WHERE key%2=1", + 'INSERT INTO postgresql_replica_{} select i, i from generate_series(1000000, 1050000) as t(i);', + 'DELETE FROM postgresql_replica_{} WHERE value % 9 = 2;', + "UPDATE postgresql_replica_{} SET key=key+10000000", + 'UPDATE postgresql_replica_{} SET value = value + 2 WHERE key % 3 = 1;', + 'DELETE FROM postgresql_replica_{} WHERE value%5 = 0;' + ] + + +def assert_nested_table_is_created(instance, table_name, materialized_database='test_database', schema_name=''): + if len(schema_name) == 0: + table = table_name + else: + table = schema_name + "." + table_name + + print(f'Checking table {table} exists in {materialized_database}') + database_tables = instance.query(f'SHOW TABLES FROM {materialized_database}') + + while table not in database_tables: + time.sleep(0.2) + database_tables = instance.query(f'SHOW TABLES FROM {materialized_database}') + + assert(table in database_tables) + + +def assert_number_of_columns(instance, expected, table_name, database_name='test_database'): + result = instance.query(f"select count() from system.columns where table = '{table_name}' and database = '{database_name}' and not startsWith(name, '_')") + while (int(result) != expected): + time.sleep(1) + result = instance.query(f"select count() from system.columns where table = '{table_name}' and database = '{database_name}' and not startsWith(name, '_')") + print('Number of columns ok') + + +def check_tables_are_synchronized(instance, table_name, order_by='key', postgres_database='postgres_database', materialized_database='test_database', schema_name=''): + assert_nested_table_is_created(instance, table_name, materialized_database, schema_name) + + table_path = '' + if len(schema_name) == 0: + table_path = f'{materialized_database}.{table_name}' + else: + table_path = f'{materialized_database}.`{schema_name}.{table_name}`' + + print(f"Checking table is synchronized: {table_path}") + result_query = f'select * from {table_path} order by {order_by};' + + expected = instance.query(f'select * from {postgres_database}.{table_name} order by {order_by};') + result = instance.query(result_query) + + for _ in range(30): + if result == expected: + break + else: + time.sleep(0.5) + result = instance.query(result_query) + + assert(result == expected) + + +def check_several_tables_are_synchronized(instance, tables_num, order_by='key', postgres_database='postgres_database', materialized_database='test_database', schema_name=''): + for i in range(tables_num): + check_tables_are_synchronized(instance, f'postgresql_replica_{i}'); diff --git a/tests/integration/helpers/test_tools.py b/tests/integration/helpers/test_tools.py index 3577553be34..ec3841f79d7 100644 --- a/tests/integration/helpers/test_tools.py +++ b/tests/integration/helpers/test_tools.py @@ -100,3 +100,19 @@ def exec_query_with_retry(instance, query, retry_count=40, sleep_time=0.5, silen time.sleep(sleep_time) else: raise exception + +def csv_compare(result, expected): + csv_result = TSV(result) + csv_expected = TSV(expected) + mismatch = [] + max_len = len(csv_result) if len(csv_result) > len(csv_expected) else len(csv_expected) + for i in range(max_len): + if i >= len(csv_result): + mismatch.append("-[%d]=%s" % (i, csv_expected.lines[i])) + elif i >= len(csv_expected): + mismatch.append("+[%d]=%s" % (i, csv_result.lines[i])) + elif csv_expected.lines[i] != csv_result.lines[i]: + mismatch.append("-[%d]=%s" % (i, csv_expected.lines[i])) + mismatch.append("+[%d]=%s" % (i, csv_result.lines[i])) + + return "\n".join(mismatch) diff --git a/tests/integration/runner b/tests/integration/runner index 4d01b9737d1..3687ca4068c 100755 --- a/tests/integration/runner +++ b/tests/integration/runner @@ -226,6 +226,8 @@ if __name__ == "__main__": [image, tag] = img_tag.split(":") if image == "clickhouse/mysql-golang-client": env_tags += "-e {}={} ".format("DOCKER_MYSQL_GOLANG_CLIENT_TAG", tag) + elif image == "clickhouse/dotnet-client": + env_tags += "-e {}={} ".format("DOCKER_DOTNET_CLIENT_TAG", tag) elif image == "clickhouse/mysql-java-client": env_tags += "-e {}={} ".format("DOCKER_MYSQL_JAVA_CLIENT_TAG", tag) elif image == "clickhouse/mysql-js-client": @@ -237,7 +239,7 @@ if __name__ == "__main__": elif image == "clickhouse/integration-test": env_tags += "-e {}={} ".format("DOCKER_BASE_TAG", tag) elif image == "clickhouse/kerberos-kdc": - env_tags += "-e {}={}".format("DOCKER_KERBEROS_KDC_TAG", tag) + env_tags += "-e {}={} ".format("DOCKER_KERBEROS_KDC_TAG", tag) else: logging.info("Unknown image %s" % (image)) diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/__init__.py b/tests/integration/test_concurrent_queries_restriction_by_query_kind/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_insert_restriction.xml b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_insert_restriction.xml new file mode 100644 index 00000000000..7753c579902 --- /dev/null +++ b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_insert_restriction.xml @@ -0,0 +1,3 @@ + + 2 + diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_select_restriction.xml b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_select_restriction.xml new file mode 100644 index 00000000000..c8f081e6804 --- /dev/null +++ b/tests/integration/test_concurrent_queries_restriction_by_query_kind/configs/concurrent_select_restriction.xml @@ -0,0 +1,3 @@ + + 2 + diff --git a/tests/integration/test_concurrent_queries_restriction_by_query_kind/test.py b/tests/integration/test_concurrent_queries_restriction_by_query_kind/test.py new file mode 100644 index 00000000000..2d16d9157f6 --- /dev/null +++ b/tests/integration/test_concurrent_queries_restriction_by_query_kind/test.py @@ -0,0 +1,77 @@ +import time +from multiprocessing.dummy import Pool + +import pytest +from helpers.cluster import ClickHouseCluster + + +cluster = ClickHouseCluster(__file__) +node_insert = cluster.add_instance('node_insert', main_configs=['configs/concurrent_insert_restriction.xml']) +node_select = cluster.add_instance('node_select', main_configs=['configs/concurrent_select_restriction.xml']) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + node_select.query("create table test_concurrent_insert (x UInt64) ENGINE = MergeTree() order by tuple()") + node_insert.query("create table test_concurrent_insert (x UInt64) ENGINE = MergeTree() order by tuple()") + yield cluster + finally: + cluster.shutdown() + + +def execute_with_background(node, sql, background_sql, background_times, wait_times=3): + r = None + for _ in range(wait_times): + r = node.query('show processlist', stdin='') + if not r.strip(): + break + time.sleep(1) + else: + assert False, "there are unknown background queries: {}".format(r) + for _ in range(background_times): + node.get_query_request(background_sql, stdin='') + time.sleep(0.5) # wait background to start. + return node.query(sql, stdin='') + + +def common_pattern(node, query_kind, restricted_sql, normal_sql, limit, wait_times): + # restriction is working + with pytest.raises(Exception, match=r".*Too many simultaneous {} queries.*".format(query_kind)): + execute_with_background(node, restricted_sql, restricted_sql, limit, wait_times) + + # different query kind is independent + execute_with_background(node, normal_sql, restricted_sql, limit, wait_times) + + # normal + execute_with_background(node, restricted_sql, '', 0, wait_times) + + +def test_select(started_cluster): + common_pattern( + node_select, 'select', + 'select sleep(3)', + 'insert into test_concurrent_insert values (0)', + 2, + 10 + ) + + # subquery is not counted + execute_with_background( + node_select, + 'select sleep(3)', + 'insert into test_concurrent_insert select sleep(3)', + 2, + 10 + ) + + +def test_insert(started_cluster): + common_pattern( + node_insert, 'insert', + 'insert into test_concurrent_insert select sleep(3)', + 'select 1', + 2, + 10 + ) diff --git a/tests/integration/test_config_xml_full/configs/config.xml b/tests/integration/test_config_xml_full/configs/config.xml index c277ff7341f..76eceedbcea 100644 --- a/tests/integration/test_config_xml_full/configs/config.xml +++ b/tests/integration/test_config_xml_full/configs/config.xml @@ -639,6 +639,24 @@ + + + + localhost + 9440 + + + + + + + + localhost + 9440 + + + + diff --git a/tests/integration/test_config_yaml_full/configs/config.yaml b/tests/integration/test_config_yaml_full/configs/config.yaml index 5958d463d21..21cf439f7ec 100644 --- a/tests/integration/test_config_yaml_full/configs/config.yaml +++ b/tests/integration/test_config_yaml_full/configs/config.yaml @@ -100,6 +100,12 @@ remote_servers: host: localhost port: 9440 secure: 1 + test_shard_localhost_secure_empty_tag: + shard: + replica: + host: localhost + port: 9440 + secure: test_unavailable_shard: shard: - replica: diff --git a/tests/integration/test_dictionaries_mysql/configs/named_collections.xml b/tests/integration/test_dictionaries_mysql/configs/named_collections.xml index e6e8d0c239f..6e4098c4e4a 100644 --- a/tests/integration/test_dictionaries_mysql/configs/named_collections.xml +++ b/tests/integration/test_dictionaries_mysql/configs/named_collections.xml @@ -21,5 +21,14 @@ test
test_table
+ + root + clickhouse + mysql57 + 3306 + test + test_table
+ 0 +
diff --git a/tests/integration/test_dictionaries_mysql/test.py b/tests/integration/test_dictionaries_mysql/test.py index c1819923523..664fde2baa8 100644 --- a/tests/integration/test_dictionaries_mysql/test.py +++ b/tests/integration/test_dictionaries_mysql/test.py @@ -205,6 +205,39 @@ def test_predefined_connection_configuration(started_cluster): result = instance.query("SELECT dictGetUInt32(dict, 'value', toUInt64(100))") assert(int(result) == 200) + instance.query(''' + DROP DICTIONARY IF EXISTS dict; + CREATE DICTIONARY dict (id UInt32, value UInt32) + PRIMARY KEY id + SOURCE(MYSQL(NAME mysql1 connection_pool_size 0)) + LIFETIME(MIN 1 MAX 2) + LAYOUT(HASHED()); + ''') + result = instance.query_and_get_error("SELECT dictGetUInt32(dict, 'value', toUInt64(100))") + assert 'Connection pool cannot have zero size' in result + + instance.query(''' + DROP DICTIONARY IF EXISTS dict; + CREATE DICTIONARY dict (id UInt32, value UInt32) + PRIMARY KEY id + SOURCE(MYSQL(NAME mysql4)) + LIFETIME(MIN 1 MAX 2) + LAYOUT(HASHED()); + ''') + result = instance.query_and_get_error("SELECT dictGetUInt32(dict, 'value', toUInt64(100))") + assert 'Connection pool cannot have zero size' in result + + instance.query(''' + DROP DICTIONARY IF EXISTS dict; + CREATE DICTIONARY dict (id UInt32, value UInt32) + PRIMARY KEY id + SOURCE(MYSQL(NAME mysql4 connection_pool_size 1)) + LIFETIME(MIN 1 MAX 2) + LAYOUT(HASHED()); + ''') + result = instance.query("SELECT dictGetUInt32(dict, 'value', toUInt64(100))") + assert(int(result) == 200) + def create_mysql_db(mysql_connection, name): with mysql_connection.cursor() as cursor: diff --git a/tests/integration/test_dictionaries_postgresql/test.py b/tests/integration/test_dictionaries_postgresql/test.py index 8869e9112d1..ce295e11586 100644 --- a/tests/integration/test_dictionaries_postgresql/test.py +++ b/tests/integration/test_dictionaries_postgresql/test.py @@ -369,6 +369,29 @@ def test_predefined_connection_configuration(started_cluster): assert(int(result.strip()) == 99) +def test_bad_configuration(started_cluster): + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) + cursor = conn.cursor() + + node1.query(''' + DROP DICTIONARY IF EXISTS postgres_dict; + CREATE DICTIONARY postgres_dict (id UInt32, value UInt32) + PRIMARY KEY id + SOURCE(POSTGRESQL( + port 5432 + host 'postgres1' + user 'postgres' + password 'mysecretpassword' + dbbb 'clickhouse' + table 'test_schema.test_table')) + LIFETIME(MIN 1 MAX 2) + LAYOUT(HASHED()); + ''') + + node1.query_and_get_error("SELECT dictGetUInt32(postgres_dict, 'value', toUInt64(1))") + assert node1.contains_in_log('Unexpected key `dbbb`') + + if __name__ == '__main__': cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_dictionaries_update_and_reload/test.py b/tests/integration/test_dictionaries_update_and_reload/test.py index 8e375b7b327..9bee5db8ce1 100644 --- a/tests/integration/test_dictionaries_update_and_reload/test.py +++ b/tests/integration/test_dictionaries_update_and_reload/test.py @@ -203,7 +203,7 @@ def test_reload_after_fail_by_timer(started_cluster): instance.copy_file_to_container(os.path.join(SCRIPT_DIR, "configs/dictionaries/file.txt"), "/etc/clickhouse-server/dictionaries/no_file_2.txt") # Check that file appears in container and wait if needed. - while not instance.file_exists("/etc/clickhouse-server/dictionaries/no_file_2.txt"): + while not instance.path_exists("/etc/clickhouse-server/dictionaries/no_file_2.txt"): time.sleep(1) assert("9\t10\n" == instance.exec_in_container(["cat", "/etc/clickhouse-server/dictionaries/no_file_2.txt"])) instance.query("SYSTEM RELOAD DICTIONARY no_file_2") diff --git a/tests/integration/test_dotnet_client/__init__.py b/tests/integration/test_dotnet_client/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_dotnet_client/configs/config.xml b/tests/integration/test_dotnet_client/configs/config.xml new file mode 100644 index 00000000000..9bcadc43f10 --- /dev/null +++ b/tests/integration/test_dotnet_client/configs/config.xml @@ -0,0 +1,16 @@ + + + + trace + /var/log/clickhouse-server/clickhouse-server.log + /var/log/clickhouse-server/clickhouse-server.err.log + 1000M + 10 + + + 8123 + 127.0.0.1 + + ./clickhouse/ + users.xml + diff --git a/tests/integration/test_dotnet_client/configs/users.xml b/tests/integration/test_dotnet_client/configs/users.xml new file mode 100644 index 00000000000..1874371871a --- /dev/null +++ b/tests/integration/test_dotnet_client/configs/users.xml @@ -0,0 +1,32 @@ + + + + + + + + + + 123 + + ::/0 + + default + default + + + + + + ::/0 + + default + default + + + + + + + + diff --git a/tests/integration/test_dotnet_client/dotnet.reference b/tests/integration/test_dotnet_client/dotnet.reference new file mode 100644 index 00000000000..a3d6e1d5ba8 Binary files /dev/null and b/tests/integration/test_dotnet_client/dotnet.reference differ diff --git a/tests/integration/test_dotnet_client/test.py b/tests/integration/test_dotnet_client/test.py new file mode 100644 index 00000000000..4cc16ac826e --- /dev/null +++ b/tests/integration/test_dotnet_client/test.py @@ -0,0 +1,47 @@ +# coding: utf-8 + +import datetime +import math +import os +import time + +import logging +import docker +import pytest +from docker.models.containers import Container +from helpers.cluster import ClickHouseCluster, get_docker_compose_path, run_and_check + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +DOCKER_COMPOSE_PATH = get_docker_compose_path() + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance('node', + user_configs=["configs/users.xml"], env_variables={'UBSAN_OPTIONS': 'print_stacktrace=1'}) + +@pytest.fixture(scope="module") +def started_cluster(): + cluster.start() + try: + yield cluster + finally: + cluster.shutdown() + + +@pytest.fixture(scope='module') +def dotnet_container(): + docker_compose = os.path.join(DOCKER_COMPOSE_PATH, 'docker_compose_dotnet_client.yml') + run_and_check( + ['docker-compose', '-p', cluster.project_name, '-f', docker_compose, 'up', '--no-recreate', '-d', '--no-build']) + yield docker.from_env().containers.get(cluster.project_name + '_dotnet1_1') + + +def test_dotnet_client(started_cluster, dotnet_container): + with open(os.path.join(SCRIPT_DIR, 'dotnet.reference'), 'rb') as fp: + reference = fp.read() + + code, (stdout, stderr) = dotnet_container.exec_run( + 'dotnet run --host {host} --port {port} --user default --password 123 --database default' + .format(host=started_cluster.get_instance_ip('node'), port=8123), demux=True) + + assert code == 0 + assert stdout == reference diff --git a/tests/integration/test_executable_dictionary/__init__.py b/tests/integration/test_executable_dictionary/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_executable_dictionary/config/dictionaries_config.xml b/tests/integration/test_executable_dictionary/config/dictionaries_config.xml new file mode 100644 index 00000000000..3cbf717bb67 --- /dev/null +++ b/tests/integration/test_executable_dictionary/config/dictionaries_config.xml @@ -0,0 +1,2 @@ + + diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_argument_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_argument_python_dictionary.xml new file mode 100644 index 00000000000..ddbb8e95abb --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_argument_python_dictionary.xml @@ -0,0 +1,99 @@ + + + executable_input_argument_python + + + TabSeparated + input_argument.py 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_argument_pool_python + + + TabSeparated + input_argument.py 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_argument_python + + + TabSeparated + input_implicit_argument.py 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_argument_pool_python + + + TabSeparated + input_implicit_argument.py 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_bash_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_bash_dictionary.xml new file mode 100644 index 00000000000..488a12de115 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_bash_dictionary.xml @@ -0,0 +1,99 @@ + + + executable_input_bash + + + TabSeparated + input.sh + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_pool_bash + + + TabSeparated + input.sh + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_bash + + + TabSeparated + input_implicit.sh + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_pool_bash + + + TabSeparated + input_implicit.sh + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_python_dictionary.xml new file mode 100644 index 00000000000..5b551e51951 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_python_dictionary.xml @@ -0,0 +1,99 @@ + + + executable_input_python + + + TabSeparated + input.py + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_pool_python + + + TabSeparated + input.py + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_python + + + TabSeparated + input_implicit.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_pool_python + + + TabSeparated + input_implicit.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_send_chunk_header_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_send_chunk_header_python_dictionary.xml new file mode 100644 index 00000000000..816cb0db2c5 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_send_chunk_header_python_dictionary.xml @@ -0,0 +1,103 @@ + + + executable_input_send_chunk_header_python + + + TabSeparated + input_chunk_header.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_send_chunk_header_pool_python + + + TabSeparated + input_chunk_header.py + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_send_chunk_header_python + + + TabSeparated + input_implicit_chunk_header.py + 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_send_chunk_header_pool_python + + + TabSeparated + input_implicit_chunk_header.py + 1 + 1 + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_signalled_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_signalled_python_dictionary.xml new file mode 100644 index 00000000000..71f8873b20e --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_signalled_python_dictionary.xml @@ -0,0 +1,103 @@ + + + executable_input_signalled_python + + + TabSeparated + input_signalled.py + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + + + executable_input_signalled_pool_python + + + TabSeparated + input_signalled.py + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + + + executable_implicit_input_signalled_python + + + TabSeparated + input_implicit_signalled.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + + + executable_implicit_input_signalled_pool_python + + + TabSeparated + input_implicit_signalled.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + Default result + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_slow_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_slow_python_dictionary.xml new file mode 100644 index 00000000000..dee161a9b78 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_slow_python_dictionary.xml @@ -0,0 +1,103 @@ + + + executable_input_slow_python + + + TabSeparated + input_slow.py + 1 + 1000 + + + + + + + + input + + + result + String + + + + + + + executable_input_slow_pool_python + + + TabSeparated + input_slow.py + 1 + 1000 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_slow_python + + + TabSeparated + input_implicit_slow.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + + + + + + + executable_implicit_input_slow_pool_python + + + TabSeparated + input_implicit_slow.py + 1 + 1 + 1000 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_input_sum_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_input_sum_python_dictionary.xml new file mode 100644 index 00000000000..3f63e7b8671 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_input_sum_python_dictionary.xml @@ -0,0 +1,128 @@ + + + executable_input_sum_python + + + TabSeparated + input_sum.py + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + + + executable_input_sum_pool_python + + + TabSeparated + input_sum.py + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + + + + executable_implicit_input_sum_python + + + TabSeparated + input_implicit_sum.py + 1 + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + + + executable_implicit_input_sum_pool_python + + + TabSeparated + input_implicit_sum.py + 1 + 1 + + + + + + + + + first_argument + UInt64 + + + second_argument + UInt64 + + + + result + UInt64 + + + + + diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_non_direct_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_non_direct_dictionary.xml new file mode 100644 index 00000000000..3f77dae1ac6 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_non_direct_dictionary.xml @@ -0,0 +1,95 @@ + + + executable_input_non_direct_bash + + + TabSeparated + while read read_data; do printf "$read_data\tKey $read_data\n"; done + + + + + + + + input + + + result + String + + + + + + + executable_input_non_direct_pool_bash + + + TabSeparated + while read read_data; do printf "$read_data\tKey $read_data\n"; done + + + + + + + + input + + + result + String + + + + + + + executable_input_implicit_non_direct_bash + + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 1 + + + + + + + + input + + + result + String + + + + + + + executable_input_implicit_non_direct_pool_bash + + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 1 + + + + + + + + input + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_source_argument_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_source_argument_python_dictionary.xml new file mode 100644 index 00000000000..3173eb5500d --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_source_argument_python_dictionary.xml @@ -0,0 +1,54 @@ + + + executable_source_simple_key_argument_python + + + TabSeparated + source_argument.py 1 + 1 + + + + + + 0 + + + input + + + result + String + + + + + + + executable_source_complex_key_argument_python + + + TabSeparated + source_argument.py 1 + 1 + + + + + + 0 + + + + input + UInt64 + + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_source_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_source_python_dictionary.xml new file mode 100644 index 00000000000..a2036fc67bb --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_source_python_dictionary.xml @@ -0,0 +1,54 @@ + + + executable_source_simple_key_python + + + TabSeparated + source.py + 1 + + + + + + 0 + + + input + + + result + String + + + + + + + executable_source_complex_key_python + + + TabSeparated + source.py + 1 + + + + + + 0 + + + + input + UInt64 + + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/dictionaries/executable_source_updated_python_dictionary.xml b/tests/integration/test_executable_dictionary/dictionaries/executable_source_updated_python_dictionary.xml new file mode 100644 index 00000000000..10d1b1ca0c6 --- /dev/null +++ b/tests/integration/test_executable_dictionary/dictionaries/executable_source_updated_python_dictionary.xml @@ -0,0 +1,56 @@ + + + executable_source_simple_key_update_python + + + TabSeparated + source_update.py + 1 + 1 + + + + + + 5 + + + input + + + result + String + + + + + + + executable_source_complex_key_update_python + + + TabSeparated + source_update.py + 1 + 1 + + + + + + 5 + + + + input + UInt64 + + + + result + String + + + + + \ No newline at end of file diff --git a/tests/integration/test_executable_dictionary/test.py b/tests/integration/test_executable_dictionary/test.py new file mode 100644 index 00000000000..5e50a092a29 --- /dev/null +++ b/tests/integration/test_executable_dictionary/test.py @@ -0,0 +1,175 @@ +import os +import sys +import time + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance('node', stay_alive=True, main_configs=[]) + + +def skip_test_msan(instance): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with vfork") + +def copy_file_to_container(local_path, dist_path, container_id): + os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path)) + +config = ''' + /etc/clickhouse-server/dictionaries/*_dictionary.xml +''' + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + node.replace_config("/etc/clickhouse-server/config.d/dictionaries_config.xml", config) + + copy_file_to_container(os.path.join(SCRIPT_DIR, 'dictionaries/.'), '/etc/clickhouse-server/dictionaries', node.docker_id) + copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) + + node.restart_clickhouse() + + yield cluster + + finally: + cluster.shutdown() + +def test_executable_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_input_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_input_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_input_send_chunk_header_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_send_chunk_header_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_send_chunk_header_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_input_send_chunk_header_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_send_chunk_header_python', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_send_chunk_header_pool_python', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_input_sum_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_sum_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + assert node.query("SELECT dictGet('executable_input_sum_pool_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + +def test_executable_implicit_input_sum_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_sum_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + assert node.query("SELECT dictGet('executable_implicit_input_sum_pool_python', 'result', tuple(toUInt64(1), toUInt64(1)))") == '2\n' + +def test_executable_input_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_argument_python', 'result', toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT dictGet('executable_input_argument_pool_python', 'result', toUInt64(1))") == 'Key 1 1\n' + +def test_executable_implicit_input_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_argument_python', 'result', toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT dictGet('executable_implicit_input_argument_pool_python', 'result', toUInt64(1))") == 'Key 1 1\n' + +def test_executable_input_signalled_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_signalled_python', 'result', toUInt64(1))") == 'Default result\n' + assert node.query("SELECT dictGet('executable_input_signalled_pool_python', 'result', toUInt64(1))") == 'Default result\n' + +def test_executable_implicit_input_signalled_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_implicit_input_signalled_python', 'result', toUInt64(1))") == 'Default result\n' + assert node.query("SELECT dictGet('executable_implicit_input_signalled_pool_python', 'result', toUInt64(1))") == 'Default result\n' + +def test_executable_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_implicit_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_implicit_input_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_python', 'result', toUInt64(1))") + assert node.query_and_get_error("SELECT dictGet('executable_implicit_input_slow_pool_python', 'result', toUInt64(1))") + +def test_executable_non_direct_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_non_direct_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_non_direct_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_implicit_non_direct_input_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT dictGet('executable_input_implicit_non_direct_bash', 'result', toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT dictGet('executable_input_implicit_non_direct_pool_bash', 'result', toUInt64(1))") == 'Key 1\n' + +def test_executable_source_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_python) ORDER BY input") == '1\tValue 1\n2\tValue 2\n3\tValue 3\n' + assert node.query("SELECT dictGet('executable_source_simple_key_python', 'result', toUInt64(1))") == 'Value 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_python', 'result', toUInt64(2))") == 'Value 2\n' + assert node.query("SELECT dictGet('executable_source_simple_key_python', 'result', toUInt64(3))") == 'Value 3\n' + + assert node.query("SELECT * FROM dictionary('executable_source_complex_key_python') ORDER BY input") == '1\tValue 1\n2\tValue 2\n3\tValue 3\n' + assert node.query("SELECT dictGet('executable_source_complex_key_python', 'result', tuple(toUInt64(1)))") == 'Value 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_python', 'result', tuple(toUInt64(2)))") == 'Value 2\n' + assert node.query("SELECT dictGet('executable_source_complex_key_python', 'result', tuple(toUInt64(3)))") == 'Value 3\n' + +def test_executable_source_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_argument_python) ORDER BY input") == '1\tValue 1 1\n2\tValue 1 2\n3\tValue 1 3\n' + assert node.query("SELECT dictGet('executable_source_simple_key_argument_python', 'result', toUInt64(1))") == 'Value 1 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_argument_python', 'result', toUInt64(2))") == 'Value 1 2\n' + assert node.query("SELECT dictGet('executable_source_simple_key_argument_python', 'result', toUInt64(3))") == 'Value 1 3\n' + + assert node.query("SELECT * FROM dictionary(executable_source_complex_key_argument_python) ORDER BY input") == '1\tValue 1 1\n2\tValue 1 2\n3\tValue 1 3\n' + assert node.query("SELECT dictGet('executable_source_complex_key_argument_python', 'result', toUInt64(1))") == 'Value 1 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_argument_python', 'result', toUInt64(2))") == 'Value 1 2\n' + assert node.query("SELECT dictGet('executable_source_complex_key_argument_python', 'result', toUInt64(3))") == 'Value 1 3\n' + +def test_executable_source_updated_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_update_python) ORDER BY input") == '1\tValue 0 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_update_python', 'result', toUInt64(1))") == 'Value 0 1\n' + + time.sleep(10) + + assert node.query("SELECT * FROM dictionary(executable_source_simple_key_update_python) ORDER BY input") == '1\tValue 1 1\n' + assert node.query("SELECT dictGet('executable_source_simple_key_update_python', 'result', toUInt64(1))") == 'Value 1 1\n' + + assert node.query("SELECT * FROM dictionary(executable_source_complex_key_update_python) ORDER BY input") == '1\tValue 0 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_update_python', 'result', toUInt64(1))") == 'Value 0 1\n' + + time.sleep(10) + + assert node.query("SELECT * FROM dictionary(executable_source_complex_key_update_python) ORDER BY input") == '1\tValue 1 1\n' + assert node.query("SELECT dictGet('executable_source_complex_key_update_python', 'result', toUInt64(1))") == 'Value 1 1\n' + diff --git a/tests/integration/test_executable_dictionary/user_scripts/input.py b/tests/integration/test_executable_dictionary/user_scripts/input.py new file mode 100755 index 00000000000..e711dd8e306 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input.py @@ -0,0 +1,11 @@ +#!/usr/bin/python3 + +import sys +import os +import signal + +if __name__ == '__main__': + for line in sys.stdin: + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input.sh b/tests/integration/test_executable_dictionary/user_scripts/input.sh new file mode 100755 index 00000000000..7712c392951 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "$read_data\tKey $read_data\n"; +done diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_argument.py b/tests/integration/test_executable_dictionary/user_scripts/input_argument.py new file mode 100755 index 00000000000..163f9c4183f --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_argument.py @@ -0,0 +1,11 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + str(arg) + " " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_chunk_header.py b/tests/integration/test_executable_dictionary/user_scripts/input_chunk_header.py new file mode 100755 index 00000000000..4eb00f64eb3 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_chunk_header.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + updated_line = line.replace('\n', '') + chunk_length -= 1 + print(updated_line + '\t' + "Key " + updated_line, end='\n') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.py new file mode 100755 index 00000000000..835ab1f441a --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.py @@ -0,0 +1,8 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit.sh b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.sh new file mode 100755 index 00000000000..aea51b82b1f --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key $read_data\n"; +done diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_argument.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_argument.py new file mode 100755 index 00000000000..c1b2e5966d7 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_argument.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + print("Key " + str(arg) + " " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_chunk_header.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_chunk_header.py new file mode 100755 index 00000000000..5dc03e1c507 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_chunk_header.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_signalled.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_signalled.py new file mode 100755 index 00000000000..27c8bc4840e --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_signalled.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_slow.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_slow.py new file mode 100755 index 00000000000..648a9eac918 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_slow.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(5) + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_implicit_sum.py b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_sum.py new file mode 100755 index 00000000000..432d7a13a2f --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_implicit_sum.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_signalled.py b/tests/integration/test_executable_dictionary/user_scripts/input_signalled.py new file mode 100755 index 00000000000..a3a99f1e71e --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_signalled.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_slow.py b/tests/integration/test_executable_dictionary/user_scripts/input_slow.py new file mode 100755 index 00000000000..a3b8c484b29 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_slow.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(5) + updated_line = line.replace('\n', '') + print(updated_line + '\t' + "Key " + updated_line, end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/input_sum.py b/tests/integration/test_executable_dictionary/user_scripts/input_sum.py new file mode 100755 index 00000000000..e9ec5028701 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/input_sum.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + updated_line = line.replace('\n', '') + line_split = re.split(r'\t+', line) + sum = int(line_split[0]) + int(line_split[1]) + print(updated_line + '\t' + str(sum), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/source.py b/tests/integration/test_executable_dictionary/user_scripts/source.py new file mode 100755 index 00000000000..e105773c467 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/source.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + print('1' + '\t' + 'Value 1', end='\n') + print('2' + '\t' + 'Value 2', end='\n') + print('3' + '\t' + 'Value 3', end='\n') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/source_argument.py b/tests/integration/test_executable_dictionary/user_scripts/source_argument.py new file mode 100755 index 00000000000..881e73adc97 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/source_argument.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + print('1' + '\t' + 'Value ' + str(arg) + ' 1', end='\n') + print('2' + '\t' + 'Value ' + str(arg) + ' 2', end='\n') + print('3' + '\t' + 'Value ' + str(arg) + ' 3', end='\n') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_dictionary/user_scripts/source_update.py b/tests/integration/test_executable_dictionary/user_scripts/source_update.py new file mode 100755 index 00000000000..99388f9ada3 --- /dev/null +++ b/tests/integration/test_executable_dictionary/user_scripts/source_update.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + update_field_value = 0 + + if len(sys.argv) >= 2: + update_field_value = int(sys.argv[1]) + + print('1' + '\t' + 'Value ' + str(update_field_value) + ' 1', end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/test.py b/tests/integration/test_executable_table_function/test.py index f5537e26b94..7820396d20f 100644 --- a/tests/integration/test_executable_table_function/test.py +++ b/tests/integration/test_executable_table_function/test.py @@ -1,6 +1,5 @@ import os import sys -import time import pytest @@ -30,69 +29,353 @@ def started_cluster(): copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) node.restart_clickhouse() + node.query("CREATE TABLE test_data_table (id UInt64) ENGINE=TinyLog;") + node.query("INSERT INTO test_data_table VALUES (0), (1), (2);") + yield cluster finally: cluster.shutdown() -def test_executable_function_no_input(started_cluster): +def test_executable_function_no_input_bash(started_cluster): skip_test_msan(node) - assert node.query("SELECT * FROM executable('test_no_input.sh', 'TabSeparated', 'value UInt64')") == '1\n' + assert node.query("SELECT * FROM executable('no_input.sh', 'TabSeparated', 'value String')") == 'Key 0\nKey 1\nKey 2\n' -def test_executable_function_input(started_cluster): +def test_executable_function_no_input_python(started_cluster): skip_test_msan(node) - assert node.query("SELECT * FROM executable('test_input.sh', 'TabSeparated', 'value String', (SELECT 1))") == 'Key 1\n' + assert node.query("SELECT * FROM executable('no_input.py', 'TabSeparated', 'value String')") == 'Key 0\nKey 1\nKey 2\n' -def test_executable_function_input_multiple_pipes(started_cluster): +def test_executable_function_input_bash(started_cluster): skip_test_msan(node) - actual = node.query("SELECT * FROM executable('test_input_multiple_pipes.sh', 'TabSeparated', 'value String', (SELECT 1), (SELECT 2), (SELECT 3))") + + query = "SELECT * FROM executable('input.sh', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == 'Key 1\n' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == 'Key 0\nKey 1\nKey 2\n' + +def test_executable_function_input_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input.py', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == 'Key 1\n' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == 'Key 0\nKey 1\nKey 2\n' + +def test_executable_function_input_sum_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_sum.py', 'TabSeparated', 'value UInt64', {source})" + assert node.query(query.format(source='(SELECT 1, 1)')) == '2\n' + assert node.query(query.format(source='(SELECT id, id FROM test_data_table)')) == '0\n2\n4\n' + +def test_executable_function_input_argument_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_argument.py 1', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == 'Key 1 1\n' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == 'Key 1 0\nKey 1 1\nKey 1 2\n' + +def test_executable_function_input_signalled_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_signalled.py', 'TabSeparated', 'value String', {source})" + assert node.query(query.format(source='(SELECT 1)')) == '' + assert node.query(query.format(source='(SELECT id FROM test_data_table)')) == '' + +def test_executable_function_input_slow_python(started_cluster): + skip_test_msan(node) + + query = "SELECT * FROM executable('input_slow.py', 'TabSeparated', 'value String', {source})" + assert node.query_and_get_error(query.format(source='(SELECT 1)')) + assert node.query_and_get_error(query.format(source='(SELECT id FROM test_data_table)')) + +def test_executable_function_input_multiple_pipes_python(started_cluster): + skip_test_msan(node) + query = "SELECT * FROM executable('input_multiple_pipes.py', 'TabSeparated', 'value String', {source})" + actual = node.query(query.format(source='(SELECT 1), (SELECT 2), (SELECT 3)')) expected = 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' assert actual == expected -def test_executable_function_argument(started_cluster): - skip_test_msan(node) - assert node.query("SELECT * FROM executable('test_argument.sh 1', 'TabSeparated', 'value String')") == 'Key 1\n' - -def test_executable_storage_no_input(started_cluster): - skip_test_msan(node) - node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value UInt64) ENGINE=Executable('test_no_input.sh', 'TabSeparated')") - assert node.query("SELECT * FROM test_table") == '1\n' - node.query("DROP TABLE test_table") - -def test_executable_storage_input(started_cluster): - skip_test_msan(node) - node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=Executable('test_input.sh', 'TabSeparated', (SELECT 1))") - assert node.query("SELECT * FROM test_table") == 'Key 1\n' - node.query("DROP TABLE test_table") - -def test_executable_storage_input_multiple_pipes(started_cluster): - skip_test_msan(node) - node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=Executable('test_input_multiple_pipes.sh', 'TabSeparated', (SELECT 1), (SELECT 2), (SELECT 3))") - actual = node.query("SELECT * FROM test_table") - expected = 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + actual = node.query(query.format(source='(SELECT id FROM test_data_table), (SELECT 2), (SELECT 3)')) + expected = 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' assert actual == expected - node.query("DROP TABLE test_table") -def test_executable_storage_argument(started_cluster): +def test_executable_storage_no_input_bash(started_cluster): skip_test_msan(node) node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=Executable('test_argument.sh 1', 'TabSeparated')") + node.query("CREATE TABLE test_table (value String) ENGINE=Executable('no_input.sh', 'TabSeparated')") + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_no_input_python(started_cluster): + skip_test_msan(node) + node.query("DROP TABLE IF EXISTS test_table") + node.query("CREATE TABLE test_table (value String) ENGINE=Executable('no_input.py', 'TabSeparated')") + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_bash(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input.sh', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) assert node.query("SELECT * FROM test_table") == 'Key 1\n' node.query("DROP TABLE test_table") -def test_executable_pool_storage(started_cluster): + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_python(started_cluster): skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input.py', 'TabSeparated', {source})" + node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=ExecutablePool('test_input_process_pool.sh', 'TabSeparated', (SELECT 1))") + node.query(query.format(source='(SELECT 1)')) assert node.query("SELECT * FROM test_table") == 'Key 1\n' node.query("DROP TABLE test_table") -def test_executable_pool_storage_multiple_pipes(started_cluster): + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_send_chunk_header_python(started_cluster): skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_chunk_header.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1" + node.query("DROP TABLE IF EXISTS test_table") - node.query("CREATE TABLE test_table (value String) ENGINE=ExecutablePool('test_input_process_pool_multiple_pipes.sh', 'TabSeparated', (SELECT 1), (SELECT 2), (SELECT 3))") + node.query(query.format(source='(SELECT 1)')) + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_sum_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value UInt64) ENGINE=Executable('input_sum.py', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1, 1)')) + assert node.query("SELECT * FROM test_table") == '2\n' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id, id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_argument_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_argument.py 1', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_signalled_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_signalled.py', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + assert node.query("SELECT * FROM test_table") == '' + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query("SELECT * FROM test_table") == '' + node.query("DROP TABLE test_table") + +def test_executable_storage_input_slow_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_slow.py', 'TabSeparated', {source}) SETTINGS command_read_timeout=2500" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + assert node.query_and_get_error("SELECT * FROM test_table") + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + assert node.query_and_get_error("SELECT * FROM test_table") + node.query("DROP TABLE test_table") + +def test_executable_function_input_multiple_pipes_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=Executable('input_multiple_pipes.py', 'TabSeparated', {source})" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1), (SELECT 2), (SELECT 3)')) assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table), (SELECT 2), (SELECT 3)')) + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 0\nKey 1\nKey 2\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_sum_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value UInt64) ENGINE=ExecutablePool('input_sum_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1, 1)')) + + assert node.query("SELECT * FROM test_table") == '2\n' + assert node.query("SELECT * FROM test_table") == '2\n' + assert node.query("SELECT * FROM test_table") == '2\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id, id FROM test_data_table)')) + + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + assert node.query("SELECT * FROM test_table") == '0\n2\n4\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_argument_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_argument_pool.py 1', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + assert node.query("SELECT * FROM test_table") == 'Key 1 0\nKey 1 1\nKey 1 2\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_signalled_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_signalled_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_slow_python(started_cluster): + skip_test_msan(node) + + query = """CREATE TABLE test_table (value String) + ENGINE=ExecutablePool('input_slow_pool.py', 'TabSeparated', {source}) + SETTINGS send_chunk_header=1, pool_size=1, command_read_timeout=2500""" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table)')) + + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + assert node.query_and_get_error("SELECT * FROM test_table") + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_multiple_pipes_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_multiple_pipes_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1), (SELECT 2), (SELECT 3)')) + + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT id FROM test_data_table), (SELECT 2), (SELECT 3)')) + + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 0\nKey from 0 fd 1\nKey from 0 fd 2\n' + + node.query("DROP TABLE test_table") + +def test_executable_pool_storage_input_count_python(started_cluster): + skip_test_msan(node) + + query = "CREATE TABLE test_table (value String) ENGINE=ExecutablePool('input_count_pool.py', 'TabSeparated', {source}) SETTINGS send_chunk_header=1, pool_size=1" + + node.query("DROP TABLE IF EXISTS test_table") + node.query(query.format(source='(SELECT 1)')) + + assert node.query("SELECT * FROM test_table") == '1\n' + assert node.query("SELECT * FROM test_table") == '1\n' + assert node.query("SELECT * FROM test_table") == '1\n' + + node.query("DROP TABLE test_table") + + node.query(query.format(source='(SELECT number FROM system.numbers LIMIT 250000)')) + + assert node.query("SELECT * FROM test_table") == '250000\n' + assert node.query("SELECT * FROM test_table") == '250000\n' + assert node.query("SELECT * FROM test_table") == '250000\n' + + node.query("DROP TABLE test_table") diff --git a/tests/integration/test_executable_table_function/user_scripts/input.py b/tests/integration/test_executable_table_function/user_scripts/input.py new file mode 100755 index 00000000000..835ab1f441a --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input.py @@ -0,0 +1,8 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/test_input.sh b/tests/integration/test_executable_table_function/user_scripts/input.sh similarity index 100% rename from tests/integration/test_executable_table_function/user_scripts/test_input.sh rename to tests/integration/test_executable_table_function/user_scripts/input.sh diff --git a/tests/integration/test_executable_table_function/user_scripts/input_argument.py b/tests/integration/test_executable_table_function/user_scripts/input_argument.py new file mode 100755 index 00000000000..c1b2e5966d7 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_argument.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + print("Key " + str(arg) + " " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_argument_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_argument_pool.py new file mode 100755 index 00000000000..378a6ef4391 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_argument_pool.py @@ -0,0 +1,17 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + str(arg) + " " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_chunk_header.py b/tests/integration/test_executable_table_function/user_scripts/input_chunk_header.py new file mode 100755 index 00000000000..5dc03e1c507 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_chunk_header.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_count_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_count_pool.py new file mode 100755 index 00000000000..8b744168a82 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_count_pool.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(1, end='\n') + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes.py b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes.py new file mode 100755 index 00000000000..64590cbc16a --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes.py @@ -0,0 +1,19 @@ +#!/usr/bin/python3 + +import sys +import os + +if __name__ == '__main__': + fd3 = os.fdopen(3) + fd4 = os.fdopen(4) + + for line in fd4: + print("Key from 4 fd " + line, end='') + + for line in fd3: + print("Key from 3 fd " + line, end='') + + for line in sys.stdin: + print("Key from 0 fd " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes_pool.py new file mode 100755 index 00000000000..a3a515899f9 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_multiple_pipes_pool.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 + +import sys +import os + +if __name__ == '__main__': + fd3 = os.fdopen(3) + fd4 = os.fdopen(4) + + lines = [] + + for chunk_header_fd4 in fd4: + fd4_chunk_length = int(chunk_header_fd4) + + while fd4_chunk_length != 0: + line = fd4.readline() + fd4_chunk_length -= 1 + lines.append("Key from 4 fd " + line) + + for chunk_header_fd3 in fd3: + fd3_chunk_length = int(chunk_header_fd3) + + while fd3_chunk_length != 0: + line = fd3.readline() + fd3_chunk_length -= 1 + lines.append("Key from 3 fd " + line) + + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + lines.append("Key from 0 fd " + line) + + break + break + + print(str(len(lines)), end='\n') + + for line in lines: + print(line, end='') + lines.clear() + + sys.stdout.flush() \ No newline at end of file diff --git a/tests/integration/test_executable_table_function/user_scripts/input_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_pool.py new file mode 100755 index 00000000000..ec4e9af23cd --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_pool.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_signalled.py b/tests/integration/test_executable_table_function/user_scripts/input_signalled.py new file mode 100755 index 00000000000..93ce20fa8e7 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_signalled.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import os +import signal + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_signalled_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_signalled_pool.py new file mode 100755 index 00000000000..1ea0eddbd8d --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_signalled_pool.py @@ -0,0 +1,19 @@ +#!/usr/bin/python3 + +import sys +import os +import signal + +if __name__ == '__main__': + for chunk_header in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_slow.py b/tests/integration/test_executable_table_function/user_scripts/input_slow.py new file mode 100755 index 00000000000..4c2abe89e33 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_slow.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(25) + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_slow_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_slow_pool.py new file mode 100755 index 00000000000..c8df7e18c4c --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_slow_pool.py @@ -0,0 +1,18 @@ +#!/usr/bin/python3 + +import sys +import time + +if __name__ == '__main__': + for chunk_header in sys.stdin: + time.sleep(25) + + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_sum.py b/tests/integration/test_executable_table_function/user_scripts/input_sum.py new file mode 100755 index 00000000000..432d7a13a2f --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_sum.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/input_sum_pool.py b/tests/integration/test_executable_table_function/user_scripts/input_sum_pool.py new file mode 100755 index 00000000000..cd0de25fe87 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/input_sum_pool.py @@ -0,0 +1,17 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + print(str(chunk_length), end='\n') + + while chunk_length != 0: + line = sys.stdin.readline() + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + chunk_length -= 1 + + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/no_input.py b/tests/integration/test_executable_table_function/user_scripts/no_input.py new file mode 100755 index 00000000000..65b78f3d755 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/no_input.py @@ -0,0 +1,9 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + print("Key 0") + print("Key 1") + print("Key 2") + sys.stdout.flush() diff --git a/tests/integration/test_executable_table_function/user_scripts/no_input.sh b/tests/integration/test_executable_table_function/user_scripts/no_input.sh new file mode 100755 index 00000000000..13d172a5be4 --- /dev/null +++ b/tests/integration/test_executable_table_function/user_scripts/no_input.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +printf "Key 0\n"; +printf "Key 1\n"; +printf "Key 2\n"; diff --git a/tests/integration/test_executable_table_function/user_scripts/test_argument.sh b/tests/integration/test_executable_table_function/user_scripts/test_argument.sh deleted file mode 100755 index 89634031d2b..00000000000 --- a/tests/integration/test_executable_table_function/user_scripts/test_argument.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -echo "Key $1" diff --git a/tests/integration/test_executable_table_function/user_scripts/test_input_multiple_pipes.sh b/tests/integration/test_executable_table_function/user_scripts/test_input_multiple_pipes.sh deleted file mode 100755 index 1e53e3211dc..00000000000 --- a/tests/integration/test_executable_table_function/user_scripts/test_input_multiple_pipes.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -while read -t 250 -u 4 read_data; do printf "Key from 4 fd $read_data\n"; done -while read -t 250 -u 3 read_data; do printf "Key from 3 fd $read_data\n"; done -while read -t 250 read_data; do printf "Key from 0 fd $read_data\n"; done diff --git a/tests/integration/test_executable_table_function/user_scripts/test_no_input.sh b/tests/integration/test_executable_table_function/user_scripts/test_no_input.sh deleted file mode 100755 index 9e8b3be63d6..00000000000 --- a/tests/integration/test_executable_table_function/user_scripts/test_no_input.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -echo "1" diff --git a/tests/integration/test_executable_user_defined_function/__init__.py b/tests/integration/test_executable_user_defined_function/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_executable_user_defined_function/config/executable_user_defined_functions_config.xml b/tests/integration/test_executable_user_defined_function/config/executable_user_defined_functions_config.xml new file mode 100644 index 00000000000..3cbf717bb67 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/config/executable_user_defined_functions_config.xml @@ -0,0 +1,2 @@ + + diff --git a/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml b/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml new file mode 100644 index 00000000000..d8f81a588a2 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/functions/test_function_config.xml @@ -0,0 +1,196 @@ + + + executable + test_function_bash + String + + UInt64 + + TabSeparated + input.sh + + + + executable_pool + test_function_pool_bash + String + + UInt64 + + TabSeparated + input.sh + + + + executable + test_function_python + String + + UInt64 + + TabSeparated + input.py + + + + executable_pool + test_function_pool_python + String + + UInt64 + + TabSeparated + input.py + + + + executable + test_function_send_chunk_header_python + String + + UInt64 + + TabSeparated + 1 + input_chunk_header.py + + + + executable_pool + test_function_send_chunk_header_pool_python + String + + UInt64 + + TabSeparated + 1 + input_chunk_header.py + + + + executable + test_function_sum_python + String + + UInt64 + + + UInt64 + + TabSeparated + input_sum.py + + + + executable_pool + test_function_sum_pool_python + String + + UInt64 + + + UInt64 + + TabSeparated + input_sum.py + + + + executable + test_function_argument_python + String + + UInt64 + + TabSeparated + input_argument.py 1 + + + + executable_pool + test_function_argument_pool_python + String + + UInt64 + + TabSeparated + input_argument.py 1 + + + + executable + test_function_slow_python + String + + UInt64 + + TabSeparated + input_slow.py + 1 + 1000 + + + + executable_pool + test_function_slow_pool_python + String + + UInt64 + + TabSeparated + input_slow.py + 1 + 1000 + + + + executable + test_function_signalled_python + String + + UInt64 + + TabSeparated + input_signalled.py + 1 + 1000 + + + + executable_pool + test_function_signalled_pool_python + String + + UInt64 + + TabSeparated + input_signalled.py + 1 + 1000 + + + + executable + test_function_non_direct_bash + String + + UInt64 + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 0 + + + + executable_pool + test_function_non_direct_pool_bash + String + + UInt64 + + TabSeparated + while read read_data; do printf "Key $read_data\n"; done + 0 + + + diff --git a/tests/integration/test_executable_user_defined_function/test.py b/tests/integration/test_executable_user_defined_function/test.py new file mode 100644 index 00000000000..94afdf8d8a9 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/test.py @@ -0,0 +1,106 @@ +import os +import sys +import time + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance('node', stay_alive=True, main_configs=[]) + + +def skip_test_msan(instance): + if instance.is_built_with_memory_sanitizer(): + pytest.skip("Memory Sanitizer cannot work with vfork") + +def copy_file_to_container(local_path, dist_path, container_id): + os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path)) + +config = ''' + /etc/clickhouse-server/functions/test_function_config.xml +''' + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + node.replace_config("/etc/clickhouse-server/config.d/executable_user_defined_functions_config.xml", config) + + copy_file_to_container(os.path.join(SCRIPT_DIR, 'functions/.'), '/etc/clickhouse-server/functions', node.docker_id) + copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) + + node.restart_clickhouse() + + yield cluster + + finally: + cluster.shutdown() + +def test_executable_function_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_bash(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_pool_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_pool_bash(1)") == 'Key 1\n' + +def test_executable_function_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_python(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_pool_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_pool_python(1)") == 'Key 1\n' + +def test_executable_function_send_chunk_header_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_send_chunk_header_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_send_chunk_header_python(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_send_chunk_header_pool_python(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_send_chunk_header_pool_python(1)") == 'Key 1\n' + +def test_executable_function_sum_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_sum_python(toUInt64(1), toUInt64(1))") == '2\n' + assert node.query("SELECT test_function_sum_python(1, 1)") == '2\n' + + assert node.query("SELECT test_function_sum_pool_python(toUInt64(1), toUInt64(1))") == '2\n' + assert node.query("SELECT test_function_sum_pool_python(1, 1)") == '2\n' + +def test_executable_function_argument_python(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_argument_python(toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT test_function_argument_python(1)") == 'Key 1 1\n' + + assert node.query("SELECT test_function_argument_pool_python(toUInt64(1))") == 'Key 1 1\n' + assert node.query("SELECT test_function_argument_pool_python(1)") == 'Key 1 1\n' + +def test_executable_function_signalled_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT test_function_signalled_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_signalled_python(1)") + + assert node.query_and_get_error("SELECT test_function_signalled_pool_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_signalled_pool_python(1)") + +def test_executable_function_slow_python(started_cluster): + skip_test_msan(node) + assert node.query_and_get_error("SELECT test_function_slow_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_slow_python(1)") + + assert node.query_and_get_error("SELECT test_function_slow_pool_python(toUInt64(1))") + assert node.query_and_get_error("SELECT test_function_slow_pool_python(1)") + +def test_executable_function_non_direct_bash(started_cluster): + skip_test_msan(node) + assert node.query("SELECT test_function_non_direct_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_non_direct_bash(1)") == 'Key 1\n' + + assert node.query("SELECT test_function_non_direct_pool_bash(toUInt64(1))") == 'Key 1\n' + assert node.query("SELECT test_function_non_direct_pool_bash(1)") == 'Key 1\n' diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input.py b/tests/integration/test_executable_user_defined_function/user_scripts/input.py new file mode 100755 index 00000000000..835ab1f441a --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input.py @@ -0,0 +1,8 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for line in sys.stdin: + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input.sh b/tests/integration/test_executable_user_defined_function/user_scripts/input.sh new file mode 100755 index 00000000000..aea51b82b1f --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key $read_data\n"; +done diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_argument.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_argument.py new file mode 100755 index 00000000000..c1b2e5966d7 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_argument.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + arg = int(sys.argv[1]) + + for line in sys.stdin: + print("Key " + str(arg) + " " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_chunk_header.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_chunk_header.py new file mode 100755 index 00000000000..5dc03e1c507 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_chunk_header.py @@ -0,0 +1,14 @@ +#!/usr/bin/python3 + +import sys + +if __name__ == '__main__': + for chunk_header in sys.stdin: + chunk_length = int(chunk_header) + + while chunk_length != 0: + line = sys.stdin.readline() + chunk_length -= 1 + print("Key " + line, end='') + + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_signalled.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_signalled.py new file mode 100755 index 00000000000..27c8bc4840e --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_signalled.py @@ -0,0 +1,13 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + os.signal(os.getpid(), signal.SIGTERM) + + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_slow.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_slow.py new file mode 100755 index 00000000000..648a9eac918 --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_slow.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + +import sys +import os +import signal +import time + +if __name__ == '__main__': + for line in sys.stdin: + time.sleep(5) + print("Key " + line, end='') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_function/user_scripts/input_sum.py b/tests/integration/test_executable_user_defined_function/user_scripts/input_sum.py new file mode 100755 index 00000000000..432d7a13a2f --- /dev/null +++ b/tests/integration/test_executable_user_defined_function/user_scripts/input_sum.py @@ -0,0 +1,10 @@ +#!/usr/bin/python3 + +import sys +import re + +if __name__ == '__main__': + for line in sys.stdin: + line_split = re.split(r'\t+', line) + print(int(line_split[0]) + int(line_split[1]), end='\n') + sys.stdout.flush() diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml index f2a7d6e67b1..d0bd6e5ab88 100644 --- a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml +++ b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config.xml @@ -7,8 +7,7 @@ UInt64 TabSeparated - while read read_data; do printf "Key_1 $read_data\n"; done - 0 + test_input_1.sh
diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml index fe02146a6b8..80ae21a086d 100644 --- a/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml +++ b/tests/integration/test_executable_user_defined_functions_config_reload/functions/test_function_config2.xml @@ -7,8 +7,7 @@ UInt64 TabSeparated - while read read_data; do printf "Key_2 $read_data\n"; done - 0 + test_input_2.sh
diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/test.py b/tests/integration/test_executable_user_defined_functions_config_reload/test.py index 3117b3e72b1..629c426a28c 100644 --- a/tests/integration/test_executable_user_defined_functions_config_reload/test.py +++ b/tests/integration/test_executable_user_defined_functions_config_reload/test.py @@ -28,6 +28,8 @@ def started_cluster(): cluster.start() copy_file_to_container(os.path.join(SCRIPT_DIR, 'functions/.'), '/etc/clickhouse-server/functions', node.docker_id) + copy_file_to_container(os.path.join(SCRIPT_DIR, 'user_scripts/.'), '/var/lib/clickhouse/user_scripts', node.docker_id) + node.restart_clickhouse() yield cluster diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_1.sh b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_1.sh new file mode 100755 index 00000000000..a6cffe83bba --- /dev/null +++ b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_1.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key_1 $read_data\n"; +done diff --git a/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_2.sh b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_2.sh new file mode 100755 index 00000000000..a673cfd18fb --- /dev/null +++ b/tests/integration/test_executable_user_defined_functions_config_reload/user_scripts/test_input_2.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +while read read_data; + do printf "Key_2 $read_data\n"; +done diff --git a/tests/integration/test_graphite_merge_tree/test.py b/tests/integration/test_graphite_merge_tree/test.py index 7628211551d..9e48f12f007 100644 --- a/tests/integration/test_graphite_merge_tree/test.py +++ b/tests/integration/test_graphite_merge_tree/test.py @@ -6,6 +6,7 @@ import pytest from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster from helpers.test_tools import TSV +from helpers.test_tools import csv_compare cluster = ClickHouseCluster(__file__) instance = cluster.add_instance('instance', @@ -234,18 +235,19 @@ SELECT * FROM test.graphite; def test_system_graphite_retentions(graphite_table): expected = ''' -graphite_rollup \\\\.count$ sum 0 0 1 0 ['test'] ['graphite'] -graphite_rollup \\\\.max$ max 0 0 2 0 ['test'] ['graphite'] -graphite_rollup ^five_min\\\\. 31536000 14400 3 0 ['test'] ['graphite'] -graphite_rollup ^five_min\\\\. 5184000 3600 3 0 ['test'] ['graphite'] -graphite_rollup ^five_min\\\\. 0 300 3 0 ['test'] ['graphite'] -graphite_rollup ^one_min avg 31536000 600 4 0 ['test'] ['graphite'] -graphite_rollup ^one_min avg 7776000 300 4 0 ['test'] ['graphite'] -graphite_rollup ^one_min avg 0 60 4 0 ['test'] ['graphite'] +graphite_rollup all \\\\.count$ sum 0 0 1 0 ['test'] ['graphite'] +graphite_rollup all \\\\.max$ max 0 0 2 0 ['test'] ['graphite'] +graphite_rollup all ^five_min\\\\. 31536000 14400 3 0 ['test'] ['graphite'] +graphite_rollup all ^five_min\\\\. 5184000 3600 3 0 ['test'] ['graphite'] +graphite_rollup all ^five_min\\\\. 0 300 3 0 ['test'] ['graphite'] +graphite_rollup all ^one_min avg 31536000 600 4 0 ['test'] ['graphite'] +graphite_rollup all ^one_min avg 7776000 300 4 0 ['test'] ['graphite'] +graphite_rollup all ^one_min avg 0 60 4 0 ['test'] ['graphite'] ''' result = q('SELECT * from system.graphite_retentions') - assert TSV(result) == TSV(expected) + mismatch = csv_compare(result, expected) + assert len(mismatch) == 0, f"got\n{result}\nwant\n{expected}\ndiff\n{mismatch}\n" q(''' DROP TABLE IF EXISTS test.graphite2; diff --git a/tests/integration/test_graphite_merge_tree_typed/__init__.py b/tests/integration/test_graphite_merge_tree_typed/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_graphite_merge_tree_typed/configs/graphite_rollup.xml b/tests/integration/test_graphite_merge_tree_typed/configs/graphite_rollup.xml new file mode 100644 index 00000000000..c716540a61c --- /dev/null +++ b/tests/integration/test_graphite_merge_tree_typed/configs/graphite_rollup.xml @@ -0,0 +1,120 @@ + + + + metric + timestamp + value + updated + + plain + \.count$ + sum + + + plain + \.max$ + max + + + plain + ^five_min\. + + 0 + 300 + + + 5184000 + 3600 + + + 31536000 + 14400 + + + + plain + ^one_min + avg + + 0 + 60 + + + 7776000 + 300 + + + 31536000 + 600 + + + + tagged + + avg + + 0 + 60 + + + 7776000 + 300 + + + 31536000 + 600 + + + + tag_list + retention=five_min + avg + + 0 + 300 + + + 5184000 + 3600 + + + 31536000 + 14400 + + + + tagged + ^for_taggged + avg + + 0 + 60 + + + 7776000 + 300 + + + 31536000 + 600 + + + + all + ^ten_min\. + sum + + 0 + 600 + + + 5184000 + 7200 + + + 31536000 + 28800 + + + + diff --git a/tests/integration/test_graphite_merge_tree_typed/configs/users.xml b/tests/integration/test_graphite_merge_tree_typed/configs/users.xml new file mode 100644 index 00000000000..66d0cd7e445 --- /dev/null +++ b/tests/integration/test_graphite_merge_tree_typed/configs/users.xml @@ -0,0 +1,8 @@ + + + + + 0 + + + diff --git a/tests/integration/test_graphite_merge_tree_typed/test.py b/tests/integration/test_graphite_merge_tree_typed/test.py new file mode 100644 index 00000000000..e26fd0d2e77 --- /dev/null +++ b/tests/integration/test_graphite_merge_tree_typed/test.py @@ -0,0 +1,580 @@ +import datetime +import os.path as p +import time + +import sys +import pytest +from helpers.client import QueryRuntimeException +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV +from helpers.test_tools import csv_compare + +cluster = ClickHouseCluster(__file__) +instance = cluster.add_instance('instance', + main_configs=['configs/graphite_rollup.xml'], + user_configs=["configs/users.xml"]) +q = instance.query + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + q('CREATE DATABASE test') + + yield cluster + + finally: + cluster.shutdown() + + +@pytest.fixture +def graphite_table(started_cluster): + q(''' +DROP TABLE IF EXISTS test.graphite; +CREATE TABLE test.graphite + (metric String, value Float64, timestamp UInt32, date Date, updated UInt32) + ENGINE = GraphiteMergeTree('graphite_rollup') + PARTITION BY toYYYYMM(date) + ORDER BY (metric, timestamp) + SETTINGS index_granularity=8192; +''') + + yield + + q('DROP TABLE test.graphite') + + +def test_rollup_versions_plain(graphite_table): + timestamp = int(time.time()) + rounded_timestamp = timestamp - timestamp % 60 + date = datetime.date.today().isoformat() + + # Insert rows with timestamps relative to the current time so that the + # first retention clause is active. + # Two parts are created. + q(''' +INSERT INTO test.graphite (metric, value, timestamp, date, updated) + VALUES ('one_min.x1', 100, {timestamp}, '{date}', 1); +INSERT INTO test.graphite (metric, value, timestamp, date, updated) + VALUES ('one_min.x1', 200, {timestamp}, '{date}', 2); +'''.format(timestamp=timestamp, date=date)) + + expected1 = '''\ +one_min.x1 100 {timestamp} {date} 1 +one_min.x1 200 {timestamp} {date} 2 +'''.format(timestamp=timestamp, date=date) + + assert TSV( + q('SELECT * FROM test.graphite ORDER BY updated') + ) == TSV(expected1) + + q('OPTIMIZE TABLE test.graphite') + + # After rollup only the row with max version is retained. + expected2 = '''\ +one_min.x1 200 {timestamp} {date} 2 +'''.format(timestamp=rounded_timestamp, date=date) + + assert TSV(q('SELECT * FROM test.graphite')) == TSV(expected2) + + +def test_rollup_versions_tagged(graphite_table): + timestamp = int(time.time()) + rounded_timestamp = timestamp - timestamp % 60 + date = datetime.date.today().isoformat() + + # Insert rows with timestamps relative to the current time so that the + # first retention clause is active. + # Two parts are created. + q(''' +INSERT INTO test.graphite (metric, value, timestamp, date, updated) + VALUES ('x1?retention=one_min', 100, {timestamp}, '{date}', 1); +INSERT INTO test.graphite (metric, value, timestamp, date, updated) + VALUES ('x1?retention=one_min', 200, {timestamp}, '{date}', 2); +'''.format(timestamp=timestamp, date=date)) + + expected1 = '''\ +x1?retention=one_min 100 {timestamp} {date} 1 +x1?retention=one_min 200 {timestamp} {date} 2 +'''.format(timestamp=timestamp, date=date) + + result = q('SELECT * FROM test.graphite ORDER BY metric, updated') + mismatch = csv_compare(result, expected1) + assert len(mismatch) == 0, f"got\n{result}\nwant\n{expected1}\ndiff\n{mismatch}\n" + + q('OPTIMIZE TABLE test.graphite') + + # After rollup only the row with max version is retained. + expected2 = '''\ +x1?retention=one_min 200 {timestamp} {date} 2 +'''.format(timestamp=rounded_timestamp, date=date) + + result = q('SELECT * FROM test.graphite ORDER BY metric, updated') + mismatch = csv_compare(result, expected2) + assert len(mismatch) == 0, f"got\n{result}\nwant\n{expected2}\ndiff\n{mismatch}\n" + + +def test_rollup_versions_all(graphite_table): + timestamp = int(time.time()) + rounded_timestamp = timestamp - timestamp % 600 + date = datetime.date.today().isoformat() + + # Insert rows with timestamps relative to the current time so that the + # first retention clause is active. + # Two parts are created. + q(''' +INSERT INTO test.graphite (metric, value, timestamp, date, updated) + VALUES ('ten_min.x1', 100, {timestamp}, '{date}', 1); +INSERT INTO test.graphite (metric, value, timestamp, date, updated) + VALUES ('ten_min.x1', 200, {timestamp}, '{date}', 2); +INSERT INTO test.graphite (metric, value, timestamp, date, updated) + VALUES ('ten_min.x1?env=staging', 100, {timestamp}, '{date}', 1); +INSERT INTO test.graphite (metric, value, timestamp, date, updated) + VALUES ('ten_min.x1?env=staging', 200, {timestamp}, '{date}', 2); +'''.format(timestamp=timestamp, date=date)) + + expected1 = '''\ +ten_min.x1 100 {timestamp} {date} 1 +ten_min.x1 200 {timestamp} {date} 2 +ten_min.x1?env=staging 100 {timestamp} {date} 1 +ten_min.x1?env=staging 200 {timestamp} {date} 2 +'''.format(timestamp=timestamp, date=date) + + result = q('SELECT * FROM test.graphite ORDER BY metric, updated') + mismatch = csv_compare(result, expected1) + assert len(mismatch) == 0, f"got\n{result}\nwant\n{expected1}\ndiff\n{mismatch}\n" + + q('OPTIMIZE TABLE test.graphite') + + # After rollup only the row with max version is retained. + expected2 = '''\ +ten_min.x1 200 {timestamp} {date} 2 +ten_min.x1?env=staging 200 {timestamp} {date} 2 +'''.format(timestamp=rounded_timestamp, date=date) + + result = q('SELECT * FROM test.graphite ORDER BY metric, updated') + mismatch = csv_compare(result, expected2) + assert len(mismatch) == 0, f"got\n{result}\nwant\n{expected2}\ndiff\n{mismatch}\n" + + +def test_rollup_aggregation_plain(graphite_table): + # This query essentially emulates what rollup does. + result1 = q(''' +SELECT avg(v), max(upd) +FROM (SELECT timestamp, + argMax(value, (updated, number)) AS v, + max(updated) AS upd + FROM (SELECT 'one_min.x5' AS metric, + toFloat64(number) AS value, + toUInt32(1111111111 + intDiv(number, 3)) AS timestamp, + toDate('2017-02-02') AS date, + toUInt32(intDiv(number, 2)) AS updated, + number + FROM system.numbers LIMIT 1000000) + WHERE intDiv(timestamp, 600) * 600 = 1111444200 + GROUP BY timestamp) +''') + + expected1 = '''\ +999634.9918367347 499999 +''' + assert TSV(result1) == TSV(expected1) + + # Timestamp 1111111111 is in sufficiently distant past + # so that the last retention clause is active. + result2 = q(''' +INSERT INTO test.graphite + SELECT 'one_min.x' AS metric, + toFloat64(number) AS value, + toUInt32(1111111111 + intDiv(number, 3)) AS timestamp, + toDate('2017-02-02') AS date, toUInt32(intDiv(number, 2)) AS updated + FROM (SELECT * FROM system.numbers LIMIT 1000000) + WHERE intDiv(timestamp, 600) * 600 = 1111444200; + +OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL; + +SELECT * FROM test.graphite; +''') + + expected2 = '''\ +one_min.x 999634.9918367347 1111444200 2017-02-02 499999 +''' + + assert TSV(result2) == TSV(expected2) + + +def test_rollup_aggregation_tagged(graphite_table): + # This query essentially emulates what rollup does. + result1 = q(''' +SELECT avg(v), max(upd) +FROM (SELECT timestamp, + argMax(value, (updated, number)) AS v, + max(updated) AS upd + FROM (SELECT 'x?retention=one_min' AS metric, + toFloat64(number) AS value, + toUInt32(1111111111 + intDiv(number, 3)) AS timestamp, + toDate('2017-02-02') AS date, + toUInt32(intDiv(number, 2)) AS updated, + number + FROM system.numbers LIMIT 1000000) + WHERE intDiv(timestamp, 600) * 600 = 1111444200 + GROUP BY timestamp) +''') + + expected1 = '''\ +999634.9918367347 499999 +''' + assert TSV(result1) == TSV(expected1) + + # Timestamp 1111111111 is in sufficiently distant past + # so that the last retention clause is active. + result2 = q(''' +INSERT INTO test.graphite + SELECT 'x?retention=one_min' AS metric, + toFloat64(number) AS value, + toUInt32(1111111111 + intDiv(number, 3)) AS timestamp, + toDate('2017-02-02') AS date, toUInt32(intDiv(number, 2)) AS updated + FROM (SELECT * FROM system.numbers LIMIT 1000000) + WHERE intDiv(timestamp, 600) * 600 = 1111444200; + +OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL; + +SELECT * FROM test.graphite; +''') + + expected2 = '''\ +x?retention=one_min 999634.9918367347 1111444200 2017-02-02 499999 +''' + + assert TSV(result2) == TSV(expected2) + + +def test_rollup_aggregation_2_plain(graphite_table): + result = q(''' +INSERT INTO test.graphite + SELECT 'one_min.x' AS metric, + toFloat64(number) AS value, + toUInt32(1111111111 - intDiv(number, 3)) AS timestamp, + toDate('2017-02-02') AS date, + toUInt32(100 - number) AS updated + FROM (SELECT * FROM system.numbers LIMIT 50); + +OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL; + +SELECT * FROM test.graphite; +''') + + expected = '''\ +one_min.x 24 1111110600 2017-02-02 100 +''' + + assert TSV(result) == TSV(expected) + + +def test_rollup_aggregation_2_tagged(graphite_table): + result = q(''' +INSERT INTO test.graphite + SELECT 'x?retention=one_min' AS metric, + toFloat64(number) AS value, + toUInt32(1111111111 - intDiv(number, 3)) AS timestamp, + toDate('2017-02-02') AS date, + toUInt32(100 - number) AS updated + FROM (SELECT * FROM system.numbers LIMIT 50); + +OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL; + +SELECT * FROM test.graphite; +''') + + expected = '''\ +x?retention=one_min 24 1111110600 2017-02-02 100 +''' + + assert TSV(result) == TSV(expected) + + +def test_multiple_paths_and_versions_plain(graphite_table): + result = q(''' +INSERT INTO test.graphite + SELECT 'one_min.x' AS metric, + toFloat64(number) AS value, + toUInt32(1111111111 + intDiv(number, 3) * 600) AS timestamp, + toDate('2017-02-02') AS date, + toUInt32(100 - number) AS updated + FROM (SELECT * FROM system.numbers LIMIT 50); + +OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL; + +SELECT * FROM test.graphite; + + +INSERT INTO test.graphite + SELECT 'one_min.y' AS metric, + toFloat64(number) AS value, + toUInt32(1111111111 + number * 600) AS timestamp, + toDate('2017-02-02') AS date, + toUInt32(100 - number) AS updated + FROM (SELECT * FROM system.numbers LIMIT 50); + +OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL; + +SELECT * FROM test.graphite; +''') + + with open(p.join(p.dirname(__file__), + 'test_multiple_paths_and_versions.reference.plain') + ) as reference: + assert TSV(result) == TSV(reference) + + +def test_multiple_paths_and_versions_tagged(graphite_table): + result = q(''' +INSERT INTO test.graphite + SELECT 'x?retention=one_min' AS metric, + toFloat64(number) AS value, + toUInt32(1111111111 + intDiv(number, 3) * 600) AS timestamp, + toDate('2017-02-02') AS date, + toUInt32(100 - number) AS updated + FROM (SELECT * FROM system.numbers LIMIT 50); + +OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL; + +SELECT * FROM test.graphite; + + +INSERT INTO test.graphite + SELECT 'y?retention=one_min' AS metric, + toFloat64(number) AS value, + toUInt32(1111111111 + number * 600) AS timestamp, + toDate('2017-02-02') AS date, + toUInt32(100 - number) AS updated + FROM (SELECT * FROM system.numbers LIMIT 50); + +OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL; + +SELECT * FROM test.graphite; +''') + + with open(p.join(p.dirname(__file__), + 'test_multiple_paths_and_versions.reference.tagged') + ) as reference: + assert TSV(result) == TSV(reference) + + +def test_multiple_output_blocks(graphite_table): + MERGED_BLOCK_SIZE = 8192 + + to_insert = '' + expected = '' + for i in range(2 * MERGED_BLOCK_SIZE + 1): + rolled_up_time = 1000000200 + 600 * i + + for j in range(3): + cur_time = rolled_up_time + 100 * j + to_insert += 'one_min.x1 {} {} 2001-09-09 1\n'.format( + 10 * j, cur_time + ) + to_insert += 'one_min.x1 {} {} 2001-09-09 2\n'.format( + 10 * (j + 1), cur_time + ) + + expected += 'one_min.x1 20 {} 2001-09-09 2\n'.format(rolled_up_time) + + q('INSERT INTO test.graphite FORMAT TSV', to_insert) + + result = q(''' +OPTIMIZE TABLE test.graphite PARTITION 200109 FINAL; + +SELECT * FROM test.graphite; +''') + + assert TSV(result) == TSV(expected) + + +def test_paths_not_matching_any_pattern(graphite_table): + to_insert = '''\ +one_min.x1 100 1000000000 2001-09-09 1 +zzzzzzzz 100 1000000001 2001-09-09 1 +zzzzzzzz 200 1000000001 2001-09-09 2 +''' + + q('INSERT INTO test.graphite FORMAT TSV', to_insert) + + expected = '''\ +one_min.x1 100 999999600 2001-09-09 1 +zzzzzzzz 200 1000000001 2001-09-09 2 +''' + + result = q(''' +OPTIMIZE TABLE test.graphite PARTITION 200109 FINAL; + +SELECT * FROM test.graphite; +''') + + assert TSV(result) == TSV(expected) + + +def test_rules_isolation(graphite_table): + to_insert = '''\ +one_min.x1 100 1000000000 2001-09-09 1 +for_taggged 100 1000000001 2001-09-09 1 +for_taggged 200 1000000001 2001-09-09 2 +one_min?env=staging 100 1000000001 2001-09-09 1 +one_min?env=staging 200 1000000001 2001-09-09 2 +''' + + q('INSERT INTO test.graphite FORMAT TSV', to_insert) + + expected = '''\ +for_taggged 200 1000000001 2001-09-09 2 +one_min.x1 100 999999600 2001-09-09 1 +one_min?env=staging 200 1000000001 2001-09-09 2 +''' + + result = q(''' +OPTIMIZE TABLE test.graphite PARTITION 200109 FINAL; + +SELECT * FROM test.graphite; +''') + + result = q('SELECT * FROM test.graphite ORDER BY metric, updated') + mismatch = csv_compare(result, expected) + assert len(mismatch) == 0, f"got\n{result}\nwant\n{expected}\ndiff\n{mismatch}\n" + + +def test_system_graphite_retentions(graphite_table): + expected = ''' +graphite_rollup plain \\\\.count$ sum 0 0 1 0 ['test'] ['graphite'] +graphite_rollup plain \\\\.max$ max 0 0 2 0 ['test'] ['graphite'] +graphite_rollup plain ^five_min\\\\. 31536000 14400 3 0 ['test'] ['graphite'] +graphite_rollup plain ^five_min\\\\. 5184000 3600 3 0 ['test'] ['graphite'] +graphite_rollup plain ^five_min\\\\. 0 300 3 0 ['test'] ['graphite'] +graphite_rollup plain ^one_min avg 31536000 600 4 0 ['test'] ['graphite'] +graphite_rollup plain ^one_min avg 7776000 300 4 0 ['test'] ['graphite'] +graphite_rollup plain ^one_min avg 0 60 4 0 ['test'] ['graphite'] +graphite_rollup tagged [\\\\?&]retention=one_min(&.*)?$ avg 31536000 600 5 0 ['test'] ['graphite'] +graphite_rollup tagged [\\\\?&]retention=one_min(&.*)?$ avg 7776000 300 5 0 ['test'] ['graphite'] +graphite_rollup tagged [\\\\?&]retention=one_min(&.*)?$ avg 0 60 5 0 ['test'] ['graphite'] +graphite_rollup tagged [\\\\?&]retention=five_min(&.*)?$ avg 31536000 14400 6 0 ['test'] ['graphite'] +graphite_rollup tagged [\\\\?&]retention=five_min(&.*)?$ avg 5184000 3600 6 0 ['test'] ['graphite'] +graphite_rollup tagged [\\\\?&]retention=five_min(&.*)?$ avg 0 300 6 0 ['test'] ['graphite'] +graphite_rollup tagged ^for_taggged avg 31536000 600 7 0 ['test'] ['graphite'] +graphite_rollup tagged ^for_taggged avg 7776000 300 7 0 ['test'] ['graphite'] +graphite_rollup tagged ^for_taggged avg 0 60 7 0 ['test'] ['graphite'] +graphite_rollup all ^ten_min\\\\. sum 31536000 28800 8 0 ['test'] ['graphite'] +graphite_rollup all ^ten_min\\\\. sum 5184000 7200 8 0 ['test'] ['graphite'] +graphite_rollup all ^ten_min\\\\. sum 0 600 8 0 ['test'] ['graphite'] + ''' + result = q('SELECT * from system.graphite_retentions') + + mismatch = csv_compare(result, expected) + assert len(mismatch) == 0, f"got\n{result}\nwant\n{expected}\ndiff\n{mismatch}\n" + + q(''' +DROP TABLE IF EXISTS test.graphite2; +CREATE TABLE test.graphite2 + (metric String, value Float64, timestamp UInt32, date Date, updated UInt32) + ENGINE = GraphiteMergeTree('graphite_rollup') + PARTITION BY toYYYYMM(date) + ORDER BY (metric, timestamp) + SETTINGS index_granularity=8192; + ''') + expected = ''' +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] + ''' + result = q(''' + SELECT + config_name, + Tables.database, + Tables.table + FROM system.graphite_retentions + ''') + assert csv_compare(result, expected), f"got\n{result}\nwant\n{expected}" + + +def test_path_dangling_pointer(graphite_table): + q(''' +DROP TABLE IF EXISTS test.graphite2; +CREATE TABLE test.graphite2 + (metric String, value Float64, timestamp UInt32, date Date, updated UInt32) + ENGINE = GraphiteMergeTree('graphite_rollup') + PARTITION BY toYYYYMM(date) + ORDER BY (metric, timestamp) + SETTINGS index_granularity=1; + ''') + + path = 'abcd' * 4000000 # 16MB + q('INSERT INTO test.graphite2 FORMAT TSV', + "{}\t0.0\t0\t2018-01-01\t100\n".format(path)) + q('INSERT INTO test.graphite2 FORMAT TSV', + "{}\t0.0\t0\t2018-01-01\t101\n".format(path)) + for version in range(10): + q('INSERT INTO test.graphite2 FORMAT TSV', + "{}\t0.0\t0\t2018-01-01\t{}\n".format(path, version)) + + while True: + q('OPTIMIZE TABLE test.graphite2 PARTITION 201801 FINAL') + parts = int(q("SELECT count() FROM system.parts " + "WHERE active AND database='test' " + "AND table='graphite2'")) + if parts == 1: + break + print(('Parts', parts)) + + assert TSV( + q("SELECT value, timestamp, date, updated FROM test.graphite2") + ) == TSV("0\t0\t2018-01-01\t101\n") + + q('DROP TABLE test.graphite2') + + +def test_combined_rules(graphite_table): + # 1487970000 ~ Sat 25 Feb 00:00:00 MSK 2017 + to_insert = 'INSERT INTO test.graphite VALUES ' + expected_unmerged = '' + for i in range(384): + to_insert += "('five_min.count', {v}, {t}, toDate({t}), 1), ".format( + v=1, t=1487970000 + (i * 300) + ) + to_insert += "('five_min.max', {v}, {t}, toDate({t}), 1), ".format( + v=i, t=1487970000 + (i * 300) + ) + expected_unmerged += ("five_min.count\t{v1}\t{t}\n" + "five_min.max\t{v2}\t{t}\n").format( + v1=1, v2=i, + t=1487970000 + (i * 300) + ) + + q(to_insert) + assert TSV(q('SELECT metric, value, timestamp FROM test.graphite' + ' ORDER BY (timestamp, metric)')) == TSV(expected_unmerged) + + q('OPTIMIZE TABLE test.graphite PARTITION 201702 FINAL') + expected_merged = ''' + five_min.count 48 1487970000 2017-02-25 1 + five_min.count 48 1487984400 2017-02-25 1 + five_min.count 48 1487998800 2017-02-25 1 + five_min.count 48 1488013200 2017-02-25 1 + five_min.count 48 1488027600 2017-02-25 1 + five_min.count 48 1488042000 2017-02-25 1 + five_min.count 48 1488056400 2017-02-26 1 + five_min.count 48 1488070800 2017-02-26 1 + five_min.max 47 1487970000 2017-02-25 1 + five_min.max 95 1487984400 2017-02-25 1 + five_min.max 143 1487998800 2017-02-25 1 + five_min.max 191 1488013200 2017-02-25 1 + five_min.max 239 1488027600 2017-02-25 1 + five_min.max 287 1488042000 2017-02-25 1 + five_min.max 335 1488056400 2017-02-26 1 + five_min.max 383 1488070800 2017-02-26 1 + ''' + assert TSV(q('SELECT * FROM test.graphite' + ' ORDER BY (metric, timestamp)')) == TSV(expected_merged) diff --git a/tests/integration/test_graphite_merge_tree_typed/test_multiple_paths_and_versions.reference.plain b/tests/integration/test_graphite_merge_tree_typed/test_multiple_paths_and_versions.reference.plain new file mode 100644 index 00000000000..0f10d11ed05 --- /dev/null +++ b/tests/integration/test_graphite_merge_tree_typed/test_multiple_paths_and_versions.reference.plain @@ -0,0 +1,84 @@ +one_min.x 0 1111110600 2017-02-02 100 +one_min.x 3 1111111200 2017-02-02 97 +one_min.x 6 1111111800 2017-02-02 94 +one_min.x 9 1111112400 2017-02-02 91 +one_min.x 12 1111113000 2017-02-02 88 +one_min.x 15 1111113600 2017-02-02 85 +one_min.x 18 1111114200 2017-02-02 82 +one_min.x 21 1111114800 2017-02-02 79 +one_min.x 24 1111115400 2017-02-02 76 +one_min.x 27 1111116000 2017-02-02 73 +one_min.x 30 1111116600 2017-02-02 70 +one_min.x 33 1111117200 2017-02-02 67 +one_min.x 36 1111117800 2017-02-02 64 +one_min.x 39 1111118400 2017-02-02 61 +one_min.x 42 1111119000 2017-02-02 58 +one_min.x 45 1111119600 2017-02-02 55 +one_min.x 48 1111120200 2017-02-02 52 +one_min.x 0 1111110600 2017-02-02 100 +one_min.x 3 1111111200 2017-02-02 97 +one_min.x 6 1111111800 2017-02-02 94 +one_min.x 9 1111112400 2017-02-02 91 +one_min.x 12 1111113000 2017-02-02 88 +one_min.x 15 1111113600 2017-02-02 85 +one_min.x 18 1111114200 2017-02-02 82 +one_min.x 21 1111114800 2017-02-02 79 +one_min.x 24 1111115400 2017-02-02 76 +one_min.x 27 1111116000 2017-02-02 73 +one_min.x 30 1111116600 2017-02-02 70 +one_min.x 33 1111117200 2017-02-02 67 +one_min.x 36 1111117800 2017-02-02 64 +one_min.x 39 1111118400 2017-02-02 61 +one_min.x 42 1111119000 2017-02-02 58 +one_min.x 45 1111119600 2017-02-02 55 +one_min.x 48 1111120200 2017-02-02 52 +one_min.y 0 1111110600 2017-02-02 100 +one_min.y 1 1111111200 2017-02-02 99 +one_min.y 2 1111111800 2017-02-02 98 +one_min.y 3 1111112400 2017-02-02 97 +one_min.y 4 1111113000 2017-02-02 96 +one_min.y 5 1111113600 2017-02-02 95 +one_min.y 6 1111114200 2017-02-02 94 +one_min.y 7 1111114800 2017-02-02 93 +one_min.y 8 1111115400 2017-02-02 92 +one_min.y 9 1111116000 2017-02-02 91 +one_min.y 10 1111116600 2017-02-02 90 +one_min.y 11 1111117200 2017-02-02 89 +one_min.y 12 1111117800 2017-02-02 88 +one_min.y 13 1111118400 2017-02-02 87 +one_min.y 14 1111119000 2017-02-02 86 +one_min.y 15 1111119600 2017-02-02 85 +one_min.y 16 1111120200 2017-02-02 84 +one_min.y 17 1111120800 2017-02-02 83 +one_min.y 18 1111121400 2017-02-02 82 +one_min.y 19 1111122000 2017-02-02 81 +one_min.y 20 1111122600 2017-02-02 80 +one_min.y 21 1111123200 2017-02-02 79 +one_min.y 22 1111123800 2017-02-02 78 +one_min.y 23 1111124400 2017-02-02 77 +one_min.y 24 1111125000 2017-02-02 76 +one_min.y 25 1111125600 2017-02-02 75 +one_min.y 26 1111126200 2017-02-02 74 +one_min.y 27 1111126800 2017-02-02 73 +one_min.y 28 1111127400 2017-02-02 72 +one_min.y 29 1111128000 2017-02-02 71 +one_min.y 30 1111128600 2017-02-02 70 +one_min.y 31 1111129200 2017-02-02 69 +one_min.y 32 1111129800 2017-02-02 68 +one_min.y 33 1111130400 2017-02-02 67 +one_min.y 34 1111131000 2017-02-02 66 +one_min.y 35 1111131600 2017-02-02 65 +one_min.y 36 1111132200 2017-02-02 64 +one_min.y 37 1111132800 2017-02-02 63 +one_min.y 38 1111133400 2017-02-02 62 +one_min.y 39 1111134000 2017-02-02 61 +one_min.y 40 1111134600 2017-02-02 60 +one_min.y 41 1111135200 2017-02-02 59 +one_min.y 42 1111135800 2017-02-02 58 +one_min.y 43 1111136400 2017-02-02 57 +one_min.y 44 1111137000 2017-02-02 56 +one_min.y 45 1111137600 2017-02-02 55 +one_min.y 46 1111138200 2017-02-02 54 +one_min.y 47 1111138800 2017-02-02 53 +one_min.y 48 1111139400 2017-02-02 52 +one_min.y 49 1111140000 2017-02-02 51 diff --git a/tests/integration/test_graphite_merge_tree_typed/test_multiple_paths_and_versions.reference.tagged b/tests/integration/test_graphite_merge_tree_typed/test_multiple_paths_and_versions.reference.tagged new file mode 100644 index 00000000000..e2c63ab3b22 --- /dev/null +++ b/tests/integration/test_graphite_merge_tree_typed/test_multiple_paths_and_versions.reference.tagged @@ -0,0 +1,84 @@ +x?retention=one_min 0 1111110600 2017-02-02 100 +x?retention=one_min 3 1111111200 2017-02-02 97 +x?retention=one_min 6 1111111800 2017-02-02 94 +x?retention=one_min 9 1111112400 2017-02-02 91 +x?retention=one_min 12 1111113000 2017-02-02 88 +x?retention=one_min 15 1111113600 2017-02-02 85 +x?retention=one_min 18 1111114200 2017-02-02 82 +x?retention=one_min 21 1111114800 2017-02-02 79 +x?retention=one_min 24 1111115400 2017-02-02 76 +x?retention=one_min 27 1111116000 2017-02-02 73 +x?retention=one_min 30 1111116600 2017-02-02 70 +x?retention=one_min 33 1111117200 2017-02-02 67 +x?retention=one_min 36 1111117800 2017-02-02 64 +x?retention=one_min 39 1111118400 2017-02-02 61 +x?retention=one_min 42 1111119000 2017-02-02 58 +x?retention=one_min 45 1111119600 2017-02-02 55 +x?retention=one_min 48 1111120200 2017-02-02 52 +x?retention=one_min 0 1111110600 2017-02-02 100 +x?retention=one_min 3 1111111200 2017-02-02 97 +x?retention=one_min 6 1111111800 2017-02-02 94 +x?retention=one_min 9 1111112400 2017-02-02 91 +x?retention=one_min 12 1111113000 2017-02-02 88 +x?retention=one_min 15 1111113600 2017-02-02 85 +x?retention=one_min 18 1111114200 2017-02-02 82 +x?retention=one_min 21 1111114800 2017-02-02 79 +x?retention=one_min 24 1111115400 2017-02-02 76 +x?retention=one_min 27 1111116000 2017-02-02 73 +x?retention=one_min 30 1111116600 2017-02-02 70 +x?retention=one_min 33 1111117200 2017-02-02 67 +x?retention=one_min 36 1111117800 2017-02-02 64 +x?retention=one_min 39 1111118400 2017-02-02 61 +x?retention=one_min 42 1111119000 2017-02-02 58 +x?retention=one_min 45 1111119600 2017-02-02 55 +x?retention=one_min 48 1111120200 2017-02-02 52 +y?retention=one_min 0 1111110600 2017-02-02 100 +y?retention=one_min 1 1111111200 2017-02-02 99 +y?retention=one_min 2 1111111800 2017-02-02 98 +y?retention=one_min 3 1111112400 2017-02-02 97 +y?retention=one_min 4 1111113000 2017-02-02 96 +y?retention=one_min 5 1111113600 2017-02-02 95 +y?retention=one_min 6 1111114200 2017-02-02 94 +y?retention=one_min 7 1111114800 2017-02-02 93 +y?retention=one_min 8 1111115400 2017-02-02 92 +y?retention=one_min 9 1111116000 2017-02-02 91 +y?retention=one_min 10 1111116600 2017-02-02 90 +y?retention=one_min 11 1111117200 2017-02-02 89 +y?retention=one_min 12 1111117800 2017-02-02 88 +y?retention=one_min 13 1111118400 2017-02-02 87 +y?retention=one_min 14 1111119000 2017-02-02 86 +y?retention=one_min 15 1111119600 2017-02-02 85 +y?retention=one_min 16 1111120200 2017-02-02 84 +y?retention=one_min 17 1111120800 2017-02-02 83 +y?retention=one_min 18 1111121400 2017-02-02 82 +y?retention=one_min 19 1111122000 2017-02-02 81 +y?retention=one_min 20 1111122600 2017-02-02 80 +y?retention=one_min 21 1111123200 2017-02-02 79 +y?retention=one_min 22 1111123800 2017-02-02 78 +y?retention=one_min 23 1111124400 2017-02-02 77 +y?retention=one_min 24 1111125000 2017-02-02 76 +y?retention=one_min 25 1111125600 2017-02-02 75 +y?retention=one_min 26 1111126200 2017-02-02 74 +y?retention=one_min 27 1111126800 2017-02-02 73 +y?retention=one_min 28 1111127400 2017-02-02 72 +y?retention=one_min 29 1111128000 2017-02-02 71 +y?retention=one_min 30 1111128600 2017-02-02 70 +y?retention=one_min 31 1111129200 2017-02-02 69 +y?retention=one_min 32 1111129800 2017-02-02 68 +y?retention=one_min 33 1111130400 2017-02-02 67 +y?retention=one_min 34 1111131000 2017-02-02 66 +y?retention=one_min 35 1111131600 2017-02-02 65 +y?retention=one_min 36 1111132200 2017-02-02 64 +y?retention=one_min 37 1111132800 2017-02-02 63 +y?retention=one_min 38 1111133400 2017-02-02 62 +y?retention=one_min 39 1111134000 2017-02-02 61 +y?retention=one_min 40 1111134600 2017-02-02 60 +y?retention=one_min 41 1111135200 2017-02-02 59 +y?retention=one_min 42 1111135800 2017-02-02 58 +y?retention=one_min 43 1111136400 2017-02-02 57 +y?retention=one_min 44 1111137000 2017-02-02 56 +y?retention=one_min 45 1111137600 2017-02-02 55 +y?retention=one_min 46 1111138200 2017-02-02 54 +y?retention=one_min 47 1111138800 2017-02-02 53 +y?retention=one_min 48 1111139400 2017-02-02 52 +y?retention=one_min 49 1111140000 2017-02-02 51 diff --git a/tests/integration/test_http_handlers_config/test.py b/tests/integration/test_http_handlers_config/test.py index 818a1e54640..01872a1d0c3 100644 --- a/tests/integration/test_http_handlers_config/test.py +++ b/tests/integration/test_http_handlers_config/test.py @@ -58,9 +58,9 @@ def test_predefined_query_handler(): 'test_predefined_handler_get?max_threads=1&setting_name=max_threads', method='GET', headers={'XXX': 'xxx'}).content - assert b'max_threads\t1\nmax_alter_threads\t1\n' == cluster.instance.http_request( - 'query_param_with_url/max_threads?max_threads=1&max_alter_threads=1', - headers={'XXX': 'max_alter_threads'}).content + assert b'max_final_threads\t1\nmax_threads\t1\n' == cluster.instance.http_request( + 'query_param_with_url/max_threads?max_threads=1&max_final_threads=1', + headers={'XXX': 'max_final_threads'}).content def test_fixed_static_handler(): diff --git a/tests/integration/test_keeper_auth/test.py b/tests/integration/test_keeper_auth/test.py index 276fe3d8518..6be78f95483 100644 --- a/tests/integration/test_keeper_auth/test.py +++ b/tests/integration/test_keeper_auth/test.py @@ -36,6 +36,38 @@ def get_genuine_zk(): get_fake_zk ] ) + +def test_remove_acl(started_cluster, get_zk): + auth_connection = get_zk() + + auth_connection.add_auth('digest', 'user1:password1') + + # Consistent with zookeeper, accept generated digest + auth_connection.create("/test_remove_acl1", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", read=True, write=False, create=False, delete=False, admin=False)]) + auth_connection.create("/test_remove_acl2", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", read=True, write=True, create=False, delete=False, admin=False)]) + auth_connection.create("/test_remove_acl3", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", all=True)]) + + auth_connection.delete("/test_remove_acl2") + + auth_connection.create("/test_remove_acl4", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", read=True, write=True, create=True, delete=False, admin=False)]) + + acls, stat = auth_connection.get_acls("/test_remove_acl3") + + assert stat.aversion == 0 + assert len(acls) == 1 + for acl in acls: + assert acl.acl_list == ['ALL'] + assert acl.perms == 31 + + +@pytest.mark.parametrize( + ('get_zk'), + [ + get_genuine_zk, + get_fake_zk + ] +) + def test_digest_auth_basic(started_cluster, get_zk): auth_connection = get_zk() @@ -43,12 +75,11 @@ def test_digest_auth_basic(started_cluster, get_zk): auth_connection.create("/test_no_acl", b"") auth_connection.create("/test_all_acl", b"data", acl=[make_acl("auth", "", all=True)]) - # for some reason original zookeeper accepts this ACL, but doesn't allow to do anything with this node - # even with correct credentials. - auth_connection.create("/test_all_digest_acl", b"dataX", acl=[make_acl("digest", "user1:password1", all=True)]) + # Consistent with zookeeper, accept generated digest + auth_connection.create("/test_all_digest_acl", b"dataX", acl=[make_acl("digest", "user1:XDkd2dsEuhc9ImU3q8pa8UOdtpI=", all=True)]) assert auth_connection.get("/test_all_acl")[0] == b"data" - #assert auth_connection.get("/test_all_digest_acl")[0] == b"dataX" + assert auth_connection.get("/test_all_digest_acl")[0] == b"dataX" no_auth_connection = get_zk() no_auth_connection.set("/test_no_acl", b"hello") diff --git a/tests/integration/test_log_lz4_streaming/test.py b/tests/integration/test_log_lz4_streaming/test.py index 7f2f22f28c9..75b46a378c5 100644 --- a/tests/integration/test_log_lz4_streaming/test.py +++ b/tests/integration/test_log_lz4_streaming/test.py @@ -18,7 +18,7 @@ def started_cluster(): def check_log_file(): - assert node.file_exists("/var/log/clickhouse-server/clickhouse-server.log.lz4") + assert node.path_exists("/var/log/clickhouse-server/clickhouse-server.log.lz4") lz4_output = node.exec_in_container(["bash", "-c", "lz4 -t /var/log/clickhouse-server/clickhouse-server.log.lz4 2>&1"], user='root') assert lz4_output.count('Error') == 0, lz4_output diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index 7265105c8df..1528103e1cb 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -1079,9 +1079,41 @@ def table_overrides(clickhouse_node, mysql_node, service_name): check_query(clickhouse_node, "SELECT count() FROM table_overrides.t1", "1000\n") mysql_node.query("INSERT INTO table_overrides.t1 VALUES(1001, '2021-10-01 00:00:00', 42.0)") check_query(clickhouse_node, "SELECT count() FROM table_overrides.t1", "1001\n") + + explain_with_table_func = f"EXPLAIN TABLE OVERRIDE mysql('{service_name}:3306', 'table_overrides', 't1', 'root', 'clickhouse')" + + for what in ['ORDER BY', 'PRIMARY KEY', 'SAMPLE BY', 'PARTITION BY', 'TTL']: + with pytest.raises(QueryRuntimeException) as exc: + clickhouse_node.query(f"{explain_with_table_func} {what} temperature") + assert f'{what} override refers to nullable column `temperature`' in \ + str(exc.value) + assert f"{what} uses columns: `temperature` Nullable(Float32)" in \ + clickhouse_node.query(f"{explain_with_table_func} {what} assumeNotNull(temperature)") + + for testcase in [ + ('COLUMNS (temperature Nullable(Float32) MATERIALIZED 1.0)', + 'column `temperature`: modifying default specifier is not allowed'), + ('COLUMNS (sensor_id UInt64 ALIAS 42)', + 'column `sensor_id`: modifying default specifier is not allowed') + ]: + with pytest.raises(QueryRuntimeException) as exc: + clickhouse_node.query(f"{explain_with_table_func} {testcase[0]}") + assert testcase[1] in str(exc.value) + + for testcase in [ + ('COLUMNS (temperature Nullable(Float64))', + 'Modified columns: `temperature` Nullable(Float32) -> Nullable(Float64)'), + ('COLUMNS (temp_f Nullable(Float32) ALIAS if(temperature IS NULL, NULL, (temperature * 9.0 / 5.0) + 32),\ + temp_k Nullable(Float32) ALIAS if(temperature IS NULL, NULL, temperature + 273.15))', + 'Added columns: `temp_f` Nullable(Float32), `temp_k` Nullable(Float32)') + ]: + assert testcase[1] in clickhouse_node.query( + f"{explain_with_table_func} {testcase[0]}") + clickhouse_node.query("DROP DATABASE IF EXISTS table_overrides") mysql_node.query("DROP DATABASE IF EXISTS table_overrides") + def materialized_database_support_all_kinds_of_mysql_datatype(clickhouse_node, mysql_node, service_name): mysql_node.query("DROP DATABASE IF EXISTS test_database_datatype") clickhouse_node.query("DROP DATABASE IF EXISTS test_database_datatype") diff --git a/tests/integration/test_odbc_interaction/test.py b/tests/integration/test_odbc_interaction/test.py index 001a46e1237..8d3a8773bc4 100644 --- a/tests/integration/test_odbc_interaction/test.py +++ b/tests/integration/test_odbc_interaction/test.py @@ -338,6 +338,8 @@ def test_postgres_odbc_hashed_dictionary_with_schema(started_cluster): cursor.execute("truncate table clickhouse.test_table") cursor.execute("insert into clickhouse.test_table values(1, 1, 'hello'),(2, 2, 'world')") node1.query("SYSTEM RELOAD DICTIONARY postgres_odbc_hashed") + node1.exec_in_container(["ss", "-K", "dport", "postgresql"], privileged=True, user='root') + node1.query("SYSTEM RELOAD DICTIONARY postgres_odbc_hashed") assert_eq_with_retry(node1, "select dictGetString('postgres_odbc_hashed', 'column2', toUInt64(1))", "hello") assert_eq_with_retry(node1, "select dictGetString('postgres_odbc_hashed', 'column2', toUInt64(2))", "world") diff --git a/tests/integration/test_postgresql_replica_database_engine_1/test.py b/tests/integration/test_postgresql_replica_database_engine_1/test.py index cba9e93c056..8b5d7f5f7b2 100644 --- a/tests/integration/test_postgresql_replica_database_engine_1/test.py +++ b/tests/integration/test_postgresql_replica_database_engine_1/test.py @@ -1,245 +1,67 @@ import pytest import time -import psycopg2 import os.path as p import random from helpers.cluster import ClickHouseCluster from helpers.test_tools import assert_eq_with_retry -from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT from helpers.test_tools import TSV from random import randrange import threading +from helpers.postgres_utility import get_postgres_conn +from helpers.postgres_utility import PostgresManager + +from helpers.postgres_utility import create_replication_slot, drop_replication_slot +from helpers.postgres_utility import create_postgres_schema, drop_postgres_schema +from helpers.postgres_utility import create_postgres_table, drop_postgres_table +from helpers.postgres_utility import check_tables_are_synchronized +from helpers.postgres_utility import check_several_tables_are_synchronized +from helpers.postgres_utility import assert_nested_table_is_created +from helpers.postgres_utility import assert_number_of_columns +from helpers.postgres_utility import postgres_table_template, postgres_table_template_2, postgres_table_template_3, postgres_table_template_4 +from helpers.postgres_utility import queries + + cluster = ClickHouseCluster(__file__) instance = cluster.add_instance('instance', main_configs = ['configs/log_conf.xml'], user_configs = ['configs/users.xml'], with_postgres=True, stay_alive=True) -postgres_table_template = """ - CREATE TABLE IF NOT EXISTS "{}" ( - key Integer NOT NULL, value Integer, PRIMARY KEY(key)) - """ -postgres_table_template_2 = """ - CREATE TABLE IF NOT EXISTS "{}" ( - key Integer NOT NULL, value1 Integer, value2 Integer, value3 Integer, PRIMARY KEY(key)) - """ -postgres_table_template_3 = """ - CREATE TABLE IF NOT EXISTS "{}" ( - key1 Integer NOT NULL, value1 Integer, key2 Integer NOT NULL, value2 Integer NOT NULL) - """ -postgres_table_template_4 = """ - CREATE TABLE IF NOT EXISTS "{}"."{}" ( - key Integer NOT NULL, value Integer, PRIMARY KEY(key)) - """ - -def get_postgres_conn(ip, port, database=False, auto_commit=True, database_name='postgres_database', replication=False): - if database == True: - conn_string = "host={} port={} dbname='{}' user='postgres' password='mysecretpassword'".format(ip, port, database_name) - else: - conn_string = "host={} port={} user='postgres' password='mysecretpassword'".format(ip, port) - - if replication: - conn_string += " replication='database'" - - conn = psycopg2.connect(conn_string) - if auto_commit: - conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) - conn.autocommit = True - return conn - -def create_replication_slot(conn, slot_name='user_slot'): - cursor = conn.cursor() - cursor.execute('CREATE_REPLICATION_SLOT {} LOGICAL pgoutput EXPORT_SNAPSHOT'.format(slot_name)) - result = cursor.fetchall() - print(result[0][0]) # slot name - print(result[0][1]) # start lsn - print(result[0][2]) # snapshot - return result[0][2] - -def drop_replication_slot(conn, slot_name='user_slot'): - cursor = conn.cursor() - cursor.execute("select pg_drop_replication_slot('{}')".format(slot_name)) - -def create_postgres_db(cursor, name='postgres_database'): - cursor.execute("CREATE DATABASE {}".format(name)) - -def drop_postgres_db(cursor, name='postgres_database'): - cursor.execute("DROP DATABASE IF EXISTS {}".format(name)) - -def drop_postgres_schema(cursor, schema_name): - cursor.execute('DROP SCHEMA IF EXISTS {} CASCADE'.format(schema_name)) - -def create_postgres_schema(cursor, schema_name): - drop_postgres_schema(cursor, schema_name) - cursor.execute('CREATE SCHEMA {}'.format(schema_name)) - -def create_clickhouse_postgres_db(ip, port, name='postgres_database', database_name='postgres_database', schema_name=''): - drop_clickhouse_postgres_db(name) - if len(schema_name) == 0: - instance.query(''' - CREATE DATABASE {} - ENGINE = PostgreSQL('{}:{}', '{}', 'postgres', 'mysecretpassword')'''.format(name, ip, port, database_name)) - else: - instance.query(''' - CREATE DATABASE {} - ENGINE = PostgreSQL('{}:{}', '{}', 'postgres', 'mysecretpassword', '{}')'''.format(name, ip, port, database_name, schema_name)) - -def drop_clickhouse_postgres_db(name='postgres_database'): - instance.query('DROP DATABASE IF EXISTS {}'.format(name)) - -def create_materialized_db(ip, port, - materialized_database='test_database', - postgres_database='postgres_database', - settings=[]): - instance.query(f"DROP DATABASE IF EXISTS {materialized_database}") - create_query = f"CREATE DATABASE {materialized_database} ENGINE = MaterializedPostgreSQL('{ip}:{port}', '{postgres_database}', 'postgres', 'mysecretpassword')" - if len(settings) > 0: - create_query += " SETTINGS " - for i in range(len(settings)): - if i != 0: - create_query += ', ' - create_query += settings[i] - instance.query(create_query) - assert materialized_database in instance.query('SHOW DATABASES') - -def drop_materialized_db(materialized_database='test_database'): - instance.query('DROP DATABASE IF EXISTS {}'.format(materialized_database)) - assert materialized_database not in instance.query('SHOW DATABASES') - -def drop_postgres_table(cursor, table_name): - cursor.execute("""DROP TABLE IF EXISTS "{}" """.format(table_name)) - -def drop_postgres_table_with_schema(cursor, schema_name, table_name): - cursor.execute("""DROP TABLE IF EXISTS "{}"."{}" """.format(schema_name, table_name)) - -def create_postgres_table(cursor, table_name, replica_identity_full=False, template=postgres_table_template): - drop_postgres_table(cursor, table_name) - cursor.execute(template.format(table_name)) - if replica_identity_full: - cursor.execute('ALTER TABLE {} REPLICA IDENTITY FULL;'.format(table_name)) - -def create_postgres_table_with_schema(cursor, schema_name, table_name): - drop_postgres_table_with_schema(cursor, schema_name, table_name) - cursor.execute(postgres_table_template_4.format(schema_name, table_name)) - -queries = [ - 'INSERT INTO postgresql_replica_{} select i, i from generate_series(0, 10000) as t(i);', - 'DELETE FROM postgresql_replica_{} WHERE (value*value) % 3 = 0;', - 'UPDATE postgresql_replica_{} SET value = value - 125 WHERE key % 2 = 0;', - "UPDATE postgresql_replica_{} SET key=key+20000 WHERE key%2=0", - 'INSERT INTO postgresql_replica_{} select i, i from generate_series(40000, 50000) as t(i);', - 'DELETE FROM postgresql_replica_{} WHERE key % 10 = 0;', - 'UPDATE postgresql_replica_{} SET value = value + 101 WHERE key % 2 = 1;', - "UPDATE postgresql_replica_{} SET key=key+80000 WHERE key%2=1", - 'DELETE FROM postgresql_replica_{} WHERE value % 2 = 0;', - 'UPDATE postgresql_replica_{} SET value = value + 2000 WHERE key % 5 = 0;', - 'INSERT INTO postgresql_replica_{} select i, i from generate_series(200000, 250000) as t(i);', - 'DELETE FROM postgresql_replica_{} WHERE value % 3 = 0;', - 'UPDATE postgresql_replica_{} SET value = value * 2 WHERE key % 3 = 0;', - "UPDATE postgresql_replica_{} SET key=key+500000 WHERE key%2=1", - 'INSERT INTO postgresql_replica_{} select i, i from generate_series(1000000, 1050000) as t(i);', - 'DELETE FROM postgresql_replica_{} WHERE value % 9 = 2;', - "UPDATE postgresql_replica_{} SET key=key+10000000", - 'UPDATE postgresql_replica_{} SET value = value + 2 WHERE key % 3 = 1;', - 'DELETE FROM postgresql_replica_{} WHERE value%5 = 0;' - ] - - -def assert_nested_table_is_created(table_name, materialized_database='test_database', schema_name=''): - if len(schema_name) == 0: - table = table_name - else: - table = schema_name + "." + table_name - print(f'Checking table {table} exists in {materialized_database}') - database_tables = instance.query('SHOW TABLES FROM {}'.format(materialized_database)) - while table not in database_tables: - time.sleep(0.2) - database_tables = instance.query('SHOW TABLES FROM {}'.format(materialized_database)) - assert(table in database_tables) - - -def assert_number_of_columns(expected, table_name, database_name='test_database'): - result = instance.query(f"select count() from system.columns where table = '{table_name}' and database = '{database_name}' and not startsWith(name, '_')") - while (int(result) != expected): - time.sleep(1) - result = instance.query(f"select count() from system.columns where table = '{table_name}' and database = '{database_name}' and not startsWith(name, '_')") - print('Number of columns ok') - - -@pytest.mark.timeout(320) -def check_tables_are_synchronized(table_name, order_by='key', postgres_database='postgres_database', materialized_database='test_database', schema_name=''): - assert_nested_table_is_created(table_name, materialized_database, schema_name) - - print("Checking table is synchronized:", table_name) - expected = instance.query('select * from {}.{} order by {};'.format(postgres_database, table_name, order_by)) - if len(schema_name) == 0: - result = instance.query('select * from {}.{} order by {};'.format(materialized_database, table_name, order_by)) - else: - result = instance.query('select * from {}.`{}.{}` order by {};'.format(materialized_database, schema_name, table_name, order_by)) - - while result != expected: - time.sleep(0.5) - if len(schema_name) == 0: - result = instance.query('select * from {}.{} order by {};'.format(materialized_database, table_name, order_by)) - else: - result = instance.query('select * from {}.`{}.{}` order by {};'.format(materialized_database, schema_name, table_name, order_by)) - - assert(result == expected) +pg_manager = PostgresManager() @pytest.fixture(scope="module") def started_cluster(): try: cluster.start() - conn = get_postgres_conn(ip=cluster.postgres_ip, port=cluster.postgres_port) - cursor = conn.cursor() - create_postgres_db(cursor, 'postgres_database') - create_clickhouse_postgres_db(ip=cluster.postgres_ip, port=cluster.postgres_port) - - instance.query("DROP DATABASE IF EXISTS test_database") + pg_manager.init(instance, cluster.postgres_ip, cluster.postgres_port) yield cluster finally: cluster.shutdown() +@pytest.fixture(autouse=True) +def setup_teardown(): + print("PostgreSQL is available - running test") + yield # run test + pg_manager.restart() + + def test_load_and_sync_all_database_tables(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() NUM_TABLES = 5 - - for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - create_postgres_table(cursor, table_name); - instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(50)".format(table_name)) - - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port) - assert 'test_database' in instance.query('SHOW DATABASES') - - for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - check_tables_are_synchronized(table_name); - cursor.execute('drop table {};'.format(table_name)) - - result = instance.query('''SELECT count() FROM system.tables WHERE database = 'test_database';''') + pg_manager.create_and_fill_postgres_tables(NUM_TABLES) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) + check_several_tables_are_synchronized(instance, NUM_TABLES) + result = instance.query("SELECT count() FROM system.tables WHERE database = 'test_database';") assert(int(result) == NUM_TABLES) - drop_materialized_db() - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) - def test_replicating_dml(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) cursor = conn.cursor() NUM_TABLES = 5 @@ -248,41 +70,26 @@ def test_replicating_dml(started_cluster): create_postgres_table(cursor, 'postgresql_replica_{}'.format(i)); instance.query("INSERT INTO postgres_database.postgresql_replica_{} SELECT number, {} from numbers(50)".format(i, i)) - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) for i in range(NUM_TABLES): instance.query("INSERT INTO postgres_database.postgresql_replica_{} SELECT 50 + number, {} from numbers(1000)".format(i, i)) - - for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - check_tables_are_synchronized(table_name); + check_several_tables_are_synchronized(instance, NUM_TABLES) for i in range(NUM_TABLES): cursor.execute('UPDATE postgresql_replica_{} SET value = {} * {} WHERE key < 50;'.format(i, i, i)) cursor.execute('UPDATE postgresql_replica_{} SET value = {} * {} * {} WHERE key >= 50;'.format(i, i, i, i)) - - for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); + check_several_tables_are_synchronized(instance, NUM_TABLES) for i in range(NUM_TABLES): cursor.execute('DELETE FROM postgresql_replica_{} WHERE (value*value + {}) % 2 = 0;'.format(i, i)) cursor.execute('UPDATE postgresql_replica_{} SET value = value - (value % 7) WHERE key > 128 AND key < 512;'.format(i)) cursor.execute('DELETE FROM postgresql_replica_{} WHERE key % 7 = 1;'.format(i, i)) - - for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); - - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) - - drop_materialized_db() + check_several_tables_are_synchronized(instance, NUM_TABLES) def test_different_data_types(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) cursor = conn.cursor() cursor.execute('drop table if exists test_data_types;') @@ -309,15 +116,14 @@ def test_different_data_types(started_cluster): k Char(2)[] -- Nullable(String) )''') - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) for i in range(10): instance.query(''' INSERT INTO postgres_database.test_data_types VALUES ({}, -32768, -2147483648, -9223372036854775808, 1.12345, 1.1234567890, 2147483647, 9223372036854775807, '2000-05-12 12:12:12.012345', '2000-05-12', 0.2, 0.2)'''.format(i)) - check_tables_are_synchronized('test_data_types', 'id'); + check_tables_are_synchronized(instance, 'test_data_types', 'id'); result = instance.query('SELECT * FROM test_database.test_data_types ORDER BY id LIMIT 1;') assert(result == '0\t-32768\t-2147483648\t-9223372036854775808\t1.12345\t1.123456789\t2147483647\t9223372036854775807\t2000-05-12 12:12:12.012345\t2000-05-12\t0.2\t0.2\n') @@ -326,7 +132,7 @@ def test_different_data_types(started_cluster): cursor.execute('UPDATE test_data_types SET {} = {};'.format(col, i)) cursor.execute('''UPDATE test_data_types SET i = '2020-12-12';'''.format(col, i)) - check_tables_are_synchronized('test_data_types', 'id'); + check_tables_are_synchronized(instance, 'test_data_types', 'id'); instance.query("INSERT INTO postgres_database.test_array_data_type " "VALUES (" @@ -357,44 +163,35 @@ def test_different_data_types(started_cluster): "[]\n" ) - check_tables_are_synchronized('test_array_data_type'); + check_tables_are_synchronized(instance, 'test_array_data_type'); result = instance.query('SELECT * FROM test_database.test_array_data_type ORDER BY key;') assert(result == expected) - drop_materialized_db() + pg_manager.drop_materialized_db() cursor.execute('drop table if exists test_data_types;') cursor.execute('drop table if exists test_array_data_type;') def test_load_and_sync_subset_of_database_tables(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() NUM_TABLES = 10 + pg_manager.create_and_fill_postgres_tables(NUM_TABLES) publication_tables = '' for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - create_postgres_table(cursor, 'postgresql_replica_{}'.format(i)); - instance.query("INSERT INTO postgres_database.postgresql_replica_{} SELECT number, number from numbers(50)".format(i)) - if i < int(NUM_TABLES/2): if publication_tables != '': publication_tables += ', ' - publication_tables += table_name + publication_tables += f'postgresql_replica_{i}' - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - settings=["materialized_postgresql_tables_list = '{}'".format(publication_tables)]) - assert 'test_database' in instance.query('SHOW DATABASES') + pg_manager.create_materialized_db( + ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + settings=["materialized_postgresql_tables_list = '{}'".format(publication_tables)]) time.sleep(1) for i in range(int(NUM_TABLES/2)): - table_name = 'postgresql_replica_{}'.format(i) - assert_nested_table_is_created(table_name) + table_name = f'postgresql_replica_{i}' + assert_nested_table_is_created(instance, table_name) result = instance.query('''SELECT count() FROM system.tables WHERE database = 'test_database';''') assert(int(result) == int(NUM_TABLES/2)) @@ -409,69 +206,40 @@ def test_load_and_sync_subset_of_database_tables(started_cluster): instance.query("INSERT INTO postgres_database.{} SELECT 50 + number, {} from numbers(100)".format(table_name, i)) for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) + table_name = f'postgresql_replica_{i}' if i < int(NUM_TABLES/2): - check_tables_are_synchronized(table_name); - - drop_materialized_db() - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) + check_tables_are_synchronized(instance, table_name); def test_changing_replica_identity_value(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) cursor = conn.cursor() create_postgres_table(cursor, 'postgresql_replica'); instance.query("INSERT INTO postgres_database.postgresql_replica SELECT 50 + number, number from numbers(50)") - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) instance.query("INSERT INTO postgres_database.postgresql_replica SELECT 100 + number, number from numbers(50)") - check_tables_are_synchronized('postgresql_replica'); + check_tables_are_synchronized(instance, 'postgresql_replica'); cursor.execute("UPDATE postgresql_replica SET key=key-25 WHERE key<100 ") - check_tables_are_synchronized('postgresql_replica'); - - drop_materialized_db() - cursor.execute('drop table if exists postgresql_replica;') + check_tables_are_synchronized(instance, 'postgresql_replica'); def test_clickhouse_restart(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() NUM_TABLES = 5 - - for i in range(NUM_TABLES): - create_postgres_table(cursor, 'postgresql_replica_{}'.format(i)); - instance.query("INSERT INTO postgres_database.postgresql_replica_{} SELECT number, {} from numbers(50)".format(i, i)) - - instance.query("CREATE DATABASE test_database ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres', 'mysecretpassword')") - - for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - check_tables_are_synchronized(table_name); + pg_manager.create_and_fill_postgres_tables(NUM_TABLES) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) + check_several_tables_are_synchronized(instance, NUM_TABLES) for i in range(NUM_TABLES): instance.query("INSERT INTO postgres_database.postgresql_replica_{} SELECT 50 + number, {} from numbers(50000)".format(i, i)) instance.restart_clickhouse() - - for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); - - drop_materialized_db() - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) + check_several_tables_are_synchronized(instance, NUM_TABLES) def test_replica_identity_index(started_cluster): - drop_materialized_db() conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) cursor = conn.cursor() @@ -480,27 +248,22 @@ def test_replica_identity_index(started_cluster): cursor.execute("ALTER TABLE postgresql_replica REPLICA IDENTITY USING INDEX idx") instance.query("INSERT INTO postgres_database.postgresql_replica SELECT number, number, number, number from numbers(50, 10)") - create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) instance.query("INSERT INTO postgres_database.postgresql_replica SELECT number, number, number, number from numbers(100, 10)") - check_tables_are_synchronized('postgresql_replica', order_by='key1'); + check_tables_are_synchronized(instance, 'postgresql_replica', order_by='key1'); cursor.execute("UPDATE postgresql_replica SET key1=key1-25 WHERE key1<100 ") cursor.execute("UPDATE postgresql_replica SET key2=key2-25 WHERE key2>100 ") cursor.execute("UPDATE postgresql_replica SET value1=value1+100 WHERE key1<100 ") cursor.execute("UPDATE postgresql_replica SET value2=value2+200 WHERE key2>100 ") - check_tables_are_synchronized('postgresql_replica', order_by='key1'); + check_tables_are_synchronized(instance, 'postgresql_replica', order_by='key1'); cursor.execute('DELETE FROM postgresql_replica WHERE key2<75;') - check_tables_are_synchronized('postgresql_replica', order_by='key1'); - - drop_materialized_db() - cursor.execute('drop table if exists postgresql_replica;') + check_tables_are_synchronized(instance, 'postgresql_replica', order_by='key1'); def test_table_schema_changes(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) cursor = conn.cursor() NUM_TABLES = 5 @@ -509,15 +272,14 @@ def test_table_schema_changes(started_cluster): create_postgres_table(cursor, 'postgresql_replica_{}'.format(i), template=postgres_table_template_2); instance.query("INSERT INTO postgres_database.postgresql_replica_{} SELECT number, {}, {}, {} from numbers(25)".format(i, i, i, i)) - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - settings=["materialized_postgresql_allow_automatic_update = 1"]) + pg_manager.create_materialized_db( + ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + settings=["materialized_postgresql_allow_automatic_update = 1"]) for i in range(NUM_TABLES): instance.query("INSERT INTO postgres_database.postgresql_replica_{} SELECT 25 + number, {}, {}, {} from numbers(25)".format(i, i, i, i)) - for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); + check_several_tables_are_synchronized(instance, NUM_TABLES) expected = instance.query("SELECT key, value1, value3 FROM test_database.postgresql_replica_3 ORDER BY key"); @@ -530,13 +292,12 @@ def test_table_schema_changes(started_cluster): cursor.execute(f"UPDATE {altered_table} SET value3 = 12 WHERE key%2=0") time.sleep(2) - assert_nested_table_is_created(altered_table) - assert_number_of_columns(3, altered_table) - check_tables_are_synchronized(altered_table) + assert_nested_table_is_created(instance, altered_table) + assert_number_of_columns(instance, 3, altered_table) + check_tables_are_synchronized(instance, altered_table) print('check1 OK') - for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); + check_several_tables_are_synchronized(instance, NUM_TABLES) for i in range(NUM_TABLES): if i != altered_idx: @@ -544,32 +305,12 @@ def test_table_schema_changes(started_cluster): else: instance.query("INSERT INTO postgres_database.postgresql_replica_{} SELECT 51 + number, {}, {} from numbers(49)".format(i, i, i)) - check_tables_are_synchronized(altered_table); + check_tables_are_synchronized(instance, altered_table); print('check2 OK') - for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); - - for i in range(NUM_TABLES): - cursor.execute('drop table postgresql_replica_{};'.format(i)) - - instance.query("DROP DATABASE test_database") - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) + check_several_tables_are_synchronized(instance, NUM_TABLES) def test_many_concurrent_queries(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() - NUM_TABLES = 5 - - for i in range(NUM_TABLES): - create_postgres_table(cursor, 'postgresql_replica_{}'.format(i)); - instance.query('INSERT INTO postgres_database.postgresql_replica_{} SELECT number, number from numbers(10000)'.format(i)) - n = [10000] - query_pool = ['DELETE FROM postgresql_replica_{} WHERE (value*value) % 3 = 0;', 'UPDATE postgresql_replica_{} SET value = value - 125 WHERE key % 2 = 0;', 'DELETE FROM postgresql_replica_{} WHERE key % 10 = 0;', @@ -582,6 +323,13 @@ def test_many_concurrent_queries(started_cluster): 'UPDATE postgresql_replica_{} SET value = value + 2 WHERE key % 3 = 1;', 'DELETE FROM postgresql_replica_{} WHERE value%5 = 0;'] + NUM_TABLES = 5 + + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + database=True) + cursor = conn.cursor() + pg_manager.create_and_fill_postgres_tables_from_cursor(cursor, NUM_TABLES, numbers=10000) + def attack(thread_id): print('thread {}'.format(thread_id)) k = 10000 @@ -606,13 +354,14 @@ def test_many_concurrent_queries(started_cluster): cursor.execute("UPDATE postgresql_replica_{} SET key=key%100000+100000*{} WHERE key%{}=0".format(thread_id, i+1, i+1)) print("update primary key {} ok".format(thread_id)) + n = [10000] + threads = [] threads_num = 16 for i in range(threads_num): threads.append(threading.Thread(target=attack, args=(i,))) - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) for thread in threads: time.sleep(random.uniform(0, 1)) @@ -628,108 +377,91 @@ def test_many_concurrent_queries(started_cluster): thread.join() for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); + check_tables_are_synchronized(instance, 'postgresql_replica_{}'.format(i)); count1 = instance.query('SELECT count() FROM postgres_database.postgresql_replica_{}'.format(i)) count2 = instance.query('SELECT count() FROM (SELECT * FROM test_database.postgresql_replica_{})'.format(i)) assert(int(count1) == int(count2)) print(count1, count2) - drop_materialized_db() - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) - def test_single_transaction(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True, auto_commit=False) cursor = conn.cursor() - create_postgres_table(cursor, 'postgresql_replica_0'); + table_name = 'postgresql_replica_0' + create_postgres_table(cursor, table_name); conn.commit() - create_materialized_db(ip=started_cluster.postgres_ip, + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) - assert_nested_table_is_created('postgresql_replica_0') + assert_nested_table_is_created(instance, table_name) for query in queries: print('query {}'.format(query)) cursor.execute(query.format(0)) time.sleep(5) - result = instance.query("select count() from test_database.postgresql_replica_0") + result = instance.query(f"select count() from test_database.{table_name}") # no commit yet assert(int(result) == 0) conn.commit() - check_tables_are_synchronized('postgresql_replica_0'); - - drop_materialized_db() - cursor.execute('drop table if exists postgresql_replica_0;') + check_tables_are_synchronized(instance, table_name); def test_virtual_columns(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) cursor = conn.cursor() - create_postgres_table(cursor, 'postgresql_replica_0'); + table_name = 'postgresql_replica_0' + create_postgres_table(cursor, table_name); - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - settings=["materialized_postgresql_allow_automatic_update = 1"]) - assert_nested_table_is_created('postgresql_replica_0') - instance.query("INSERT INTO postgres_database.postgresql_replica_0 SELECT number, number from numbers(10)") - check_tables_are_synchronized('postgresql_replica_0'); + pg_manager.create_materialized_db( + ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + settings=["materialized_postgresql_allow_automatic_update = 1"]) + + assert_nested_table_is_created(instance, table_name) + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number from numbers(10)") + check_tables_are_synchronized(instance, table_name); # just check that it works, no check with `expected` because _version is taken as LSN, which will be different each time. - result = instance.query('SELECT key, value, _sign, _version FROM test_database.postgresql_replica_0;') + result = instance.query(f'SELECT key, value, _sign, _version FROM test_database.{table_name};') print(result) - cursor.execute("ALTER TABLE postgresql_replica_0 ADD COLUMN value2 integer") - instance.query("INSERT INTO postgres_database.postgresql_replica_0 SELECT number, number, number from numbers(10, 10)") - assert_number_of_columns(3, 'postgresql_replica_0') - check_tables_are_synchronized('postgresql_replica_0'); + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value2 integer") + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number, number from numbers(10, 10)") + assert_number_of_columns(instance, 3, table_name) + check_tables_are_synchronized(instance, table_name); result = instance.query('SELECT key, value, value2, _sign, _version FROM test_database.postgresql_replica_0;') print(result) - instance.query("INSERT INTO postgres_database.postgresql_replica_0 SELECT number, number, number from numbers(20, 10)") - check_tables_are_synchronized('postgresql_replica_0'); + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number, number from numbers(20, 10)") + check_tables_are_synchronized(instance, table_name); - result = instance.query('SELECT key, value, value2, _sign, _version FROM test_database.postgresql_replica_0;') + result = instance.query(f'SELECT key, value, value2, _sign, _version FROM test_database.{table_name};') print(result) - drop_materialized_db() - cursor.execute('drop table if exists postgresql_replica_0;') - def test_multiple_databases(started_cluster): - drop_materialized_db('test_database_1') - drop_materialized_db('test_database_2') NUM_TABLES = 5 - - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=False) cursor = conn.cursor() - create_postgres_db(cursor, 'postgres_database_1') - create_postgres_db(cursor, 'postgres_database_2') + pg_manager.create_postgres_db(cursor, 'postgres_database_1') + pg_manager.create_postgres_db(cursor, 'postgres_database_2') - conn1 = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn1 = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True, database_name='postgres_database_1') - conn2 = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn2 = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True, database_name='postgres_database_2') cursor1 = conn1.cursor() cursor2 = conn2.cursor() - create_clickhouse_postgres_db(cluster.postgres_ip, cluster.postgres_port, 'postgres_database_1', 'postgres_database_1') - create_clickhouse_postgres_db(cluster.postgres_ip, cluster.postgres_port, 'postgres_database_2', 'postgres_database_2') + pg_manager.create_clickhouse_postgres_db(cluster.postgres_ip, cluster.postgres_port, 'postgres_database_1', 'postgres_database_1') + pg_manager.create_clickhouse_postgres_db(cluster.postgres_ip, cluster.postgres_port, 'postgres_database_2', 'postgres_database_2') cursors = [cursor1, cursor2] for cursor_id in range(len(cursors)): @@ -740,9 +472,9 @@ def test_multiple_databases(started_cluster): print('database 1 tables: ', instance.query('''SELECT name FROM system.tables WHERE database = 'postgres_database_1';''')) print('database 2 tables: ', instance.query('''SELECT name FROM system.tables WHERE database = 'postgres_database_2';''')) - create_materialized_db(started_cluster.postgres_ip, started_cluster.postgres_port, + pg_manager.create_materialized_db(started_cluster.postgres_ip, started_cluster.postgres_port, 'test_database_1', 'postgres_database_1') - create_materialized_db(started_cluster.postgres_ip, started_cluster.postgres_port, + pg_manager.create_materialized_db(started_cluster.postgres_ip, started_cluster.postgres_port, 'test_database_2', 'postgres_database_2') cursors = [cursor1, cursor2] @@ -754,289 +486,186 @@ def test_multiple_databases(started_cluster): for cursor_id in range(len(cursors)): for i in range(NUM_TABLES): table_name = 'postgresql_replica_{}'.format(i) - check_tables_are_synchronized( + check_tables_are_synchronized(instance, table_name, 'key', 'postgres_database_{}'.format(cursor_id + 1), 'test_database_{}'.format(cursor_id + 1)); - for i in range(NUM_TABLES): - cursor1.execute('drop table if exists postgresql_replica_{};'.format(i)) - for i in range(NUM_TABLES): - cursor2.execute('drop table if exists postgresql_replica_{};'.format(i)) - - drop_clickhouse_postgres_db('postgres_database_1') - drop_clickhouse_postgres_db('postgres_database_2') - - drop_materialized_db('test_database_1') - drop_materialized_db('test_database_2') - def test_concurrent_transactions(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() - NUM_TABLES = 6 - - for i in range(NUM_TABLES): - create_postgres_table(cursor, 'postgresql_replica_{}'.format(i)); - def transaction(thread_id): conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True, auto_commit=False) - cursor_ = conn.cursor() + cursor = conn.cursor() for query in queries: - cursor_.execute(query.format(thread_id)) + cursor.execute(query.format(thread_id)) print('thread {}, query {}'.format(thread_id, query)) conn.commit() + NUM_TABLES = 6 + pg_manager.create_and_fill_postgres_tables(NUM_TABLES, numbers=0) + threads = [] threads_num = 6 for i in range(threads_num): threads.append(threading.Thread(target=transaction, args=(i,))) - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) for thread in threads: time.sleep(random.uniform(0, 0.5)) thread.start() + for thread in threads: thread.join() for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); - count1 = instance.query('SELECT count() FROM postgres_database.postgresql_replica_{}'.format(i)) - count2 = instance.query('SELECT count() FROM (SELECT * FROM test_database.postgresql_replica_{})'.format(i)) + check_tables_are_synchronized(instance, f'postgresql_replica_{i}'); + count1 = instance.query(f'SELECT count() FROM postgres_database.postgresql_replica_{i}') + count2 = instance.query(f'SELECT count() FROM (SELECT * FROM test_database.postgresql_replica_{i})') print(int(count1), int(count2), sep=' ') assert(int(count1) == int(count2)) - drop_materialized_db() - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) - def test_abrupt_connection_loss_while_heavy_replication(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() - NUM_TABLES = 6 - - for i in range(NUM_TABLES): - create_postgres_table(cursor, 'postgresql_replica_{}'.format(i)); - def transaction(thread_id): if thread_id % 2: - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True, auto_commit=True) else: - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True, auto_commit=False) - cursor_ = conn.cursor() + cursor = conn.cursor() for query in queries: - cursor_.execute(query.format(thread_id)) + cursor.execute(query.format(thread_id)) print('thread {}, query {}'.format(thread_id, query)) if thread_id % 2 == 0: conn.commit() - threads = [] + NUM_TABLES = 6 + pg_manager.create_and_fill_postgres_tables(NUM_TABLES, numbers=0) + threads_num = 6 + threads = [] for i in range(threads_num): threads.append(threading.Thread(target=transaction, args=(i,))) - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) for thread in threads: time.sleep(random.uniform(0, 0.5)) thread.start() - # Join here because it takes time for data to reach wal for thread in threads: - thread.join() - time.sleep(1) + thread.join() # Join here because it takes time for data to reach wal + + time.sleep(2) started_cluster.pause_container('postgres1') - for i in range(NUM_TABLES): - result = instance.query("SELECT count() FROM test_database.postgresql_replica_{}".format(i)) - print(result) # Just debug + # for i in range(NUM_TABLES): + # result = instance.query(f"SELECT count() FROM test_database.postgresql_replica_{i}") + # print(result) # Just debug started_cluster.unpause_container('postgres1') - - for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); - - for i in range(NUM_TABLES): - result = instance.query("SELECT count() FROM test_database.postgresql_replica_{}".format(i)) - print(result) # Just debug - - drop_materialized_db() - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) + check_several_tables_are_synchronized(instance, NUM_TABLES) def test_drop_database_while_replication_startup_not_finished(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() NUM_TABLES = 5 - - for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - create_postgres_table(cursor, table_name); - instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(100000)".format(table_name)) - + pg_manager.create_and_fill_postgres_tables(NUM_TABLES, 100000) for i in range(6): - create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) time.sleep(0.5 * i) - drop_materialized_db() - - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) + pg_manager.drop_materialized_db() def test_restart_server_while_replication_startup_not_finished(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() NUM_TABLES = 5 - - for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - create_postgres_table(cursor, table_name); - instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(100000)".format(table_name)) - - create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) - time.sleep(0.5) + pg_manager.create_and_fill_postgres_tables(NUM_TABLES, 100000) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) + time.sleep(1) instance.restart_clickhouse() - for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); - - drop_materialized_db() - for i in range(NUM_TABLES): - cursor.execute('drop table postgresql_replica_{};'.format(i)) + check_several_tables_are_synchronized(instance, NUM_TABLES) def test_abrupt_server_restart_while_heavy_replication(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() - NUM_TABLES = 6 - - for i in range(NUM_TABLES): - create_postgres_table(cursor, 'postgresql_replica_{}'.format(i)); - def transaction(thread_id): if thread_id % 2: - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True, auto_commit=True) else: - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, + conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True, auto_commit=False) - cursor_ = conn.cursor() + cursor = conn.cursor() for query in queries: - cursor_.execute(query.format(thread_id)) + cursor.execute(query.format(thread_id)) print('thread {}, query {}'.format(thread_id, query)) if thread_id % 2 == 0: conn.commit() + NUM_TABLES = 6 + pg_manager.create_and_fill_postgres_tables(tables_num=NUM_TABLES, numbers=0) + threads = [] threads_num = 6 for i in range(threads_num): threads.append(threading.Thread(target=transaction, args=(i,))) - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) for thread in threads: time.sleep(random.uniform(0, 0.5)) thread.start() - # Join here because it takes time for data to reach wal for thread in threads: - thread.join() + thread.join() # Join here because it takes time for data to reach wal + instance.restart_clickhouse() - - for i in range(NUM_TABLES): - result = instance.query("SELECT count() FROM test_database.postgresql_replica_{}".format(i)) - print(result) # Just debug - - for i in range(NUM_TABLES): - check_tables_are_synchronized('postgresql_replica_{}'.format(i)); - - for i in range(NUM_TABLES): - result = instance.query("SELECT count() FROM test_database.postgresql_replica_{}".format(i)) - print(result) # Just debug - - drop_materialized_db() - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) + check_several_tables_are_synchronized(instance, NUM_TABLES) def test_quoting_1(started_cluster): - conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) - cursor = conn.cursor() table_name = 'user' - create_postgres_table(cursor, table_name); - instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number from numbers(50)") - create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) - check_tables_are_synchronized(table_name); - drop_materialized_db() - drop_postgres_table(cursor, table_name) + pg_manager.create_and_fill_postgres_table(table_name) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) + check_tables_are_synchronized(instance, table_name); def test_quoting_2(started_cluster): - conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) - cursor = conn.cursor() table_name = 'user' - create_postgres_table(cursor, table_name); - instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number from numbers(50)") - create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, - settings=[f"materialized_postgresql_tables_list = '{table_name}'"]) - check_tables_are_synchronized(table_name); - drop_materialized_db() - drop_postgres_table(cursor, table_name) + pg_manager.create_and_fill_postgres_table(table_name) + pg_manager.create_materialized_db( + ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + settings=[f"materialized_postgresql_tables_list = '{table_name}'"]) + check_tables_are_synchronized(instance, table_name); def test_user_managed_slots(started_cluster): - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() - table_name = 'test_table' - create_postgres_table(cursor, table_name); - instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(10000)".format(table_name)) - slot_name = 'user_slot' - replication_connection = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, - database=True, replication=True, auto_commit=True) + table_name = 'test_table' + pg_manager.create_and_fill_postgres_table(table_name) + + replication_connection = get_postgres_conn( + ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + database=True, replication=True, auto_commit=True) snapshot = create_replication_slot(replication_connection, slot_name=slot_name) - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - settings=["materialized_postgresql_replication_slot = '{}'".format(slot_name), - "materialized_postgresql_snapshot = '{}'".format(snapshot)]) - check_tables_are_synchronized(table_name); + + pg_manager.create_materialized_db( + ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + settings=[f"materialized_postgresql_replication_slot = '{slot_name}'", + f"materialized_postgresql_snapshot = '{snapshot}'"]) + check_tables_are_synchronized(instance, table_name); + instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(10000, 10000)".format(table_name)) - check_tables_are_synchronized(table_name); + check_tables_are_synchronized(instance, table_name); + instance.restart_clickhouse() + instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(20000, 10000)".format(table_name)) - check_tables_are_synchronized(table_name); - drop_postgres_table(cursor, table_name) - drop_materialized_db() + check_tables_are_synchronized(instance, table_name); + + pg_manager.drop_materialized_db() drop_replication_slot(replication_connection, slot_name) - cursor.execute('DROP TABLE IF EXISTS test_table') + replication_connection.close() if __name__ == '__main__': diff --git a/tests/integration/test_postgresql_replica_database_engine_2/test.py b/tests/integration/test_postgresql_replica_database_engine_2/test.py index 7aee454c4a9..3226c040e8e 100644 --- a/tests/integration/test_postgresql_replica_database_engine_2/test.py +++ b/tests/integration/test_postgresql_replica_database_engine_2/test.py @@ -12,235 +12,62 @@ from helpers.test_tools import TSV from random import randrange import threading +from helpers.postgres_utility import get_postgres_conn +from helpers.postgres_utility import PostgresManager + +from helpers.postgres_utility import create_replication_slot, drop_replication_slot +from helpers.postgres_utility import create_postgres_schema, drop_postgres_schema +from helpers.postgres_utility import create_postgres_table, drop_postgres_table +from helpers.postgres_utility import create_postgres_table_with_schema, drop_postgres_table_with_schema +from helpers.postgres_utility import check_tables_are_synchronized +from helpers.postgres_utility import check_several_tables_are_synchronized +from helpers.postgres_utility import assert_nested_table_is_created +from helpers.postgres_utility import assert_number_of_columns +from helpers.postgres_utility import postgres_table_template, postgres_table_template_2, postgres_table_template_3, postgres_table_template_4, postgres_table_template_5 +from helpers.postgres_utility import queries + + cluster = ClickHouseCluster(__file__) instance = cluster.add_instance('instance', main_configs = ['configs/log_conf.xml'], user_configs = ['configs/users.xml'], with_postgres=True, stay_alive=True) -postgres_table_template = """ - CREATE TABLE IF NOT EXISTS "{}" ( - key Integer NOT NULL, value Integer, PRIMARY KEY(key)) - """ -postgres_table_template_2 = """ - CREATE TABLE IF NOT EXISTS "{}" ( - key Integer NOT NULL, value1 Integer, value2 Integer, value3 Integer, PRIMARY KEY(key)) - """ -postgres_table_template_3 = """ - CREATE TABLE IF NOT EXISTS "{}" ( - key1 Integer NOT NULL, value1 Integer, key2 Integer NOT NULL, value2 Integer NOT NULL) - """ -postgres_table_template_4 = """ - CREATE TABLE IF NOT EXISTS "{}"."{}" ( - key Integer NOT NULL, value Integer, PRIMARY KEY(key)) - """ -postgres_table_template_5 = """ - CREATE TABLE IF NOT EXISTS "{}" ( - key Integer NOT NULL, value UUID, PRIMARY KEY(key)) - """ - -def get_postgres_conn(ip, port, database=False, auto_commit=True, database_name='postgres_database', replication=False): - if database == True: - conn_string = "host={} port={} dbname='{}' user='postgres' password='mysecretpassword'".format(ip, port, database_name) - else: - conn_string = "host={} port={} user='postgres' password='mysecretpassword'".format(ip, port) - - if replication: - conn_string += " replication='database'" - - conn = psycopg2.connect(conn_string) - if auto_commit: - conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) - conn.autocommit = True - return conn - -def create_replication_slot(conn, slot_name='user_slot'): - cursor = conn.cursor() - cursor.execute('CREATE_REPLICATION_SLOT {} LOGICAL pgoutput EXPORT_SNAPSHOT'.format(slot_name)) - result = cursor.fetchall() - print(result[0][0]) # slot name - print(result[0][1]) # start lsn - print(result[0][2]) # snapshot - return result[0][2] - -def drop_replication_slot(conn, slot_name='user_slot'): - cursor = conn.cursor() - cursor.execute("select pg_drop_replication_slot('{}')".format(slot_name)) - -def create_postgres_db(cursor, name='postgres_database'): - cursor.execute("CREATE DATABASE {}".format(name)) - -def drop_postgres_db(cursor, name='postgres_database'): - cursor.execute("DROP DATABASE IF EXISTS {}".format(name)) - -def drop_postgres_schema(cursor, schema_name): - cursor.execute('DROP SCHEMA IF EXISTS {} CASCADE'.format(schema_name)) - -def create_postgres_schema(cursor, schema_name): - drop_postgres_schema(cursor, schema_name) - cursor.execute('CREATE SCHEMA {}'.format(schema_name)) - -def create_clickhouse_postgres_db(ip, port, name='postgres_database', database_name='postgres_database', schema_name=''): - drop_clickhouse_postgres_db(name) - if len(schema_name) == 0: - instance.query(''' - CREATE DATABASE {} - ENGINE = PostgreSQL('{}:{}', '{}', 'postgres', 'mysecretpassword')'''.format(name, ip, port, database_name)) - else: - instance.query(''' - CREATE DATABASE {} - ENGINE = PostgreSQL('{}:{}', '{}', 'postgres', 'mysecretpassword', '{}')'''.format(name, ip, port, database_name, schema_name)) - -def drop_clickhouse_postgres_db(name='postgres_database'): - instance.query('DROP DATABASE IF EXISTS {}'.format(name)) - -def create_materialized_db(ip, port, - materialized_database='test_database', - postgres_database='postgres_database', - settings=[], table_overrides=''): - instance.query(f"DROP DATABASE IF EXISTS {materialized_database}") - create_query = f"CREATE DATABASE {materialized_database} ENGINE = MaterializedPostgreSQL('{ip}:{port}', '{postgres_database}', 'postgres', 'mysecretpassword')" - if len(settings) > 0: - create_query += " SETTINGS " - for i in range(len(settings)): - if i != 0: - create_query += ', ' - create_query += settings[i] - create_query += table_overrides - instance.query(create_query) - assert materialized_database in instance.query('SHOW DATABASES') - -def drop_materialized_db(materialized_database='test_database'): - instance.query('DROP DATABASE IF EXISTS {}'.format(materialized_database)) - assert materialized_database not in instance.query('SHOW DATABASES') - -def drop_postgres_table(cursor, table_name): - cursor.execute("""DROP TABLE IF EXISTS "{}" """.format(table_name)) - -def drop_postgres_table_with_schema(cursor, schema_name, table_name): - cursor.execute("""DROP TABLE IF EXISTS "{}"."{}" """.format(schema_name, table_name)) - -def create_postgres_table(cursor, table_name, replica_identity_full=False, template=postgres_table_template): - drop_postgres_table(cursor, table_name) - cursor.execute(template.format(table_name)) - if replica_identity_full: - cursor.execute('ALTER TABLE {} REPLICA IDENTITY FULL;'.format(table_name)) - -def create_postgres_table_with_schema(cursor, schema_name, table_name): - drop_postgres_table_with_schema(cursor, schema_name, table_name) - cursor.execute(postgres_table_template_4.format(schema_name, table_name)) - -queries = [ - 'INSERT INTO postgresql_replica_{} select i, i from generate_series(0, 10000) as t(i);', - 'DELETE FROM postgresql_replica_{} WHERE (value*value) % 3 = 0;', - 'UPDATE postgresql_replica_{} SET value = value - 125 WHERE key % 2 = 0;', - "UPDATE postgresql_replica_{} SET key=key+20000 WHERE key%2=0", - 'INSERT INTO postgresql_replica_{} select i, i from generate_series(40000, 50000) as t(i);', - 'DELETE FROM postgresql_replica_{} WHERE key % 10 = 0;', - 'UPDATE postgresql_replica_{} SET value = value + 101 WHERE key % 2 = 1;', - "UPDATE postgresql_replica_{} SET key=key+80000 WHERE key%2=1", - 'DELETE FROM postgresql_replica_{} WHERE value % 2 = 0;', - 'UPDATE postgresql_replica_{} SET value = value + 2000 WHERE key % 5 = 0;', - 'INSERT INTO postgresql_replica_{} select i, i from generate_series(200000, 250000) as t(i);', - 'DELETE FROM postgresql_replica_{} WHERE value % 3 = 0;', - 'UPDATE postgresql_replica_{} SET value = value * 2 WHERE key % 3 = 0;', - "UPDATE postgresql_replica_{} SET key=key+500000 WHERE key%2=1", - 'INSERT INTO postgresql_replica_{} select i, i from generate_series(1000000, 1050000) as t(i);', - 'DELETE FROM postgresql_replica_{} WHERE value % 9 = 2;', - "UPDATE postgresql_replica_{} SET key=key+10000000", - 'UPDATE postgresql_replica_{} SET value = value + 2 WHERE key % 3 = 1;', - 'DELETE FROM postgresql_replica_{} WHERE value%5 = 0;' - ] - - -def assert_nested_table_is_created(table_name, materialized_database='test_database', schema_name=''): - if len(schema_name) == 0: - table = table_name - else: - table = schema_name + "." + table_name - print(f'Checking table {table} exists in {materialized_database}') - database_tables = instance.query('SHOW TABLES FROM {}'.format(materialized_database)) - while table not in database_tables: - time.sleep(0.2) - database_tables = instance.query('SHOW TABLES FROM {}'.format(materialized_database)) - assert(table in database_tables) - - -def assert_number_of_columns(expected, table_name, database_name='test_database'): - result = instance.query(f"select count() from system.columns where table = '{table_name}' and database = '{database_name}' and not startsWith(name, '_')") - while (int(result) != expected): - time.sleep(1) - result = instance.query(f"select count() from system.columns where table = '{table_name}' and database = '{database_name}' and not startsWith(name, '_')") - print('Number of columns ok') - - -@pytest.mark.timeout(320) -def check_tables_are_synchronized(table_name, order_by='key', postgres_database='postgres_database', materialized_database='test_database', schema_name=''): - assert_nested_table_is_created(table_name, materialized_database, schema_name) - - print(f"Checking table is synchronized. Table name: {table_name}, table schema: {schema_name}") - expected = instance.query('select * from {}.{} order by {};'.format(postgres_database, table_name, order_by)) - if len(schema_name) == 0: - result = instance.query('select * from {}.{} order by {};'.format(materialized_database, table_name, order_by)) - else: - result = instance.query('select * from {}.`{}.{}` order by {};'.format(materialized_database, schema_name, table_name, order_by)) - - try_num = 0 - while result != expected: - time.sleep(0.5) - if len(schema_name) == 0: - result = instance.query('select * from {}.{} order by {};'.format(materialized_database, table_name, order_by)) - else: - result = instance.query('select * from {}.`{}.{}` order by {};'.format(materialized_database, schema_name, table_name, order_by)) - try_num += 1 - if try_num > 30: - break - - assert(result == expected) +pg_manager = PostgresManager() @pytest.fixture(scope="module") def started_cluster(): try: cluster.start() - conn = get_postgres_conn(ip=cluster.postgres_ip, port=cluster.postgres_port) - cursor = conn.cursor() - create_postgres_db(cursor, 'postgres_database') - create_clickhouse_postgres_db(ip=cluster.postgres_ip, port=cluster.postgres_port) - - instance.query("DROP DATABASE IF EXISTS test_database") + pg_manager.init(instance, cluster.postgres_ip, cluster.postgres_port) yield cluster finally: cluster.shutdown() +@pytest.fixture(autouse=True) +def setup_teardown(): + print("PostgreSQL is available - running test") + yield # run test + pg_manager.restart() + + def test_add_new_table_to_replication(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() + cursor = pg_manager.get_db_cursor() cursor.execute('DROP TABLE IF EXISTS test_table') NUM_TABLES = 5 - for i in range(NUM_TABLES): - create_postgres_table(cursor, 'postgresql_replica_{}'.format(i)); - instance.query("INSERT INTO postgres_database.postgresql_replica_{} SELECT number, {} from numbers(10000)".format(i, i)) - - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port) - - for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - check_tables_are_synchronized(table_name); + pg_manager.create_and_fill_postgres_tables_from_cursor(cursor, NUM_TABLES, 10000) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) + check_several_tables_are_synchronized(instance, NUM_TABLES) result = instance.query("SHOW TABLES FROM test_database") assert(result == "postgresql_replica_0\npostgresql_replica_1\npostgresql_replica_2\npostgresql_replica_3\npostgresql_replica_4\n") table_name = 'postgresql_replica_5' - create_postgres_table(cursor, table_name) - instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(10000)".format(table_name)) + pg_manager.create_and_fill_postgres_table_from_cursor(cursor, table_name) result = instance.query('SHOW CREATE DATABASE test_database') assert(result[:63] == "CREATE DATABASE test_database\\nENGINE = MaterializedPostgreSQL(") # Check without ip @@ -252,16 +79,16 @@ def test_add_new_table_to_replication(started_cluster): result = instance.query_and_get_error("ALTER DATABASE test_database MODIFY SETTING materialized_postgresql_tables='tabl1'") assert('Database engine MaterializedPostgreSQL does not support setting' in result) - instance.query("ATTACH TABLE test_database.{}".format(table_name)); + instance.query(f"ATTACH TABLE test_database.{table_name}"); result = instance.query("SHOW TABLES FROM test_database") assert(result == "postgresql_replica_0\npostgresql_replica_1\npostgresql_replica_2\npostgresql_replica_3\npostgresql_replica_4\npostgresql_replica_5\n") - check_tables_are_synchronized(table_name); - instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(10000, 10000)".format(table_name)) - check_tables_are_synchronized(table_name); + check_tables_are_synchronized(instance, table_name); + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number from numbers(10000, 10000)") + check_tables_are_synchronized(instance, table_name); - result = instance.query_and_get_error("ATTACH TABLE test_database.{}".format(table_name)); + result = instance.query_and_get_error(f"ATTACH TABLE test_database.{table_name}"); assert('Table test_database.postgresql_replica_5 already exists' in result) result = instance.query_and_get_error("ATTACH TABLE test_database.unknown_table"); @@ -274,14 +101,14 @@ def test_add_new_table_to_replication(started_cluster): table_name = 'postgresql_replica_6' create_postgres_table(cursor, table_name) instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(10000)".format(table_name)) - instance.query("ATTACH TABLE test_database.{}".format(table_name)); + instance.query(f"ATTACH TABLE test_database.{table_name}"); instance.restart_clickhouse() table_name = 'postgresql_replica_7' create_postgres_table(cursor, table_name) instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(10000)".format(table_name)) - instance.query("ATTACH TABLE test_database.{}".format(table_name)); + instance.query(f"ATTACH TABLE test_database.{table_name}"); result = instance.query('SHOW CREATE DATABASE test_database') assert(result[:63] == "CREATE DATABASE test_database\\nENGINE = MaterializedPostgreSQL(") @@ -289,33 +116,14 @@ def test_add_new_table_to_replication(started_cluster): result = instance.query("SHOW TABLES FROM test_database") assert(result == "postgresql_replica_0\npostgresql_replica_1\npostgresql_replica_2\npostgresql_replica_3\npostgresql_replica_4\npostgresql_replica_5\npostgresql_replica_6\npostgresql_replica_7\n") + check_several_tables_are_synchronized(instance, NUM_TABLES + 3) - for i in range(NUM_TABLES + 3): - table_name = 'postgresql_replica_{}'.format(i) - check_tables_are_synchronized(table_name); - - for i in range(NUM_TABLES + 3): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) def test_remove_table_from_replication(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port, - database=True) - cursor = conn.cursor() - cursor.execute('DROP TABLE IF EXISTS test_table') NUM_TABLES = 5 - - for i in range(NUM_TABLES): - create_postgres_table(cursor, 'postgresql_replica_{}'.format(i)); - instance.query("INSERT INTO postgres_database.postgresql_replica_{} SELECT number, {} from numbers(10000)".format(i, i)) - - create_materialized_db(ip=started_cluster.postgres_ip, - port=started_cluster.postgres_port) - - for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - check_tables_are_synchronized(table_name); + pg_manager.create_and_fill_postgres_tables(NUM_TABLES, 10000) + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port) + check_several_tables_are_synchronized(instance, NUM_TABLES) result = instance.query("SHOW TABLES FROM test_database") assert(result == "postgresql_replica_0\npostgresql_replica_1\npostgresql_replica_2\npostgresql_replica_3\npostgresql_replica_4\n") @@ -325,8 +133,8 @@ def test_remove_table_from_replication(started_cluster): assert(result[-59:] == "\\'postgres_database\\', \\'postgres\\', \\'mysecretpassword\\')\n") table_name = 'postgresql_replica_4' - instance.query('DETACH TABLE test_database.{}'.format(table_name)); - result = instance.query_and_get_error('SELECT * FROM test_database.{}'.format(table_name)) + instance.query(f'DETACH TABLE test_database.{table_name}'); + result = instance.query_and_get_error(f'SELECT * FROM test_database.{table_name}') assert("doesn't exist" in result) result = instance.query("SHOW TABLES FROM test_database") @@ -336,52 +144,42 @@ def test_remove_table_from_replication(started_cluster): assert(result[:63] == "CREATE DATABASE test_database\\nENGINE = MaterializedPostgreSQL(") assert(result[-138:] == ")\\nSETTINGS materialized_postgresql_tables_list = \\'postgresql_replica_0,postgresql_replica_1,postgresql_replica_2,postgresql_replica_3\\'\n") - instance.query('ATTACH TABLE test_database.{}'.format(table_name)); - check_tables_are_synchronized(table_name); - - for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - check_tables_are_synchronized(table_name); + instance.query(f'ATTACH TABLE test_database.{table_name}'); + check_tables_are_synchronized(instance, table_name); + check_several_tables_are_synchronized(instance, NUM_TABLES) result = instance.query('SHOW CREATE DATABASE test_database') assert(result[:63] == "CREATE DATABASE test_database\\nENGINE = MaterializedPostgreSQL(") assert(result[-159:] == ")\\nSETTINGS materialized_postgresql_tables_list = \\'postgresql_replica_0,postgresql_replica_1,postgresql_replica_2,postgresql_replica_3,postgresql_replica_4\\'\n") table_name = 'postgresql_replica_1' - instance.query('DETACH TABLE test_database.{}'.format(table_name)); + instance.query(f'DETACH TABLE test_database.{table_name}'); result = instance.query('SHOW CREATE DATABASE test_database') assert(result[:63] == "CREATE DATABASE test_database\\nENGINE = MaterializedPostgreSQL(") assert(result[-138:] == ")\\nSETTINGS materialized_postgresql_tables_list = \\'postgresql_replica_0,postgresql_replica_2,postgresql_replica_3,postgresql_replica_4\\'\n") - for i in range(NUM_TABLES): - cursor.execute('drop table if exists postgresql_replica_{};'.format(i)) + cursor = pg_manager.get_db_cursor() + cursor.execute(f'drop table if exists postgresql_replica_0;') # Removing from replication table which does not exist in PostgreSQL must be ok. instance.query('DETACH TABLE test_database.postgresql_replica_0'); assert instance.contains_in_log("from publication, because table does not exist in PostgreSQL") - drop_materialized_db() def test_predefined_connection_configuration(started_cluster): - drop_materialized_db() - conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) - cursor = conn.cursor() + cursor = pg_manager.get_db_cursor() cursor.execute(f'DROP TABLE IF EXISTS test_table') cursor.execute(f'CREATE TABLE test_table (key integer PRIMARY KEY, value integer)') cursor.execute(f'INSERT INTO test_table SELECT 1, 2') - instance.query("CREATE DATABASE test_database ENGINE = MaterializedPostgreSQL(postgres1) SETTINGS materialized_postgresql_tables_list='test_table'") - check_tables_are_synchronized("test_table"); - drop_materialized_db() - cursor.execute('DROP TABLE IF EXISTS test_table') + check_tables_are_synchronized(instance, "test_table"); + pg_manager.drop_materialized_db() insert_counter = 0 def test_database_with_single_non_default_schema(started_cluster): - conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) - cursor = conn.cursor() - + cursor = pg_manager.get_db_cursor() NUM_TABLES=5 schema_name = 'test_schema' materialized_db = 'test_database' @@ -405,18 +203,17 @@ def test_database_with_single_non_default_schema(started_cluster): def check_all_tables_are_synchronized(): for i in range(NUM_TABLES): print('checking table', i) - check_tables_are_synchronized("postgresql_replica_{}".format(i), postgres_database=clickhouse_postgres_db); + check_tables_are_synchronized(instance, f"postgresql_replica_{i}", postgres_database=clickhouse_postgres_db); print('synchronization Ok') create_postgres_schema(cursor, schema_name) - create_clickhouse_postgres_db(ip=cluster.postgres_ip, port=cluster.postgres_port, name=clickhouse_postgres_db, schema_name=schema_name) + pg_manager.create_clickhouse_postgres_db(ip=cluster.postgres_ip, port=cluster.postgres_port, name=clickhouse_postgres_db, schema_name=schema_name) for i in range(NUM_TABLES): - table_name = 'postgresql_replica_{}'.format(i) - create_postgres_table_with_schema(cursor, schema_name, table_name); + create_postgres_table_with_schema(cursor, schema_name, f'postgresql_replica_{i}'); insert_into_tables() - create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, settings=[f"materialized_postgresql_schema = '{schema_name}'", "materialized_postgresql_allow_automatic_update = 1"]) insert_into_tables() @@ -434,22 +231,19 @@ def test_database_with_single_non_default_schema(started_cluster): cursor.execute("ALTER TABLE test_schema.postgresql_replica_{} ADD COLUMN value2 integer".format(altered_table)) instance.query(f"INSERT INTO {clickhouse_postgres_db}.postgresql_replica_{altered_table} SELECT number, number, number from numbers(5000, 1000)") - assert_number_of_columns(3, f'postgresql_replica_{altered_table}') - check_tables_are_synchronized(f"postgresql_replica_{altered_table}", postgres_database=clickhouse_postgres_db); + assert_number_of_columns(instance, 3, f'postgresql_replica_{altered_table}') + check_tables_are_synchronized(instance, f"postgresql_replica_{altered_table}", postgres_database=clickhouse_postgres_db); print('DETACH-ATTACH') detached_table_name = "postgresql_replica_1" instance.query(f"DETACH TABLE {materialized_db}.{detached_table_name}") assert not instance.contains_in_log("from publication, because table does not exist in PostgreSQL") instance.query(f"ATTACH TABLE {materialized_db}.{detached_table_name}") - check_tables_are_synchronized(detached_table_name, postgres_database=clickhouse_postgres_db); - - drop_materialized_db() + check_tables_are_synchronized(instance, detached_table_name, postgres_database=clickhouse_postgres_db); def test_database_with_multiple_non_default_schemas_1(started_cluster): - conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) - cursor = conn.cursor() + cursor = pg_manager.get_db_cursor() NUM_TABLES = 5 schema_name = 'test_schema' @@ -475,11 +269,11 @@ def test_database_with_multiple_non_default_schemas_1(started_cluster): def check_all_tables_are_synchronized(): for i in range(NUM_TABLES): print('checking table', i) - check_tables_are_synchronized("postgresql_replica_{}".format(i), schema_name=schema_name, postgres_database=clickhouse_postgres_db); + check_tables_are_synchronized(instance, "postgresql_replica_{}".format(i), schema_name=schema_name, postgres_database=clickhouse_postgres_db); print('synchronization Ok') create_postgres_schema(cursor, schema_name) - create_clickhouse_postgres_db(ip=cluster.postgres_ip, port=cluster.postgres_port, name=clickhouse_postgres_db, schema_name=schema_name) + pg_manager.create_clickhouse_postgres_db(ip=cluster.postgres_ip, port=cluster.postgres_port, name=clickhouse_postgres_db, schema_name=schema_name) for i in range(NUM_TABLES): table_name = 'postgresql_replica_{}'.format(i) @@ -489,7 +283,7 @@ def test_database_with_multiple_non_default_schemas_1(started_cluster): publication_tables += schema_name + '.' + table_name insert_into_tables() - create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + pg_manager.create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, settings=[f"materialized_postgresql_tables_list = '{publication_tables}'", "materialized_postgresql_tables_list_with_schema=1", "materialized_postgresql_allow_automatic_update = 1"]) check_all_tables_are_synchronized() @@ -507,8 +301,8 @@ def test_database_with_multiple_non_default_schemas_1(started_cluster): cursor.execute("ALTER TABLE test_schema.postgresql_replica_{} ADD COLUMN value2 integer".format(altered_table)) instance.query(f"INSERT INTO {clickhouse_postgres_db}.postgresql_replica_{altered_table} SELECT number, number, number from numbers(5000, 1000)") - assert_number_of_columns(3, f'{schema_name}.postgresql_replica_{altered_table}') - check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=schema_name, postgres_database=clickhouse_postgres_db); + assert_number_of_columns(instance, 3, f'{schema_name}.postgresql_replica_{altered_table}') + check_tables_are_synchronized(instance, f"postgresql_replica_{altered_table}", schema_name=schema_name, postgres_database=clickhouse_postgres_db); print('DETACH-ATTACH') detached_table_name = "postgresql_replica_1" @@ -516,15 +310,11 @@ def test_database_with_multiple_non_default_schemas_1(started_cluster): assert not instance.contains_in_log("from publication, because table does not exist in PostgreSQL") instance.query(f"ATTACH TABLE {materialized_db}.`{schema_name}.{detached_table_name}`") assert_show_tables("test_schema.postgresql_replica_0\ntest_schema.postgresql_replica_1\ntest_schema.postgresql_replica_2\ntest_schema.postgresql_replica_3\ntest_schema.postgresql_replica_4\n") - check_tables_are_synchronized(detached_table_name, schema_name=schema_name, postgres_database=clickhouse_postgres_db); - - drop_materialized_db() + check_tables_are_synchronized(instance, detached_table_name, schema_name=schema_name, postgres_database=clickhouse_postgres_db); def test_database_with_multiple_non_default_schemas_2(started_cluster): - conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) - cursor = conn.cursor() - + cursor = pg_manager.get_db_cursor() NUM_TABLES = 2 schemas_num = 2 schema_list = 'schema0, schema1' @@ -539,7 +329,7 @@ def test_database_with_multiple_non_default_schemas_2(started_cluster): for ti in range(NUM_TABLES): table_name = f'postgresql_replica_{ti}' print(f'checking table {schema_name}.{table_name}') - check_tables_are_synchronized(f'{table_name}', schema_name=schema_name, postgres_database=clickhouse_postgres_db); + check_tables_are_synchronized(instance, f'{table_name}', schema_name=schema_name, postgres_database=clickhouse_postgres_db); print('synchronized Ok') def insert_into_tables(): @@ -560,14 +350,16 @@ def test_database_with_multiple_non_default_schemas_2(started_cluster): schema_name = f'schema{i}' clickhouse_postgres_db = f'clickhouse_postgres_db{i}' create_postgres_schema(cursor, schema_name) - create_clickhouse_postgres_db(ip=cluster.postgres_ip, port=cluster.postgres_port, name=clickhouse_postgres_db, schema_name=schema_name) + pg_manager.create_clickhouse_postgres_db(ip=cluster.postgres_ip, port=cluster.postgres_port, name=clickhouse_postgres_db, schema_name=schema_name) for ti in range(NUM_TABLES): table_name = f'postgresql_replica_{ti}' create_postgres_table_with_schema(cursor, schema_name, table_name); insert_into_tables() - create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, - settings=[f"materialized_postgresql_schema_list = '{schema_list}'", "materialized_postgresql_allow_automatic_update = 1"]) + pg_manager.create_materialized_db( + ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + settings=[f"materialized_postgresql_schema_list = '{schema_list}'", + "materialized_postgresql_allow_automatic_update = 1"]) check_all_tables_are_synchronized() insert_into_tables() @@ -586,8 +378,8 @@ def test_database_with_multiple_non_default_schemas_2(started_cluster): cursor.execute(f"ALTER TABLE schema{altered_schema}.postgresql_replica_{altered_table} ADD COLUMN value2 integer") instance.query(f"INSERT INTO clickhouse_postgres_db{altered_schema}.postgresql_replica_{altered_table} SELECT number, number, number from numbers(1000 * {insert_counter}, 1000)") - assert_number_of_columns(3, f'schema{altered_schema}.postgresql_replica_{altered_table}') - check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=f"schema{altered_schema}", postgres_database=clickhouse_postgres_db); + assert_number_of_columns(instance, 3, f'schema{altered_schema}.postgresql_replica_{altered_table}') + check_tables_are_synchronized(instance, f"postgresql_replica_{altered_table}", schema_name=f"schema{altered_schema}", postgres_database=clickhouse_postgres_db); print('DETACH-ATTACH') detached_table_name = "postgresql_replica_1" @@ -597,23 +389,22 @@ def test_database_with_multiple_non_default_schemas_2(started_cluster): assert not instance.contains_in_log("from publication, because table does not exist in PostgreSQL") instance.query(f"ATTACH TABLE {materialized_db}.`{detached_table_schema}.{detached_table_name}`") assert_show_tables("schema0.postgresql_replica_0\nschema0.postgresql_replica_1\nschema1.postgresql_replica_0\nschema1.postgresql_replica_1\n") - check_tables_are_synchronized(f"postgresql_replica_{altered_table}", schema_name=detached_table_schema, postgres_database=clickhouse_postgres_db); - - drop_materialized_db() + check_tables_are_synchronized(instance, f"postgresql_replica_{altered_table}", schema_name=detached_table_schema, postgres_database=clickhouse_postgres_db); def test_table_override(started_cluster): - conn = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, database=True) - cursor = conn.cursor() + cursor = pg_manager.get_db_cursor() table_name = 'table_override' materialized_database = 'test_database' create_postgres_table(cursor, table_name, template=postgres_table_template_5); instance.query(f"create table {table_name}(key Int32, value UUID) engine = PostgreSQL (postgres1, table={table_name})") instance.query(f"insert into {table_name} select number, generateUUIDv4() from numbers(10)") table_overrides = f" TABLE OVERRIDE {table_name} (COLUMNS (key Int32, value UUID))" - create_materialized_db(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, - settings=[f"materialized_postgresql_tables_list = '{table_name}'"], table_overrides=table_overrides) - assert_nested_table_is_created(table_name, materialized_database) + pg_manager.create_materialized_db( + ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + settings=[f"materialized_postgresql_tables_list = '{table_name}'"], + table_overrides=table_overrides) + assert_nested_table_is_created(instance, table_name, materialized_database) result = instance.query(f"show create table {materialized_database}.{table_name}") print(result) expected = "CREATE TABLE test_database.table_override\\n(\\n `key` Int32,\\n `value` UUID,\\n `_sign` Int8() MATERIALIZED 1,\\n `_version` UInt64() MATERIALIZED 1\\n)\\nENGINE = ReplacingMergeTree(_version)\\nORDER BY tuple(key)" @@ -621,9 +412,50 @@ def test_table_override(started_cluster): time.sleep(5) query = f"select * from {materialized_database}.{table_name} order by key" expected = instance.query(f"select * from {table_name} order by key") + instance.query(f"drop table {table_name} no delay") assert_eq_with_retry(instance, query, expected) - drop_materialized_db() - drop_postgres_table(cursor, table_name) + + +def test_table_schema_changes_2(started_cluster): + cursor = pg_manager.get_db_cursor() + table_name = "test_table" + + create_postgres_table(cursor, table_name, template=postgres_table_template_2); + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number, number, number from numbers(25)") + + pg_manager.create_materialized_db( + ip=started_cluster.postgres_ip, port=started_cluster.postgres_port, + settings=["materialized_postgresql_allow_automatic_update = 1, materialized_postgresql_tables_list='test_table'"]) + + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, number, number, number from numbers(25, 25)") + check_tables_are_synchronized(instance, table_name); + + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value1") + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value2") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value1 Text") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value2 Text") + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value3") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value3 Text") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value4 Text") + cursor.execute(f"UPDATE {table_name} SET value3 = 'kek' WHERE key%2=0") + check_tables_are_synchronized(instance, table_name); + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, toString(number), toString(number), toString(number), toString(number) from numbers(50, 25)") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value5 Integer") + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value2") + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, toString(number), toString(number), toString(number), number from numbers(75, 25)") + check_tables_are_synchronized(instance, table_name); + instance.restart_clickhouse() + check_tables_are_synchronized(instance, table_name); + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value5") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value5 Text") + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, toString(number), toString(number), toString(number), toString(number) from numbers(100, 25)") + check_tables_are_synchronized(instance, table_name); + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value6 Text") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value7 Integer") + cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN value8 Integer") + cursor.execute(f"ALTER TABLE {table_name} DROP COLUMN value5") + instance.query(f"INSERT INTO postgres_database.{table_name} SELECT number, toString(number), toString(number), toString(number), toString(number), number, number from numbers(125, 25)") + check_tables_are_synchronized(instance, table_name); if __name__ == '__main__': diff --git a/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml index 5d10ac0d959..181144b0473 100644 --- a/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml +++ b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml @@ -7,18 +7,21 @@ http://minio1:9001/root/data/ minio minio123 + true s3 http://minio1:9001/root/data/ minio minio123 + true s3 http://minio1:9001/root/data2/ minio minio123 + true diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py index 1c3713c02a2..fb30a83877b 100644 --- a/tests/integration/test_s3_zero_copy_replication/test.py +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -32,11 +32,30 @@ def get_large_objects_count(cluster, size=100, folder='data'): minio = cluster.minio_client counter = 0 for obj in minio.list_objects(cluster.minio_bucket, '{}/'.format(folder)): - if obj.size >= size: + if obj.size is not None and obj.size >= size: counter = counter + 1 return counter +def check_objects_exisis(cluster, object_list, folder='data'): + minio = cluster.minio_client + for obj in object_list: + if obj: + minio.stat_object(cluster.minio_bucket, '{}/{}'.format(folder, obj)) + + +def check_objects_not_exisis(cluster, object_list, folder='data'): + minio = cluster.minio_client + for obj in object_list: + if obj: + try: + minio.stat_object(cluster.minio_bucket, '{}/{}'.format(folder, obj)) + except Exception as error: + assert "NoSuchKey" in str(error) + else: + assert False, "Object {} should not be exists".format(obj) + + def wait_for_large_objects_count(cluster, expected, size=100, timeout=30): while timeout > 0: if get_large_objects_count(cluster, size=size) == expected: @@ -266,6 +285,138 @@ def test_s3_zero_copy_with_ttl_delete(cluster, large_data, iterations): node2.query("DROP TABLE IF EXISTS ttl_delete_test NO DELAY") +def wait_mutations(node, table, seconds): + time.sleep(1) + while seconds > 0: + seconds -= 1 + mutations = node.query(f"SELECT count() FROM system.mutations WHERE table='{table}' AND is_done=0") + if mutations == '0\n': + return + time.sleep(1) + mutations = node.query(f"SELECT count() FROM system.mutations WHERE table='{table}' AND is_done=0") + assert mutations == '0\n' + + +def test_s3_zero_copy_unfreeze(cluster): + node1 = cluster.instances["node1"] + node2 = cluster.instances["node2"] + + node1.query("DROP TABLE IF EXISTS unfreeze_test NO DELAY") + node2.query("DROP TABLE IF EXISTS unfreeze_test NO DELAY") + + node1.query( + """ + CREATE TABLE unfreeze_test ON CLUSTER test_cluster (d UInt64) + ENGINE=ReplicatedMergeTree('/clickhouse/tables/unfreeze_test', '{}') + ORDER BY d + SETTINGS storage_policy='s3' + """ + .format('{replica}') + ) + + node1.query("INSERT INTO unfreeze_test VALUES (0)") + + node1.query("ALTER TABLE unfreeze_test FREEZE WITH NAME 'freeze_backup1'") + node2.query("ALTER TABLE unfreeze_test FREEZE WITH NAME 'freeze_backup2'") + wait_mutations(node1, "unfreeze_test", 10) + wait_mutations(node2, "unfreeze_test", 10) + + objects01 = node1.get_backuped_s3_objects("s31", "freeze_backup1") + objects02 = node2.get_backuped_s3_objects("s31", "freeze_backup2") + + assert objects01 == objects02 + + check_objects_exisis(cluster, objects01) + + node1.query("TRUNCATE TABLE unfreeze_test") + + objects11 = node1.get_backuped_s3_objects("s31", "freeze_backup1") + objects12 = node2.get_backuped_s3_objects("s31", "freeze_backup2") + + assert objects01 == objects11 + assert objects01 == objects12 + + check_objects_exisis(cluster, objects11) + + node1.query("ALTER TABLE unfreeze_test UNFREEZE WITH NAME 'freeze_backup1'") + wait_mutations(node1, "unfreeze_test", 10) + + check_objects_exisis(cluster, objects12) + + node2.query("ALTER TABLE unfreeze_test UNFREEZE WITH NAME 'freeze_backup2'") + wait_mutations(node2, "unfreeze_test", 10) + + check_objects_not_exisis(cluster, objects12) + + node1.query("DROP TABLE IF EXISTS unfreeze_test NO DELAY") + node2.query("DROP TABLE IF EXISTS unfreeze_test NO DELAY") + + +def test_s3_zero_copy_drop_detached(cluster): + node1 = cluster.instances["node1"] + node2 = cluster.instances["node2"] + + node1.query("DROP TABLE IF EXISTS drop_detached_test NO DELAY") + node2.query("DROP TABLE IF EXISTS drop_detached_test NO DELAY") + + node1.query( + """ + CREATE TABLE drop_detached_test ON CLUSTER test_cluster (d UInt64) + ENGINE=ReplicatedMergeTree('/clickhouse/tables/drop_detached_test', '{}') + ORDER BY d PARTITION BY d + SETTINGS storage_policy='s3' + """ + .format('{replica}') + ) + + node1.query("INSERT INTO drop_detached_test VALUES (0)") + node1.query("ALTER TABLE drop_detached_test FREEZE WITH NAME 'detach_backup1'") + node1.query("INSERT INTO drop_detached_test VALUES (1)") + node1.query("ALTER TABLE drop_detached_test FREEZE WITH NAME 'detach_backup2'") + + objects1 = node1.get_backuped_s3_objects("s31", "detach_backup1") + objects2 = node1.get_backuped_s3_objects("s31", "detach_backup2") + + objects_diff = list(set(objects2) - set(objects1)) + + node1.query("ALTER TABLE drop_detached_test UNFREEZE WITH NAME 'detach_backup2'") + node1.query("ALTER TABLE drop_detached_test UNFREEZE WITH NAME 'detach_backup1'") + + node1.query("ALTER TABLE drop_detached_test DETACH PARTITION '0'") + node1.query("ALTER TABLE drop_detached_test DETACH PARTITION '1'") + wait_mutations(node1, "drop_detached_test", 10) + wait_mutations(node2, "drop_detached_test", 10) + + check_objects_exisis(cluster, objects1) + check_objects_exisis(cluster, objects2) + + node2.query("ALTER TABLE drop_detached_test DROP DETACHED PARTITION '1'", settings={"allow_drop_detached": 1}) + wait_mutations(node1, "drop_detached_test", 10) + wait_mutations(node2, "drop_detached_test", 10) + + check_objects_exisis(cluster, objects1) + check_objects_exisis(cluster, objects2) + + node1.query("ALTER TABLE drop_detached_test DROP DETACHED PARTITION '1'", settings={"allow_drop_detached": 1}) + wait_mutations(node1, "drop_detached_test", 10) + wait_mutations(node2, "drop_detached_test", 10) + + check_objects_exisis(cluster, objects1) + check_objects_not_exisis(cluster, objects_diff) + + node1.query("ALTER TABLE drop_detached_test DROP DETACHED PARTITION '0'", settings={"allow_drop_detached": 1}) + wait_mutations(node1, "drop_detached_test", 10) + wait_mutations(node2, "drop_detached_test", 10) + + check_objects_exisis(cluster, objects1) + + node2.query("ALTER TABLE drop_detached_test DROP DETACHED PARTITION '0'", settings={"allow_drop_detached": 1}) + wait_mutations(node1, "drop_detached_test", 10) + wait_mutations(node2, "drop_detached_test", 10) + + check_objects_not_exisis(cluster, objects1) + + def test_s3_zero_copy_concurrent_merge(cluster): node1 = cluster.instances["node1"] node2 = cluster.instances["node2"] diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 33ce94a7a29..f317fb5429a 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -310,6 +310,7 @@ def test_seekable_formats(started_cluster): result = node1.query(f"SELECT count() FROM {table_function}") assert(int(result) == 5000000) + def test_read_table_with_default(started_cluster): hdfs_api = started_cluster.hdfs_api @@ -322,6 +323,22 @@ def test_read_table_with_default(started_cluster): "select * from hdfs('hdfs://hdfs1:9000/simple_table_function', 'TSVWithNames', 'n UInt32, m UInt32 DEFAULT n * 2') FORMAT TSVWithNames") == output +def test_schema_inference(started_cluster): + node1.query(f"insert into table function hdfs('hdfs://hdfs1:9000/native', 'Native', 'a Int32, b String') SELECT number, randomString(100) FROM numbers(5000000)") + + result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/native', 'Native')") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = node1.query(f"select count(*) from hdfs('hdfs://hdfs1:9000/native', 'Native')") + assert(int(result) == 5000000) + + node1.query(f"create table schema_inference engine=HDFS('hdfs://hdfs1:9000/native', 'Native')") + result = node1.query(f"desc schema_inference") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = node1.query(f"select count(*) from schema_inference") + assert(int(result) == 5000000) + def test_hdfsCluster(started_cluster): hdfs_api = started_cluster.hdfs_api diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 1ee7f3cf125..a92dafa0b8a 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -445,15 +445,21 @@ def test_kafka_formats(kafka_cluster): # /src/Processors/Formats/IRowInputFormat.cpp:0: DB::IRowInputFormat::generate() @ 0x1de72710 in /usr/bin/clickhouse ], }, - # 'Template' : { - # 'data_sample' : [ - # '(id = 0, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', - # # '(id = 1, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 2, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 3, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 4, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 5, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 6, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 7, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 8, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 9, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 10, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 11, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 12, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 13, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 14, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 15, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', - # # '(id = 0, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', - # # '' # tolerates - # ], - # 'extra_settings': ", format_template_row='template_row.format'" - # }, + 'CustomSeparated' : { + 'data_sample' : [ + '0\t0\tAM\t0.5\t1\n', + '1\t0\tAM\t0.5\t1\n2\t0\tAM\t0.5\t1\n3\t0\tAM\t0.5\t1\n4\t0\tAM\t0.5\t1\n5\t0\tAM\t0.5\t1\n6\t0\tAM\t0.5\t1\n7\t0\tAM\t0.5\t1\n8\t0\tAM\t0.5\t1\n9\t0\tAM\t0.5\t1\n10\t0\tAM\t0.5\t1\n11\t0\tAM\t0.5\t1\n12\t0\tAM\t0.5\t1\n13\t0\tAM\t0.5\t1\n14\t0\tAM\t0.5\t1\n15\t0\tAM\t0.5\t1\n', + '0\t0\tAM\t0.5\t1\n', + ], + }, + 'Template' : { + 'data_sample' : [ + '(id = 0, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', + '(id = 1, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 2, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 3, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 4, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 5, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 6, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 7, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 8, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 9, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 10, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 11, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 12, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 13, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 14, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)\n(id = 15, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', + '(id = 0, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', + ], + 'extra_settings': ", format_template_row='template_row.format'" + }, 'Regexp': { 'data_sample': [ '(id = 0, blockNo = 0, val1 = "AM", val2 = 0.5, val3 = 1)', @@ -1498,6 +1504,13 @@ def test_kafka_flush_on_big_message(kafka_cluster): def test_kafka_virtual_columns(kafka_cluster): + admin_client = KafkaAdminClient(bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)) + topic_config = { + # default retention, since predefined timestamp_ms is used. + 'retention.ms': '-1', + } + kafka_create_topic(admin_client, "virt1", config=topic_config) + instance.query(''' CREATE TABLE test.kafka (key UInt64, value UInt64) ENGINE = Kafka @@ -1530,6 +1543,13 @@ def test_kafka_virtual_columns(kafka_cluster): def test_kafka_virtual_columns_with_materialized_view(kafka_cluster): + admin_client = KafkaAdminClient(bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)) + topic_config = { + # default retention, since predefined timestamp_ms is used. + 'retention.ms': '-1', + } + kafka_create_topic(admin_client, "virt2", config=topic_config) + instance.query(''' DROP TABLE IF EXISTS test.view; DROP TABLE IF EXISTS test.consumer; @@ -1738,8 +1758,12 @@ def test_kafka_commit_on_block_write(kafka_cluster): def test_kafka_virtual_columns2(kafka_cluster): admin_client = KafkaAdminClient(bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)) - kafka_create_topic(admin_client, "virt2_0", num_partitions=2) - kafka_create_topic(admin_client, "virt2_1", num_partitions=2) + topic_config = { + # default retention, since predefined timestamp_ms is used. + 'retention.ms': '-1', + } + kafka_create_topic(admin_client, "virt2_0", num_partitions=2, config=topic_config) + kafka_create_topic(admin_client, "virt2_1", num_partitions=2, config=topic_config) instance.query(''' CREATE TABLE test.kafka (value UInt64) @@ -1867,6 +1891,13 @@ def test_kafka_produce_key_timestamp(kafka_cluster): def test_kafka_insert_avro(kafka_cluster): + admin_client = KafkaAdminClient(bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)) + topic_config = { + # default retention, since predefined timestamp_ms is used. + 'retention.ms': '-1', + } + kafka_create_topic(admin_client, "avro1", config=topic_config) + instance.query(''' DROP TABLE IF EXISTS test.kafka; CREATE TABLE test.kafka (key UInt64, value UInt64, _timestamp DateTime('UTC')) diff --git a/tests/integration/test_storage_mysql/configs/named_collections.xml b/tests/integration/test_storage_mysql/configs/named_collections.xml index 4a97be7bd98..b4a79880d2a 100644 --- a/tests/integration/test_storage_mysql/configs/named_collections.xml +++ b/tests/integration/test_storage_mysql/configs/named_collections.xml @@ -21,5 +21,14 @@ clickhouse test_table
+ + root + clickhouse + mysql57 + 3306 + clickhouse + test_table
+ 0 +
diff --git a/tests/integration/test_storage_mysql/test.py b/tests/integration/test_storage_mysql/test.py index c0ba0d8735e..713a8793f48 100644 --- a/tests/integration/test_storage_mysql/test.py +++ b/tests/integration/test_storage_mysql/test.py @@ -418,6 +418,10 @@ def test_predefined_connection_configuration(started_cluster): ''') assert (node1.query(f"SELECT count() FROM test_table").rstrip() == '100') + assert 'Connection pool cannot have zero size' in node1.query_and_get_error("SELECT count() FROM mysql(mysql1, table='test_table', connection_pool_size=0)") + assert 'Connection pool cannot have zero size' in node1.query_and_get_error("SELECT count() FROM mysql(mysql4)") + assert int(node1.query("SELECT count() FROM mysql(mysql4, connection_pool_size=1)")) == 100 + # Regression for (k, v) IN ((k, v)) def test_mysql_in(started_cluster): diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index 2c2a9e41509..a3d99159cb2 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -35,6 +35,17 @@ def rabbitmq_check_result(result, check=False, ref_file='test_rabbitmq_json.refe else: return TSV(result) == TSV(reference) +def wait_rabbitmq_to_start(rabbitmq_docker_id, timeout=180): + start = time.time() + while time.time() - start < timeout: + try: + if instance.cluster.check_rabbitmq_is_available(rabbitmq_docker_id): + logging.debug("RabbitMQ is available") + return + time.sleep(0.5) + except Exception as ex: + logging.debug("Can't connect to RabbitMQ " + str(ex)) + time.sleep(0.5) def kill_rabbitmq(rabbitmq_id): p = subprocess.Popen(('docker', 'stop', rabbitmq_id), stdout=subprocess.PIPE) @@ -45,7 +56,7 @@ def kill_rabbitmq(rabbitmq_id): def revive_rabbitmq(rabbitmq_id): p = subprocess.Popen(('docker', 'start', rabbitmq_id), stdout=subprocess.PIPE) p.communicate() - return p.returncode == 0 + wait_rabbitmq_to_start(rabbitmq_id) # Fixtures diff --git a/tests/integration/test_storage_s3/configs/named_collections.xml b/tests/integration/test_storage_s3/configs/named_collections.xml index dfcbeeb2d4a..efadedc1bde 100644 --- a/tests/integration/test_storage_s3/configs/named_collections.xml +++ b/tests/integration/test_storage_s3/configs/named_collections.xml @@ -15,5 +15,10 @@ minio minio123 + + http://minio1:9001/root/test_native + minio + minio123 + diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index f3c4b1dd0cf..885a37f875c 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -126,7 +126,7 @@ def run_query(instance, query, stdin=None, settings=None): pytest.param("'wrongid','wrongkey',", False, 'xz', id="xz"), pytest.param("'wrongid','wrongkey',", False, 'zstd', id="zstd") ]) -def test_put(started_cluster, maybe_auth, positive, compression): +def _test_put(started_cluster, maybe_auth, positive, compression): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -148,7 +148,7 @@ def test_put(started_cluster, maybe_auth, positive, compression): assert values_csv == get_s3_file_content(started_cluster, bucket, filename) -def test_partition_by(started_cluster): +def _test_partition_by(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -173,7 +173,7 @@ def test_partition_by(started_cluster): assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test2_45.csv") -def test_partition_by_string_column(started_cluster): +def _test_partition_by_string_column(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "col_num UInt32, col_str String" @@ -191,7 +191,7 @@ def test_partition_by_string_column(started_cluster): assert '78,"你好"\n' == get_s3_file_content(started_cluster, bucket, "test_你好.csv") -def test_partition_by_const_column(started_cluster): +def _test_partition_by_const_column(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -212,7 +212,7 @@ def test_partition_by_const_column(started_cluster): "space", "plus" ]) -def test_get_file_with_special(started_cluster, special): +def _test_get_file_with_special(started_cluster, special): symbol = {"space": " ", "plus": "+"}[special] urlsafe_symbol = {"space": "%20", "plus": "%2B"}[special] auth = "'minio','minio123'," @@ -239,7 +239,7 @@ def test_get_file_with_special(started_cluster, special): "plus", "plus2" ]) -def test_get_path_with_special(started_cluster, special): +def _test_get_path_with_special(started_cluster, special): symbol = {"space": "%20", "plus": "%2B", "plus2": "%2B"}[special] safe_symbol = {"space": "%20", "plus": "+", "plus2": "%2B"}[special] auth = "'minio','minio123'," @@ -253,7 +253,7 @@ def test_get_path_with_special(started_cluster, special): @pytest.mark.parametrize("auth", [ pytest.param("'minio','minio123',", id="minio") ]) -def test_empty_put(started_cluster, auth): +def _test_empty_put(started_cluster, auth): # type: (ClickHouseCluster, str) -> None bucket = started_cluster.minio_bucket @@ -291,7 +291,7 @@ def test_empty_put(started_cluster, auth): pytest.param("'minio','minio123',", True, id="auth_positive"), pytest.param("'wrongid','wrongkey',", False, id="negative"), ]) -def test_put_csv(started_cluster, maybe_auth, positive): +def _test_put_csv(started_cluster, maybe_auth, positive): # type: (ClickHouseCluster, bool, str) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -313,7 +313,7 @@ def test_put_csv(started_cluster, maybe_auth, positive): # Test put and get with S3 server redirect. -def test_put_get_with_redirect(started_cluster): +def _test_put_get_with_redirect(started_cluster): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket @@ -340,7 +340,7 @@ def test_put_get_with_redirect(started_cluster): # Test put with restricted S3 server redirect. -def test_put_with_zero_redirect(started_cluster): +def _test_put_with_zero_redirect(started_cluster): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket @@ -367,7 +367,7 @@ def test_put_with_zero_redirect(started_cluster): assert exception_raised -def test_put_get_with_globs(started_cluster): +def _test_put_get_with_globs(started_cluster): # type: (ClickHouseCluster) -> None unique_prefix = random.randint(1,10000) bucket = started_cluster.minio_bucket @@ -399,7 +399,7 @@ def test_put_get_with_globs(started_cluster): pytest.param("'wrongid','wrongkey'", False, id="negative"), # ("'minio','minio123',",True), Redirect with credentials not working with nginx. ]) -def test_multipart_put(started_cluster, maybe_auth, positive): +def _test_multipart_put(started_cluster, maybe_auth, positive): # type: (ClickHouseCluster) -> None bucket = started_cluster.minio_bucket if not maybe_auth else started_cluster.minio_restricted_bucket @@ -439,7 +439,7 @@ def test_multipart_put(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) -def test_remote_host_filter(started_cluster): +def _test_remote_host_filter(started_cluster): instance = started_cluster.instances["restricted_dummy"] format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -457,7 +457,7 @@ def test_remote_host_filter(started_cluster): pytest.param("''", id="1_argument"), pytest.param("'','','','','',''", id="6_arguments"), ]) -def test_wrong_s3_syntax(started_cluster, s3_storage_args): +def _test_wrong_s3_syntax(started_cluster, s3_storage_args): instance = started_cluster.instances["dummy"] # type: ClickHouseInstance expected_err_msg = "Code: 42" # NUMBER_OF_ARGUMENTS_DOESNT_MATCH @@ -466,7 +466,7 @@ def test_wrong_s3_syntax(started_cluster, s3_storage_args): # https://en.wikipedia.org/wiki/One_Thousand_and_One_Nights -def test_s3_glob_scheherazade(started_cluster): +def _test_s3_glob_scheherazade(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -535,7 +535,7 @@ def replace_config(old, new): config.close() -def test_custom_auth_headers(started_cluster): +def _test_custom_auth_headers(started_cluster): table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" get_query = "select * from s3('http://resolver:8080/{bucket}/{file}', 'CSV', '{table_format}')".format( @@ -566,7 +566,7 @@ def test_custom_auth_headers(started_cluster): instance.query("DROP TABLE test") -def test_custom_auth_headers_exclusion(started_cluster): +def _test_custom_auth_headers_exclusion(started_cluster): table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" get_query = f"SELECT * FROM s3('http://resolver:8080/{started_cluster.minio_restricted_bucket}/restricteddirectory/{filename}', 'CSV', '{table_format}')" @@ -580,7 +580,7 @@ def test_custom_auth_headers_exclusion(started_cluster): assert 'Forbidden Error' in ei.value.stderr -def test_infinite_redirect(started_cluster): +def _test_infinite_redirect(started_cluster): bucket = "redirected" table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" @@ -598,7 +598,7 @@ def test_infinite_redirect(started_cluster): pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz"), ]) -def test_storage_s3_get_gzip(started_cluster, extension, method): +def _test_storage_s3_get_gzip(started_cluster, extension, method): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = f"test_get_gzip.{extension}" @@ -638,7 +638,7 @@ def test_storage_s3_get_gzip(started_cluster, extension, method): run_query(instance, f"DROP TABLE {name}") -def test_storage_s3_get_unstable(started_cluster): +def _test_storage_s3_get_unstable(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64" @@ -647,7 +647,7 @@ def test_storage_s3_get_unstable(started_cluster): assert result.splitlines() == ["500001,500000,0"] -def test_storage_s3_put_uncompressed(started_cluster): +def _test_storage_s3_put_uncompressed(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = "test_put_uncompressed.bin" @@ -684,7 +684,7 @@ def test_storage_s3_put_uncompressed(started_cluster): pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz") ]) -def test_storage_s3_put_gzip(started_cluster, extension, method): +def _test_storage_s3_put_gzip(started_cluster, extension, method): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] filename = f"test_put_gzip.{extension}" @@ -721,7 +721,7 @@ def test_storage_s3_put_gzip(started_cluster, extension, method): assert sum([ int(i.split(',')[1]) for i in uncompressed_content.splitlines() ]) == 708 -def test_truncate_table(started_cluster): +def _test_truncate_table(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance name = "truncate" @@ -745,7 +745,7 @@ def test_truncate_table(started_cluster): assert instance.query("SELECT * FROM {}".format(name)) == "" -def test_predefined_connection_configuration(started_cluster): +def _test_predefined_connection_configuration(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance name = "test_table" @@ -762,7 +762,7 @@ def test_predefined_connection_configuration(started_cluster): result = "" -def test_url_reconnect_in_the_middle(started_cluster): +def _test_url_reconnect_in_the_middle(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] table_format = "id String, data String" @@ -783,7 +783,7 @@ def test_url_reconnect_in_the_middle(started_cluster): f"""select sum(cityHash64(x)) from (select toUInt64(id) + sleep(0.1) as x from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'TSV', '{table_format}') settings http_max_tries = 10, http_retry_max_backoff_ms=2000, http_send_timeout=1, http_receive_timeout=1)""") - assert(int(result), 3914219105369203805) + assert(int(result) == 3914219105369203805) thread = threading.Thread(target=select) thread.start() @@ -796,10 +796,10 @@ def test_url_reconnect_in_the_middle(started_cluster): thread.join() - assert(int(result), 3914219105369203805) + assert(int(result) == 3914219105369203805) -def test_seekable_formats(started_cluster): +def _test_seekable_formats(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance @@ -821,7 +821,7 @@ def test_seekable_formats(started_cluster): assert(int(result[:3]) < 200) -def test_seekable_formats_url(started_cluster): +def _test_seekable_formats_url(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] @@ -865,3 +865,53 @@ def test_insert_with_path_with_globs(started_cluster): table_function_3 = f"s3('http://minio1:9001/root/test_parquet*', 'minio', 'minio123', 'Parquet', 'a Int32, b String')" instance.query_and_get_error(f"insert into table function {table_function_3} SELECT number, randomString(100) FROM numbers(500)") + + +def test_s3_schema_inference(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + instance.query(f"insert into table function s3(s3_native, structure='a Int32, b String', format='Native') select number, randomString(100) from numbers(5000000)") + result = instance.query(f"desc s3(s3_native, format='Native')") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from s3(s3_native, format='Native')") + assert(int(result) == 5000000) + + instance.query(f"create table schema_inference engine=S3(s3_native, format='Native')") + result = instance.query(f"desc schema_inference") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from schema_inference") + assert(int(result) == 5000000) + + + table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')" + result = instance.query(f"desc {table_function}") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from {table_function}") + assert(int(result) == 5000000) + + instance.query(f"create table schema_inference_2 engine=URL('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_native', 'Native')") + result = instance.query(f"desc schema_inference_2") + assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n" + + result = instance.query(f"select count(*) from schema_inference_2") + assert(int(result) == 5000000) + + +def test_empty_file(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + name = "empty" + url = f'http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{name}' + + minio = started_cluster.minio_client + minio.put_object(bucket, name, io.BytesIO(b""), 0) + + table_function = f"s3('{url}', 'CSV', 'id Int32')" + result = instance.query(f"SELECT count() FROM {table_function}") + assert(int(result) == 0) + diff --git a/tests/integration/test_system_logs_recreate/test.py b/tests/integration/test_system_logs_recreate/test.py index 3ab0269b42e..c0afa8cd555 100644 --- a/tests/integration/test_system_logs_recreate/test.py +++ b/tests/integration/test_system_logs_recreate/test.py @@ -68,3 +68,26 @@ def test_system_logs_recreate(): # IOW that the table created only when the structure is indeed different. for table in system_logs: assert len(node.query(f"SHOW TABLES FROM system LIKE '{table}%'").strip().split('\n')) == 3 + + +def test_drop_system_log(): + node.exec_in_container(['bash', '-c', f"""echo " + + + 1000000 + + + " > /etc/clickhouse-server/config.d/yyy-override-query_log.xml + """]) + node.restart_clickhouse() + node.query("select 1") + node.query("system flush logs") + node.query("select 2") + node.query("system flush logs") + assert node.query("select count() > 0 from system.query_log") == "1\n" + node.query("drop table system.query_log sync") + node.query("select 3") + node.query("system flush logs") + assert node.query("select count() > 0 from system.query_log") == "1\n" + node.exec_in_container(['rm', f'/etc/clickhouse-server/config.d/yyy-override-query_log.xml']) + node.restart_clickhouse() diff --git a/tests/integration/test_table_functions_access_rights/test.py b/tests/integration/test_table_functions_access_rights/test.py index 16f18407960..90106303315 100644 --- a/tests/integration/test_table_functions_access_rights/test.py +++ b/tests/integration/test_table_functions_access_rights/test.py @@ -39,7 +39,7 @@ def test_merge(): instance.query("GRANT CREATE TEMPORARY TABLE ON *.* TO A") assert "no tables in database matches" in instance.query_and_get_error(select_query, user = 'A') - + instance.query("GRANT SELECT ON default.table1 TO A") assert instance.query(select_query, user = 'A') == "1\n" diff --git a/tests/performance/set_index.xml b/tests/performance/set_index.xml index 1fb7cf967f3..631cad9986e 100644 --- a/tests/performance/set_index.xml +++ b/tests/performance/set_index.xml @@ -3,17 +3,17 @@ INSERT INTO test_in SELECT number FROM numbers(500000000) - SELECT count() FROM test_in WHERE a IN (SELECT rand(1) FROM numbers(200000)) SETTINGS max_rows_to_read = 1, read_overflow_mode = 'break' + SELECT count() FROM test_in WHERE a IN (SELECT rand(1) FROM numbers(200000)) SETTINGS max_rows_to_read = 200001, read_overflow_mode = 'break' - SELECT count() FROM test_in WHERE toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=1, read_overflow_mode='break' + SELECT count() FROM test_in WHERE toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=200001, read_overflow_mode='break' - SELECT count() FROM test_in WHERE -toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=1, read_overflow_mode='break' + SELECT count() FROM test_in WHERE -toInt64(a) IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=200001, read_overflow_mode='break' - SELECT count() FROM test_in WHERE -toInt64(a) NOT IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=1, read_overflow_mode='break' + SELECT count() FROM test_in WHERE -toInt64(a) NOT IN (SELECT toInt64(rand(1)) FROM numbers(200000)) settings max_rows_to_read=200001, read_overflow_mode='break' SELECT (rand(), rand()) IN ((17258, 93148), (4508, 52749), (68660, 70017), (77797, 23528), (1136, 37393), (53237, 15379), (68370, 73211), (15782, 54962), (59432, 45415), (68396, 920), (96154, 21016), (12700, 26887), (88016, 43191), (68153, 51575), (91315, 40005), (18070, 73178), (86, 631), (77717, 20324), (3227, 76188), (74960, 43147), (77538, 19628), (82292, 6525), (24293, 12566), (85244, 96287), (93982, 1329), (38064, 54723), (83999, 45810), (71921, 53673), (88638, 9669), (1959, 39535), (82235, 95796), (27907, 90975), (42383, 91015), (9948, 91514), (81712, 47309), (400, 25808), (31791, 46948), (39740, 36098), (25943, 84598), (99598, 52939), (77134, 15845), (40313, 72174), (85017, 94036), (36595, 14303), (83961, 68078), (55792, 72759), (73574, 43606), (9853, 63560), (28580, 56721), (74804, 41025), (32095, 55657), (52881, 63416), (91368, 90310), (23922, 38883), (30592, 10758), (66448, 61183), (31880, 96697), (11362, 20633), (75331, 2015), (71129, 8785), (1115, 70955), (7886, 83698), (18961, 84556), (16677, 43028), (37347, 70220), (31699, 71244), (10578, 96159), (67600, 39041), (78791, 86687), (21545, 54174), (68774, 37637), (46132, 81768), (98413, 20605), (2960, 23665), (31507, 35719), (96209, 18368), (60558, 38035), (21952, 3264), (11834, 86458), (21651, 17650), (86276, 36087), (18818, 24849), (61951, 3390), (59637, 62545), (30346, 72253), (36281, 2992), (78340, 49872), (94326, 93723), (3416, 94405), (12272, 8741), (22600, 22095), (57636, 37106), (38702, 14889), (70238, 11276), (17325, 60648), (16492, 41271), (52100, 1304), (93416, 7795), (57209, 71008), (48010, 36078), (20384, 74420), (77440, 34439), (69224, 45099), (30374, 33884), (49038, 90140), (1154, 84725), (64926, 86985), (91746, 73472), (59757, 75755), (45860, 71557), (45833, 36526), (74618, 73598), (91360, 65168), (58029, 30793), (56332, 14973), (99943, 96877), (97454, 6450), (64502, 77301), (73182, 31853), (76809, 83964), (82916, 86188), (78736, 65427), (36495, 7422), (76196, 2804), (96117, 61093), (9177, 26099), (52942, 63007), (48578, 47876), (50638, 89903), (7113, 97316), (35301, 12750), (47807, 7254), (38217, 55418), (56970, 41687), (20527, 62886), (358, 14021), (64018, 18582), (91740, 21683), (81967, 53589), (45437, 38450), (45476, 67752), (76851, 72072), (7304, 60091), (40097, 12897), (39906, 29247), (84262, 58734), (30857, 43791), (56087, 78929), (20498, 45954), (48726, 500), (62723, 43763), (28368, 30756), (74048, 52403), (15045, 95926), (75542, 55384), (52543, 22525), (56001, 6935), (11431, 46745), (77731, 7310), (36718, 59909), (32235, 91254), (92417, 25917), (21782, 79277), (46378, 87536), (35324, 26075), (6310, 76915), (1551, 69473), (50642, 68865), (55190, 72934), (49780, 21873), (99466, 29686), (90761, 13179), (72959, 57033), (20020, 90200), (46186, 79105), (73871, 52382), (59559, 38801), (59916, 16082), (33610, 94966), (46001, 45225), (86679, 26469), (77245, 91929), (32887, 36623), (11179, 46898), (87881, 68087), (45438, 47991), (24950, 94525), (91664, 51656), (43914, 47805), (15736, 96156), (56346, 20283), (85053, 48931), (17790, 26179), (96195, 55728), (43765, 54807), (44988, 89269), (55911, 99411), (52446, 47397), (28346, 65442), (96669, 68226), (66194, 26848), (37276, 55864), (14116, 41583), (18058, 16317), (93136, 85318), (35616, 86252), (29222, 29969), (33386, 85372), (71094, 44238), (27733, 31838), (64626, 16692), (52904, 97899), (97619, 12663), (50165, 4688), (67557, 44053), (69184, 66269), (73164, 89705), (39822, 15169), (65499, 72808), (30068, 63697), (30154, 64235), (97016, 58716), (94366, 36592), (1592, 16261), (87985, 52102), (12554, 23652), (15909, 25292), (2527, 91531), (92139, 36031), (28986, 30032), (3038, 56314), (32239, 26707), (15973, 34901), (70246, 39680), (82529, 38132), (45827, 74783), (53665, 64111), (55218, 84170), (20466, 16130), (55734, 71203), (31438, 96906), (66338, 85858), (35988, 68511), (78391, 15191), (80747, 59213), (5357, 11546), (16822, 16607), (36607, 41106), (74949, 30739), (45726, 64887), (1524, 54847), (37371, 89195), (28726, 27788), (22600, 44777), (53999, 63625), (84304, 98338), (49260, 76480), (74564, 53907), (89867, 97096), (60157, 61299), (17165, 10146), (56334, 36268), (62114, 49222), (22715, 23620), (42830, 11539), (41091, 69151), (75471, 68364), (18681, 43249), (42738, 63219), (35474, 98454), (76815, 46024), (66310, 36521), (86095, 77013), (63693, 77319), (80731, 63031), (95478, 92387), (23787, 63724), (46299, 68994), (4800, 2460), (9663, 80639), (77231, 85814), (81615, 11311), (35638, 27340), (13598, 14322), (30657, 17238), (90957, 96846), (69962, 52140), (41681, 65962), (96836, 58177), (36190, 11623), (4231, 40500), (43049, 41949), (71177, 98492), (30193, 39750), (19744, 33204), (63358, 30210), (45638, 58918), (43641, 38741), (35598, 40932), (33238, 36236), (50835, 20968), (25099, 34071), (84986, 88456), (35333, 1529), (79771, 23985), (647, 61658), (9424, 11743), (77766, 31528), (77811, 86973), (76403, 74377), (55568, 79251), (68858, 20762), (68520, 66773), (93598, 89823), (8080, 82539), (87760, 52247), (25191, 16905), (17837, 8339), (85177, 59050), (51680, 77374), (3287, 43018), (43479, 62141), (34909, 46322), (11869, 5885), (96193, 58417), (101, 47460), (34937, 88582), (83216, 88388), (28571, 15292), (66683, 62613), (34478, 8924), (2680, 89973), (62438, 44460), (11724, 4791), (5383, 72888), (88206, 67586), (8124, 21690), (28779, 75789), (66791, 4757), (6176, 47760), (6403, 78084), (78122, 35446), (99494, 73608), (39691, 89098), (59182, 19484), (25389, 98963), (96487, 3692), (76222, 67381), (21199, 50358), (95998, 58137), (28777, 43913), (14176, 60117), (52257, 81703), (14604, 13438), (71301, 14401), (19758, 66914), (15506, 29873), (87205, 29449), (93295, 15930), (63651, 11287), (19785, 15966), (30795, 75112), (69462, 37655), (18793, 85764), (36240, 31236), (98153, 73724), (72491, 4223), (66930, 35048), (25686, 13269), (13940, 13259), (69163, 11235), (1183, 86961), (54323, 67315), (85044, 60872), (48875, 3683), (43052, 92861), (87574, 32969), (92552, 80564), (94832, 47682), (72011, 80994), (60182, 917), (97788, 34169), (66432, 47940), (87468, 80954), (35385, 68758), (50555, 63710), (55311, 44337), (87065, 26514), (84581, 98736), (23212, 56499), (75120, 72447), (56087, 38285), (58171, 45629), (28401, 44319), (70432, 27883), (18891, 14646), (26206, 49924), (79957, 44914), (56064, 27529), (99090, 29197), (49435, 340), (53525, 65601), (76998, 88349), (50416, 70860), (42506, 75290), (34024, 13295), (86663, 46523), (88814, 231), (57809, 21), (84914, 84771), (43042, 66892), (17288, 33908), (4934, 63195), (50590, 1516), (97843, 80208), (20091, 86717), (71566, 15929), (19531, 23634), (41646, 45549), (89226, 82902), (96683, 63386), (31072, 53788), (51135, 41099), (78912, 65609), (36094, 23603), (88403, 51455), (73795, 47066), (26448, 82852), (22829, 2894), (30041, 92548), (27733, 20608), (70180, 19892), (51650, 63440), (76328, 13666), (40514, 6677), (2786, 51059), (40809, 16499), (10857, 82541), (78221, 61067), (17982, 51969), (85369, 66965), (47153, 47149), (43965, 75796), (82725, 60767), (42407, 97249), (51475, 81224), (60957, 89414), (33065, 21663), (36601, 5290), (95842, 67301), (64630, 60398), (55212, 35638), (41750, 44235), (75260, 82400), (91291, 25843), (6477, 8311), (14919, 52306), (66220, 33180), (45736, 2313), (37450, 64444), (98614, 61344), (75007, 50946), (56701, 28117), (66632, 5174), (92323, 76613), (6796, 73695), (33696, 76280), (86876, 5614), (50863, 67993), (36068, 17049), (91912, 34271), (70706, 1904), (97798, 41117), (68154, 72483), (83862, 25578), (61643, 17204), (69974, 64232), (77926, 19637), (64901, 88988), (71424, 91703), (91655, 17147), (46872, 56530), (44189, 98087), (95939, 54420), (72651, 68785), (67624, 84875), (92587, 87663), (65275, 81256), (53798, 2506), (14702, 3638), (71291, 50452), (14909, 13903), (66965, 26606), (14127, 60345), (35306, 1738), (77234, 10468), (53521, 41218), (80681, 82583), (44227, 26521), (32263, 21482), (82270, 56963), (50580, 80567), (11593, 22346), (20074, 26867), (73126, 28667), (62996, 24317), (20295, 57163), (1506, 57668), (69567, 45236), (43366, 26001), (88052, 40181), (1599, 89349), (36789, 1579), (39895, 46673), (30381, 3206), (31723, 5625), (19252, 31317), (16932, 77149), (48794, 34409), (55986, 30328), (47551, 75088), (57363, 78365), (95221, 63385), (26449, 5733), (96588, 53077), (52980, 41140), (8187, 85947), (36723, 26520), (23579, 38909), (33350, 19275), (63930, 19357), (43536, 59941), (31117, 77322), (44638, 94812), (44730, 99097), (95108, 48170), (57813, 49503), (79959, 89436), (86980, 62031), (8275, 44009), (36666, 94645), (22064, 38882), (40471, 16939), (31156, 11337), (13101, 96977), (17906, 26835), (89861, 51405), (73369, 67946), (99141, 58572), (27131, 98703), (15900, 43412), (51768, 93125), (78579, 46689), (23029, 13895), (60870, 55830), (22553, 8236), (76449, 96207), (83766, 51024), (27630, 50614), (53484, 90104), (77626, 21944), (46755, 41583), (53616, 34240), (94159, 44415), (13914, 90059), (44387, 89012), (27499, 64579), (83415, 30809), (77558, 82619), (88880, 9814), (8466, 4424), (43598, 91921), (24695, 3349), (46295, 65208), (51256, 82461), (49126, 93012), (16186, 96585), (43284, 22655), (93130, 90393), (77495, 34372), (85509, 65856), (86662, 61906), (50988, 44393), (29828, 17737), (91651, 35308), (29796, 49716), (14019, 87751), (29688, 71207), (82845, 19100), (11989, 50132), (21158, 99905), (54732, 42547), (32314, 12851), (46405, 43794), (87849, 45643), (53524, 21212), (61925, 75491), (12498, 21937), (30185, 69475), (48421, 52487), (15112, 90935), (33187, 17801), (61704, 25514), (17889, 23917), (18758, 57197), (7693, 47232), (47905, 24618), (11494, 78950), (95662, 54561), (8075, 33909), (90427, 46065), (73962, 19821), (50691, 79400), (58218, 4881), (94106, 2509), (60633, 55169), (49600, 83054), (23339, 13270), (70262, 58946), (48417, 97266), (27629, 46905), (74465, 75514), (41687, 2564), (12814, 19492), (78899, 30168), (17745, 35206), (37972, 35296), (22288, 80001), diff --git a/tests/queries/0_stateless/00398_url_functions.reference b/tests/queries/0_stateless/00398_url_functions.reference index 9cd18350d78..e5f89fbea78 100644 --- a/tests/queries/0_stateless/00398_url_functions.reference +++ b/tests/queries/0_stateless/00398_url_functions.reference @@ -45,6 +45,7 @@ com /?query=hello world+foo+bar /?query=hello world+foo+bar /?query=hello world+foo+bar +/?query=hello world foo+bar /a/b/c /a/b/c @@ -57,6 +58,7 @@ query=hello world+foo+bar query=hello world+foo+bar query=hello world+foo+bar query=hello world+foo+bar +query=hello world foo+bar ====FRAGMENT==== @@ -71,6 +73,7 @@ query=hello world+foo+bar#a=b query=hello world+foo+bar#a=b query=hello world+foo+bar#a=b #a=b +query=hello world foo+bar#a=b ====CUT TO FIRST SIGNIFICANT SUBDOMAIN==== example.com example.com diff --git a/tests/queries/0_stateless/00398_url_functions.sql b/tests/queries/0_stateless/00398_url_functions.sql index af03a6d487a..ea71ed226d7 100644 --- a/tests/queries/0_stateless/00398_url_functions.sql +++ b/tests/queries/0_stateless/00398_url_functions.sql @@ -49,6 +49,7 @@ SELECT decodeURLComponent(pathFull('//127.0.0.1/?query=hello%20world+foo%2Bbar') SELECT decodeURLComponent(pathFull('http://127.0.0.1/?query=hello%20world+foo%2Bbar')) AS Path; SELECT decodeURLComponent(materialize(pathFull('http://127.0.0.1/?query=hello%20world+foo%2Bbar'))) AS Path; SELECT decodeURLComponent(materialize(pathFull('//127.0.0.1/?query=hello%20world+foo%2Bbar'))) AS Path; +SELECT decodeURLFormComponent(materialize(pathFull('//127.0.0.1/?query=hello%20world+foo%2Bbar'))) AS Path; SELECT path('http://127.0.0.1') AS Path; SELECT path('http://127.0.0.1/a/b/c') AS Path; SELECT path('http://127.0.0.1:443/a/b/c') AS Path; @@ -62,6 +63,7 @@ SELECT decodeURLComponent(queryString('http://127.0.0.1/?query=hello%20world+foo SELECT decodeURLComponent(queryString('http://127.0.0.1:443/?query=hello%20world+foo%2Bbar')); SELECT decodeURLComponent(queryString('http://paul@127.0.0.1:443/?query=hello%20world+foo%2Bbar')); SELECT decodeURLComponent(queryString('//paul@127.0.0.1:443/?query=hello%20world+foo%2Bbar')); +SELECT decodeURLFormComponent(queryString('//paul@127.0.0.1:443/?query=hello%20world+foo%2Bbar')); SELECT '====FRAGMENT===='; SELECT decodeURLComponent(fragment('http://127.0.0.1/?query=hello%20world+foo%2Bbar')); @@ -78,6 +80,7 @@ SELECT decodeURLComponent(queryStringAndFragment('http://127.0.0.1/?query=hello% SELECT decodeURLComponent(queryStringAndFragment('http://paul@127.0.0.1/?query=hello%20world+foo%2Bbar#a=b')); SELECT decodeURLComponent(queryStringAndFragment('//paul@127.0.0.1/?query=hello%20world+foo%2Bbar#a=b')); SELECT decodeURLComponent(queryStringAndFragment('//paul@127.0.0.1/#a=b')); +SELECT decodeURLFormComponent(queryStringAndFragment('//paul@127.0.0.1/?query=hello%20world+foo%2Bbar#a=b')); SELECT '====CUT TO FIRST SIGNIFICANT SUBDOMAIN===='; SELECT cutToFirstSignificantSubdomain('http://www.example.com'); diff --git a/tests/queries/0_stateless/00646_url_engine.python b/tests/queries/0_stateless/00646_url_engine.python index 85ae3e776ed..4f47e819328 100644 --- a/tests/queries/0_stateless/00646_url_engine.python +++ b/tests/queries/0_stateless/00646_url_engine.python @@ -156,6 +156,7 @@ def test_select(table_name="", schema="str String,numuint UInt32,numint Int32,do if table_name: get_ch_answer("drop table if exists {}".format(table_name)) + def test_insert(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests_insert=[], requests_select=[], answers=[]): with open(CSV_DATA, 'w') as f: # flush test file f.write('') diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh index 2731e4bcce3..8d9e2689e26 100755 --- a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh +++ b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh @@ -25,15 +25,15 @@ SELECT * FROM enum_mapping_protobuf_00825; EOF BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_enum_mapping.XXXXXX.binary") -$CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$SCHEMADIR/00825_protobuf_format_enum_mapping:Message'" > "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage'" > "$BINARY_FILE_PATH" # Check the output in the protobuf format echo -$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$SCHEMADIR/00825_protobuf_format_enum_mapping:Message" --input "$BINARY_FILE_PATH" +$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage" --input "$BINARY_FILE_PATH" # Check the input in the protobuf format (now the table contains the same data twice). echo -$CLICKHOUSE_CLIENT --query "INSERT INTO enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_enum_mapping:Message'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "INSERT INTO enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_enum_mapping:EnumMessage'" < "$BINARY_FILE_PATH" $CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825" rm "$BINARY_FILE_PATH" diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary new file mode 100644 index 00000000000..4b7b97a300f Binary files /dev/null and b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.yaodzJ.binary differ diff --git a/tests/queries/0_stateless/00907_set_index_with_nullable_and_low_cardinality_bug.sql b/tests/queries/0_stateless/00907_set_index_with_nullable_and_low_cardinality_bug.sql index 3a55a69c726..336d9984e69 100644 --- a/tests/queries/0_stateless/00907_set_index_with_nullable_and_low_cardinality_bug.sql +++ b/tests/queries/0_stateless/00907_set_index_with_nullable_and_low_cardinality_bug.sql @@ -8,7 +8,7 @@ CREATE TABLE null_lc_set_index ( INDEX test_user_idx (user) TYPE set(0) GRANULARITY 8192 ) ENGINE=MergeTree PARTITION BY toYYYYMMDD(timestamp) - ORDER BY (timestamp, action, cityHash64(user)); + ORDER BY (timestamp, action, cityHash64(user)) SETTINGS allow_nullable_key = 1; INSERT INTO null_lc_set_index VALUES (1550883010, 'subscribe', 'alice'); INSERT INTO null_lc_set_index VALUES (1550883020, 'follow', 'bob'); diff --git a/tests/queries/0_stateless/00945_bloom_filter_index.sql b/tests/queries/0_stateless/00945_bloom_filter_index.sql index f45c4c04290..497b0cdb641 100644 --- a/tests/queries/0_stateless/00945_bloom_filter_index.sql +++ b/tests/queries/0_stateless/00945_bloom_filter_index.sql @@ -14,10 +14,10 @@ SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN (1, 2) SETTINGS max_ SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN ((1, 2), (2, 3)) SETTINGS max_rows_to_read = 6; SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN ((1, 1), (2, 2)) SETTINGS max_rows_to_read = 6; SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN ((1, (1, 1)), (2, (2, 2))) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN (SELECT arrayJoin([toInt32(1), toInt32(2)])) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN (SELECT arrayJoin([(toInt32(1), toInt32(2)), (toInt32(2), toInt32(3))])) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN (SELECT arrayJoin([(toInt32(1), toUInt64(1)), (toInt32(2), toUInt64(2))])) SETTINGS max_rows_to_read = 6; -SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN (SELECT arrayJoin([(toUInt64(1), (toUInt64(1), toInt32(1))), (toUInt64(2), (toUInt64(2), toInt32(2)))])) SETTINGS max_rows_to_read = 6; +SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN (SELECT arrayJoin([toInt32(1), toInt32(2)])) SETTINGS max_rows_to_read = 7; +SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN (SELECT arrayJoin([(toInt32(1), toInt32(2)), (toInt32(2), toInt32(3))])) SETTINGS max_rows_to_read = 7; +SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN (SELECT arrayJoin([(toInt32(1), toUInt64(1)), (toInt32(2), toUInt64(2))])) SETTINGS max_rows_to_read = 7; +SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN (SELECT arrayJoin([(toUInt64(1), (toUInt64(1), toInt32(1))), (toUInt64(2), (toUInt64(2), toInt32(2)))])) SETTINGS max_rows_to_read = 7; WITH (1, 2) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE i32 IN liter_prepared_set SETTINGS max_rows_to_read = 6; WITH ((1, 2), (2, 3)) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i32) IN liter_prepared_set SETTINGS max_rows_to_read = 6; WITH ((1, 1), (2, 2)) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN liter_prepared_set SETTINGS max_rows_to_read = 6; @@ -183,7 +183,7 @@ CREATE TABLE bloom_filter_array_lc_null_types_test ( fixed_string Array(LowCardinality(Nullable(FixedString(5)))), INDEX idx (i8, i16, i32, i64, u8, u16, u32, u64, f32, f64, date, date_time, str, fixed_string) TYPE bloom_filter GRANULARITY 1) -ENGINE = MergeTree() ORDER BY order_key SETTINGS index_granularity = 6; +ENGINE = MergeTree() ORDER BY order_key SETTINGS index_granularity = 6, allow_nullable_key = 1; INSERT INTO bloom_filter_array_lc_null_types_test SELECT groupArray(number) AS order_key, diff --git a/tests/queries/0_stateless/00974_query_profiler.reference b/tests/queries/0_stateless/00974_query_profiler.reference index e37cf5f7642..708c4988416 100644 --- a/tests/queries/0_stateless/00974_query_profiler.reference +++ b/tests/queries/0_stateless/00974_query_profiler.reference @@ -1,4 +1,4 @@ 0 0 1 -1000000000 0 +10000000000 0 1 diff --git a/tests/queries/0_stateless/00974_query_profiler.sql b/tests/queries/0_stateless/00974_query_profiler.sql index 45ba6504a79..24e4241b813 100644 --- a/tests/queries/0_stateless/00974_query_profiler.sql +++ b/tests/queries/0_stateless/00974_query_profiler.sql @@ -15,7 +15,7 @@ SELECT count() > 0 FROM system.trace_log t WHERE query_id = (SELECT query_id FRO SET query_profiler_real_time_period_ns = 0; SET query_profiler_cpu_time_period_ns = 1000000; SET log_queries = 1; -SELECT count(), ignore('test cpu time query profiler') FROM numbers(1000000000); +SELECT count(), ignore('test cpu time query profiler') FROM numbers_mt(10000000000); SET log_queries = 0; SYSTEM FLUSH LOGS; diff --git a/tests/queries/0_stateless/01045_bloom_filter_null_array.sql b/tests/queries/0_stateless/01045_bloom_filter_null_array.sql index 3dfc04ae8ff..4a5741b4e72 100644 --- a/tests/queries/0_stateless/01045_bloom_filter_null_array.sql +++ b/tests/queries/0_stateless/01045_bloom_filter_null_array.sql @@ -1,6 +1,6 @@ DROP TABLE IF EXISTS bloom_filter_null_array; -CREATE TABLE bloom_filter_null_array (v Array(LowCardinality(Nullable(String))), INDEX idx v TYPE bloom_filter(0.1) GRANULARITY 1) ENGINE = MergeTree() ORDER BY v; +CREATE TABLE bloom_filter_null_array (v Array(LowCardinality(Nullable(String))), INDEX idx v TYPE bloom_filter(0.1) GRANULARITY 1) ENGINE = MergeTree() ORDER BY v SETTINGS allow_nullable_key = 1; INSERT INTO bloom_filter_null_array VALUES ([]); INSERT INTO bloom_filter_null_array VALUES (['1', '2']) ([]) ([]); diff --git a/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference b/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference index 77f48f2832c..96f7cbb1d69 100644 --- a/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference +++ b/tests/queries/0_stateless/01047_window_view_parser_inner_table.reference @@ -1,8 +1,12 @@ ---TUMBLE--- +||---DEFAULT ENGINE WITH DATA COLUMN ALIAS--- +CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'))`, b)\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN NAME--- CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN ALIAS--- CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 +||---DATA COLUMN ALIAS--- +CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY b\nSETTINGS index_granularity = 8192 ||---IDENTIFIER--- CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'))`, b)\nSETTINGS index_granularity = 8192 ||---FUNCTION--- @@ -10,10 +14,14 @@ CREATE TABLE test_01047.`.inner.wv`\n(\n `plus(a, b)` Int64,\n `windowID(t ||---PARTITION--- CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(____timestamp, toIntervalSecond(\'1\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPARTITION BY `windowID(____timestamp, toIntervalSecond(\'1\'))`\nORDER BY `windowID(____timestamp, toIntervalSecond(\'1\'))`\nSETTINGS index_granularity = 8192 ---HOP--- +||---DEFAULT ENGINE WITH DATA COLUMN ALIAS--- +CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, b)\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN NAME--- CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nSETTINGS index_granularity = 8192 ||---WINDOW COLUMN ALIAS--- CREATE TABLE test_01047.`.inner.wv`\n(\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nSETTINGS index_granularity = 8192 +||---DATA COLUMN ALIAS--- +CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nORDER BY b\nSETTINGS index_granularity = 8192 ||---IDENTIFIER--- CREATE TABLE test_01047.`.inner.wv`\n(\n `b` Int32,\n `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))` UInt32,\n `count(a)` AggregateFunction(count, Int32)\n)\nENGINE = AggregatingMergeTree\nPRIMARY KEY `windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`\nORDER BY (`windowID(timestamp, toIntervalSecond(\'1\'), toIntervalSecond(\'3\'))`, b)\nSETTINGS index_granularity = 8192 ||---FUNCTION--- diff --git a/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql b/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql index 777c5ae2a5a..595d93e0771 100644 --- a/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql +++ b/tests/queries/0_stateless/01047_window_view_parser_inner_table.sql @@ -9,6 +9,12 @@ DROP TABLE IF EXISTS test_01047.mt; CREATE TABLE test_01047.mt(a Int32, b Int32, timestamp DateTime) ENGINE=MergeTree ORDER BY tuple(); SELECT '---TUMBLE---'; +SELECT '||---DEFAULT ENGINE WITH DATA COLUMN ALIAS---'; +DROP TABLE IF EXISTS test_01047.wv; +DROP TABLE IF EXISTS test_01047.`.inner.wv`; +CREATE WINDOW VIEW test_01047.wv AS SELECT count(a) AS count, b as id FROM test_01047.mt GROUP BY id, tumble(timestamp, INTERVAL '1' SECOND); +SHOW CREATE TABLE test_01047.`.inner.wv`; + SELECT '||---WINDOW COLUMN NAME---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; @@ -21,6 +27,12 @@ DROP TABLE IF EXISTS test_01047.`.inner.wv`; CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid AS SELECT count(a) AS count, tumble(timestamp, INTERVAL '1' SECOND) AS wid FROM test_01047.mt GROUP BY wid; SHOW CREATE TABLE test_01047.`.inner.wv`; +SELECT '||---DATA COLUMN ALIAS---'; +DROP TABLE IF EXISTS test_01047.wv; +DROP TABLE IF EXISTS test_01047.`.inner.wv`; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY id AS SELECT count(a) AS count, b as id FROM test_01047.mt GROUP BY id, tumble(timestamp, INTERVAL '1' SECOND); +SHOW CREATE TABLE test_01047.`.inner.wv`; + SELECT '||---IDENTIFIER---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; @@ -41,6 +53,12 @@ SHOW CREATE TABLE test_01047.`.inner.wv`; SELECT '---HOP---'; +SELECT '||---DEFAULT ENGINE WITH DATA COLUMN ALIAS---'; +DROP TABLE IF EXISTS test_01047.wv; +DROP TABLE IF EXISTS test_01047.`.inner.wv`; +CREATE WINDOW VIEW test_01047.wv AS SELECT count(a) AS count, b as id FROM test_01047.mt GROUP BY id, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND); +SHOW CREATE TABLE test_01047.`.inner.wv`; + SELECT '||---WINDOW COLUMN NAME---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; @@ -53,6 +71,12 @@ DROP TABLE IF EXISTS test_01047.`.inner.wv`; CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY wid AS SELECT count(a) AS count, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND) AS wid FROM test_01047.mt GROUP BY wid; SHOW CREATE TABLE test_01047.`.inner.wv`; +SELECT '||---DATA COLUMN ALIAS---'; +DROP TABLE IF EXISTS test_01047.wv; +DROP TABLE IF EXISTS test_01047.`.inner.wv`; +CREATE WINDOW VIEW test_01047.wv ENGINE AggregatingMergeTree ORDER BY id AS SELECT count(a) AS count, b as id FROM test_01047.mt GROUP BY id, hop(timestamp, INTERVAL '1' SECOND, INTERVAL '3' SECOND); +SHOW CREATE TABLE test_01047.`.inner.wv`; + SELECT '||---IDENTIFIER---'; DROP TABLE IF EXISTS test_01047.wv; DROP TABLE IF EXISTS test_01047.`.inner.wv`; diff --git a/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql b/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql index a653206fe18..0bc5fcd1db8 100644 --- a/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql +++ b/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql @@ -89,8 +89,11 @@ INSERT INTO checkouts SELECT number as id, '2000-01-01 10:00:00' from numbers(50 -- by this time we should have 3 parts for target_table because of prev inserts -- and we plan to make two more inserts. With index_granularity=128 and max id=1000 -- we expect to read not more than: +-- 1000 rows read from numbers(1000) in the INSERT itself +-- 1000 rows in the `IN (SELECT id FROM table)` in the mat views -- (1000/128) marks per part * (3 + 2) parts * 128 granularity = 5120 rows -set max_rows_to_read = 5120; +-- Total: 7120 +set max_rows_to_read = 7120; INSERT INTO logins SELECT number as id, '2000-01-01 11:00:00' from numbers(1000); INSERT INTO checkouts SELECT number as id, '2000-01-01 11:10:00' from numbers(1000); @@ -98,8 +101,8 @@ INSERT INTO checkouts SELECT number as id, '2000-01-01 11:10:00' from numbers(10 -- by this time we should have 5 parts for target_table because of prev inserts -- and we plan to make two more inserts. With index_granularity=128 and max id=1 -- we expect to read not more than: --- 1 mark per part * (5 + 2) parts * 128 granularity = 896 rows -set max_rows_to_read = 896; +-- 1 mark per part * (5 + 2) parts * 128 granularity + 1 (numbers(1)) = 897 rows +set max_rows_to_read = 897; INSERT INTO logins SELECT number+2 as id, '2001-01-01 11:10:01' from numbers(1); INSERT INTO checkouts SELECT number+2 as id, '2001-01-01 11:10:02' from numbers(1); diff --git a/tests/queries/0_stateless/01074_h3_range_check.sql b/tests/queries/0_stateless/01074_h3_range_check.sql index acf59b16d75..4c655f44a8b 100644 --- a/tests/queries/0_stateless/01074_h3_range_check.sql +++ b/tests/queries/0_stateless/01074_h3_range_check.sql @@ -2,3 +2,4 @@ SELECT h3EdgeLengthM(100); -- { serverError 69 } SELECT h3HexAreaM2(100); -- { serverError 69 } +SELECT h3HexAreaKm2(100); -- { serverError 69 } diff --git a/tests/queries/0_stateless/01176_mysql_client_interactive.expect b/tests/queries/0_stateless/01176_mysql_client_interactive.expect index 37087dd85f7..5bbc77ccf14 100755 --- a/tests/queries/0_stateless/01176_mysql_client_interactive.expect +++ b/tests/queries/0_stateless/01176_mysql_client_interactive.expect @@ -5,11 +5,12 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail + expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01179_insert_values_semicolon.expect b/tests/queries/0_stateless/01179_insert_values_semicolon.expect index 0e65e5c4cbf..bf937c3a6a4 100755 --- a/tests/queries/0_stateless/01179_insert_values_semicolon.expect +++ b/tests/queries/0_stateless/01179_insert_values_semicolon.expect @@ -1,13 +1,14 @@ #!/usr/bin/expect -f +# Tags: long log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01180_client_syntax_errors.expect b/tests/queries/0_stateless/01180_client_syntax_errors.expect index c20982b2991..6e4e975988e 100755 --- a/tests/queries/0_stateless/01180_client_syntax_errors.expect +++ b/tests/queries/0_stateless/01180_client_syntax_errors.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.reference b/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.reference index 2a6b00cdddb..968247bd35b 100644 --- a/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.reference +++ b/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.reference @@ -8,10 +8,10 @@ Hello 1 Word 1 date1 date2 value1 value2 2019-10-02 2018-10-02 1 1 -CREATE TABLE default.table_rename_with_ttl\n(\n `date1` Date,\n `date2` Date,\n `value1` String,\n `value2` String TTL date1 + toIntervalMonth(10000)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL date2 + toIntervalMonth(10000)\nSETTINGS index_granularity = 8192 +CREATE TABLE default.table_rename_with_ttl\n(\n `date1` Date,\n `date2` Date,\n `value1` String,\n `value2` String TTL date1 + toIntervalMonth(500)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL date2 + toIntervalMonth(500)\nSETTINGS index_granularity = 8192 renamed_date1 date2 value1 value2 2019-10-02 2018-10-02 1 1 -CREATE TABLE default.table_rename_with_ttl\n(\n `renamed_date1` Date,\n `date2` Date,\n `value1` String,\n `value2` String TTL renamed_date1 + toIntervalMonth(10000)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL date2 + toIntervalMonth(10000)\nSETTINGS index_granularity = 8192 +CREATE TABLE default.table_rename_with_ttl\n(\n `renamed_date1` Date,\n `date2` Date,\n `value1` String,\n `value2` String TTL renamed_date1 + toIntervalMonth(500)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL date2 + toIntervalMonth(500)\nSETTINGS index_granularity = 8192 renamed_date1 renamed_date2 value1 value2 2019-10-02 2018-10-02 1 1 -CREATE TABLE default.table_rename_with_ttl\n(\n `renamed_date1` Date,\n `renamed_date2` Date,\n `value1` String,\n `value2` String TTL renamed_date1 + toIntervalMonth(10000)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL renamed_date2 + toIntervalMonth(10000)\nSETTINGS index_granularity = 8192 +CREATE TABLE default.table_rename_with_ttl\n(\n `renamed_date1` Date,\n `renamed_date2` Date,\n `value1` String,\n `value2` String TTL renamed_date1 + toIntervalMonth(500)\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/default/test_01213/table_rename_with_ttl\', \'1\')\nORDER BY tuple()\nTTL renamed_date2 + toIntervalMonth(500)\nSETTINGS index_granularity = 8192 diff --git a/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.sql b/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.sql index 986947d5979..a831fd18bfe 100644 --- a/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.sql +++ b/tests/queries/0_stateless/01213_alter_rename_with_default_zookeeper_long.sql @@ -38,11 +38,11 @@ CREATE TABLE table_rename_with_ttl date1 Date, date2 Date, value1 String, - value2 String TTL date1 + INTERVAL 10000 MONTH + value2 String TTL date1 + INTERVAL 500 MONTH ) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_01213/table_rename_with_ttl', '1') ORDER BY tuple() -TTL date2 + INTERVAL 10000 MONTH; +TTL date2 + INTERVAL 500 MONTH; INSERT INTO table_rename_with_ttl SELECT toDateTime(toDate('2019-10-01') + number % 3, 'Europe/Moscow'), toDateTime(toDate('2018-10-01') + number % 3, 'Europe/Moscow'), toString(number), toString(number) from numbers(9); diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index d347f149230..cc237a40a3f 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -114,6 +114,7 @@ SYSTEM RESTORE REPLICA ['RESTORE REPLICA'] TABLE SYSTEM SYSTEM FLUSH DISTRIBUTED ['FLUSH DISTRIBUTED'] TABLE SYSTEM FLUSH SYSTEM FLUSH LOGS ['FLUSH LOGS'] GLOBAL SYSTEM FLUSH SYSTEM FLUSH [] \N SYSTEM +SYSTEM THREAD FUZZER ['SYSTEM START THREAD FUZZER','SYSTEM STOP THREAD FUZZER','START THREAD FUZZER','STOP THREAD FUZZER'] GLOBAL SYSTEM SYSTEM [] \N ALL dictGet ['dictHas','dictGetHierarchy','dictIsIn'] DICTIONARY ALL addressToLine [] GLOBAL INTROSPECTION diff --git a/tests/queries/0_stateless/01281_parseDateTime64BestEffort.reference b/tests/queries/0_stateless/01281_parseDateTime64BestEffort.reference index 5d2507d2a08..b76db01a8ab 100644 --- a/tests/queries/0_stateless/01281_parseDateTime64BestEffort.reference +++ b/tests/queries/0_stateless/01281_parseDateTime64BestEffort.reference @@ -13,3 +13,7 @@ Formats 2020-05-14 03:37:03.253 2020-05-14 03:37:03.000 2020-05-14 03:37:03.000 +Unix Timestamp with Milliseconds +2021-12-28 00:00:00.123 +2021-12-28 00:00:00.1 +2021-12-28 00:00:00.123000 diff --git a/tests/queries/0_stateless/01281_parseDateTime64BestEffort.sql b/tests/queries/0_stateless/01281_parseDateTime64BestEffort.sql index 5c0bbe1b4c2..ac1186284be 100644 --- a/tests/queries/0_stateless/01281_parseDateTime64BestEffort.sql +++ b/tests/queries/0_stateless/01281_parseDateTime64BestEffort.sql @@ -30,4 +30,9 @@ SELECT parseDateTime64BestEffort('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk SELECT 'Formats'; SELECT parseDateTime64BestEffort('2020-05-14T03:37:03.253184', 3, 'UTC'); SELECT parseDateTime64BestEffort('2020-05-14T03:37:03', 3, 'UTC'); -SELECT parseDateTime64BestEffort('2020-05-14 03:37:03', 3, 'UTC'); \ No newline at end of file +SELECT parseDateTime64BestEffort('2020-05-14 03:37:03', 3, 'UTC'); + +SELECT 'Unix Timestamp with Milliseconds'; +SELECT parseDateTime64BestEffort('1640649600123', 3, 'UTC'); +SELECT parseDateTime64BestEffort('1640649600123', 1, 'UTC'); +SELECT parseDateTime64BestEffort('1640649600123', 6, 'UTC'); diff --git a/tests/queries/0_stateless/01282_system_parts_ttl_info.sql b/tests/queries/0_stateless/01282_system_parts_ttl_info.sql index dfa340636b3..ede5350ddd4 100644 --- a/tests/queries/0_stateless/01282_system_parts_ttl_info.sql +++ b/tests/queries/0_stateless/01282_system_parts_ttl_info.sql @@ -1,5 +1,5 @@ DROP TABLE IF EXISTS ttl; -CREATE TABLE ttl (d DateTime) ENGINE = MergeTree ORDER BY tuple() TTL d + INTERVAL 10 DAY; +CREATE TABLE ttl (d DateTime) ENGINE = MergeTree ORDER BY tuple() TTL d + INTERVAL 10 DAY SETTINGS remove_empty_parts=0; SYSTEM STOP MERGES ttl; INSERT INTO ttl VALUES ('2000-01-01 01:02:03'), ('2000-02-03 04:05:06'); SELECT rows, delete_ttl_info_min, delete_ttl_info_max, move_ttl_info.expression, move_ttl_info.min, move_ttl_info.max FROM system.parts WHERE database = currentDatabase() AND table = 'ttl'; diff --git a/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect b/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect index 5e845754402..e4442047c87 100755 --- a/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect +++ b/tests/queries/0_stateless/01293_client_interactive_vertical_multiline.expect @@ -4,11 +4,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect b/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect index c68b153d5d3..2f871ab46d8 100755 --- a/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect +++ b/tests/queries/0_stateless/01293_client_interactive_vertical_singleline.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect b/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect index 05d9d408228..ad5b7625929 100755 --- a/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect +++ b/tests/queries/0_stateless/01300_client_save_history_when_terminated_long.expect @@ -4,11 +4,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect b/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect index f3a28bbee9b..9c20b7c517e 100755 --- a/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect +++ b/tests/queries/0_stateless/01370_client_autocomplete_word_break_characters.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01410_nullable_key_and_index.sql b/tests/queries/0_stateless/01410_nullable_key_and_index.sql index fd1712b5d24..46a58152700 100644 --- a/tests/queries/0_stateless/01410_nullable_key_and_index.sql +++ b/tests/queries/0_stateless/01410_nullable_key_and_index.sql @@ -65,3 +65,12 @@ CREATE TABLE xxxx_null (`ts` Nullable(DateTime)) ENGINE = MergeTree ORDER BY toS INSERT INTO xxxx_null SELECT '2021-11-11 00:00:00'; SELECT * FROM xxxx_null WHERE ts > '2021-10-11 00:00:00'; DROP TABLE xxxx_null; + +-- nullable keys are forbidden when `allow_nullable_key = 0` +CREATE TABLE invalid_null (id Nullable(String)) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +CREATE TABLE invalid_lc_null (id LowCardinality(Nullable(String))) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +CREATE TABLE invalid_array_null (id Array(Nullable(String))) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +CREATE TABLE invalid_tuple_null (id Tuple(Nullable(String), UInt8)) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +CREATE TABLE invalid_map_null (id Map(UInt8, Nullable(String))) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +CREATE TABLE invalid_simple_agg_state_null (id SimpleAggregateFunction(sum, Nullable(UInt64))) ENGINE = MergeTree ORDER BY id; -- { serverError 44 } +-- AggregateFunctions are not comparable and cannot be used in key expressions. No need to test it. diff --git a/tests/queries/0_stateless/01410_nullable_key_more_tests.reference b/tests/queries/0_stateless/01410_nullable_key_more_tests.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01410_nullable_key_more_tests.sh b/tests/queries/0_stateless/01410_nullable_key_more_tests.sh new file mode 100755 index 00000000000..03bebed324b --- /dev/null +++ b/tests/queries/0_stateless/01410_nullable_key_more_tests.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +test_func() +{ + engine=$1 + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "drop table if exists table_with_nullable_keys" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "create table table_with_nullable_keys (nullable_int Nullable(UInt32), nullable_str Nullable(String), nullable_lc LowCardinality(Nullable(String)), nullable_ints Array(Nullable(UInt32)), nullable_misc Tuple(Nullable(String), Nullable(UInt32)), nullable_val Map(UInt32, Nullable(String)), value UInt8) engine $engine order by (nullable_int, nullable_str, nullable_lc, nullable_ints, nullable_misc, nullable_val) settings allow_nullable_key = 1, index_granularity = 1" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "insert into table_with_nullable_keys select * replace (cast(nullable_val as Map(UInt32, Nullable(String))) as nullable_val) from generateRandom('nullable_int Nullable(UInt32), nullable_str Nullable(String), nullable_lc Nullable(String), nullable_ints Array(Nullable(UInt32)), nullable_misc Tuple(Nullable(String), Nullable(UInt32)), nullable_val Array(Tuple(UInt32, Nullable(String))), value UInt8', 1, 30, 30) limit 1024" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_str = (select randomPrintableASCII(30)) or nullable_str in (select randomPrintableASCII(30) from numbers(3)) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_lc = (select randomPrintableASCII(30)) or nullable_lc in (select randomPrintableASCII(30) from numbers(3)) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_ints = [1, 2, null] or nullable_ints in (select * from generateRandom('nullable_ints Array(Nullable(UInt32))', 1, 30, 30) limit 3) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_misc = (select (randomPrintableASCII(30), rand())) or nullable_misc in (select arrayJoin([(randomPrintableASCII(30), null), (null, rand())]))" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "select * from table_with_nullable_keys where nullable_val = (select map(rand(), randomPrintableASCII(10), rand(2), randomPrintableASCII(20), rand(3), null)) or nullable_val in (select cast(nullable_ints as Map(UInt32, Nullable(String))) from generateRandom('nullable_ints Array(Tuple(UInt32, Nullable(String)))', 1, 30, 30) limit 3) format Null" + + curl -d@- -sS "${CLICKHOUSE_URL}" <<< "drop table table_with_nullable_keys" +} + +test_func MergeTree +test_func AggregatingMergeTree +test_func ReplacingMergeTree diff --git a/tests/queries/0_stateless/01414_low_cardinality_nullable.sql b/tests/queries/0_stateless/01414_low_cardinality_nullable.sql index 596e90adfd6..c11e990cea8 100644 --- a/tests/queries/0_stateless/01414_low_cardinality_nullable.sql +++ b/tests/queries/0_stateless/01414_low_cardinality_nullable.sql @@ -19,7 +19,7 @@ CREATE TABLE lc_nullable ( str Array(LowCardinality(Nullable(String))), fixed_string Array(LowCardinality(Nullable(FixedString(5)))) -) ENGINE = MergeTree() ORDER BY order_key; +) ENGINE = MergeTree() ORDER BY order_key SETTINGS allow_nullable_key = 1; INSERT INTO lc_nullable SELECT groupArray(number) AS order_key, diff --git a/tests/queries/0_stateless/01442_date_time_with_params.reference b/tests/queries/0_stateless/01442_date_time_with_params.reference index 19f78c83f82..726e59d4d35 100644 --- a/tests/queries/0_stateless/01442_date_time_with_params.reference +++ b/tests/queries/0_stateless/01442_date_time_with_params.reference @@ -13,6 +13,8 @@ parseDateTimeBestEffort 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') 2020-05-14 06:37:03.253 DateTime64(3, \'Europe/Minsk\') 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') +2021-12-28 00:00:00.123 DateTime64(3, \'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') parseDateTimeBestEffortOrNull \N Nullable(DateTime64(3)) 2020-05-14 03:37:03.000 Nullable(DateTime64(3, \'UTC\')) @@ -25,6 +27,8 @@ parseDateTimeBestEffortOrNull 2020-05-14 03:37:03.253 Nullable(DateTime64(3, \'UTC\')) 2020-05-14 06:37:03.253 Nullable(DateTime64(3, \'Europe/Minsk\')) 2020-05-14 03:37:03.253 Nullable(DateTime64(3, \'UTC\')) +2021-12-28 00:00:00.123 Nullable(DateTime64(3, \'UTC\')) +2021-12-28 00:00:00 Nullable(DateTime(\'UTC\')) parseDateTimeBestEffortOrZero 1970-01-01 00:00:00.000 DateTime64(3, \'UTC\') 2020-05-14 03:37:03.000 DateTime64(3, \'UTC\') @@ -37,6 +41,8 @@ parseDateTimeBestEffortOrZero 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') 2020-05-14 06:37:03.253 DateTime64(3, \'Europe/Minsk\') 2020-05-14 03:37:03.253 DateTime64(3, \'UTC\') +2021-12-28 00:00:00.123 DateTime64(3, \'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') parseDateTime32BestEffort 2020-05-14 03:37:03 DateTime(\'UTC\') 2020-05-14 03:37:03 DateTime(\'UTC\') @@ -48,6 +54,7 @@ parseDateTime32BestEffort 2020-05-14 03:37:03 DateTime(\'UTC\') 2020-05-14 06:37:03 DateTime(\'Europe/Minsk\') 2020-05-14 03:37:03 DateTime(\'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') parseDateTime32BestEffortOrNull \N Nullable(DateTime) 2020-05-14 03:37:03 Nullable(DateTime(\'UTC\')) @@ -60,6 +67,7 @@ parseDateTime32BestEffortOrNull 2020-05-14 03:37:03 Nullable(DateTime(\'UTC\')) 2020-05-14 06:37:03 Nullable(DateTime(\'Europe/Minsk\')) 2020-05-14 03:37:03 Nullable(DateTime(\'UTC\')) +2021-12-28 00:00:00 Nullable(DateTime(\'UTC\')) parseDateTime32BestEffortOrZero 1970-01-01 00:00:00 DateTime(\'UTC\') 2020-05-14 03:37:03 DateTime(\'UTC\') @@ -72,3 +80,4 @@ parseDateTime32BestEffortOrZero 2020-05-14 03:37:03 DateTime(\'UTC\') 2020-05-14 06:37:03 DateTime(\'Europe/Minsk\') 2020-05-14 03:37:03 DateTime(\'UTC\') +2021-12-28 00:00:00 DateTime(\'UTC\') diff --git a/tests/queries/0_stateless/01442_date_time_with_params.sql b/tests/queries/0_stateless/01442_date_time_with_params.sql index 52815460245..5a57aabdb0c 100644 --- a/tests/queries/0_stateless/01442_date_time_with_params.sql +++ b/tests/queries/0_stateless/01442_date_time_with_params.sql @@ -24,6 +24,8 @@ SELECT parseDateTimeBestEffort('2020-05-14T03:37:03.253184', 3, 'UTC') AS a, toT SELECT parseDateTimeBestEffort('2020-05-14T03:37:03.253184Z', 3, 'UTC') AS a, toTypeName(a); SELECT parseDateTimeBestEffort('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTimeBestEffort(materialize('2020-05-14T03:37:03.253184Z'), 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffort('1640649600123', 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffort('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTimeBestEffortOrNull'; SELECT parseDateTimeBestEffortOrNull('', 3) AS a, toTypeName(a); @@ -37,6 +39,8 @@ SELECT parseDateTimeBestEffortOrNull('2020-05-14T03:37:03.253184', 3, 'UTC') AS SELECT parseDateTimeBestEffortOrNull('2020-05-14T03:37:03.253184Z', 3, 'UTC') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrNull('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrNull(materialize('2020-05-14T03:37:03.253184Z'), 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrNull('1640649600123', 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrNull('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTimeBestEffortOrZero'; SELECT parseDateTimeBestEffortOrZero('', 3, 'UTC') AS a, toTypeName(a); @@ -50,6 +54,8 @@ SELECT parseDateTimeBestEffortOrZero('2020-05-14T03:37:03.253184', 3, 'UTC') AS SELECT parseDateTimeBestEffortOrZero('2020-05-14T03:37:03.253184Z', 3, 'UTC') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrZero('2020-05-14T03:37:03.253184Z', 3, 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTimeBestEffortOrZero(materialize('2020-05-14T03:37:03.253184Z'), 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrZero('1640649600123', 3, 'UTC') AS a, toTypeName(a); +SELECT parseDateTimeBestEffortOrZero('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTime32BestEffort'; SELECT parseDateTime32BestEffort('') AS a, toTypeName(a); -- {serverError 41} @@ -63,6 +69,7 @@ SELECT parseDateTime32BestEffort('2020-05-14T03:37:03.253184', 'UTC') AS a, toTy SELECT parseDateTime32BestEffort('2020-05-14T03:37:03.253184Z', 'UTC') AS a, toTypeName(a); SELECT parseDateTime32BestEffort('2020-05-14T03:37:03.253184Z', 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTime32BestEffort(materialize('2020-05-14T03:37:03.253184Z'), 'UTC') AS a, toTypeName(a); +SELECT parseDateTime32BestEffort('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTime32BestEffortOrNull'; SELECT parseDateTime32BestEffortOrNull('') AS a, toTypeName(a); @@ -76,6 +83,7 @@ SELECT parseDateTime32BestEffortOrNull('2020-05-14T03:37:03.253184', 'UTC') AS a SELECT parseDateTime32BestEffortOrNull('2020-05-14T03:37:03.253184Z', 'UTC') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrNull('2020-05-14T03:37:03.253184Z', 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrNull(materialize('2020-05-14T03:37:03.253184Z'), 'UTC') AS a, toTypeName(a); +SELECT parseDateTime32BestEffortOrNull('1640649600123', 'UTC') AS a, toTypeName(a); SELECT 'parseDateTime32BestEffortOrZero'; SELECT parseDateTime32BestEffortOrZero('', 'UTC') AS a, toTypeName(a); @@ -89,6 +97,6 @@ SELECT parseDateTime32BestEffortOrZero('2020-05-14T03:37:03.253184', 'UTC') AS a SELECT parseDateTime32BestEffortOrZero('2020-05-14T03:37:03.253184Z', 'UTC') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrZero('2020-05-14T03:37:03.253184Z', 'Europe/Minsk') AS a, toTypeName(a); SELECT parseDateTime32BestEffortOrZero(materialize('2020-05-14T03:37:03.253184Z'), 'UTC') AS a, toTypeName(a); - +SELECT parseDateTime32BestEffortOrZero('1640649600123', 'UTC') AS a, toTypeName(a); DROP TABLE IF EXISTS test; diff --git a/tests/queries/0_stateless/01503_if_const_optimization.reference b/tests/queries/0_stateless/01503_if_const_optimization.reference index e69de29bb2d..dec7d2fabd2 100644 --- a/tests/queries/0_stateless/01503_if_const_optimization.reference +++ b/tests/queries/0_stateless/01503_if_const_optimization.reference @@ -0,0 +1 @@ +\N diff --git a/tests/queries/0_stateless/01503_if_const_optimization.sql b/tests/queries/0_stateless/01503_if_const_optimization.sql index 047f6f757e8..a64be6bc80b 100644 --- a/tests/queries/0_stateless/01503_if_const_optimization.sql +++ b/tests/queries/0_stateless/01503_if_const_optimization.sql @@ -1 +1 @@ -SELECT if(CAST(NULL), '2.55', NULL) AS x; -- { serverError 42 } +SELECT if(CAST(NULL AS Nullable(UInt8)), '2.55', NULL) AS x; diff --git a/tests/queries/0_stateless/01504_rocksdb.sql b/tests/queries/0_stateless/01504_rocksdb.sql index 9f9e6c3b1ac..f79f31139fe 100644 --- a/tests/queries/0_stateless/01504_rocksdb.sql +++ b/tests/queries/0_stateless/01504_rocksdb.sql @@ -34,7 +34,7 @@ INSERT INTO 01504_test_memory SELECT number % 77 AS k, SUM(number) AS value, (1, SELECT A.a = B.a, A.b = B.b, A.c = B.c, A.d = B.d, A.e = B.e FROM ( SELECT 0 AS a, groupBitmapMerge(bm) AS b , SUM(k) AS c, SUM(value) AS d, SUM(dummy.1) AS e FROM 01504_test) A ANY LEFT JOIN (SELECT 0 AS a, groupBitmapMerge(bm) AS b , SUM(k) AS c, SUM(value) AS d, SUM(dummy.1) AS e FROM 01504_test_memory) B USING a ORDER BY a; -CREATE TEMPORARY TABLE keys AS SELECT * FROM numbers(1000); +CREATE TEMPORARY TABLE keys AS SELECT * FROM system.numbers LIMIT 1 OFFSET 4; SET max_rows_to_read = 2; SELECT dummy == (1,1.2) FROM 01504_test WHERE k IN (1, 3) OR k IN (1) OR k IN (3, 1) OR k IN [1] OR k IN [1, 3] ; diff --git a/tests/queries/0_stateless/01520_client_print_query_id.expect b/tests/queries/0_stateless/01520_client_print_query_id.expect index b0ff5d9d165..8b6e0e17a85 100755 --- a/tests/queries/0_stateless/01520_client_print_query_id.expect +++ b/tests/queries/0_stateless/01520_client_print_query_id.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01534_lambda_array_join.sql b/tests/queries/0_stateless/01534_lambda_array_join.sql index aee9dd1411a..092c569b25f 100644 --- a/tests/queries/0_stateless/01534_lambda_array_join.sql +++ b/tests/queries/0_stateless/01534_lambda_array_join.sql @@ -6,7 +6,7 @@ SELECT count() AS c FROM numbers(10) GROUP BY - arrayMap(x -> reinterpretAsUInt8(substring(randomString(randomString(range(randomString(255), NULL)), NULL))), range(3)), + arrayMap(x -> reinterpretAsUInt8(substring(randomString(randomString(range(randomString(255), NULL)), NULL), NULL)), range(3)), randomString(range(randomString(1048577), NULL), NULL), byte ORDER BY byte ASC; diff --git a/tests/queries/0_stateless/01565_reconnect_after_client_error.expect b/tests/queries/0_stateless/01565_reconnect_after_client_error.expect index 712fe4ff64a..819450ffd30 100755 --- a/tests/queries/0_stateless/01565_reconnect_after_client_error.expect +++ b/tests/queries/0_stateless/01565_reconnect_after_client_error.expect @@ -1,4 +1,5 @@ #!/usr/bin/expect -f +# Tags: long # This is a separate test, because we want to test the interactive mode. # https://github.com/ClickHouse/ClickHouse/issues/19353 @@ -7,11 +8,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01583_const_column_in_set_index.sql b/tests/queries/0_stateless/01583_const_column_in_set_index.sql index e40249eaf08..b781efb0f13 100644 --- a/tests/queries/0_stateless/01583_const_column_in_set_index.sql +++ b/tests/queries/0_stateless/01583_const_column_in_set_index.sql @@ -3,7 +3,7 @@ drop table if exists insub; create table insub (i int, j int) engine MergeTree order by i settings index_granularity = 1; insert into insub select number a, a + 2 from numbers(10); -SET max_rows_to_read = 2; +SET max_rows_to_read = 12; -- 10 from numbers + 2 from table select * from insub where i in (select toInt32(3) from numbers(10)); drop table if exists insub; diff --git a/tests/queries/0_stateless/01585_use_index_for_global_in.sql b/tests/queries/0_stateless/01585_use_index_for_global_in.sql index a0a5b90ac1f..1dd7609350f 100644 --- a/tests/queries/0_stateless/01585_use_index_for_global_in.sql +++ b/tests/queries/0_stateless/01585_use_index_for_global_in.sql @@ -8,10 +8,12 @@ create table xp_d as xp engine Distributed(test_shard_localhost, currentDatabase insert into xp select number, number + 2 from numbers(10); -set max_rows_to_read = 2; +set max_rows_to_read = 4; -- 2 from numbers, 2 from tables select * from xp where i in (select * from numbers(2)); select * from xp where i global in (select * from numbers(2)); select * from xp_d where i in (select * from numbers(2)); + +set max_rows_to_read = 6; -- 2 from numbers, 2 from GLOBAL temp table (pushed from numbers), 2 from local xp select * from xp_d where i global in (select * from numbers(2)); drop table if exists xp; diff --git a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference index de0116f9eaa..0cb1993057f 100644 --- a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference +++ b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.reference @@ -14,6 +14,14 @@ 1 3 0 2 1 3 +0 2 +1 3 +0 2 +1 3 +0 2 +1 3 +0 2 +1 3 \N 100 \N 100 \N 100 diff --git a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql index 6129c92c888..d4147a445ec 100644 --- a/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql +++ b/tests/queries/0_stateless/01585_use_index_for_global_in_with_null.sql @@ -12,17 +12,29 @@ insert into xp select null, 100; optimize table xp final; set max_rows_to_read = 2; +select * from xp where i in [0, 1]; +select * from xp where i global in [0, 1]; +select * from xp_d where i in [0, 1]; +select * from xp_d where i global in [0, 1]; + +set max_rows_to_read = 4; -- 2 in the subquery, 2 in the query itself select * from xp where i in (select * from numbers(2)); select * from xp where i global in (select * from numbers(2)); select * from xp_d where i in (select * from numbers(2)); + +set max_rows_to_read = 6; -- 2 subquery, 2 from global temp table (GLOBAL IN), 2 from local xp table select * from xp_d where i global in (select * from numbers(2)); set transform_null_in = 1; +set max_rows_to_read = 4; -- 2 in the subquery, 2 in the query itself select * from xp where i in (select * from numbers(2)); select * from xp where i global in (select * from numbers(2)); select * from xp_d where i in (select * from numbers(2)); + +set max_rows_to_read = 6; -- 2 subquery, 2 from global temp table (GLOBAL IN), 2 from local xp table select * from xp_d where i global in (select * from numbers(2)); +set max_rows_to_read = 0; -- No rows should be read select * from xp where i in (null); select * from xp where i global in (null); select * from xp_d where i in (null); diff --git a/tests/queries/0_stateless/01600_parts_states_metrics_long.sh b/tests/queries/0_stateless/01600_parts_states_metrics_long.sh index 9c0d28fdd91..f47d0863e69 100755 --- a/tests/queries/0_stateless/01600_parts_states_metrics_long.sh +++ b/tests/queries/0_stateless/01600_parts_states_metrics_long.sh @@ -7,7 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # NOTE: database = $CLICKHOUSE_DATABASE is unwanted verify_sql="SELECT - (SELECT sumIf(value, metric = 'PartsCommitted'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics) + (SELECT sumIf(value, metric = 'PartsActive'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics) = (SELECT sum(active), sum(NOT active) FROM system.parts)" # The query is not atomic - it can compare states between system.parts and system.metrics from different points in time. diff --git a/tests/queries/0_stateless/01604_explain_ast_of_nonselect_query.reference b/tests/queries/0_stateless/01604_explain_ast_of_nonselect_query.reference index 887c701a5e4..4cc67aa517c 100644 --- a/tests/queries/0_stateless/01604_explain_ast_of_nonselect_query.reference +++ b/tests/queries/0_stateless/01604_explain_ast_of_nonselect_query.reference @@ -1,6 +1,6 @@ AlterQuery t1 (children 2) ExpressionList (children 1) - AlterCommand 33 (children 1) + AlterCommand DELETE (children 1) Function equals (children 1) ExpressionList (children 2) Identifier date diff --git a/tests/queries/0_stateless/01660_system_parts_smoke.reference b/tests/queries/0_stateless/01660_system_parts_smoke.reference index f21fab8e539..36550f31bd0 100644 --- a/tests/queries/0_stateless/01660_system_parts_smoke.reference +++ b/tests/queries/0_stateless/01660_system_parts_smoke.reference @@ -1,13 +1,13 @@ # two parts -Committed -Committed -all_1_1_0 Committed -all_2_2_0 Committed +Active +Active +all_1_1_0 Active +all_2_2_0 Active all_1_1_0 1 all_2_2_0 1 # optimize +1 Active 2 Outdated -1 Committed # truncate Outdated Outdated diff --git a/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh b/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh index e029d90a686..1be082a6aae 100755 --- a/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh +++ b/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh @@ -20,11 +20,11 @@ function test_completion_word_client() log_user 0 set timeout 3 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } spawn bash -c "$CLICKHOUSE_CLIENT_BINARY $CLICKHOUSE_CLIENT_OPT" @@ -104,11 +104,11 @@ function test_completion_word_local() log_user 0 set timeout 3 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } spawn bash -c "$CLICKHOUSE_LOCAL" diff --git a/tests/queries/0_stateless/01748_partition_id_pruning.sql b/tests/queries/0_stateless/01748_partition_id_pruning.sql index 17a405e17ad..e0d45884c60 100644 --- a/tests/queries/0_stateless/01748_partition_id_pruning.sql +++ b/tests/queries/0_stateless/01748_partition_id_pruning.sql @@ -8,12 +8,12 @@ set max_rows_to_read = 3; select * from x where _partition_id = partitionId(1); -set max_rows_to_read = 4; -- one row for subquery +set max_rows_to_read = 5; -- one row for subquery + subquery select * from x where _partition_id in (select partitionId(number + 1) from numbers(1)); -- trivial count optimization test -set max_rows_to_read = 1; -- one row for subquery +set max_rows_to_read = 2; -- one row for subquery + subquery itself select count() from x where _partition_id in (select partitionId(number + 1) from numbers(1)); drop table x; diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect index 5543af4dd05..022320e2d4b 100755 --- a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect +++ b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 2 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect b/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect index 138727d296d..d5ce4c3cbf2 100755 --- a/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect +++ b/tests/queries/0_stateless/01910_client_replxx_container_overflow_long.expect @@ -4,11 +4,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01922_sum_null_for_remote.reference b/tests/queries/0_stateless/01922_sum_null_for_remote.reference new file mode 100644 index 00000000000..dec7d2fabd2 --- /dev/null +++ b/tests/queries/0_stateless/01922_sum_null_for_remote.reference @@ -0,0 +1 @@ +\N diff --git a/tests/queries/0_stateless/01922_sum_null_for_remote.sql b/tests/queries/0_stateless/01922_sum_null_for_remote.sql new file mode 100644 index 00000000000..a19740364a1 --- /dev/null +++ b/tests/queries/0_stateless/01922_sum_null_for_remote.sql @@ -0,0 +1 @@ +select sum(null) from remote('127.0.0.{1,2}', 'system', 'one') diff --git a/tests/queries/0_stateless/01927_query_views_log_current_database.reference b/tests/queries/0_stateless/01927_query_views_log_current_database.reference index ff9eca2d97f..eaa1e98c55c 100644 --- a/tests/queries/0_stateless/01927_query_views_log_current_database.reference +++ b/tests/queries/0_stateless/01927_query_views_log_current_database.reference @@ -1,70 +1,94 @@ Row 1: ────── -stage: Query log rows -read_rows: 100 -written_rows: 201 -databases: ['_table_function','default'] -tables: ['_table_function.numbers','default.table_a','default.table_b','default.table_b_live_view','default.table_c'] -views: ['default.matview_a_to_b','default.matview_b_to_c','default.table_b_live_view'] -sleep_calls: 200 -sleep_us: 298 +stage: Query log rows +read_rows: 400 +written_rows: 201 +databases: ['_table_function','default'] +tables: ['_table_function.numbers','default.table_a','default.table_b','default.table_b_live_view','default.table_c'] +views: ['default.matview_a_to_b','default.matview_b_to_c','default.table_b_live_view'] +sleep_calls: 200 +sleep_us: 298 +profile_select_rows: 400 +profile_select_bytes: 5200 +profile_insert_rows: 201 +profile_insert_bytes: 2808 Row 1: ────── -stage: Depending views -view_name: default.matview_a_to_b -view_type: Materialized -status: QueryFinish -view_target: default.table_b -view_query: SELECT toFloat64(a) AS a, b + sleepEachRow(0.000001) AS count FROM default.table_a -read_rows: 100 -written_rows: 100 -sleep_calls: 100 -sleep_us: 99 +stage: Depending views +view_name: default.matview_a_to_b +view_type: Materialized +status: QueryFinish +view_target: default.table_b +view_query: SELECT toFloat64(a) AS a, b + sleepEachRow(0.000001) AS count FROM default.table_a +read_rows: 100 +written_rows: 100 +sleep_calls: 100 +sleep_us: 99 +profile_select_rows: 100 +profile_select_bytes: 2000 +profile_insert_rows: 100 +profile_insert_bytes: 800 Row 2: ────── -stage: Depending views -view_name: default.matview_b_to_c -view_type: Materialized -status: QueryFinish -view_target: default.table_c -view_query: SELECT sum(a + sleepEachRow(0.000002)) AS a FROM default.table_b -read_rows: 100 -written_rows: 1 -sleep_calls: 100 -sleep_us: 199 +stage: Depending views +view_name: default.matview_b_to_c +view_type: Materialized +status: QueryFinish +view_target: default.table_c +view_query: SELECT sum(a + sleepEachRow(0.000002)) AS a FROM default.table_b +read_rows: 100 +written_rows: 1 +sleep_calls: 100 +sleep_us: 199 +profile_select_rows: 100 +profile_select_bytes: 800 +profile_insert_rows: 1 +profile_insert_bytes: 8 Row 3: ────── -stage: Depending views -view_name: default.table_b_live_view -view_type: Live -status: QueryFinish -view_target: default.table_b_live_view -view_query: SELECT sum(a + b) FROM default.table_b -read_rows: 100 -written_rows: 0 -sleep_calls: 0 -sleep_us: 0 +stage: Depending views +view_name: default.table_b_live_view +view_type: Live +status: QueryFinish +view_target: default.table_b_live_view +view_query: SELECT sum(a + b) FROM default.table_b +read_rows: 100 +written_rows: 0 +sleep_calls: 0 +sleep_us: 0 +profile_select_rows: 100 +profile_select_bytes: 1600 +profile_insert_rows: 0 +profile_insert_bytes: 0 Row 1: ────── -stage: Query log rows 2 -read_rows: 50 -written_rows: 100 -databases: ['_table_function','default'] -tables: ['_table_function.numbers','default.table_d','default.table_e','default.table_f'] -views: ['default.matview_join_d_e'] -sleep_calls: 50 -sleep_us: 150 +stage: Query log rows 2 +read_rows: 100 +written_rows: 100 +databases: ['_table_function','default'] +tables: ['_table_function.numbers','default.table_d','default.table_e','default.table_f'] +views: ['default.matview_join_d_e'] +sleep_calls: 50 +sleep_us: 150 +profile_select_rows: 100 +profile_select_bytes: 800 +profile_insert_rows: 100 +profile_insert_bytes: 1600 Row 1: ────── -stage: Depending views 2 -view_name: default.matview_join_d_e -view_type: Materialized -status: QueryFinish -view_target: default.table_f -view_query: SELECT table_d.a AS a, table_e.count + sleepEachRow(0.000003) AS count FROM default.table_d LEFT JOIN default.table_e ON table_d.a = table_e.a -read_rows: 50 -written_rows: 50 -sleep_calls: 50 -sleep_us: 150 +stage: Depending views 2 +view_name: default.matview_join_d_e +view_type: Materialized +status: QueryFinish +view_target: default.table_f +view_query: SELECT table_d.a AS a, table_e.count + sleepEachRow(0.000003) AS count FROM default.table_d LEFT JOIN default.table_e ON table_d.a = table_e.a +read_rows: 50 +written_rows: 50 +sleep_calls: 50 +sleep_us: 150 +profile_select_rows: 50 +profile_select_bytes: 400 +profile_insert_rows: 50 +profile_insert_bytes: 800 diff --git a/tests/queries/0_stateless/01927_query_views_log_current_database.sql b/tests/queries/0_stateless/01927_query_views_log_current_database.sql index 40ab8c8e16a..fbfbeab0167 100644 --- a/tests/queries/0_stateless/01927_query_views_log_current_database.sql +++ b/tests/queries/0_stateless/01927_query_views_log_current_database.sql @@ -45,7 +45,11 @@ SELECT arraySort(tables) as tables, arraySort(views) as views, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_log WHERE query like '-- INSERT 1%INSERT INTO table_a%' AND current_database = currentDatabase() @@ -62,7 +66,11 @@ SELECT read_rows, written_rows, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_views_log WHERE initial_query_id = ( @@ -85,7 +93,11 @@ SELECT arraySort(tables) as tables, arraySort(views) as views, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_log WHERE query like '-- INSERT 2%INSERT INTO table_d%' AND current_database = currentDatabase() @@ -102,7 +114,11 @@ SELECT read_rows, written_rows, ProfileEvents['SleepFunctionCalls'] as sleep_calls, - ProfileEvents['SleepFunctionMicroseconds'] as sleep_us + ProfileEvents['SleepFunctionMicroseconds'] as sleep_us, + ProfileEvents['SelectedRows'] as profile_select_rows, + ProfileEvents['SelectedBytes'] as profile_select_bytes, + ProfileEvents['InsertedRows'] as profile_insert_rows, + ProfileEvents['InsertedBytes'] as profile_insert_bytes FROM system.query_views_log WHERE initial_query_id = ( diff --git a/tests/queries/0_stateless/01933_client_replxx_convert_history.expect b/tests/queries/0_stateless/01933_client_replxx_convert_history.expect index 59231161d91..c5645179ab3 100755 --- a/tests/queries/0_stateless/01933_client_replxx_convert_history.expect +++ b/tests/queries/0_stateless/01933_client_replxx_convert_history.expect @@ -5,11 +5,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/01945_show_debug_warning.expect b/tests/queries/0_stateless/01945_show_debug_warning.expect index 402ad9a1f35..2f74b6e33ae 100755 --- a/tests/queries/0_stateless/01945_show_debug_warning.expect +++ b/tests/queries/0_stateless/01945_show_debug_warning.expect @@ -7,11 +7,11 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02003_memory_limit_in_client.expect b/tests/queries/0_stateless/02003_memory_limit_in_client.expect index 47ac4926537..a3d6d04110a 100755 --- a/tests/queries/0_stateless/02003_memory_limit_in_client.expect +++ b/tests/queries/0_stateless/02003_memory_limit_in_client.expect @@ -8,14 +8,18 @@ log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] + +# +# Check that the query will fail in clickhouse-client +# spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --max_memory_usage_in_client=1" expect ":) " @@ -28,7 +32,24 @@ expect ":) " send -- "\4" expect eof -set basedir [file dirname $argv0] +# +# Check that the query will fail in clickhouse-client +# +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --max_memory_usage_in_client=1" +expect ":) " + +send -- "SELECT arrayMap(x -> range(x), range(number)) FROM numbers(1000)\r" +expect "Code: 241" + +expect ":) " + +# Exit. +send -- "\4" +expect eof + +# +# Check that the query will not fail (due to max_untracked_memory) +# spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion --max_memory_usage_in_client=1" expect ":) " diff --git a/tests/queries/0_stateless/02047_client_exception.expect b/tests/queries/0_stateless/02047_client_exception.expect index 57a38c4f6aa..f7d4bfb555d 100755 --- a/tests/queries/0_stateless/02047_client_exception.expect +++ b/tests/queries/0_stateless/02047_client_exception.expect @@ -4,11 +4,11 @@ log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect b/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect index 89271805fb3..ffa25b964db 100755 --- a/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect +++ b/tests/queries/0_stateless/02049_clickhouse_local_merge_tree.expect @@ -4,12 +4,11 @@ log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } - + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02051_read_settings.reference.j2 b/tests/queries/0_stateless/02051_read_settings.reference.j2 index 86aa67a9d2d..391cf3adf35 100644 --- a/tests/queries/0_stateless/02051_read_settings.reference.j2 +++ b/tests/queries/0_stateless/02051_read_settings.reference.j2 @@ -1,3 +1,4 @@ +{% for index_granularity_bytes in [0, 10 * 1024 * 1024] -%} {% for read_method in ['read', 'mmap', 'pread_threadpool', 'pread_fake_async'] -%} {% for direct_io in [0, 1] -%} {% for prefetch in [0, 1] -%} @@ -9,3 +10,4 @@ {% endfor -%} {% endfor -%} {% endfor -%} +{% endfor -%} diff --git a/tests/queries/0_stateless/02051_read_settings.sql.j2 b/tests/queries/0_stateless/02051_read_settings.sql.j2 index 9f02274e732..fa19fbd3036 100644 --- a/tests/queries/0_stateless/02051_read_settings.sql.j2 +++ b/tests/queries/0_stateless/02051_read_settings.sql.j2 @@ -4,7 +4,15 @@ drop table if exists data_02051; -create table data_02051 (key Int, value String) engine=MergeTree() order by key +{# check each index_granularity_bytes #} +{% for index_granularity_bytes in [0, 10 * 1024 * 1024] %} +create table data_02051 (key Int, value String) +engine=MergeTree() +order by key +settings + index_granularity_bytes={{ index_granularity_bytes }}, + /* to suppress "Table can't create parts with adaptive granularity, but settings ..." warning */ + min_bytes_for_wide_part=0 as select number, repeat(toString(number), 5) from numbers(1e6); {# check each local_filesystem_read_method #} @@ -29,3 +37,7 @@ select count(ignore(*)) from data_02051 settings {% endfor %} {% endfor %} {% endfor %} + +drop table data_02051; +{# index_granularity_bytes #} +{% endfor %} diff --git a/tests/queries/0_stateless/02096_date_time_1970_saturation.reference b/tests/queries/0_stateless/02096_date_time_1970_saturation.reference new file mode 100644 index 00000000000..3c073b9262e --- /dev/null +++ b/tests/queries/0_stateless/02096_date_time_1970_saturation.reference @@ -0,0 +1,30 @@ +1970-01-01 +1970-01-01 03:00:00 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-02 03:00:00 +1970-01-01 03:00:00 +1970-01-01 03:00:00 +1970-01-01 03:00:00 +1970-01-01 03:00:00 +1970-01-01 03:00:00 +1969-12-31 16:00:00 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-01 +1970-01-02 16:00:00 +1969-12-31 16:00:00 +1969-12-31 16:00:00 +1969-12-31 16:00:00 +1969-12-31 16:00:00 +1969-12-31 16:00:00 diff --git a/tests/queries/0_stateless/02096_date_time_1970_saturation.sql b/tests/queries/0_stateless/02096_date_time_1970_saturation.sql new file mode 100644 index 00000000000..e0c401443a7 --- /dev/null +++ b/tests/queries/0_stateless/02096_date_time_1970_saturation.sql @@ -0,0 +1,31 @@ +select toDate(0); +select toDateTime(0, 'Europe/Moscow'); +select toMonday(toDate(0)); +select toMonday(toDateTime(0, 'Europe/Moscow')); +select toStartOfWeek(toDate(0)); +select toStartOfWeek(toDateTime(0, 'Europe/Moscow')); +select toStartOfMonth(toDate(0)); +select toStartOfMonth(toDateTime(0, 'Europe/Moscow')); +select toStartOfQuarter(toDate(0)); +select toStartOfQuarter(toDateTime(0, 'Europe/Moscow')); +select toStartOfYear(toDate(0)); +select toStartOfYear(toDateTime(0, 'Europe/Moscow')); +select toTime(toDateTime(0, 'Europe/Moscow')); +select toStartOfMinute(toDateTime(0, 'Europe/Moscow')); +select toStartOfFiveMinute(toDateTime(0, 'Europe/Moscow')); +select toStartOfTenMinutes(toDateTime(0, 'Europe/Moscow')); +select toStartOfFifteenMinutes(toDateTime(0, 'Europe/Moscow')); +select toStartOfHour(toDateTime(0, 'Europe/Moscow')); + +select toDateTime(0, 'America/Los_Angeles'); +select toMonday(toDateTime(0, 'America/Los_Angeles')); +select toStartOfWeek(toDateTime(0, 'America/Los_Angeles')); +select toStartOfMonth(toDateTime(0, 'America/Los_Angeles')); +select toStartOfQuarter(toDateTime(0, 'America/Los_Angeles')); +select toStartOfYear(toDateTime(0, 'America/Los_Angeles')); +select toTime(toDateTime(0, 'America/Los_Angeles'), 'America/Los_Angeles'); +select toStartOfMinute(toDateTime(0, 'America/Los_Angeles')); +select toStartOfFiveMinute(toDateTime(0, 'America/Los_Angeles')); +select toStartOfTenMinutes(toDateTime(0, 'America/Los_Angeles')); +select toStartOfFifteenMinutes(toDateTime(0, 'America/Los_Angeles')); +select toStartOfHour(toDateTime(0, 'America/Los_Angeles')); diff --git a/tests/queries/0_stateless/02105_backslash_letter_commands.expect b/tests/queries/0_stateless/02105_backslash_letter_commands.expect index 89d896fdedc..e67d60912fa 100755 --- a/tests/queries/0_stateless/02105_backslash_letter_commands.expect +++ b/tests/queries/0_stateless/02105_backslash_letter_commands.expect @@ -3,11 +3,11 @@ log_user 0 set timeout 02 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect index 73b12637906..0abe25e60f4 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_client_with_queries_file.expect @@ -5,23 +5,24 @@ log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } - -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_prepare.sh" - set basedir [file dirname $argv0] -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT --disable_suggestion --interactive --queries-file \$CURDIR/file_02112" + +system "$basedir/helpers/02112_prepare.sh" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT --disable_suggestion --interactive --queries-file $basedir/file_02112" expect ":) " send -- "select * from t format TSV\r" expect "1" expect ":) " -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_clean.sh" +send "" +expect eof +system "$basedir/helpers/02112_clean.sh" diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect index fa146577234..c846464b011 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_local.expect @@ -4,11 +4,11 @@ log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect b/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect index fbf79629f71..c64f149a93c 100755 --- a/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect +++ b/tests/queries/0_stateless/02112_delayed_clickhouse_local_with_queries_file.expect @@ -5,23 +5,24 @@ log_user 0 set timeout 20 match_max 100000 -# A default timeout action is to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } - -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_prepare.sh" - set basedir [file dirname $argv0] -spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --interactive --queries-file \$CURDIR/file_02112" + +system "$basedir/helpers/02112_prepare.sh" +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion --interactive --queries-file $basedir/file_02112" expect ":) " send -- "select * from t format TSV\r" expect "1" expect ":) " -spawn bash -c "\$CLICKHOUSE_TESTS_DIR/helpers/02112_clean.sh" +send "" +expect eof +system "$basedir/helpers/02112_clean.sh" diff --git a/tests/queries/0_stateless/02116_interactive_hello.expect b/tests/queries/0_stateless/02116_interactive_hello.expect index 49a167e5a6e..e659cf8703c 100755 --- a/tests/queries/0_stateless/02116_interactive_hello.expect +++ b/tests/queries/0_stateless/02116_interactive_hello.expect @@ -1,14 +1,15 @@ #!/usr/bin/expect -f +# Tags: long log_user 0 set timeout 60 match_max 100000 -# A default timeout action is to do nothing, change it to fail expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } set basedir [file dirname $argv0] diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 35de7f8e82c..234804f1078 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -20,8 +20,8 @@ CREATE TABLE system.errors\n(\n `name` String,\n `code` Int32,\n `value CREATE TABLE system.events\n(\n `event` String,\n `value` UInt64,\n `description` String\n)\nENGINE = SystemEvents()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.formats\n(\n `name` String,\n `is_input` UInt8,\n `is_output` UInt8\n)\nENGINE = SystemFormats()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.functions\n(\n `name` String,\n `is_aggregate` UInt8,\n `case_insensitive` UInt8,\n `alias_to` String,\n `create_query` String,\n `origin` Enum8(\'System\' = 0, \'SQLUserDefined\' = 1, \'ExecutableUserDefined\' = 2)\n)\nENGINE = SystemFunctions()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' -CREATE TABLE system.grants\n(\n `user_name` Nullable(String),\n `role_name` Nullable(String),\n `access_type` Enum8(\'SQLITE\' = -128, \'ODBC\' = -127, \'JDBC\' = -126, \'HDFS\' = -125, \'S3\' = -124, \'SOURCES\' = -123, \'ALL\' = -122, \'NONE\' = -121, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM\' = 116, \'dictGet\' = 117, \'addressToLine\' = 118, \'addressToSymbol\' = 119, \'demangle\' = 120, \'INTROSPECTION\' = 121, \'FILE\' = 122, \'URL\' = 123, \'REMOTE\' = 124, \'MONGO\' = 125, \'MYSQL\' = 126, \'POSTGRES\' = 127),\n `database` Nullable(String),\n `table` Nullable(String),\n `column` Nullable(String),\n `is_partial_revoke` UInt8,\n `grant_option` UInt8\n)\nENGINE = SystemGrants()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' -CREATE TABLE system.graphite_retentions\n(\n `config_name` String,\n `regexp` String,\n `function` String,\n `age` UInt64,\n `precision` UInt64,\n `priority` UInt16,\n `is_default` UInt8,\n `Tables.database` Array(String),\n `Tables.table` Array(String)\n)\nENGINE = SystemGraphite()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' +CREATE TABLE system.grants\n(\n `user_name` Nullable(String),\n `role_name` Nullable(String),\n `access_type` Enum8(\'POSTGRES\' = -128, \'SQLITE\' = -127, \'ODBC\' = -126, \'JDBC\' = -125, \'HDFS\' = -124, \'S3\' = -123, \'SOURCES\' = -122, \'ALL\' = -121, \'NONE\' = -120, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM THREAD FUZZER\' = 116, \'SYSTEM\' = 117, \'dictGet\' = 118, \'addressToLine\' = 119, \'addressToSymbol\' = 120, \'demangle\' = 121, \'INTROSPECTION\' = 122, \'FILE\' = 123, \'URL\' = 124, \'REMOTE\' = 125, \'MONGO\' = 126, \'MYSQL\' = 127),\n `database` Nullable(String),\n `table` Nullable(String),\n `column` Nullable(String),\n `is_partial_revoke` UInt8,\n `grant_option` UInt8\n)\nENGINE = SystemGrants()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' +CREATE TABLE system.graphite_retentions\n(\n `config_name` String,\n `rule_type` String,\n `regexp` String,\n `function` String,\n `age` UInt64,\n `precision` UInt64,\n `priority` UInt16,\n `is_default` UInt8,\n `Tables.database` Array(String),\n `Tables.table` Array(String)\n)\nENGINE = SystemGraphite()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.licenses\n(\n `library_name` String,\n `license_type` String,\n `license_path` String,\n `license_text` String\n)\nENGINE = SystemLicenses()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.macros\n(\n `macro` String,\n `substitution` String\n)\nENGINE = SystemMacros()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.merge_tree_settings\n(\n `name` String,\n `value` String,\n `changed` UInt8,\n `description` String,\n `type` String\n)\nENGINE = SystemMergeTreeSettings()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' @@ -35,7 +35,7 @@ CREATE TABLE system.one\n(\n `dummy` UInt8\n)\nENGINE = SystemOne()\nCOMMENT CREATE TABLE system.part_moves_between_shards\n(\n `database` String,\n `table` String,\n `task_name` String,\n `task_uuid` UUID,\n `create_time` DateTime,\n `part_name` String,\n `part_uuid` UUID,\n `to_shard` String,\n `dst_part_name` String,\n `update_time` DateTime,\n `state` String,\n `rollback` UInt8,\n `num_tries` UInt32,\n `last_exception` String\n)\nENGINE = SystemShardMoves()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.parts\n(\n `partition` String,\n `name` String,\n `uuid` UUID,\n `part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `secondary_indices_compressed_bytes` UInt64,\n `secondary_indices_uncompressed_bytes` UInt64,\n `secondary_indices_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `is_frozen` UInt8,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `hash_of_all_files` String,\n `hash_of_uncompressed_files` String,\n `uncompressed_hash_of_compressed_files` String,\n `delete_ttl_info_min` DateTime,\n `delete_ttl_info_max` DateTime,\n `move_ttl_info.expression` Array(String),\n `move_ttl_info.min` Array(DateTime),\n `move_ttl_info.max` Array(DateTime),\n `default_compression_codec` String,\n `recompression_ttl_info.expression` Array(String),\n `recompression_ttl_info.min` Array(DateTime),\n `recompression_ttl_info.max` Array(DateTime),\n `group_by_ttl_info.expression` Array(String),\n `group_by_ttl_info.min` Array(DateTime),\n `group_by_ttl_info.max` Array(DateTime),\n `rows_where_ttl_info.expression` Array(String),\n `rows_where_ttl_info.min` Array(DateTime),\n `rows_where_ttl_info.max` Array(DateTime),\n `projections` Array(String),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemParts()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.parts_columns\n(\n `partition` String,\n `name` String,\n `uuid` UUID,\n `part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `column` String,\n `type` String,\n `column_position` UInt64,\n `default_kind` String,\n `default_expression` String,\n `column_bytes_on_disk` UInt64,\n `column_data_compressed_bytes` UInt64,\n `column_data_uncompressed_bytes` UInt64,\n `column_marks_bytes` UInt64,\n `serialization_kind` String,\n `subcolumns.names` Array(String),\n `subcolumns.types` Array(String),\n `subcolumns.serializations` Array(String),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemPartsColumns()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' -CREATE TABLE system.privileges\n(\n `privilege` Enum8(\'SQLITE\' = -128, \'ODBC\' = -127, \'JDBC\' = -126, \'HDFS\' = -125, \'S3\' = -124, \'SOURCES\' = -123, \'ALL\' = -122, \'NONE\' = -121, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM\' = 116, \'dictGet\' = 117, \'addressToLine\' = 118, \'addressToSymbol\' = 119, \'demangle\' = 120, \'INTROSPECTION\' = 121, \'FILE\' = 122, \'URL\' = 123, \'REMOTE\' = 124, \'MONGO\' = 125, \'MYSQL\' = 126, \'POSTGRES\' = 127),\n `aliases` Array(String),\n `level` Nullable(Enum8(\'GLOBAL\' = 0, \'DATABASE\' = 1, \'TABLE\' = 2, \'DICTIONARY\' = 3, \'VIEW\' = 4, \'COLUMN\' = 5)),\n `parent_group` Nullable(Enum8(\'SQLITE\' = -128, \'ODBC\' = -127, \'JDBC\' = -126, \'HDFS\' = -125, \'S3\' = -124, \'SOURCES\' = -123, \'ALL\' = -122, \'NONE\' = -121, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM\' = 116, \'dictGet\' = 117, \'addressToLine\' = 118, \'addressToSymbol\' = 119, \'demangle\' = 120, \'INTROSPECTION\' = 121, \'FILE\' = 122, \'URL\' = 123, \'REMOTE\' = 124, \'MONGO\' = 125, \'MYSQL\' = 126, \'POSTGRES\' = 127))\n)\nENGINE = SystemPrivileges()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' +CREATE TABLE system.privileges\n(\n `privilege` Enum8(\'POSTGRES\' = -128, \'SQLITE\' = -127, \'ODBC\' = -126, \'JDBC\' = -125, \'HDFS\' = -124, \'S3\' = -123, \'SOURCES\' = -122, \'ALL\' = -121, \'NONE\' = -120, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM THREAD FUZZER\' = 116, \'SYSTEM\' = 117, \'dictGet\' = 118, \'addressToLine\' = 119, \'addressToSymbol\' = 120, \'demangle\' = 121, \'INTROSPECTION\' = 122, \'FILE\' = 123, \'URL\' = 124, \'REMOTE\' = 125, \'MONGO\' = 126, \'MYSQL\' = 127),\n `aliases` Array(String),\n `level` Nullable(Enum8(\'GLOBAL\' = 0, \'DATABASE\' = 1, \'TABLE\' = 2, \'DICTIONARY\' = 3, \'VIEW\' = 4, \'COLUMN\' = 5)),\n `parent_group` Nullable(Enum8(\'POSTGRES\' = -128, \'SQLITE\' = -127, \'ODBC\' = -126, \'JDBC\' = -125, \'HDFS\' = -124, \'S3\' = -123, \'SOURCES\' = -122, \'ALL\' = -121, \'NONE\' = -120, \'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM THREAD FUZZER\' = 116, \'SYSTEM\' = 117, \'dictGet\' = 118, \'addressToLine\' = 119, \'addressToSymbol\' = 120, \'demangle\' = 121, \'INTROSPECTION\' = 122, \'FILE\' = 123, \'URL\' = 124, \'REMOTE\' = 125, \'MONGO\' = 126, \'MYSQL\' = 127))\n)\nENGINE = SystemPrivileges()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.processes\n(\n `is_initial_query` UInt8,\n `user` String,\n `query_id` String,\n `address` IPv6,\n `port` UInt16,\n `initial_user` String,\n `initial_query_id` String,\n `initial_address` IPv6,\n `initial_port` UInt16,\n `interface` UInt8,\n `os_user` String,\n `client_hostname` String,\n `client_name` String,\n `client_revision` UInt64,\n `client_version_major` UInt64,\n `client_version_minor` UInt64,\n `client_version_patch` UInt64,\n `http_method` UInt8,\n `http_user_agent` String,\n `http_referer` String,\n `forwarded_for` String,\n `quota_key` String,\n `elapsed` Float64,\n `is_cancelled` UInt8,\n `read_rows` UInt64,\n `read_bytes` UInt64,\n `total_rows_approx` UInt64,\n `written_rows` UInt64,\n `written_bytes` UInt64,\n `memory_usage` Int64,\n `peak_memory_usage` Int64,\n `query` String,\n `thread_ids` Array(UInt64),\n `ProfileEvents` Map(String, UInt64),\n `Settings` Map(String, String),\n `current_database` String,\n `ProfileEvents.Names` Array(String),\n `ProfileEvents.Values` Array(UInt64),\n `Settings.Names` Array(String),\n `Settings.Values` Array(String)\n)\nENGINE = SystemProcesses()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.projection_parts\n(\n `partition` String,\n `name` String,\n `part_type` String,\n `parent_name` String,\n `parent_uuid` UUID,\n `parent_part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `parent_marks` UInt64,\n `parent_rows` UInt64,\n `parent_bytes_on_disk` UInt64,\n `parent_data_compressed_bytes` UInt64,\n `parent_data_uncompressed_bytes` UInt64,\n `parent_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `is_frozen` UInt8,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `hash_of_all_files` String,\n `hash_of_uncompressed_files` String,\n `uncompressed_hash_of_compressed_files` String,\n `delete_ttl_info_min` DateTime,\n `delete_ttl_info_max` DateTime,\n `move_ttl_info.expression` Array(String),\n `move_ttl_info.min` Array(DateTime),\n `move_ttl_info.max` Array(DateTime),\n `default_compression_codec` String,\n `recompression_ttl_info.expression` Array(String),\n `recompression_ttl_info.min` Array(DateTime),\n `recompression_ttl_info.max` Array(DateTime),\n `group_by_ttl_info.expression` Array(String),\n `group_by_ttl_info.min` Array(DateTime),\n `group_by_ttl_info.max` Array(DateTime),\n `rows_where_ttl_info.expression` Array(String),\n `rows_where_ttl_info.min` Array(DateTime),\n `rows_where_ttl_info.max` Array(DateTime),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemProjectionParts()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.projection_parts_columns\n(\n `partition` String,\n `name` String,\n `part_type` String,\n `parent_name` String,\n `parent_uuid` UUID,\n `parent_part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `parent_marks` UInt64,\n `parent_rows` UInt64,\n `parent_bytes_on_disk` UInt64,\n `parent_data_compressed_bytes` UInt64,\n `parent_data_uncompressed_bytes` UInt64,\n `parent_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `column` String,\n `type` String,\n `column_position` UInt64,\n `default_kind` String,\n `default_expression` String,\n `column_bytes_on_disk` UInt64,\n `column_data_compressed_bytes` UInt64,\n `column_data_uncompressed_bytes` UInt64,\n `column_marks_bytes` UInt64,\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemProjectionPartsColumns()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' diff --git a/tests/queries/0_stateless/02125_query_views_log.reference b/tests/queries/0_stateless/02125_query_views_log.reference index 3ae4af9b4d0..fac70027113 100644 --- a/tests/queries/0_stateless/02125_query_views_log.reference +++ b/tests/queries/0_stateless/02125_query_views_log.reference @@ -18,7 +18,7 @@ written_bytes: 4000000 select read_rows, read_bytes, written_rows, written_bytes from system.query_log where type = 'QueryFinish' and query_kind = 'Insert' and current_database = currentDatabase() format Vertical; Row 1: ────── -read_rows: 1000000 -read_bytes: 8000000 +read_rows: 3000000 +read_bytes: 16000000 written_rows: 3000000 written_bytes: 12000000 diff --git a/tests/queries/0_stateless/02132_client_history_navigation.expect b/tests/queries/0_stateless/02132_client_history_navigation.expect index cd83454c85e..b722a0af04c 100755 --- a/tests/queries/0_stateless/02132_client_history_navigation.expect +++ b/tests/queries/0_stateless/02132_client_history_navigation.expect @@ -3,11 +3,12 @@ log_user 0 set timeout 3 match_max 100000 -# A default timeout action is to do nothing, change it to fail + expect_after { - timeout { - exit 1 - } + # Do not ignore eof from expect + eof { exp_continue } + # A default timeout action is to do nothing, change it to fail + timeout { exit 1 } } # useful debugging configuration diff --git a/tests/queries/0_stateless/02136_kill_scalar_queries.reference b/tests/queries/0_stateless/02136_kill_scalar_queries.reference new file mode 100644 index 00000000000..a598447cff5 --- /dev/null +++ b/tests/queries/0_stateless/02136_kill_scalar_queries.reference @@ -0,0 +1,2 @@ +finished default_TEST02132KILL_QUERY1 default select (SELECT max(number) from system.numbers) + 1; +finished default_TEST02132KILL_QUERY2 default SELECT (SELECT number FROM system.numbers WHERE number = 1000000000000); diff --git a/tests/queries/0_stateless/02136_kill_scalar_queries.sh b/tests/queries/0_stateless/02136_kill_scalar_queries.sh new file mode 100755 index 00000000000..382f6555c66 --- /dev/null +++ b/tests/queries/0_stateless/02136_kill_scalar_queries.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Ref: https://github.com/ClickHouse/ClickHouse/issues/1576 +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +function wait_for_query_to_start() +{ + while [[ $($CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "SELECT count() FROM system.processes WHERE query_id = '$1'") == 0 ]]; do sleep 0.1; done +} + +QUERY_1_ID="${CLICKHOUSE_DATABASE}_TEST02132KILL_QUERY1" +(${CLICKHOUSE_CLIENT} --query_id="${QUERY_1_ID}" --query='select (SELECT max(number) from system.numbers) + 1;' 2>&1 | grep -q "Code: 394." || echo 'FAIL') & +wait_for_query_to_start "${QUERY_1_ID}" +${CLICKHOUSE_CLIENT} --query="KILL QUERY WHERE query_id='${QUERY_1_ID}' SYNC" + +QUERY_2_ID="${CLICKHOUSE_DATABASE}_TEST02132KILL_QUERY2" +(${CLICKHOUSE_CLIENT} --query_id="${QUERY_2_ID}" --query='SELECT (SELECT number FROM system.numbers WHERE number = 1000000000000);' 2>&1 | grep -q "Code: 394." || echo 'FAIL') & +wait_for_query_to_start "${QUERY_2_ID}" +${CLICKHOUSE_CLIENT} --query="KILL QUERY WHERE query_id='${QUERY_2_ID}' SYNC" + +wait diff --git a/tests/queries/0_stateless/02136_scalar_progress.reference b/tests/queries/0_stateless/02136_scalar_progress.reference new file mode 100644 index 00000000000..21f6d3e0043 --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_progress.reference @@ -0,0 +1,6 @@ +< X-ClickHouse-Progress: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"65505","read_bytes":"524040","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"131010","read_bytes":"1048080","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Summary: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} diff --git a/tests/queries/0_stateless/02136_scalar_progress.sh b/tests/queries/0_stateless/02136_scalar_progress.sh new file mode 100755 index 00000000000..4608031f83d --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_progress.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# Ref: https://github.com/ClickHouse/ClickHouse/issues/1576 +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CURL -sS "${CLICKHOUSE_URL}&wait_end_of_query=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0" -d "SELECT (SELECT max(number), count(number) FROM numbers(100000));" -v 2>&1 | grep -E "X-ClickHouse-Summary|X-ClickHouse-Progress" diff --git a/tests/queries/0_stateless/02136_scalar_read_rows_json.reference b/tests/queries/0_stateless/02136_scalar_read_rows_json.reference new file mode 100644 index 00000000000..49020a4432f --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_read_rows_json.reference @@ -0,0 +1,50 @@ +#1 +{ + "meta": + [ + { + "name": "count()", + "type": "UInt64" + } + ], + + "data": + [ + { + "count()": "100" + } + ], + + "rows": 1, + + "rows_before_limit_at_least": 100, + + "statistics": + { + "rows_read": 100, + "bytes_read": 800 + } +} +#2 +{ + "meta": + [ + { + "type": "Tuple(UInt64, UInt64)" + } + ], + + "data": + [ + { + } + ], + + "rows": 1, + + "statistics": + { + "rows_read": 131011, + "bytes_read": 1048081 + } +} diff --git a/tests/queries/0_stateless/02136_scalar_read_rows_json.sh b/tests/queries/0_stateless/02136_scalar_read_rows_json.sh new file mode 100755 index 00000000000..d589cb60086 --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_read_rows_json.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Ref: https://github.com/ClickHouse/ClickHouse/issues/1576 +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo "#1" +${CLICKHOUSE_CLIENT} --query='SELECT count() FROM numbers(100) FORMAT JSON;' | grep -a -v "elapsed" +echo "#2" +${CLICKHOUSE_CLIENT} --query='SELECT (SELECT max(number), count(number) FROM numbers(100000) as n) FORMAT JSON;' | grep -a -v "elapsed" | grep -v "_subquery" diff --git a/tests/queries/0_stateless/02136_scalar_subquery_metrics.reference b/tests/queries/0_stateless/02136_scalar_subquery_metrics.reference new file mode 100644 index 00000000000..7bef11d008f --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_subquery_metrics.reference @@ -0,0 +1,9 @@ +#02136_scalar_subquery_1 999 +#02136_scalar_subquery_2 999 0 +#02136_scalar_subquery_3 999 999 +#02136_scalar_subquery_4 999 +#02136_scalar_subquery_4 999 +1001 SELECT \'#02136_scalar_subquery_1\', (SELECT max(number) FROM numbers(1000)) as n; +2001 SELECT \'#02136_scalar_subquery_2\', (SELECT max(number) FROM numbers(1000)) as n, (SELECT min(number) FROM numbers(1000)) as n2; +1001 SELECT \'#02136_scalar_subquery_3\', (SELECT max(number) FROM numbers(1000)) as n, (SELECT max(number) FROM numbers(1000)) as n2; +1002 SELECT \'#02136_scalar_subquery_4\', (SELECT max(number) FROM numbers(1000)) as n FROM system.numbers LIMIT 2; diff --git a/tests/queries/0_stateless/02136_scalar_subquery_metrics.sql b/tests/queries/0_stateless/02136_scalar_subquery_metrics.sql new file mode 100644 index 00000000000..180610288aa --- /dev/null +++ b/tests/queries/0_stateless/02136_scalar_subquery_metrics.sql @@ -0,0 +1,13 @@ +SELECT '#02136_scalar_subquery_1', (SELECT max(number) FROM numbers(1000)) as n; +SELECT '#02136_scalar_subquery_2', (SELECT max(number) FROM numbers(1000)) as n, (SELECT min(number) FROM numbers(1000)) as n2; +SELECT '#02136_scalar_subquery_3', (SELECT max(number) FROM numbers(1000)) as n, (SELECT max(number) FROM numbers(1000)) as n2; -- Cached +SELECT '#02136_scalar_subquery_4', (SELECT max(number) FROM numbers(1000)) as n FROM system.numbers LIMIT 2; -- Cached + +SYSTEM FLUSH LOGS; +SELECT read_rows, query FROM system.query_log +WHERE + event_date > yesterday() + AND type = 'QueryFinish' + AND current_database == currentDatabase() + AND query LIKE 'SELECT ''#02136_scalar_subquery_%' +ORDER BY query ASC; diff --git a/tests/queries/0_stateless/02149_external_schema_inference.reference b/tests/queries/0_stateless/02149_external_schema_inference.reference new file mode 100644 index 00000000000..875659c7fb6 --- /dev/null +++ b/tests/queries/0_stateless/02149_external_schema_inference.reference @@ -0,0 +1,168 @@ +Protobuf + +a_b_c Array(Array(Array(Int32))) + +a String +b_c Array(Array(Float64)) + +x Enum8(\'FIRST\' = 0, \'SECOND\' = 1, \'TEN\' = 10, \'HUNDRED\' = 100) + +a Map(String, UInt32) + +x_y_z Array(Array(Int32)) + +uuid String +name String +surname String +gender Enum8(\'female\' = 0, \'male\' = 1) +birthDate UInt32 +photo String +phoneNumber String +isOnline UInt8 +visitTime UInt32 +age UInt32 +zodiacSign Enum8(\'aries\' = 0, \'taurus\' = 1, \'gemini\' = 2, \'cancer\' = 3, \'leo\' = 4, \'virgo\' = 5, \'libra\' = 6, \'scorpius\' = 7, \'sagittarius\' = 8, \'capricorn\' = 9, \'aquarius\' = 10, \'pisces\' = 11) +songs Array(String) +color Array(UInt32) +hometown String +location Array(Float32) +pi Float64 +lotteryWin Float64 +someRatio Float32 +temperature Float32 +randomBigNumber Int64 +measureUnits Array(Tuple(unit String, coef Float32)) +nestiness_a_b_c Tuple(d UInt32, e Array(UInt32)) + +location Array(Int32) +pi Float32 +uuid String +newFieldBool UInt8 +name String +gender Enum8(\'male\' = 0, \'female\' = 1) +zodiacSign Int32 +birthDate Int64 +age String +isOnline Enum8(\'offline\' = 0, \'online\' = 1) +someRatio Float64 +visitTime UInt64 +newMessage Tuple(empty Array(Tuple()), z Float32) +randomBigNumber Int64 +newFieldInt Array(Int32) +color Array(Float32) +lotteryWin UInt64 +surname String +phoneNumber UInt64 +temperature Int32 +newFieldStr String +measureUnits_unit Array(String) +measureUnits_coef Array(Float32) +nestiness_a_b_c_d UInt32 +nestiness_a_b_c_e Array(UInt32) + +uuid String +name String +surname String +gender String +birthDate String +phoneNumber String +isOnline String +visitTime String +age String +zodiacSign String +songs Array(String) +color Array(String) +hometown String +location Array(String) +pi String +lotteryWin String +someRatio String +temperature String +randomBigNumber String +measureUnits Tuple(unit Array(String), coef Array(String)) +nestiness_a_b_c Tuple(d String, e Array(String)) + +uuid String +name String +surname String +gender Enum8(\'female\' = 0, \'male\' = 1) +birthDate UInt32 +photo String +phoneNumber String +isOnline UInt8 +visitTime UInt32 +age UInt32 +zodiacSign Enum8(\'aries\' = 0, \'taurus\' = 1, \'gemini\' = 2, \'cancer\' = 3, \'leo\' = 4, \'virgo\' = 5, \'libra\' = 6, \'scorpius\' = 7, \'sagittarius\' = 8, \'capricorn\' = 9, \'aquarius\' = 10, \'pisces\' = 11) +songs Array(String) +color Array(UInt32) +hometown String +location Array(Float32) +pi Float64 +lotteryWin Float64 +someRatio Float32 +temperature Float32 +randomBigNumber Int64 +measureunits Tuple(coef Array(Float32), unit Array(String)) +nestiness_a_b_c Tuple(d UInt32, e Array(UInt32)) +newFieldStr String +newFieldInt Int32 +newBool UInt8 + +identifier String +modules Array(Tuple(module_id UInt32, supply UInt32, temp UInt32, nodes Array(Tuple(node_id UInt32, opening_time UInt32, closing_time UInt32, current UInt32, coords_y Float32)))) + +Capnproto + +value Enum8(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2) + +value UInt64 +list1 Array(UInt64) +list2 Array(Array(Array(UInt64))) + +lc1 String +lc2 Nullable(String) +lc3 Array(Nullable(String)) + +value UInt64 +nested Tuple(a Tuple(b UInt64, c Array(Array(UInt64))), d Array(Tuple(e Array(Array(Tuple(f UInt64, g UInt64))), h Array(Tuple(k Array(UInt64)))))) + +nested Tuple(value Array(UInt64), array Array(Array(UInt64)), tuple Array(Tuple(one UInt64, two UInt64))) + +a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(f UInt64))) + +nullable Nullable(UInt64) +array Array(Nullable(UInt64)) +tuple Tuple(nullable Nullable(UInt64)) + +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +float32 Float32 +float64 Float64 +string String +fixed String +data String +date UInt16 +datetime UInt32 +datetime64 Int64 + +value UInt64 +tuple1 Tuple(one UInt64, two Tuple(three UInt64, four UInt64)) +tuple2 Tuple(nested1 Tuple(nested2 Tuple(x UInt64))) + +RawBLOB + +raw_blob String + +LineAsString + +line String + +JSONAsString + +json String diff --git a/tests/queries/0_stateless/02149_external_schema_inference.sh b/tests/queries/0_stateless/02149_external_schema_inference.sh new file mode 100755 index 00000000000..df2b9a43565 --- /dev/null +++ b/tests/queries/0_stateless/02149_external_schema_inference.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +touch $DATA_FILE + +SCHEMADIR=$(clickhouse-client --query "select * from file('$FILE_NAME', 'CapnProto', 'val1 char') settings format_schema='nonexist:Message'" 2>&1 | grep Exception | grep -oP "file \K.*(?=/nonexist.capnp)") +CLIENT_SCHEMADIR=$CURDIR/format_schemas +SERVER_SCHEMADIR=test_02149 +mkdir -p $SCHEMADIR/$SERVER_SCHEMADIR +cp -r $CLIENT_SCHEMADIR/* $SCHEMADIR/$SERVER_SCHEMADIR/ + +echo -e "Protobuf\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_array_3dim:ABC'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_array_of_arrays:AA'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_enum_mapping.proto:EnumMessage'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_map:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_nested_in_nested:MessageType'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:Person'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:AltPerson'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons:StrPerson'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_persons_syntax2:Syntax2Person'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Protobuf') settings format_schema='$SERVER_SCHEMADIR/00825_protobuf_format_skipped_column_in_nested:UpdateMessage'" + + +echo -e "\nCapnproto\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_lists:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_low_cardinality:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_lists_and_tuples:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_table:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_nullable:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_simple_types:Message'" + +echo +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CapnProto') settings format_schema='$SERVER_SCHEMADIR/02030_capnp_tuples:Message'" + +echo -e "\nRawBLOB\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'RawBLOB')" + +echo -e "\nLineAsString\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'LineAsString')" + +echo -e "\nJSONAsString\n" +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONAsString')" + + + +rm -rf ${SCHEMADIR:?}/$SERVER_SCHEMADIR +rm $DATA_FILE diff --git a/tests/queries/0_stateless/02149_schema_inference.reference b/tests/queries/0_stateless/02149_schema_inference.reference new file mode 100644 index 00000000000..f46e3bee101 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference.reference @@ -0,0 +1,170 @@ +TSV +c1 Nullable(String) +c2 Nullable(String) +c3 Nullable(String) +c4 Nullable(String) +42 Some string [1, 2, 3, 4] (1, 2, 3) +42 abcd [] (4, 5, 6) +TSVWithNames +number Nullable(String) +string Nullable(String) +array Nullable(String) +tuple Nullable(String) +42 Some string [1, 2, 3, 4] (1, 2, 3) +42 abcd [] (4, 5, 6) +CSV +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +c4 Array(Nullable(Float64)) +\N Some string [([1,2.3],'String'),([],NULL)] [1,NULL,3] +42 \N [([1,2.3],'String'),([3],'abcd')] [4,5,6] +c1 Nullable(String) +c2 Nullable(String) +42 String +String 42 +c1 Nullable(String) +c2 Nullable(String) +\N [NULL, NULL] +\N [] +CSVWithNames +a Nullable(Float64) +b Nullable(String) +c Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +d Array(Nullable(Float64)) +\N Some string [([1,2.3],'String'),([],NULL)] [1,NULL,3] +42 \N [([1,2.3],'String'),([3],'abcd')] [4,5,6] +JSONCompactEachRow +c1 Nullable(Float64) +c2 Array(Tuple(Nullable(Float64), Nullable(String))) +c3 Map(String, Nullable(Float64)) +c4 Nullable(UInt8) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +c1 Nullable(Float64) +c2 Array(Tuple(Nullable(Float64), Nullable(String))) +c3 Map(String, Nullable(Float64)) +c4 Nullable(UInt8) +\N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N +32 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 1 +JSONCompactEachRowWithNames +a Nullable(Float64) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +d Nullable(UInt8) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +JSONEachRow +d Nullable(UInt8) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +a Nullable(Float64) +1 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 42.42 +d Nullable(UInt8) +b Array(Tuple(Nullable(Float64), Nullable(String))) +c Map(String, Nullable(Float64)) +a Nullable(Float64) +\N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N +1 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 32 +b Nullable(String) +c Array(Nullable(Float64)) +a Nullable(Float64) +s1 [] 1 +\N [2] 2 +\N [] \N +\N [] \N +\N [3] \N +TSKV +b Nullable(String) +c Nullable(String) +a Nullable(String) +s1 \N 1 +} [2] 2 +\N \N \N +\N \N \N +\N [3] \N +Values +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(String)) +c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(String)))) +42.42 Some string [1,2,3] (1,'2') ([1,2],[(3,'4'),(5,'6')]) +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Nullable(Float64)) +c4 Tuple(Nullable(Float64), Nullable(Float64)) +c5 Tuple(Array(Nullable(Float64)), Array(Tuple(Nullable(Float64), Nullable(String)))) +42.42 \N [1,NULL,3] (1,NULL) ([1,2],[(3,'4'),(5,'6')]) +\N Some string [10] (1,2) ([],[]) +Regexp +c1 Nullable(String) +c2 Nullable(String) +c3 Nullable(String) +42 Some string 1 [([1, 2, 3], String 1), ([], String 1)] +2 Some string 2 [([4, 5, 6], String 2), ([], String 2)] +312 Some string 3 [([1, 2, 3], String 2), ([], String 2)] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([],'String 1')] +3 Some string 2 [([3,5,1],'String 2'),([],'String 2')] +244 Some string 3 [([],'String 3'),([],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([],'String 1')] +2 Some string 2 [([],'String 2'),([],'String 2')] +43 Some string 3 [([1,5,3],'String 3'),([],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +52 Some string 2 [([],'String 2'),([1],'String 2')] +24 Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +CustomSeparated +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +Template +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +column_1 Nullable(Float64) +column_2 Nullable(String) +column_3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +42.42 Some string 1 [([1,2,3],'String 1'),([1],'String 1')] +42 Some string 2 [([],'String 2'),([],'String 2')] +\N Some string 3 [([1,2,3],'String 3'),([1],'String 3')] +MsgPack +c1 Nullable(Int64) +c2 Nullable(Int64) +c3 Nullable(Float32) +c4 Nullable(String) +c5 Array(Array(Nullable(Int64))) +c6 Map(Int64, Array(Nullable(Int64))) +\N 0 0 Str: 0 [[0,1],[0]] {0:[0,1]} +1 \N 1 Str: 1 [[1,2],[1]] {1:[1,2]} +\N 2 2 Str: 2 [[2,3],[2]] {2:[2,3]} diff --git a/tests/queries/0_stateless/02149_schema_inference.sh b/tests/queries/0_stateless/02149_schema_inference.sh new file mode 100755 index 00000000000..1ccec240627 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference.sh @@ -0,0 +1,251 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME + +touch $DATA_FILE + +SCHEMADIR=$(clickhouse-client --query "select * from file('$FILE_NAME', 'Template', 'val1 char') settings format_template_row='nonexist'" 2>&1 | grep Exception | grep -oP "file \K.*(?=/nonexist)") + +echo "TSV" + +echo -e "42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3) +42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSV')" + +echo "TSVWithNames" + +echo -e "number\tstring\tarray\ttuple +42\tSome string\t[1, 2, 3, 4]\t(1, 2, 3) +42\tabcd\t[]\t(4, 5, 6)" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSVWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSVWithNames')" + +echo "CSV" + +echo -e "\N,\"Some string\",\"[([1, 2.3], 'String'), ([], NULL)]\",\"[1, NULL, 3]\" +42,\N,\"[([1, 2.3], 'String'), ([3.], 'abcd')]\",\"[4, 5, 6]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "42,\"String\" +\"String\",42" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo -e "\N,\"[NULL, NULL]\" +\N,[]" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSV')" + +echo "CSVWithNames" + +echo -e "a,b,c,d +\N,\"Some string\",\"[([1, 2.3], 'String'), ([], NULL)]\",\"[1, NULL, 3]\" +42,\N,\"[([1, 2.3], 'String'), ([3.], 'abcd')]\",\"[4, 5, 6]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CSVWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CSVWithNames')" + +echo "JSONCompactEachRow" + +echo -e "[42.42, [[1, \"String\"], [2, \"abcd\"]], {\"key\" : 42, \"key2\" : 24}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo -e "[null, [[1, \"String\"], [2, null]], {\"key\" : null, \"key2\" : 24}, null] +[32, [[2, \"String 2\"], [3, \"hello\"]], {\"key3\" : 4242, \"key4\" : 2424}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo "JSONCompactEachRowWithNames" + +echo -e "[\"a\", \"b\", \"c\", \"d\"] +[42.42, [[1, \"String\"], [2, \"abcd\"]], {\"key\" : 42, \"key2\" : 24}, true]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRowWithNames')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRowWithNames')" + + +echo "JSONEachRow" +echo -e '{"a" : 42.42, "b" : [[1, "String"], [2, "abcd"]], "c" : {"key" : 42, "key2" : 24}, "d" : true}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"a" : null, "b" : [[1, "String"], [2, null]], "c" : {"key" : null, "key2" : 24}, "d" : null} +{"a" : 32, "b" : [[2, "String 2"], [3, "hello"]], "c" : {"key3" : 4242, "key4" : 2424}, "d" : true}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"a" : 1, "b" : "s1", "c" : null} +{"c" : [2], "a" : 2, "b" : null} +{} +{"a" : null} +{"c" : [3], "a" : null}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + + +echo "TSKV" + +echo -e 'a=1\tb=s1\tc=\N +c=[2]\ta=2\tb=\N} + +a=\N +c=[3]\ta=\N' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')" + + +echo "Values" + +echo -e "(42.42, 'Some string', [1, 2, 3], (1, '2'), ([1, 2], [(3, '4'), (5, '6')]))" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Values')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Values')" + +echo -e "(42.42, NULL, [1, NULL, 3], (1, NULL), ([1, 2], [(3, '4'), (5, '6')])), (NULL, 'Some string', [10], (1, 2), ([], []))" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Values')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Values')" + + +echo "Regexp" + +REGEXP="^Line: value_1=(.+?), value_2=(.+?), value_3=(.+?)" + +echo "Line: value_1=42, value_2=Some string 1, value_3=[([1, 2, 3], String 1), ([], String 1)] +Line: value_1=2, value_2=Some string 2, value_3=[([4, 5, 6], String 2), ([], String 2)] +Line: value_1=312, value_2=Some string 3, value_3=[([1, 2, 3], String 2), ([], String 2)]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Escaped'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Escaped'" + + +echo "Line: value_1=42, value_2=\"Some string 1\", value_3=\"[([1, 2, 3], 'String 1'), ([], 'String 1')]\" +Line: value_1=3, value_2=\"Some string 2\", value_3=\"[([3, 5, 1], 'String 2'), ([], 'String 2')]\" +Line: value_1=244, value_2=\"Some string 3\", value_3=\"[([], 'String 3'), ([], 'String 3')]\"" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='CSV'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='CSV'" + + +echo "Line: value_1=42, value_2='Some string 1', value_3=[([1, 2, 3], 'String 1'), ([], 'String 1')] +Line: value_1=2, value_2='Some string 2', value_3=[([], 'String 2'), ([], 'String 2')] +Line: value_1=43, value_2='Some string 3', value_3=[([1, 5, 3], 'String 3'), ([], 'String 3')]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Quoted'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='Quoted'" + + +echo "Line: value_1=42, value_2=\"Some string 1\", value_3=[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] +Line: value_1=52, value_2=\"Some string 2\", value_3=[[[], \"String 2\"], [[1], \"String 2\"]] +Line: value_1=24, value_2=\"Some string 3\", value_3=[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]]" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='JSON'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Regexp') settings format_regexp='$REGEXP', format_regexp_escaping_rule='JSON'" + + +echo "CustomSeparated" + +CUSTOM_SETTINGS="SETTINGS format_custom_row_before_delimiter='', format_custom_row_after_delimiter='\n', format_custom_row_between_delimiter='\n', format_custom_result_before_delimiter='\n', format_custom_result_after_delimiter='\n', format_custom_field_delimiter=''" + +echo -e " +42.42\"Some string 1\"\"[([1, 2, 3], 'String 1'), ([1], 'String 1')]\" + +42\"Some string 2\"\"[([], 'String 2'), ([], 'String 2')]\" + +\N\"Some string 3\"\"[([1, 2, 3], 'String 3'), ([1], 'String 3')]\" +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='CSV'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='CSV'" + +echo -e " +42.42'Some string 1'[([1, 2, 3], 'String 1'), ([1], 'String 1')] + +42'Some string 2'[([], 'String 2'), ([], 'String 2')] + +NULL'Some string 3'[([1, 2, 3], 'String 3'), ([1], 'String 3')] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='Quoted'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='Quoted'" + +echo -e " +42.42\"Some string 1\"[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] + +42\"Some string 2\"[[[], \"String 2\"], [[], \"String 2\"]] + +null\"Some string 3\"[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='JSON'" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'CustomSeparated') $CUSTOM_SETTINGS, format_custom_escaping_rule='JSON'" + + +echo "Template" + +echo -e " +\${data}" > $SCHEMADIR/resultset_format_02149 + +echo -e "\${column_1:CSV}\${column_2:CSV}\${column_3:CSV}" > $SCHEMADIR/row_format_02149 + +TEMPLATE_SETTINGS="SETTINGS format_template_rows_between_delimiter='\n', format_template_row='row_format_02149', format_template_resultset='resultset_format_02149'" + +echo -e " +42.42\"Some string 1\"\"[([1, 2, 3], 'String 1'), ([1], 'String 1')]\" + +42\"Some string 2\"\"[([], 'String 2'), ([], 'String 2')]\" + +\N\"Some string 3\"\"[([1, 2, 3], 'String 3'), ([1], 'String 3')]\" +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + +echo -e "\${column_1:Quoted}\${column_2:Quoted}\${column_3:Quoted}" > $SCHEMADIR/row_format_02149 + +echo -e " +42.42'Some string 1'[([1, 2, 3], 'String 1'), ([1], 'String 1')] + +42'Some string 2'[([], 'String 2'), ([], 'String 2')] + +NULL'Some string 3'[([1, 2, 3], 'String 3'), ([1], 'String 3')] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + +echo -e "\${column_1:JSON}\${column_2:JSON}\${column_3:JSON}" > $SCHEMADIR/row_format_02149 + +echo -e " +42.42\"Some string 1\"[[[1, 2, 3], \"String 1\"], [[1], \"String 1\"]] + +42\"Some string 2\"[[[], \"String 2\"], [[], \"String 2\"]] + +null\"Some string 3\"[[[1, 2, 3], \"String 3\"], [[1], \"String 3\"]] +" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Template') $TEMPLATE_SETTINGS" + + +echo "MsgPack" + +$CLICKHOUSE_CLIENT -q "select toInt32(number % 2 ? number : NULL) as int, toUInt64(number % 2 ? NULL : number) as uint, toFloat32(number) as float, concat('Str: ', toString(number)) as str, [[number, number + 1], [number]] as arr, map(number, [number, number + 1]) as map from numbers(3) format MsgPack" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'MsgPack') settings input_format_msgpack_number_of_columns=6" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'MsgPack') settings input_format_msgpack_number_of_columns=6" + + +rm $SCHEMADIR/resultset_format_02149 $SCHEMADIR/row_format_02149 +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference new file mode 100644 index 00000000000..dae12318ce0 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.reference @@ -0,0 +1,40 @@ +0 Str: 0 [0,1] +1 Str: 1 [1,2] +2 Str: 2 [2,3] +3 Str: 3 [3,4] +4 Str: 4 [4,5] +5 Str: 5 [5,6] +6 Str: 6 [6,7] +7 Str: 7 [7,8] +8 Str: 8 [8,9] +9 Str: 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] +0 0 [0,1] +1 1 [1,2] +2 2 [2,3] +3 3 [3,4] +4 4 [4,5] +5 5 [5,6] +6 6 [6,7] +7 7 [7,8] +8 8 [8,9] +9 9 [9,10] diff --git a/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh new file mode 100755 index 00000000000..f00f2531dd0 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_create_table_syntax.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +mkdir $USER_FILES_PATH/test_02149 +FILE_NAME=test_02149/data.Parquet +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +$CLICKHOUSE_CLIENT -q "select number as num, concat('Str: ', toString(number)) as str, [number, number + 1] as arr from numbers(10) format Parquet" > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "drop table if exists test_02149" +$CLICKHOUSE_CLIENT -q "create table test_02149 engine=File('Parquet', '$FILE_NAME')" +$CLICKHOUSE_CLIENT -q "select * from test_02149" +$CLICKHOUSE_CLIENT -q "drop table test_02149" + +$CLICKHOUSE_CLIENT -q "create table test_02149 (x UInt32, s String, a Array(UInt32)) engine=Memory" +$CLICKHOUSE_CLIENT -q "insert into test_02149 select number, toString(number), [number, number + 1] from numbers(10)" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_merge" +$CLICKHOUSE_CLIENT -q "create table test_merge engine=Merge(currentDatabase(), 'test_02149')" +$CLICKHOUSE_CLIENT -q "select * from test_merge" +$CLICKHOUSE_CLIENT -q "drop table test_merge" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_distributed" +$CLICKHOUSE_CLIENT -q "create table test_distributed engine=Distributed(test_shard_localhost, currentDatabase(), 'test_02149')" +$CLICKHOUSE_CLIENT -q "select * from test_distributed" +$CLICKHOUSE_CLIENT -q "drop table test_distributed" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_buffer" +$CLICKHOUSE_CLIENT -q "create table test_buffer engine=Buffer(currentDatabase(), 'test_02149', 16, 10, 100, 10000, 1000000, 10000000, 100000000)" +$CLICKHOUSE_CLIENT -q "select * from test_buffer" +$CLICKHOUSE_CLIENT -q "drop table test_buffer" + +rm -rf ${USER_FILES_PATH:?}/test_02149 + diff --git a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference new file mode 100644 index 00000000000..d3d2d86d696 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.reference @@ -0,0 +1,435 @@ +Arrow +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +ArrowStream +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Parquet +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 Int64 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date UInt16 +date32 Date32 +0 1970-01-01 +1 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(`tuple.0` UInt64, `tuple.1` String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(UInt64), `nested1.1` Map(String, UInt64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(UInt64)), `nested2.0.1` Map(UInt64, Array(Tuple(`nested2.0.1.0` UInt64, `nested2.0.1.1` String)))), `nested2.1` UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +ORC +int8 Int8 +uint8 Int8 +int16 Int16 +uint16 Int16 +int32 Int32 +uint32 Int32 +int64 Int64 +uint64 Int64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date32 +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string String +Str: 0 100 +Str: 1 200 +array Array(Int64) +tuple Tuple(`tuple.0` Int64, `tuple.1` String) +map Map(String, Int64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(`nested1.0` Array(Int64), `nested1.1` Map(String, Int64))) +nested2 Tuple(`nested2.0` Tuple(`nested2.0.0` Array(Array(Int64)), `nested2.0.1` Map(Int64, Array(Tuple(`nested2.0.1.0` Int64, `nested2.0.1.1` String)))), `nested2.1` Int8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Native +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +TSVWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +TSVRawWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +CSVWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +JSONCompactEachRowWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +JSONCompactStringsEachRowWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +RowBinaryWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +CustomSeparatedWithNamesAndTypes +int8 Int8 +uint8 UInt8 +int16 Int16 +uint16 UInt16 +int32 Int32 +uint32 UInt32 +int64 Int64 +uint64 UInt64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +decimal32 Decimal(9, 5) +decimal64 Decimal(18, 5) +0 0 0 0 +1.2 0.7692307692307692 3.33333 333.33333 +date Date +date32 Date32 +1970-01-01 1970-01-01 +1970-01-02 1970-01-02 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(UInt64) +tuple Tuple(UInt64, String) +map Map(String, UInt64) +[0,1] (0,'0') {'0':0} +[1,2] (1,'1') {'1':1} +nested1 Array(Tuple(Array(UInt64), Map(String, UInt64))) +nested2 Tuple(Tuple(Array(Array(UInt64)), Map(UInt64, Array(Tuple(UInt64, String)))), UInt8) +[([0,1],{'42':0}),([],{}),([42],{'42':42})] (([[0],[1],[]],{0:[(0,'42'),(1,'42')]}),42) +[([1,2],{'42':1}),([],{}),([42],{'42':42})] (([[1],[2],[]],{1:[(1,'42'),(2,'42')]}),42) +Avro +CustomSeparatedWithNamesAndTypes +int8 Int32 +uint8 Int32 +int16 Int32 +uint16 Int32 +int32 Int32 +uint32 Int32 +int64 Int64 +uint64 Int64 +0 0 0 0 0 0 0 0 +-1 1 -1 1 -1 1 -1 1 +float32 Float32 +float64 Float64 +0 0 +1.2 0.7692307692307692 +date Int32 +0 +1 +str String +fixed_string FixedString(3) +Str: 0 100 +Str: 1 200 +array Array(Int64) +nested Array(Array(Array(Int64))) +[0,1] [[[0],[1]]] +[1,2] [[[1],[2]]] diff --git a/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh new file mode 100755 index 00000000000..d263ef63681 --- /dev/null +++ b/tests/queries/0_stateless/02149_schema_inference_formats_with_schema.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02149.data +DATA_FILE=$USER_FILES_PATH/$FILE_NAME + +for format in Arrow ArrowStream Parquet ORC Native TSVWithNamesAndTypes TSVRawWithNamesAndTypes CSVWithNamesAndTypes JSONCompactEachRowWithNamesAndTypes JSONCompactStringsEachRowWithNamesAndTypes RowBinaryWithNamesAndTypes CustomSeparatedWithNamesAndTypes +do + echo $format + $CLICKHOUSE_CLIENT -q "select toInt8(-number) as int8, toUInt8(number) as uint8, toInt16(-number) as int16, toUInt16(number) as uint16, toInt32(-number) as int32, toUInt32(number) as uint32, toInt64(-number) as int64, toUInt64(number) as uint64 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select toFloat32(number * 1.2) as float32, toFloat64(number / 1.3) as float64, toDecimal32(number / 0.3, 5) as decimal32, toDecimal64(number / 0.003, 5) as decimal64 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select toDate(number) as date, toDate32(number) as date32 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select concat('Str: ', toString(number)) as str, toFixedString(toString((number + 1) * 100 % 1000), 3) as fixed_string from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select [number, number + 1] as array, (number, toString(number)) as tuple, map(toString(number), number) as map from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" + + $CLICKHOUSE_CLIENT -q "select [([number, number + 1], map('42', number)), ([], map()), ([42], map('42', 42))] as nested1, (([[number], [number + 1], []], map(number, [(number, '42'), (number + 1, '42')])), 42) as nested2 from numbers(2) format $format" > $DATA_FILE + $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', '$format')" + $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', '$format')" +done + +echo "Avro" + +echo $format +$CLICKHOUSE_CLIENT -q "select toInt8(-number) as int8, toUInt8(number) as uint8, toInt16(-number) as int16, toUInt16(number) as uint16, toInt32(-number) as int32, toUInt32(number) as uint32, toInt64(-number) as int64, toUInt64(number) as uint64 from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select toFloat32(number * 1.2) as float32, toFloat64(number / 1.3) as float64 from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select toDate(number) as date from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select concat('Str: ', toString(number)) as str, toFixedString(toString((number + 1) * 100 % 1000), 3) as fixed_string from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +$CLICKHOUSE_CLIENT -q "select [number, number + 1] as array, [[[number], [number + 1]]] as nested from numbers(2) format Avro" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'Avro')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'Avro')" + +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/02151_clickhouse_client_hints.sh b/tests/queries/0_stateless/02151_clickhouse_client_hints.sh index 3e6c6cb16a5..7221acc2504 100755 --- a/tests/queries/0_stateless/02151_clickhouse_client_hints.sh +++ b/tests/queries/0_stateless/02151_clickhouse_client_hints.sh @@ -5,4 +5,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_CLIENT --hardware_utilization 2>&1 | grep -q "Code: 552. DB::Exception: Unrecognized option '--hardware_utilization'. Maybe you meant \['--hardware-utilization'\]. (UNRECOGNIZED_ARGUMENTS)" && echo 'OK' || echo 'FAIL' ||: +$CLICKHOUSE_CLIENT --secuer 2>&1 | grep -q "Code: 552. DB::Exception: Unrecognized option '--secuer'. Maybe you meant \['--secure'\]. (UNRECOGNIZED_ARGUMENTS)" && echo 'OK' || echo 'FAIL' ||: diff --git a/tests/queries/0_stateless/02154_parser_backtracking.reference b/tests/queries/0_stateless/02154_parser_backtracking.reference new file mode 100644 index 00000000000..23751ef6c1f --- /dev/null +++ b/tests/queries/0_stateless/02154_parser_backtracking.reference @@ -0,0 +1,14 @@ +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02154_parser_backtracking.sh b/tests/queries/0_stateless/02154_parser_backtracking.sh new file mode 100755 index 00000000000..af032008069 --- /dev/null +++ b/tests/queries/0_stateless/02154_parser_backtracking.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Should finish in reasonable time (milliseconds). +# In previous versions this query led to exponential backtracking. + +echo 'SELECT '"$(perl -e 'print "CAST(" x 100')"'a b c'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo 'SELECT '"$(perl -e 'print "CAST(" x 100')"'a, b'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo 'SELECT '"$(perl -e 'print "CAST(" x 100')"'a AS b'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo 'SELECT '"$(perl -e 'print "CAST(" x 100')"'1'"$(perl -e 'print ", '"'UInt8'"')" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 +echo 'SELECT '"$(perl -e 'print "CAST(" x 100')"'1'"$(perl -e 'print " AS UInt8)" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 + +echo "SELECT fo,22222?LUTAY(SELECT(NOT CAUTAY(SELECT(NOT CAST(NOTT(NOT CAST(NOT NOT LEfT(NOT coARRAYlumnsFLuTAY(SELECT(NO0?LUTAY(SELECT(NOT CAUTAY(SELECT(NOT CAST(NOTT(NOT CAST(NOT NOT LEfT(NOT coARRAYlumnsFLuTAY(SELECT(NOTAYTAY(SELECT(NOTAYEFAULT(fo,22222?LUTAY(%SELECT(NOT CAST(NOT NOTAYTAY(SELECT(NOTAYEFAULT(fo,22222?LUTAY(SELECT(NOT CAST(NOT NOT (NOe)))))))))))))))))))))))))))))))))" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo "SELECT position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(a b))))))))))))))))))))" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo "SELECT position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(a, b))))))))))))))))))))" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' +echo "SELECT position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(position(a, b, c))))))))))))))))))))" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' + +echo 'SELECT '"$(perl -e 'print "position(" x 100')"'x'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' +echo 'SELECT '"$(perl -e 'print "position(" x 100')"'x y'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'Syntax error' +echo 'SELECT '"$(perl -e 'print "position(" x 100')"'x IN y'"$(perl -e 'print ")" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' +echo 'SELECT '"$(perl -e 'print "position(" x 100')"'x'"$(perl -e 'print " IN x)" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' +echo 'SELECT '"$(perl -e 'print "position(" x 100')"'x'"$(perl -e 'print ", x)" x 100')" | ${CLICKHOUSE_LOCAL} --max_parser_depth 10000 2>&1 | grep -cF 'UNKNOWN_IDENTIFIER' diff --git a/tests/queries/0_stateless/02155_create_table_w_timezone.reference b/tests/queries/0_stateless/02155_create_table_w_timezone.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02155_create_table_w_timezone.sql b/tests/queries/0_stateless/02155_create_table_w_timezone.sql new file mode 100644 index 00000000000..0b72122ce39 --- /dev/null +++ b/tests/queries/0_stateless/02155_create_table_w_timezone.sql @@ -0,0 +1,8 @@ +create table t02155_t64_tz ( a DateTime64(9, America/Chicago)) Engine = Memory; -- { clientError 62 } +create table t02155_t_tz ( a DateTime(America/Chicago)) Engine = Memory; -- { clientError 62 } + +create table t02155_t64_tz ( a DateTime64(9, 'America/Chicago')) Engine = Memory; +create table t02155_t_tz ( a DateTime('America/Chicago')) Engine = Memory; + +drop table t02155_t64_tz; +drop table t02155_t_tz; diff --git a/tests/queries/0_stateless/02155_csv_with_strings_with_slash.reference b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.reference new file mode 100644 index 00000000000..db750f36364 --- /dev/null +++ b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.reference @@ -0,0 +1,62 @@ +input_format_null_as_default = 1 +0 \\asdf 2000-01-01 +1 x\\x\\ 2000-01-01 +2 x\\x 2000-01-01 +3 x\\ 2000-01-01 +4 x\\ 2000-01-01 +5 \\x 2000-01-01 +6 2000-01-01 +7 \\r\\n 2000-01-01 +8 \\\\r\\\\n 2000-01-01 +9 x\\\\ 2000-01-01 +10 \\asdf 2000-01-01 +11 x\\x\\ 2000-01-01 +12 x\\x 2000-01-01 +13 x\\ 2000-01-01 +14 x\\ 2000-01-01 +15 \\x 2000-01-01 +16 \\N 2000-01-01 +17 \\r\\n 2000-01-01 +18 \\\\r\\\\n 2000-01-01 +19 x\\\\ 2000-01-01 +20 \\asdf 2000-01-01 +21 x\\x\\ 2000-01-01 +22 x\\x 2000-01-01 +23 x\\ 2000-01-01 +24 x\\ 2000-01-01 +25 \\x 2000-01-01 +26 \\N 2000-01-01 +27 \\r\\n 2000-01-01 +28 \\\\r\\\\n 2000-01-01 +29 x\\\\ 2000-01-01 +input_format_null_as_default = 0 +0 \\asdf 2000-01-01 +1 x\\x\\ 2000-01-01 +2 x\\x 2000-01-01 +3 x\\ 2000-01-01 +4 x\\ 2000-01-01 +5 \\x 2000-01-01 +6 \\N 2000-01-01 +7 \\r\\n 2000-01-01 +8 \\\\r\\\\n 2000-01-01 +9 x\\\\ 2000-01-01 +10 \\asdf 2000-01-01 +11 x\\x\\ 2000-01-01 +12 x\\x 2000-01-01 +13 x\\ 2000-01-01 +14 x\\ 2000-01-01 +15 \\x 2000-01-01 +16 \\N 2000-01-01 +17 \\r\\n 2000-01-01 +18 \\\\r\\\\n 2000-01-01 +19 x\\\\ 2000-01-01 +20 \\asdf 2000-01-01 +21 x\\x\\ 2000-01-01 +22 x\\x 2000-01-01 +23 x\\ 2000-01-01 +24 x\\ 2000-01-01 +25 \\x 2000-01-01 +26 \\N 2000-01-01 +27 \\r\\n 2000-01-01 +28 \\\\r\\\\n 2000-01-01 +29 x\\\\ 2000-01-01 diff --git a/tests/queries/0_stateless/02155_csv_with_strings_with_slash.sh b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.sh new file mode 100755 index 00000000000..ab2577e6138 --- /dev/null +++ b/tests/queries/0_stateless/02155_csv_with_strings_with_slash.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test_02155_csv" + +${CLICKHOUSE_CLIENT} --query="create table test_02155_csv (A Int64, S String, D Date) Engine=Memory;" + + +echo "input_format_null_as_default = 1" +cat $CUR_DIR/data_csv/csv_with_slash.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_02155_csv FORMAT CSV SETTINGS input_format_null_as_default = 1" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM test_02155_csv" + +${CLICKHOUSE_CLIENT} --query="TRUNCATE TABLE test_02155_csv" + +echo "input_format_null_as_default = 0" +cat $CUR_DIR/data_csv/csv_with_slash.csv | ${CLICKHOUSE_CLIENT} -q "INSERT INTO test_02155_csv FORMAT CSV SETTINGS input_format_null_as_default = 0" +${CLICKHOUSE_CLIENT} --query="SELECT * FROM test_02155_csv" + + +${CLICKHOUSE_CLIENT} --query="DROP TABLE test_02155_csv" + diff --git a/tests/queries/0_stateless/02155_dictionary_comment.reference b/tests/queries/0_stateless/02155_dictionary_comment.reference new file mode 100644 index 00000000000..69b871a6925 --- /dev/null +++ b/tests/queries/0_stateless/02155_dictionary_comment.reference @@ -0,0 +1,11 @@ +02155_test_dictionary +02155_test_dictionary 02155_test_dictionary_comment_0 +02155_test_dictionary 02155_test_dictionary_comment_0 +0 Value +02155_test_dictionary 02155_test_dictionary_comment_0 +02155_test_dictionary 02155_test_dictionary_comment_0 +02155_test_dictionary 02155_test_dictionary_comment_1 +02155_test_dictionary 02155_test_dictionary_comment_1 +0 Value +02155_test_dictionary_view 02155_test_dictionary_view_comment_0 +02155_test_dictionary_view 02155_test_dictionary_view_comment_0 diff --git a/tests/queries/0_stateless/02155_dictionary_comment.sql b/tests/queries/0_stateless/02155_dictionary_comment.sql new file mode 100644 index 00000000000..e31d9d28366 --- /dev/null +++ b/tests/queries/0_stateless/02155_dictionary_comment.sql @@ -0,0 +1,53 @@ +DROP TABLE IF EXISTS 02155_test_table; +CREATE TABLE 02155_test_table +( + id UInt64, + value String +) ENGINE=TinyLog; + +INSERT INTO 02155_test_table VALUES (0, 'Value'); + +DROP DICTIONARY IF EXISTS 02155_test_dictionary; +CREATE DICTIONARY 02155_test_dictionary +( + id UInt64, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02155_test_table')) +LAYOUT(DIRECT()); + +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +ALTER TABLE 02155_test_dictionary COMMENT COLUMN value 'value_column'; --{serverError 48} + +ALTER TABLE 02155_test_dictionary MODIFY COMMENT '02155_test_dictionary_comment_0'; +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +SELECT * FROM 02155_test_dictionary; +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +ALTER TABLE 02155_test_dictionary MODIFY COMMENT '02155_test_dictionary_comment_1'; +SELECT name, comment FROM system.dictionaries WHERE name == '02155_test_dictionary' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary' AND database == currentDatabase(); + +DROP TABLE IF EXISTS 02155_test_dictionary_view; +CREATE TABLE 02155_test_dictionary_view +( + id UInt64, + value String +) ENGINE=Dictionary(concat(currentDatabase(), '.02155_test_dictionary')); + +SELECT * FROM 02155_test_dictionary_view; + +ALTER TABLE 02155_test_dictionary_view COMMENT COLUMN value 'value_column'; --{serverError 48} + +ALTER TABLE 02155_test_dictionary_view MODIFY COMMENT '02155_test_dictionary_view_comment_0'; +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary_view' AND database == currentDatabase(); +SELECT name, comment FROM system.tables WHERE name == '02155_test_dictionary_view' AND database == currentDatabase(); + +DROP TABLE 02155_test_dictionary_view; +DROP TABLE 02155_test_table; +DROP DICTIONARY 02155_test_dictionary; diff --git a/tests/queries/0_stateless/02155_nested_lc_defalut_bug.reference b/tests/queries/0_stateless/02155_nested_lc_defalut_bug.reference new file mode 100644 index 00000000000..fe99b0a6585 --- /dev/null +++ b/tests/queries/0_stateless/02155_nested_lc_defalut_bug.reference @@ -0,0 +1 @@ +1 ['a','b'] [3,4] ['',''] diff --git a/tests/queries/0_stateless/02155_nested_lc_defalut_bug.sql b/tests/queries/0_stateless/02155_nested_lc_defalut_bug.sql new file mode 100644 index 00000000000..45cb9f96b95 --- /dev/null +++ b/tests/queries/0_stateless/02155_nested_lc_defalut_bug.sql @@ -0,0 +1,8 @@ +DROP TABLE IF EXISTS nested_test; +CREATE TABLE nested_test (x UInt32, `nest.col1` Array(String), `nest.col2` Array(Int8)) ENGINE = MergeTree ORDER BY x; + +ALTER TABLE nested_test ADD COLUMN `nest.col3` Array(LowCardinality(String)); +INSERT INTO nested_test (x, `nest.col1`, `nest.col2`) values (1, ['a', 'b'], [3, 4]); +SELECT * FROM nested_test; + +DROP TABLE IF EXISTS nested_test; diff --git a/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.reference b/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.reference new file mode 100644 index 00000000000..e599dcc71e5 --- /dev/null +++ b/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.reference @@ -0,0 +1 @@ +2016-07-15 00:00:00 diff --git a/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.sql b/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.sql new file mode 100644 index 00000000000..703cf1fed7a --- /dev/null +++ b/tests/queries/0_stateless/02155_parse_date_lowcard_default_throw.sql @@ -0,0 +1 @@ +SELECT parseDateTimeBestEffort(toLowCardinality(materialize('15-JUL-16'))); diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere.reference b/tests/queries/0_stateless/02156_storage_merge_prewhere.reference new file mode 100644 index 00000000000..2dc83f1eaa5 --- /dev/null +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere.reference @@ -0,0 +1,13 @@ +SELECT count() +FROM t_02156_merge1 +PREWHERE k = 3 +WHERE (k = 3) AND notEmpty(v) +2 +SELECT count() +FROM t_02156_merge2 +WHERE (k = 3) AND notEmpty(v) +2 +SELECT count() +FROM t_02156_merge3 +WHERE (k = 3) AND notEmpty(v) +2 diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere.sql b/tests/queries/0_stateless/02156_storage_merge_prewhere.sql new file mode 100644 index 00000000000..69fa9ac5ee2 --- /dev/null +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere.sql @@ -0,0 +1,38 @@ +DROP TABLE IF EXISTS t_02156_mt1; +DROP TABLE IF EXISTS t_02156_mt2; +DROP TABLE IF EXISTS t_02156_log; +DROP TABLE IF EXISTS t_02156_dist; +DROP TABLE IF EXISTS t_02156_merge1; +DROP TABLE IF EXISTS t_02156_merge2; +DROP TABLE IF EXISTS t_02156_merge3; + +CREATE TABLE t_02156_mt1 (k UInt32, v String) ENGINE = MergeTree ORDER BY k; +CREATE TABLE t_02156_mt2 (k UInt32, v String) ENGINE = MergeTree ORDER BY k; +CREATE TABLE t_02156_log (k UInt32, v String) ENGINE = Log; + +CREATE TABLE t_02156_dist (k UInt32, v String) ENGINE = Distributed(test_shard_localhost, currentDatabase(), t_02156_mt1); + +CREATE TABLE t_02156_merge1 (k UInt32, v String) ENGINE = Merge(currentDatabase(), 't_02156_mt1|t_02156_mt2'); +CREATE TABLE t_02156_merge2 (k UInt32, v String) ENGINE = Merge(currentDatabase(), 't_02156_mt1|t_02156_log'); +CREATE TABLE t_02156_merge3 (k UInt32, v String) ENGINE = Merge(currentDatabase(), 't_02156_mt2|t_02156_dist'); + +INSERT INTO t_02156_mt1 SELECT number, toString(number) FROM numbers(10000); +INSERT INTO t_02156_mt2 SELECT number, toString(number) FROM numbers(10000); +INSERT INTO t_02156_log SELECT number, toString(number) FROM numbers(10000); + +EXPLAIN SYNTAX SELECT count() FROM t_02156_merge1 WHERE k = 3 AND notEmpty(v); +SELECT count() FROM t_02156_merge1 WHERE k = 3 AND notEmpty(v); + +EXPLAIN SYNTAX SELECT count() FROM t_02156_merge2 WHERE k = 3 AND notEmpty(v); +SELECT count() FROM t_02156_merge2 WHERE k = 3 AND notEmpty(v); + +EXPLAIN SYNTAX SELECT count() FROM t_02156_merge3 WHERE k = 3 AND notEmpty(v); +SELECT count() FROM t_02156_merge3 WHERE k = 3 AND notEmpty(v); + +DROP TABLE IF EXISTS t_02156_mt1; +DROP TABLE IF EXISTS t_02156_mt2; +DROP TABLE IF EXISTS t_02156_log; +DROP TABLE IF EXISTS t_02156_dist; +DROP TABLE IF EXISTS t_02156_merge1; +DROP TABLE IF EXISTS t_02156_merge2; +DROP TABLE IF EXISTS t_02156_merge3; diff --git a/tests/queries/0_stateless/02157_line_as_string_output_format.reference b/tests/queries/0_stateless/02157_line_as_string_output_format.reference new file mode 100644 index 00000000000..196aafcda30 --- /dev/null +++ b/tests/queries/0_stateless/02157_line_as_string_output_format.reference @@ -0,0 +1 @@ +Hello \ World diff --git a/tests/queries/0_stateless/02157_line_as_string_output_format.sql b/tests/queries/0_stateless/02157_line_as_string_output_format.sql new file mode 100644 index 00000000000..f1c567cf41d --- /dev/null +++ b/tests/queries/0_stateless/02157_line_as_string_output_format.sql @@ -0,0 +1 @@ +SELECT 'Hello \\ World' FORMAT LineAsString; diff --git a/tests/queries/0_stateless/02157_readonly_system_suspend.reference b/tests/queries/0_stateless/02157_readonly_system_suspend.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02157_readonly_system_suspend.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02157_readonly_system_suspend.sh b/tests/queries/0_stateless/02157_readonly_system_suspend.sh new file mode 100755 index 00000000000..77fe7b5f291 --- /dev/null +++ b/tests/queries/0_stateless/02157_readonly_system_suspend.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=none + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --readonly 1 --query "SYSTEM SUSPEND FOR 1 SECOND" 2>&1 | grep -c -F 'Code: 164' diff --git a/tests/queries/0_stateless/02158_contingency.reference b/tests/queries/0_stateless/02158_contingency.reference new file mode 100644 index 00000000000..ac475c7f204 --- /dev/null +++ b/tests/queries/0_stateless/02158_contingency.reference @@ -0,0 +1,5 @@ +0 0 -0 -0 0 +1 nan -1 -1 0.7 +0.95 0.95 -1 -1 0.23 +0.89 0.87 -0.7 -1 0.14 +0.95 0.89 -1 -0.89 0.23 diff --git a/tests/queries/0_stateless/02158_contingency.sql b/tests/queries/0_stateless/02158_contingency.sql new file mode 100644 index 00000000000..d1e1c76c066 --- /dev/null +++ b/tests/queries/0_stateless/02158_contingency.sql @@ -0,0 +1,5 @@ +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM (SELECT number % 3 AS a, number % 5 AS b FROM numbers(150)); +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM (SELECT number AS a, number + 1 AS b FROM numbers(150)); +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM (SELECT number % 10 AS a, number % 10 AS b FROM numbers(150)); +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM (SELECT number % 10 AS a, number % 5 AS b FROM numbers(150)); +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM (SELECT number % 10 AS a, number % 10 = 0 ? number : a AS b FROM numbers(150)); diff --git a/tests/queries/0_stateless/02158_explain_ast_alter_commands.reference b/tests/queries/0_stateless/02158_explain_ast_alter_commands.reference new file mode 100644 index 00000000000..030d5a8f5af --- /dev/null +++ b/tests/queries/0_stateless/02158_explain_ast_alter_commands.reference @@ -0,0 +1,41 @@ + AlterCommand ADD_COLUMN (children 1) + AlterCommand DROP_COLUMN (children 1) + AlterCommand MODIFY_COLUMN (children 1) + AlterCommand COMMENT_COLUMN (children 2) + AlterCommand RENAME_COLUMN (children 2) + AlterCommand MATERIALIZE_COLUMN (children 1) + AlterCommand MODIFY_ORDER_BY (children 1) + AlterCommand MODIFY_SAMPLE_BY (children 1) + AlterCommand MODIFY_TTL (children 1) + AlterCommand MATERIALIZE_TTL (children 1) + AlterCommand MODIFY_SETTING (children 1) + AlterCommand RESET_SETTING + AlterCommand MODIFY_QUERY (children 1) + AlterCommand REMOVE_TTL + AlterCommand REMOVE_SAMPLE_BY + AlterCommand ADD_INDEX (children 1) + AlterCommand DROP_INDEX (children 1) + AlterCommand MATERIALIZE_INDEX (children 1) + AlterCommand ADD_CONSTRAINT (children 1) + AlterCommand DROP_CONSTRAINT (children 1) + AlterCommand ADD_PROJECTION (children 1) + AlterCommand DROP_PROJECTION (children 1) + AlterCommand MATERIALIZE_PROJECTION (children 1) + AlterCommand DROP_PARTITION (children 1) + AlterCommand DROP_PARTITION (children 1) + AlterCommand ATTACH_PARTITION (children 1) + AlterCommand ATTACH_PARTITION (children 1) + AlterCommand REPLACE_PARTITION (children 1) + AlterCommand REPLACE_PARTITION (children 1) + AlterCommand MOVE_PARTITION (children 1) + AlterCommand DROP_COLUMN (children 2) + AlterCommand FREEZE_ALL + AlterCommand FREEZE_PARTITION (children 1) + AlterCommand UNFREEZE_ALL + AlterCommand UNFREEZE_PARTITION (children 1) + AlterCommand FETCH_PARTITION (children 1) + AlterCommand FETCH_PARTITION (children 1) + AlterCommand UPDATE (children 2) + AlterCommand UPDATE (children 3) + AlterCommand DELETE (children 1) + AlterCommand DELETE (children 2) diff --git a/tests/queries/0_stateless/02158_explain_ast_alter_commands.sh b/tests/queries/0_stateless/02158_explain_ast_alter_commands.sh new file mode 100755 index 00000000000..8dfb61eedfb --- /dev/null +++ b/tests/queries/0_stateless/02158_explain_ast_alter_commands.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=none + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo "EXPLAIN AST ALTER TABLE t ADD COLUMN c Int8; +EXPLAIN AST ALTER TABLE t DROP COLUMN c; +EXPLAIN AST ALTER TABLE t MODIFY COLUMN c Int8; +EXPLAIN AST ALTER TABLE t COMMENT COLUMN c 'comment'; +EXPLAIN AST ALTER TABLE t RENAME COLUMN c TO d; +EXPLAIN AST ALTER TABLE t MATERIALIZE COLUMN c; +EXPLAIN AST ALTER TABLE t MODIFY ORDER BY x; +EXPLAIN AST ALTER TABLE t MODIFY SAMPLE BY y; +EXPLAIN AST ALTER TABLE t MODIFY TTL z + INTERVAL 7 DAY; +EXPLAIN AST ALTER TABLE t MATERIALIZE TTL IN PARTITION 'p'; +EXPLAIN AST ALTER TABLE t MODIFY SETTING index_granularity = 4096; +EXPLAIN AST ALTER TABLE t RESET SETTING index_granularity; +EXPLAIN AST ALTER TABLE t MODIFY QUERY SELECT 42; +EXPLAIN AST ALTER TABLE t REMOVE TTL; +EXPLAIN AST ALTER TABLE t REMOVE SAMPLE BY; +EXPLAIN AST ALTER TABLE t ADD INDEX i c TYPE minmax GRANULARITY 1; +EXPLAIN AST ALTER TABLE t DROP INDEX i; +EXPLAIN AST ALTER TABLE t MATERIALIZE INDEX i; +EXPLAIN AST ALTER TABLE t ADD CONSTRAINT x CHECK 1; +EXPLAIN AST ALTER TABLE t DROP CONSTRAINT x; +EXPLAIN AST ALTER TABLE t ADD PROJECTION p (SELECT c); +EXPLAIN AST ALTER TABLE t DROP PROJECTION p; +EXPLAIN AST ALTER TABLE t MATERIALIZE PROJECTION p; +EXPLAIN AST ALTER TABLE t DETACH PARTITION 'p'; +EXPLAIN AST ALTER TABLE t DROP PARTITION 'p'; +EXPLAIN AST ALTER TABLE t ATTACH PARTITION 'p'; +EXPLAIN AST ALTER TABLE t ATTACH PART 'p'; +EXPLAIN AST ALTER TABLE t ATTACH PARTITION 'p' FROM t2; +EXPLAIN AST ALTER TABLE t REPLACE PARTITION 'p' FROM t2; +EXPLAIN AST ALTER TABLE t MOVE PARTITION 'p' TO TABLE t2; +EXPLAIN AST ALTER TABLE t CLEAR COLUMN c IN PARTITION 'p'; +EXPLAIN AST ALTER TABLE t FREEZE; +EXPLAIN AST ALTER TABLE t FREEZE PARTITION 'p'; +EXPLAIN AST ALTER TABLE t UNFREEZE WITH NAME 'n'; +EXPLAIN AST ALTER TABLE t UNFREEZE PARTITION 'p' WITH NAME 'n'; +EXPLAIN AST ALTER TABLE t FETCH PARTITION 'p' FROM '/path'; +EXPLAIN AST ALTER TABLE t FETCH PART 'p' FROM '/path'; +EXPLAIN AST ALTER TABLE t UPDATE c = 1 WHERE 1; +EXPLAIN AST ALTER TABLE t UPDATE c = 1 IN PARTITION 'p' WHERE 1; +EXPLAIN AST ALTER TABLE t DELETE WHERE c = 1; +EXPLAIN AST ALTER TABLE t DELETE IN PARTITION 'p' WHERE c = 1;" | \ + $CLICKHOUSE_CLIENT --readonly 1 --multiquery 2>&1 | grep 'AlterCommand' diff --git a/tests/queries/0_stateless/02158_interval_length_sum.reference b/tests/queries/0_stateless/02158_interval_length_sum.reference new file mode 100644 index 00000000000..b4de3947675 --- /dev/null +++ b/tests/queries/0_stateless/02158_interval_length_sum.reference @@ -0,0 +1 @@ +11 diff --git a/tests/queries/0_stateless/02158_interval_length_sum.sql b/tests/queries/0_stateless/02158_interval_length_sum.sql new file mode 100644 index 00000000000..af22a707caf --- /dev/null +++ b/tests/queries/0_stateless/02158_interval_length_sum.sql @@ -0,0 +1 @@ +SELECT intervalLengthSum(x, y) FROM values('x Int64, y Int64', (0, 10), (5, 5), (5, 6), (1, -1)); diff --git a/tests/queries/0_stateless/02159_left_right.reference b/tests/queries/0_stateless/02159_left_right.reference new file mode 100644 index 00000000000..8d7ba1686a7 --- /dev/null +++ b/tests/queries/0_stateless/02159_left_right.reference @@ -0,0 +1,230 @@ +-- { echo } + +SELECT left('Hello', 3); +Hel +SELECT left('Hello', -3); +He +SELECT left('Hello', 5); +Hello +SELECT left('Hello', -5); + +SELECT left('Hello', 6); +Hello +SELECT left('Hello', -6); + +SELECT left('Hello', 0); + +SELECT left('Hello', NULL); +\N +SELECT left(materialize('Привет'), 4); +Пр +SELECT LEFT('Привет', -4); +Прив +SELECT left(toNullable('Привет'), 12); +Привет +SELECT lEFT('Привет', -12); + +SELECT left(materialize(toNullable('Привет')), 13); +Привет +SELECT left('Привет', -13); + +SELECT Left('Привет', 0); + +SELECT left('Привет', NULL); +\N +SELECT leftUTF8('Привет', 4); +Прив +SELECT leftUTF8('Привет', -4); +Пр +SELECT leftUTF8('Привет', 12); +Привет +SELECT leftUTF8('Привет', -12); + +SELECT leftUTF8('Привет', 13); +Привет +SELECT leftUTF8('Привет', -13); + +SELECT leftUTF8('Привет', 0); + +SELECT leftUTF8('Привет', NULL); +\N +SELECT left('Hello', number) FROM numbers(10); + +H +He +Hel +Hell +Hello +Hello +Hello +Hello +Hello +SELECT leftUTF8('Привет', number) FROM numbers(10); + +П +Пр +При +Прив +Приве +Привет +Привет +Привет +Привет +SELECT left('Hello', -number) FROM numbers(10); + +Hell +Hel +He +H + + + + + +SELECT leftUTF8('Привет', -number) FROM numbers(10); + +Приве +Прив +При +Пр +П + + + + +SELECT leftUTF8('Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +\N +П +Прив +\N +Пр +Приве +\N +Привет + +\N +SELECT leftUTF8(number < 5 ? 'Hello' : 'Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +\N +H +Hel +\N +H +Приве +\N +Привет + +\N +SELECT right('Hello', 3); +llo +SELECT right('Hello', -3); +lo +SELECT right('Hello', 5); +Hello +SELECT right('Hello', -5); + +SELECT right('Hello', 6); +Hello +SELECT right('Hello', -6); + +SELECT right('Hello', 0); + +SELECT right('Hello', NULL); +\N +SELECT RIGHT(materialize('Привет'), 4); +ет +SELECT right('Привет', -4); +ивет +SELECT Right(toNullable('Привет'), 12); +Привет +SELECT right('Привет', -12); + +SELECT rIGHT(materialize(toNullable('Привет')), 13); +Привет +SELECT right('Привет', -13); + +SELECT rIgHt('Привет', 0); + +SELECT RiGhT('Привет', NULL); +\N +SELECT rightUTF8('Привет', 4); +ивет +SELECT rightUTF8('Привет', -4); +ет +SELECT rightUTF8('Привет', 12); +Привет +SELECT rightUTF8('Привет', -12); + +SELECT rightUTF8('Привет', 13); +Привет +SELECT rightUTF8('Привет', -13); + +SELECT rightUTF8('Привет', 0); + +SELECT rightUTF8('Привет', NULL); +\N +SELECT right('Hello', number) FROM numbers(10); + +o +lo +llo +ello +Hello +Hello +Hello +Hello +Hello +SELECT rightUTF8('Привет', number) FROM numbers(10); + +т +ет +вет +ивет +ривет +Привет +Привет +Привет +Привет +SELECT right('Hello', -number) FROM numbers(10); + +ello +llo +lo +o + + + + + +SELECT rightUTF8('Привет', -number) FROM numbers(10); + +ривет +ивет +вет +ет +т + + + + +SELECT rightUTF8('Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +\N +т +ивет +\N +ет +ривет +\N +Привет + +\N +SELECT rightUTF8(number < 5 ? 'Hello' : 'Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +\N +o +llo +\N +o +ривет +\N +Привет + +\N diff --git a/tests/queries/0_stateless/02159_left_right.sql b/tests/queries/0_stateless/02159_left_right.sql new file mode 100644 index 00000000000..a45ca3db961 --- /dev/null +++ b/tests/queries/0_stateless/02159_left_right.sql @@ -0,0 +1,71 @@ +-- { echo } + +SELECT left('Hello', 3); +SELECT left('Hello', -3); +SELECT left('Hello', 5); +SELECT left('Hello', -5); +SELECT left('Hello', 6); +SELECT left('Hello', -6); +SELECT left('Hello', 0); +SELECT left('Hello', NULL); + +SELECT left(materialize('Привет'), 4); +SELECT LEFT('Привет', -4); +SELECT left(toNullable('Привет'), 12); +SELECT lEFT('Привет', -12); +SELECT left(materialize(toNullable('Привет')), 13); +SELECT left('Привет', -13); +SELECT Left('Привет', 0); +SELECT left('Привет', NULL); + +SELECT leftUTF8('Привет', 4); +SELECT leftUTF8('Привет', -4); +SELECT leftUTF8('Привет', 12); +SELECT leftUTF8('Привет', -12); +SELECT leftUTF8('Привет', 13); +SELECT leftUTF8('Привет', -13); +SELECT leftUTF8('Привет', 0); +SELECT leftUTF8('Привет', NULL); + +SELECT left('Hello', number) FROM numbers(10); +SELECT leftUTF8('Привет', number) FROM numbers(10); +SELECT left('Hello', -number) FROM numbers(10); +SELECT leftUTF8('Привет', -number) FROM numbers(10); + +SELECT leftUTF8('Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +SELECT leftUTF8(number < 5 ? 'Hello' : 'Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); + +SELECT right('Hello', 3); +SELECT right('Hello', -3); +SELECT right('Hello', 5); +SELECT right('Hello', -5); +SELECT right('Hello', 6); +SELECT right('Hello', -6); +SELECT right('Hello', 0); +SELECT right('Hello', NULL); + +SELECT RIGHT(materialize('Привет'), 4); +SELECT right('Привет', -4); +SELECT Right(toNullable('Привет'), 12); +SELECT right('Привет', -12); +SELECT rIGHT(materialize(toNullable('Привет')), 13); +SELECT right('Привет', -13); +SELECT rIgHt('Привет', 0); +SELECT RiGhT('Привет', NULL); + +SELECT rightUTF8('Привет', 4); +SELECT rightUTF8('Привет', -4); +SELECT rightUTF8('Привет', 12); +SELECT rightUTF8('Привет', -12); +SELECT rightUTF8('Привет', 13); +SELECT rightUTF8('Привет', -13); +SELECT rightUTF8('Привет', 0); +SELECT rightUTF8('Привет', NULL); + +SELECT right('Hello', number) FROM numbers(10); +SELECT rightUTF8('Привет', number) FROM numbers(10); +SELECT right('Hello', -number) FROM numbers(10); +SELECT rightUTF8('Привет', -number) FROM numbers(10); + +SELECT rightUTF8('Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); +SELECT rightUTF8(number < 5 ? 'Hello' : 'Привет', number % 3 = 0 ? NULL : (number % 2 ? toInt64(number) : -number)) FROM numbers(10); diff --git a/tests/queries/0_stateless/02160_h3_cell_area_m2.reference b/tests/queries/0_stateless/02160_h3_cell_area_m2.reference new file mode 100644 index 00000000000..e8727e05cf9 --- /dev/null +++ b/tests/queries/0_stateless/02160_h3_cell_area_m2.reference @@ -0,0 +1,16 @@ +4106166334463.9233 +666617118882.2277 +85294486110.07852 +12781831077.715292 +1730585103.2965515 +302748289.6422262 +30296673.089799587 +4984621.68910725 +644257.1047199412 +113498.17901913072 +16692.536464980716 +2335.8824226249617 +324.4496823479308 +48.63220901355471 +7.442732649761864 +0.5977527784258132 diff --git a/tests/queries/0_stateless/02160_h3_cell_area_m2.sql b/tests/queries/0_stateless/02160_h3_cell_area_m2.sql new file mode 100644 index 00000000000..55c6ef45542 --- /dev/null +++ b/tests/queries/0_stateless/02160_h3_cell_area_m2.sql @@ -0,0 +1,30 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS h3_indexes; + +CREATE TABLE h3_indexes (h3_index UInt64) ENGINE = Memory; + +-- Random geo coordinates were generated using the H3 tool: https://github.com/ClickHouse-Extras/h3/blob/master/src/apps/testapps/mkRandGeo.c at various resolutions from 0 to 15. +-- Corresponding H3 index values were in turn generated with those geo coordinates using `geoToH3(lon, lat, res)` ClickHouse function for the following test. + +INSERT INTO h3_indexes VALUES (579205133326352383); +INSERT INTO h3_indexes VALUES (581263419093549055); +INSERT INTO h3_indexes VALUES (589753847883235327); +INSERT INTO h3_indexes VALUES (594082350283882495); +INSERT INTO h3_indexes VALUES (598372386957426687); +INSERT INTO h3_indexes VALUES (599542359671177215); +INSERT INTO h3_indexes VALUES (604296355086598143); +INSERT INTO h3_indexes VALUES (608785214872748031); +INSERT INTO h3_indexes VALUES (615732192485572607); +INSERT INTO h3_indexes VALUES (617056794467368959); +INSERT INTO h3_indexes VALUES (624586477873168383); +INSERT INTO h3_indexes VALUES (627882919484481535); +INSERT INTO h3_indexes VALUES (634600058503392255); +INSERT INTO h3_indexes VALUES (635544851677385791); +INSERT INTO h3_indexes VALUES (639763125756281263); +INSERT INTO h3_indexes VALUES (644178757620501158); + + +SELECT h3CellAreaM2(h3_index) FROM h3_indexes ORDER BY h3_index; + +DROP TABLE h3_indexes; diff --git a/tests/queries/0_stateless/02160_h3_cell_area_rads2.reference b/tests/queries/0_stateless/02160_h3_cell_area_rads2.reference new file mode 100644 index 00000000000..d74c3f77f97 --- /dev/null +++ b/tests/queries/0_stateless/02160_h3_cell_area_rads2.reference @@ -0,0 +1,16 @@ +0.10116268528089567 +0.01642329421346843 +0.002101380838405832 +0.00031490306268786255 +0.000042636031250655976 +0.000007458740696242262 +7.464122383736096e-7 +1.2280498988731694e-7 +1.587241563444197e-8 +2.7962288004989136e-9 +4.112502211061015e-10 +5.754860352096175e-11 +7.99339296836726e-12 +1.1981406631437076e-12 +1.8336491007639705e-13 +1.4726699133479243e-14 diff --git a/tests/queries/0_stateless/02160_h3_cell_area_rads2.sql b/tests/queries/0_stateless/02160_h3_cell_area_rads2.sql new file mode 100644 index 00000000000..038a0cabd50 --- /dev/null +++ b/tests/queries/0_stateless/02160_h3_cell_area_rads2.sql @@ -0,0 +1,30 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS h3_indexes; + +CREATE TABLE h3_indexes (h3_index UInt64) ENGINE = Memory; + +-- Random geo coordinates were generated using the H3 tool: https://github.com/ClickHouse-Extras/h3/blob/master/src/apps/testapps/mkRandGeo.c at various resolutions from 0 to 15. +-- Corresponding H3 index values were in turn generated with those geo coordinates using `geoToH3(lon, lat, res)` ClickHouse function for the following test. + +INSERT INTO h3_indexes VALUES (579205133326352383); +INSERT INTO h3_indexes VALUES (581263419093549055); +INSERT INTO h3_indexes VALUES (589753847883235327); +INSERT INTO h3_indexes VALUES (594082350283882495); +INSERT INTO h3_indexes VALUES (598372386957426687); +INSERT INTO h3_indexes VALUES (599542359671177215); +INSERT INTO h3_indexes VALUES (604296355086598143); +INSERT INTO h3_indexes VALUES (608785214872748031); +INSERT INTO h3_indexes VALUES (615732192485572607); +INSERT INTO h3_indexes VALUES (617056794467368959); +INSERT INTO h3_indexes VALUES (624586477873168383); +INSERT INTO h3_indexes VALUES (627882919484481535); +INSERT INTO h3_indexes VALUES (634600058503392255); +INSERT INTO h3_indexes VALUES (635544851677385791); +INSERT INTO h3_indexes VALUES (639763125756281263); +INSERT INTO h3_indexes VALUES (644178757620501158); + + +SELECT h3CellAreaRads2(h3_index) FROM h3_indexes ORDER BY h3_index; + +DROP TABLE h3_indexes; diff --git a/tests/queries/0_stateless/02160_h3_hex_area_Km2.reference b/tests/queries/0_stateless/02160_h3_hex_area_Km2.reference new file mode 100644 index 00000000000..4d33b49f257 --- /dev/null +++ b/tests/queries/0_stateless/02160_h3_hex_area_Km2.reference @@ -0,0 +1,16 @@ +4250546.848 +607220.9782 +86745.85403 +12392.26486 +1770.323552 +252.9033645 +36.1290521 +5.1612932 +0.7373276 +0.1053325 +0.0150475 +0.0021496 +0.0003071 +0.0000439 +0.0000063 +9e-7 diff --git a/tests/queries/0_stateless/02160_h3_hex_area_Km2.sql b/tests/queries/0_stateless/02160_h3_hex_area_Km2.sql new file mode 100644 index 00000000000..e6c73fa9bda --- /dev/null +++ b/tests/queries/0_stateless/02160_h3_hex_area_Km2.sql @@ -0,0 +1,18 @@ +-- Tags: no-fasttest + +SELECT h3HexAreaKm2(0); +SELECT h3HexAreaKm2(1); +SELECT h3HexAreaKm2(2); +SELECT h3HexAreaKm2(3); +SELECT h3HexAreaKm2(4); +SELECT h3HexAreaKm2(5); +SELECT h3HexAreaKm2(6); +SELECT h3HexAreaKm2(7); +SELECT h3HexAreaKm2(8); +SELECT h3HexAreaKm2(9); +SELECT h3HexAreaKm2(10); +SELECT h3HexAreaKm2(11); +SELECT h3HexAreaKm2(12); +SELECT h3HexAreaKm2(13); +SELECT h3HexAreaKm2(14); +SELECT h3HexAreaKm2(15); diff --git a/tests/queries/0_stateless/02160_h3_rads_to_degs_degs_to_rads.reference b/tests/queries/0_stateless/02160_h3_rads_to_degs_degs_to_rads.reference new file mode 100644 index 00000000000..3c26be9d9b2 --- /dev/null +++ b/tests/queries/0_stateless/02160_h3_rads_to_degs_degs_to_rads.reference @@ -0,0 +1,9 @@ +-360 +-180.6 +-180 +-1 +0 +1 +180 +180.5 +360 diff --git a/tests/queries/0_stateless/02160_h3_rads_to_degs_degs_to_rads.sql b/tests/queries/0_stateless/02160_h3_rads_to_degs_degs_to_rads.sql new file mode 100644 index 00000000000..b30fc68725b --- /dev/null +++ b/tests/queries/0_stateless/02160_h3_rads_to_degs_degs_to_rads.sql @@ -0,0 +1,21 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS h3_indexes; + + +CREATE TABLE h3_indexes (degrees Float64) ENGINE = Memory; + + +INSERT INTO h3_indexes VALUES (-1); +INSERT INTO h3_indexes VALUES (-180); +INSERT INTO h3_indexes VALUES (-180.6); +INSERT INTO h3_indexes VALUES (-360); +INSERT INTO h3_indexes VALUES (0); +INSERT INTO h3_indexes VALUES (1); +INSERT INTO h3_indexes VALUES (180); +INSERT INTO h3_indexes VALUES (180.5); +INSERT INTO h3_indexes VALUES (360); + +select h3RadsToDegs(h3DegsToRads(degrees)) from h3_indexes order by degrees; + +DROP TABLE h3_indexes; diff --git a/tests/queries/0_stateless/02160_monthname.reference b/tests/queries/0_stateless/02160_monthname.reference new file mode 100644 index 00000000000..a3386cb33c7 --- /dev/null +++ b/tests/queries/0_stateless/02160_monthname.reference @@ -0,0 +1,12 @@ +January January January +February February February +March March March +April April April +May May May +June June June +July July July +August August August +September September September +October October October +November November November +December December December diff --git a/tests/queries/0_stateless/02160_monthname.sql b/tests/queries/0_stateless/02160_monthname.sql new file mode 100644 index 00000000000..2c5bd5b576b --- /dev/null +++ b/tests/queries/0_stateless/02160_monthname.sql @@ -0,0 +1,71 @@ +WITH + toDate('2021-01-14') AS date_value, + toDateTime('2021-01-14 11:22:33') AS date_time_value, + toDateTime64('2021-01-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-02-14') AS date_value, + toDateTime('2021-02-14 11:22:33') AS date_time_value, + toDateTime64('2021-02-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-03-14') AS date_value, + toDateTime('2021-03-14 11:22:33') AS date_time_value, + toDateTime64('2021-03-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-04-14') AS date_value, + toDateTime('2021-04-14 11:22:33') AS date_time_value, + toDateTime64('2021-04-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-05-14') AS date_value, + toDateTime('2021-05-14 11:22:33') AS date_time_value, + toDateTime64('2021-05-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-06-14') AS date_value, + toDateTime('2021-06-14 11:22:33') AS date_time_value, + toDateTime64('2021-06-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-07-14') AS date_value, + toDateTime('2021-07-14 11:22:33') AS date_time_value, + toDateTime64('2021-07-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-08-14') AS date_value, + toDateTime('2021-08-14 11:22:33') AS date_time_value, + toDateTime64('2021-08-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-09-14') AS date_value, + toDateTime('2021-09-14 11:22:33') AS date_time_value, + toDateTime64('2021-09-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-10-14') AS date_value, + toDateTime('2021-10-14 11:22:33') AS date_time_value, + toDateTime64('2021-10-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-11-14') AS date_value, + toDateTime('2021-11-14 11:22:33') AS date_time_value, + toDateTime64('2021-11-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); + +WITH + toDate('2021-12-14') AS date_value, + toDateTime('2021-12-14 11:22:33') AS date_time_value, + toDateTime64('2021-12-14 11:22:33', 3) AS date_time_64_value +SELECT monthName(date_value), monthName(date_time_value), monthName(date_time_64_value); diff --git a/tests/queries/0_stateless/02160_special_functions.reference b/tests/queries/0_stateless/02160_special_functions.reference new file mode 100644 index 00000000000..3a1dcd88902 --- /dev/null +++ b/tests/queries/0_stateless/02160_special_functions.reference @@ -0,0 +1,36 @@ +1 +[] +1 +world +world +world +world +def +abc +bcde +abcdef +abcdef + abcdef +2022 +Hello +3 +3 +2023-01-01 +2023-01-01 +2023-01-01 +2023-01-01 +2023-01-01 +2023-01-01 +2023-01-01 +2023-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +2021-01-01 +1 +1 +1 diff --git a/tests/queries/0_stateless/02160_special_functions.sql b/tests/queries/0_stateless/02160_special_functions.sql new file mode 100644 index 00000000000..6d18e7d0d25 --- /dev/null +++ b/tests/queries/0_stateless/02160_special_functions.sql @@ -0,0 +1,44 @@ +SELECT CAST(1 AS UInt8); +SELECT CAST([] AS Array(UInt8)); +SELECT CAST(1, 'UInt8'); + +SELECT SUBSTRING('Hello, world' FROM 8); +SELECT SUBSTRING('Hello, world' FROM 8 FOR 5); +SELECT SUBSTRING('Hello, world', 8); +SELECT SUBSTRING('Hello, world', 8, 5); + +SELECT TRIM(LEADING 'abc' FROM 'abcdef'); +SELECT TRIM(TRAILING 'def' FROM 'abcdef'); +SELECT TRIM(BOTH 'af' FROM 'abcdef'); +SELECT TRIM(' abcdef '); +SELECT LTRIM(' abcdef '); +SELECT RTRIM(' abcdef '); + +SELECT EXTRACT(YEAR FROM DATE '2022-01-01'); +SELECT EXTRACT('Hello, world', '^\w+'); + +SELECT POSITION('ll' IN 'Hello'); +SELECT POSITION('Hello', 'll'); + +SELECT DATE_ADD(YEAR, 1, DATE '2022-01-01'); +SELECT DATE_ADD(INTERVAL 1 YEAR, DATE '2022-01-01'); +SELECT DATEADD(YEAR, 1, DATE '2022-01-01'); +SELECT DATEADD(INTERVAL 1 YEAR, DATE '2022-01-01'); +SELECT TIMESTAMP_ADD(YEAR, 1, DATE '2022-01-01'); +SELECT TIMESTAMP_ADD(INTERVAL 1 YEAR, DATE '2022-01-01'); +SELECT TIMESTAMPADD(YEAR, 1, DATE '2022-01-01'); +SELECT TIMESTAMPADD(INTERVAL 1 YEAR, DATE '2022-01-01'); + +SELECT DATE_SUB(YEAR, 1, DATE '2022-01-01'); +SELECT DATE_SUB(DATE '2022-01-01', INTERVAL 1 YEAR); +SELECT DATESUB(YEAR, 1, DATE '2022-01-01'); +SELECT DATESUB(DATE '2022-01-01', INTERVAL 1 YEAR); +SELECT TIMESTAMP_SUB(YEAR, 1, DATE '2022-01-01'); +SELECT TIMESTAMP_SUB(DATE '2022-01-01', INTERVAL 1 YEAR); +SELECT TIMESTAMPSUB(YEAR, 1, DATE '2022-01-01'); +SELECT TIMESTAMPSUB(DATE '2022-01-01', INTERVAL 1 YEAR); + +SELECT DATE_DIFF(YEAR, DATE '2021-01-01', DATE '2022-01-01'); +SELECT DATEDIFF(YEAR, DATE '2021-01-01', DATE '2022-01-01'); + +SELECT EXISTS (SELECT 1); diff --git a/tests/queries/0_stateless/02160_untuple_exponential_growth.reference b/tests/queries/0_stateless/02160_untuple_exponential_growth.reference new file mode 100644 index 00000000000..6ed281c757a --- /dev/null +++ b/tests/queries/0_stateless/02160_untuple_exponential_growth.reference @@ -0,0 +1,2 @@ +1 +1 diff --git a/tests/queries/0_stateless/02160_untuple_exponential_growth.sh b/tests/queries/0_stateless/02160_untuple_exponential_growth.sh new file mode 100755 index 00000000000..9ec6594af69 --- /dev/null +++ b/tests/queries/0_stateless/02160_untuple_exponential_growth.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Should finish in reasonable time (milliseconds). +# In previous versions this query led to exponential complexity of query analysis. + +${CLICKHOUSE_LOCAL} --query "SELECT untuple(tuple(untuple((1, untuple((untuple(tuple(untuple(tuple(untuple((untuple((1, 1, 1, 1)), 1, 1, 1)))))), 1, 1))))))" 2>&1 | grep -cF 'TOO_BIG_AST' +${CLICKHOUSE_LOCAL} --query "SELECT untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple(tuple(untuple((1, 1, 1, 1, 1))))))))))))))))))))))))))" 2>&1 | grep -cF 'TOO_BIG_AST' diff --git a/tests/queries/0_stateless/02161_array_first_last.reference b/tests/queries/0_stateless/02161_array_first_last.reference new file mode 100644 index 00000000000..25734a6f01c --- /dev/null +++ b/tests/queries/0_stateless/02161_array_first_last.reference @@ -0,0 +1,18 @@ +ArrayFirst constant predicate +0 +0 +1 +0 +ArrayFirst non constant predicate +0 +2 +2 +ArrayLast constant predicate +0 +0 +3 +0 +ArrayLast non constant predicate +0 +3 +3 diff --git a/tests/queries/0_stateless/02161_array_first_last.sql b/tests/queries/0_stateless/02161_array_first_last.sql new file mode 100644 index 00000000000..f5be8cd26df --- /dev/null +++ b/tests/queries/0_stateless/02161_array_first_last.sql @@ -0,0 +1,21 @@ +SELECT 'ArrayFirst constant predicate'; +SELECT arrayFirst(x -> 1, emptyArrayUInt8()); +SELECT arrayFirst(x -> 0, emptyArrayUInt8()); +SELECT arrayFirst(x -> 1, [1, 2, 3]); +SELECT arrayFirst(x -> 0, [1, 2, 3]); + +SELECT 'ArrayFirst non constant predicate'; +SELECT arrayFirst(x -> x >= 2, emptyArrayUInt8()); +SELECT arrayFirst(x -> x >= 2, [1, 2, 3]); +SELECT arrayFirst(x -> x >= 2, materialize([1, 2, 3])); + +SELECT 'ArrayLast constant predicate'; +SELECT arrayLast(x -> 1, emptyArrayUInt8()); +SELECT arrayLast(x -> 0, emptyArrayUInt8()); +SELECT arrayLast(x -> 1, [1, 2, 3]); +SELECT arrayLast(x -> 0, [1, 2, 3]); + +SELECT 'ArrayLast non constant predicate'; +SELECT arrayLast(x -> x >= 2, emptyArrayUInt8()); +SELECT arrayLast(x -> x >= 2, [1, 2, 3]); +SELECT arrayLast(x -> x >= 2, materialize([1, 2, 3])); diff --git a/tests/queries/0_stateless/02162_array_first_last_index.reference b/tests/queries/0_stateless/02162_array_first_last_index.reference new file mode 100644 index 00000000000..24bd1442598 --- /dev/null +++ b/tests/queries/0_stateless/02162_array_first_last_index.reference @@ -0,0 +1,18 @@ +ArrayFirstIndex constant predicate +0 +0 +1 +0 +ArrayFirstIndex non constant predicate +0 +2 +2 +ArrayLastIndex constant predicate +0 +0 +3 +0 +ArrayLastIndex non constant predicate +0 +3 +3 diff --git a/tests/queries/0_stateless/02162_array_first_last_index.sql b/tests/queries/0_stateless/02162_array_first_last_index.sql new file mode 100644 index 00000000000..af107f0f4c9 --- /dev/null +++ b/tests/queries/0_stateless/02162_array_first_last_index.sql @@ -0,0 +1,21 @@ +SELECT 'ArrayFirstIndex constant predicate'; +SELECT arrayFirstIndex(x -> 1, emptyArrayUInt8()); +SELECT arrayFirstIndex(x -> 0, emptyArrayUInt8()); +SELECT arrayFirstIndex(x -> 1, [1, 2, 3]); +SELECT arrayFirstIndex(x -> 0, [1, 2, 3]); + +SELECT 'ArrayFirstIndex non constant predicate'; +SELECT arrayFirstIndex(x -> x >= 2, emptyArrayUInt8()); +SELECT arrayFirstIndex(x -> x >= 2, [1, 2, 3]); +SELECT arrayFirstIndex(x -> x >= 2, [1, 2, 3]); + +SELECT 'ArrayLastIndex constant predicate'; +SELECT arrayLastIndex(x -> 1, emptyArrayUInt8()); +SELECT arrayLastIndex(x -> 0, emptyArrayUInt8()); +SELECT arrayLastIndex(x -> 1, [1, 2, 3]); +SELECT arrayLastIndex(x -> 0, materialize([1, 2, 3])); + +SELECT 'ArrayLastIndex non constant predicate'; +SELECT arrayLastIndex(x -> x >= 2, emptyArrayUInt8()); +SELECT arrayLastIndex(x -> x >= 2, [1, 2, 3]); +SELECT arrayLastIndex(x -> x >= 2, materialize([1, 2, 3])); diff --git a/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.reference b/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.reference new file mode 100644 index 00000000000..d366ce64c27 --- /dev/null +++ b/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.reference @@ -0,0 +1 @@ +0 1 1 Value 1 diff --git a/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.sql b/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.sql new file mode 100644 index 00000000000..24eb08137e1 --- /dev/null +++ b/tests/queries/0_stateless/02162_range_hashed_dictionary_ddl_expression.sql @@ -0,0 +1,29 @@ +DROP TABLE IF EXISTS 02162_test_table; +CREATE TABLE 02162_test_table +( + id UInt64, + value String, + range_value UInt64 +) ENGINE=TinyLog; + +INSERT INTO 02162_test_table VALUES (0, 'Value', 1); + +DROP DICTIONARY IF EXISTS 02162_test_dictionary; +CREATE DICTIONARY 02162_test_dictionary +( + id UInt64, + value String, + range_value UInt64, + start UInt64 EXPRESSION range_value, + end UInt64 EXPRESSION range_value +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE '02162_test_table')) +LAYOUT(RANGE_HASHED()) +RANGE(MIN start MAX end) +LIFETIME(0); + +SELECT * FROM 02162_test_dictionary; + +DROP DICTIONARY 02162_test_dictionary; +DROP TABLE 02162_test_table; diff --git a/tests/queries/0_stateless/02163_operators.reference b/tests/queries/0_stateless/02163_operators.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02163_operators.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02163_operators.sql b/tests/queries/0_stateless/02163_operators.sql new file mode 100644 index 00000000000..4968e448ab2 --- /dev/null +++ b/tests/queries/0_stateless/02163_operators.sql @@ -0,0 +1,2 @@ +WITH 2 AS `b.c`, [4, 5] AS a, 6 AS u, 3 AS v, 2 AS d, TRUE AS e, 1 AS f, 0 AS g, 2 AS h, 'Hello' AS i, 'World' AS j, TIMESTAMP '2022-02-02 02:02:02' AS w, [] AS k, (1, 2) AS l, 2 AS m, 3 AS n, [] AS o, [1] AS p, 1 AS q, q AS r, 1 AS s, 1 AS t +SELECT INTERVAL CASE CASE WHEN NOT -a[b.c] * u DIV v + d IS NOT NULL AND e OR f BETWEEN g AND h THEN i ELSE j END WHEN w THEN k END || [l, (m, n)] MINUTE IS NULL OR NOT o::Array(INT) = p <> q < r > s != t AS upyachka; diff --git a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.expect b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.expect new file mode 100755 index 00000000000..4f006b926bd --- /dev/null +++ b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.expect @@ -0,0 +1,21 @@ +#!/usr/bin/expect -f + +log_user 0 +set timeout 20 +match_max 100000 + +expect_after { + eof { exp_continue } + timeout { exit 1 } +} + +set basedir [file dirname $argv0] +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL --disable_suggestion" + +expect ":) " +send -- "insert into table function null() format TSV some trash here 123 \n 456\r" +expect -re ".*DB::Exception: Table function 'null' requires 'structure'.*\r" +expect ":) " + +send -- "" +expect eof diff --git a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.reference b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02164_materialized_view_support_virtual_column.reference b/tests/queries/0_stateless/02164_materialized_view_support_virtual_column.reference new file mode 100644 index 00000000000..00750edc07d --- /dev/null +++ b/tests/queries/0_stateless/02164_materialized_view_support_virtual_column.reference @@ -0,0 +1 @@ +3 diff --git a/tests/queries/0_stateless/02164_materialized_view_support_virtual_column.sql b/tests/queries/0_stateless/02164_materialized_view_support_virtual_column.sql new file mode 100644 index 00000000000..ad48a7507da --- /dev/null +++ b/tests/queries/0_stateless/02164_materialized_view_support_virtual_column.sql @@ -0,0 +1,9 @@ +DROP TABLE IF EXISTS test_tb; +CREATE TABLE test_tb (a UInt64, s String) ENGINE = MergeTree() ORDER BY a; + +DROP VIEW IF EXISTS test_view_tb; +CREATE MATERIALIZED VIEW test_view_tb ENGINE = MergeTree() ORDER BY a AS SELECT * FROM test_tb; + +INSERT INTO test_tb VALUES (1, '1'), (2, '2'), (3, '3'); + +SELECT count(_part) FROM test_view_tb; diff --git a/tests/queries/0_stateless/02165_auto_format_by_file_extension.reference b/tests/queries/0_stateless/02165_auto_format_by_file_extension.reference new file mode 100644 index 00000000000..ca3d2dd1d80 --- /dev/null +++ b/tests/queries/0_stateless/02165_auto_format_by_file_extension.reference @@ -0,0 +1,40 @@ +1 one +2 tow +1 one +2 tow +1 one +2 tow +1 one +2 tow +1 one +2 tow +1 one +2 tow +1 one +2 tow +{ + "meta": + [ + { + "name": "id", + "type": "UInt64" + }, + { + "name": "name", + "type": "String" + } + ], + + "data": + [ + { + "id": "1", + "name": "one" + }, + { + "id": "2", + "name": "tow" + } + ], + + "rows": 2, diff --git a/tests/queries/0_stateless/02165_auto_format_by_file_extension.sh b/tests/queries/0_stateless/02165_auto_format_by_file_extension.sh new file mode 100755 index 00000000000..d2e16d9ec0b --- /dev/null +++ b/tests/queries/0_stateless/02165_auto_format_by_file_extension.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -e + +[ -e "${CLICKHOUSE_TMP}"/hello.csv ] && rm "${CLICKHOUSE_TMP}"/hello.csv +[ -e "${CLICKHOUSE_TMP}"/world.csv.gz ] && rm "${CLICKHOUSE_TMP}"/world.csv.gz +[ -e "${CLICKHOUSE_TMP}"/hello.world.csv ] && rm "${CLICKHOUSE_TMP}"/hello.world.csv +[ -e "${CLICKHOUSE_TMP}"/hello.world.csv.xz ] && rm "${CLICKHOUSE_TMP}"/hello.world.csv.xz +[ -e "${CLICKHOUSE_TMP}"/.htaccess.json ] && rm "${CLICKHOUSE_TMP}"/.htaccess.json +[ -e "${CLICKHOUSE_TMP}"/example.com. ] && rm "${CLICKHOUSE_TMP}"/example.com. +[ -e "${CLICKHOUSE_TMP}"/museum...protobuf ] && rm "${CLICKHOUSE_TMP}"/museum...protobuf + +${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS 02165_out_tb;" +${CLICKHOUSE_CLIENT} --query "CREATE TABLE 02165_out_tb (id UInt64, name String) Engine=Memory;" +${CLICKHOUSE_CLIENT} --query "INSERT INTO 02165_out_tb Values(1, 'one'), (2, 'tow');" + +${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS 02165_in_tb;" +${CLICKHOUSE_CLIENT} --query "CREATE TABLE 02165_in_tb (id UInt64, name String) Engine=Memory;" + + +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_out_tb INTO OUTFILE '${CLICKHOUSE_TMP}/hello.csv';" +${CLICKHOUSE_CLIENT} --query "INSERT INTO TABLE 02165_in_tb FROM INFILE '${CLICKHOUSE_TMP}/hello.csv' FORMAT CSV;" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_in_tb;" +${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE 02165_in_tb;" + +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_out_tb INTO OUTFILE '${CLICKHOUSE_TMP}/world.csv.gz';" +${CLICKHOUSE_CLIENT} --query "INSERT INTO TABLE 02165_in_tb FROM INFILE '${CLICKHOUSE_TMP}/world.csv.gz' COMPRESSION 'gz' FORMAT CSV;" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_in_tb;" +${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE 02165_in_tb;" + +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_out_tb INTO OUTFILE '${CLICKHOUSE_TMP}/hello.world.csv';" +${CLICKHOUSE_CLIENT} --query "INSERT INTO TABLE 02165_in_tb FROM INFILE '${CLICKHOUSE_TMP}/hello.world.csv' FORMAT CSV;" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_in_tb;" +${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE 02165_in_tb;" + +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_out_tb INTO OUTFILE '${CLICKHOUSE_TMP}/hello.world.csv.xz';" +${CLICKHOUSE_CLIENT} --query "INSERT INTO TABLE 02165_in_tb FROM INFILE '${CLICKHOUSE_TMP}/hello.world.csv.xz' COMPRESSION 'xz' FORMAT CSV;" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_in_tb;" +${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE 02165_in_tb;" + +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_out_tb INTO OUTFILE '${CLICKHOUSE_TMP}/example.com.';" +${CLICKHOUSE_CLIENT} --query "INSERT INTO TABLE 02165_in_tb FROM INFILE '${CLICKHOUSE_TMP}/example.com.' FORMAT TabSeparated;" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_in_tb;" +${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE 02165_in_tb;" + +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_out_tb INTO OUTFILE '${CLICKHOUSE_TMP}/museum...protobuf';" +${CLICKHOUSE_CLIENT} --query "INSERT INTO TABLE 02165_in_tb FROM INFILE '${CLICKHOUSE_TMP}/museum...protobuf' FORMAT TabSeparated;" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_in_tb;" +${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE 02165_in_tb;" + +${CLICKHOUSE_CLIENT} --query "INSERT INTO TABLE 02165_in_tb FROM INFILE '${CLICKHOUSE_TMP}/world.csv.gz';" +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_in_tb;" +${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE 02165_in_tb;" + + +${CLICKHOUSE_CLIENT} --query "SELECT * FROM 02165_out_tb INTO OUTFILE '${CLICKHOUSE_TMP}/.htaccess.json';" +head -n 26 ${CLICKHOUSE_TMP}/.htaccess.json + +${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS 02165_out_tb;" +${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS 02165_in_tb;" + +rm "${CLICKHOUSE_TMP}"/hello.csv +rm "${CLICKHOUSE_TMP}"/world.csv.gz +rm "${CLICKHOUSE_TMP}"/hello.world.csv +rm "${CLICKHOUSE_TMP}"/hello.world.csv.xz +rm "${CLICKHOUSE_TMP}"/.htaccess.json +rm "${CLICKHOUSE_TMP}"/example.com. +rm "${CLICKHOUSE_TMP}"/museum...protobuf diff --git a/tests/queries/0_stateless/02165_insert_from_infile.reference b/tests/queries/0_stateless/02165_insert_from_infile.reference new file mode 100644 index 00000000000..2a00a8faa31 --- /dev/null +++ b/tests/queries/0_stateless/02165_insert_from_infile.reference @@ -0,0 +1,5 @@ +INSERT INTO test FROM INFILE data.file SELECT x +FROM input(\'x UInt32\') +INSERT INTO test FROM INFILE data.file WITH number AS x +SELECT number +FROM input(\'number UInt32\') diff --git a/tests/queries/0_stateless/02165_insert_from_infile.sql b/tests/queries/0_stateless/02165_insert_from_infile.sql new file mode 100644 index 00000000000..8cc851fa4e5 --- /dev/null +++ b/tests/queries/0_stateless/02165_insert_from_infile.sql @@ -0,0 +1,4 @@ +EXPLAIN SYNTAX INSERT INTO test FROM INFILE 'data.file' SELECT x from input('x UInt32') FORMAT TSV; +EXPLAIN SYNTAX INSERT INTO test FROM INFILE 'data.file' WATCH view; -- { clientError SYNTAX_ERROR } +EXPLAIN SYNTAX INSERT INTO test FROM INFILE 'data.file' VALUES (1) -- { clientError SYNTAX_ERROR } +EXPLAIN SYNTAX INSERT INTO test FROM INFILE 'data.file' WITH number AS x SELECT number FROM input('number UInt32'); diff --git a/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference b/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference new file mode 100644 index 00000000000..46f448cfba7 --- /dev/null +++ b/tests/queries/0_stateless/02166_arrow_dictionary_inference.reference @@ -0,0 +1 @@ +x LowCardinality(UInt64) diff --git a/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh b/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh new file mode 100755 index 00000000000..e560dc10d2c --- /dev/null +++ b/tests/queries/0_stateless/02166_arrow_dictionary_inference.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "insert into table function file('arrow.dict', 'Arrow', 'x LowCardinality(UInt64)') select number from numbers(10) settings output_format_arrow_low_cardinality_as_dictionary=1" + +$CLICKHOUSE_CLIENT -q "desc file('arrow.dict', 'Arrow')" + diff --git a/tests/queries/0_stateless/02169_fix_view_offset_limit_setting.reference b/tests/queries/0_stateless/02169_fix_view_offset_limit_setting.reference new file mode 100644 index 00000000000..32c54e3eeea --- /dev/null +++ b/tests/queries/0_stateless/02169_fix_view_offset_limit_setting.reference @@ -0,0 +1,12 @@ +5 +6 +7 +8 +9 +10 +0 +1 +2 +3 +4 +5 diff --git a/tests/queries/0_stateless/02169_fix_view_offset_limit_setting.sql b/tests/queries/0_stateless/02169_fix_view_offset_limit_setting.sql new file mode 100644 index 00000000000..8ac88ebc5c0 --- /dev/null +++ b/tests/queries/0_stateless/02169_fix_view_offset_limit_setting.sql @@ -0,0 +1,12 @@ +DROP TABLE IF EXISTS counter; +CREATE TABLE counter (id UInt64, createdAt DateTime) ENGINE = MergeTree() ORDER BY id; +INSERT INTO counter SELECT number, now() FROM numbers(500); + +DROP TABLE IF EXISTS vcounter; +CREATE VIEW vcounter AS SELECT intDiv(id, 10) AS tens, max(createdAt) AS maxid FROM counter GROUP BY tens; + +SELECT tens FROM vcounter ORDER BY tens ASC LIMIT 100 SETTINGS limit = 6, offset = 5; + +SELECT tens FROM vcounter ORDER BY tens ASC LIMIT 100 SETTINGS limit = 6, offset = 0; +DROP TABLE vcounter; +DROP TABLE counter; diff --git a/tests/queries/0_stateless/data_csv/csv_with_slash.csv b/tests/queries/0_stateless/data_csv/csv_with_slash.csv new file mode 100644 index 00000000000..0f2c166faa8 --- /dev/null +++ b/tests/queries/0_stateless/data_csv/csv_with_slash.csv @@ -0,0 +1,30 @@ +0,\asdf,2000-01-01 +1,x\x\,2000-01-01 +2,x\x,2000-01-01 +3,x\,2000-01-01 +4,x\,2000-01-01 +5,\x,2000-01-01 +6,\N,2000-01-01 +7,\r\n,2000-01-01 +8,\\r\\n,2000-01-01 +9,x\\,2000-01-01 +10,'\asdf',2000-01-01 +11,'x\x\',2000-01-01 +12,'x\x',2000-01-01 +13,'x\',2000-01-01 +14,'x\',2000-01-01 +15,'\x',2000-01-01 +16,'\N',2000-01-01 +17,'\r\n',2000-01-01 +18,"\\r\\n",2000-01-01 +19,"x\\",2000-01-01 +20,"\asdf",2000-01-01 +21,"x\x\",2000-01-01 +22,"x\x",2000-01-01 +23,"x\",2000-01-01 +24,"x\",2000-01-01 +25,"\x",2000-01-01 +26,"\N",2000-01-01 +27,"\r\n",2000-01-01 +28,"\\r\\n",2000-01-01 +29,"x\\",2000-01-01 diff --git a/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto b/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto index ba558dbbadb..048a689d021 100644 --- a/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto +++ b/tests/queries/0_stateless/format_schemas/00825_protobuf_format_enum_mapping.proto @@ -1,6 +1,6 @@ syntax = "proto3"; -message Message +message EnumMessage { enum Enum { @@ -10,4 +10,4 @@ message Message HUNDRED = 100; }; Enum x = 1; -}; \ No newline at end of file +}; diff --git a/tests/queries/0_stateless/helpers/02112_clean.sh b/tests/queries/0_stateless/helpers/02112_clean.sh index 910c0709955..95af0cede9c 100755 --- a/tests/queries/0_stateless/helpers/02112_clean.sh +++ b/tests/queries/0_stateless/helpers/02112_clean.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash -FILE=${CURDIR}/file_02112 -if [ -f $FILE ]; then - rm $FILE -fi +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +FILE=${CURDIR}/../file_02112 +rm "$FILE" diff --git a/tests/queries/0_stateless/helpers/02112_prepare.sh b/tests/queries/0_stateless/helpers/02112_prepare.sh index 1f371789f86..c2791b01140 100755 --- a/tests/queries/0_stateless/helpers/02112_prepare.sh +++ b/tests/queries/0_stateless/helpers/02112_prepare.sh @@ -1,7 +1,5 @@ #!/usr/bin/env bash -FILE=${CURDIR}/file_02112 -if [ -f $FILE ]; then - rm $FILE -fi -echo "drop table if exists t;create table t(i Int32) engine=Memory; insert into t select 1" >> $FILE +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +FILE=${CURDIR}/../file_02112 +echo "drop table if exists t;create table t(i Int32) engine=Memory; insert into t select 1" > "$FILE" diff --git a/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh b/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh index d025dae5b2e..699700bcd3e 100755 --- a/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh +++ b/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh @@ -48,6 +48,10 @@ SkipList=( for TESTPATH in "$CURDIR"/*.sql; do TESTNAME=$(basename $TESTPATH) + NUM=$(echo "${TESTNAME}" | grep -o -P '^\d+' | sed 's/^0*//') + if [[ "${NUM}" -ge 168 ]]; then + continue + fi if [[ " ${SkipList[*]} " =~ ${TESTNAME} ]]; then echo "Skipping $TESTNAME " diff --git a/tests/queries/1_stateful/00169_contingency.reference b/tests/queries/1_stateful/00169_contingency.reference new file mode 100644 index 00000000000..8e881081445 --- /dev/null +++ b/tests/queries/1_stateful/00169_contingency.reference @@ -0,0 +1,5 @@ +1 1 -1 -1 0.09 +0.49 0.49 -0.45 -0.69 0.03 +0.81 0.81 -0.91 -0.85 0.08 +0.96 0.96 -0.9 -0.98 0.14 +0.6 0.6 -0.78 -0.8 0.01 diff --git a/tests/queries/1_stateful/00169_contingency.sql b/tests/queries/1_stateful/00169_contingency.sql new file mode 100644 index 00000000000..cc44bba8509 --- /dev/null +++ b/tests/queries/1_stateful/00169_contingency.sql @@ -0,0 +1,14 @@ +WITH URLDomain AS a, URLDomain AS b +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM test.hits; + +WITH URLDomain AS a, RefererDomain AS b +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM test.hits; + +WITH URLDomain AS a, CounterID AS b +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM test.hits; + +WITH ClientIP AS a, RemoteIP AS b +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM test.hits; + +WITH ResolutionWidth AS a, ResolutionHeight AS b +SELECT round(cramersV(a, b), 2), round(cramersVBiasCorrected(a, b), 2), round(theilsU(a, b), 2), round(theilsU(b, a), 2), round(contingency(a, b), 2) FROM test.hits; diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 8309b6bcb53..a930e7db3fc 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -32,6 +32,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS) add_subdirectory (wal-dump) add_subdirectory (check-mysql-binlog) add_subdirectory (keeper-bench) + add_subdirectory (graphite-rollup) if (USE_NURAFT) add_subdirectory (keeper-data-dumper) diff --git a/utils/graphite-rollup/CMakeLists.txt b/utils/graphite-rollup/CMakeLists.txt new file mode 100644 index 00000000000..3cc0d3e756f --- /dev/null +++ b/utils/graphite-rollup/CMakeLists.txt @@ -0,0 +1,23 @@ +add_executable(graphite-rollup-bench graphite-rollup-bench.cpp) +target_link_libraries( + graphite-rollup-bench + PRIVATE + clickhouse_storages_system + clickhouse_aggregate_functions + clickhouse_common_config + dbms +) +target_include_directories( + graphite-rollup-bench + SYSTEM PRIVATE + ${ClickHouse_SOURCE_DIR}/src ${CMAKE_BINARY_DIR}/src + ${ClickHouse_SOURCE_DIR}/base ${ClickHouse_SOURCE_DIR}/base/pcg-random + ${CMAKE_BINARY_DIR}/src/Core/include + ${POCO_INCLUDE_DIR} + ${ClickHouse_SOURCE_DIR}/contrib/double-conversion ${ClickHouse_SOURCE_DIR}/contrib/dragonbox/include + ${ClickHouse_SOURCE_DIR}/contrib/fmtlib/include + ${ClickHouse_SOURCE_DIR}/contrib/cityhash102/include + ${RE2_INCLUDE_DIR} ${CMAKE_BINARY_DIR}/contrib/re2_st +) + +target_compile_definitions(graphite-rollup-bench PRIVATE RULES_DIR="${CMAKE_CURRENT_SOURCE_DIR}") diff --git a/utils/graphite-rollup/graphite-rollup-bench.cpp b/utils/graphite-rollup/graphite-rollup-bench.cpp new file mode 100644 index 00000000000..dabe0353b0f --- /dev/null +++ b/utils/graphite-rollup/graphite-rollup-bench.cpp @@ -0,0 +1,147 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace DB; + +static SharedContextHolder shared_context = Context::createShared(); + +std::vector loadMetrics(const std::string & metrics_file) +{ + std::vector metrics; + + FILE * stream; + char * line = nullptr; + size_t len = 0; + ssize_t nread; + + stream = fopen(metrics_file.c_str(), "r"); + if (stream == nullptr) + { + throw std::runtime_error(strerror(errno)); + } + + while ((nread = getline(&line, &len, stream)) != -1) + { + size_t l = strlen(line); + if (l > 0) + { + if (line[l - 1] == '\n') + { + line[l - 1] = '\0'; + l--; + } + if (l > 0) + { + metrics.push_back(StringRef(strdup(line), l)); + } + } + } + free(line); + if (ferror(stream)) + { + fclose(stream); + throw std::runtime_error(strerror(errno)); + } + + fclose(stream); + + return metrics; +} + +ConfigProcessor::LoadedConfig loadConfiguration(const std::string & config_path) +{ + ConfigProcessor config_processor(config_path, true, true); + ConfigProcessor::LoadedConfig config = config_processor.loadConfig(false); + return config; +} + +void bench(const std::string & config_path, const std::string & metrics_file, size_t n, bool verbose) +{ + auto config = loadConfiguration(config_path); + + auto context = Context::createGlobal(shared_context.get()); + context->setConfig(config.configuration.get()); + + Graphite::Params params; + setGraphitePatternsFromConfig(context, "graphite_rollup", params); + + std::vector metrics = loadMetrics(metrics_file); + + std::vector durations(metrics.size()); + size_t j, i; + for (j = 0; j < n; j++) + { + for (i = 0; i < metrics.size(); i++) + { + auto start = std::chrono::high_resolution_clock::now(); + + auto rule = DB::Graphite::selectPatternForPath(params, metrics[i]); + (void)rule; + + auto end = std::chrono::high_resolution_clock::now(); + double duration = (duration_cast>(end - start)).count() * 1E9; + durations[i] += duration; + + if (j == 0 && verbose) + { + std::cout << metrics[i].data << ": rule with regexp '" << rule.second->regexp_str << "' found\n"; + } + } + } + + for (i = 0; i < metrics.size(); i++) + { + std::cout << metrics[i].data << " " << durations[i] / n << " ns\n"; + free(const_cast(static_cast(metrics[i].data))); + } +} + +int main(int argc, char ** argv) +{ + registerAggregateFunctions(); + + std::string config_file, metrics_file; + + using namespace std::literals; + + std::string config_default = RULES_DIR + "/rollup.xml"s; + std::string metrics_default = RULES_DIR + "/metrics.txt"s; + + namespace po = boost::program_options; + po::variables_map vm; + + po::options_description desc; + desc.add_options()("help,h", "produce help")( + "config,c", po::value()->default_value(config_default), "XML config with rollup rules")( + "metrics,m", po::value()->default_value(metrics_default), "metrcis files (one metric per line) for run benchmark")( + "verbose,V", po::bool_switch()->default_value(false), "verbose output (print found rule)"); + + po::parsed_options parsed = po::command_line_parser(argc, argv).options(desc).run(); + po::store(parsed, vm); + po::notify(vm); + + if (vm.count("help")) + { + std::cout << desc << '\n'; + exit(1); + } + + bench(vm["config"].as(), vm["metrics"].as(), 10000, vm["verbose"].as()); + + return 0; +} diff --git a/utils/graphite-rollup/metrics.txt b/utils/graphite-rollup/metrics.txt new file mode 100644 index 00000000000..199c3791310 --- /dev/null +++ b/utils/graphite-rollup/metrics.txt @@ -0,0 +1,11 @@ +test.sum +sum?env=test&tag=Fake3 +test.max +max?env=test&tag=Fake4 +test.min +min?env=test&tag=Fake5 +fake5?env=test&tag=Fake5 +test.p95 +p95?env=test&tag=FakeNo +default +default?env=test&tag=FakeNo diff --git a/utils/graphite-rollup/rollup-tag-list.xml b/utils/graphite-rollup/rollup-tag-list.xml new file mode 100644 index 00000000000..ef28f2089ad --- /dev/null +++ b/utils/graphite-rollup/rollup-tag-list.xml @@ -0,0 +1,167 @@ + + + + plain + \.sum$ + sum + + 0 + 60 + + + 86400 + 3600 + + + + tagged + ^((.*)|.)sum\? + sum + + 0 + 60 + + + 86400 + 3600 + + + + plain + \.max$ + max + + 0 + 60 + + + 86400 + 3600 + + + + tagged + ^((.*)|.)max\? + max + + 0 + 60 + + + 86400 + 3600 + + + + plain + \.min$ + min + + 0 + 60 + + + 86400 + 3600 + + + + tagged + ^((.*)|.)min\? + min + + 0 + 60 + + + 86400 + 3600 + + + + plain + \.fake1\..*\.Fake1\. + sum + + + tag_list + fake1;tag=Fake1 + sum + + + plain + \.fake2\..*\.Fake2\. + sum + + + tag_list + fake2;tag=Fake2 + sum + + + plain + \.fake3\..*\.Fake3\. + sum + + + tag_list + fake3;tag=Fake3 + sum + + + plain + \.fake4\..*\.Fake4\. + sum + + + tag_list + fake4;tag=Fake4 + sum + + + plain + \.fake5\..*\.Fake5\. + sum + + + tag_list + fake5;tag=Fake5 + sum + + + plain + \.fake6\..*\.Fake6\. + sum + + + tag_list + fake6;tag=Fake6 + sum + + + plain + \.fake7\..*\.Fake7\. + sum + + + tag_list + fake7;tag=Fake7 + sum + + + avg + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + + diff --git a/utils/graphite-rollup/rollup-typed.xml b/utils/graphite-rollup/rollup-typed.xml new file mode 100644 index 00000000000..0b27d43ece9 --- /dev/null +++ b/utils/graphite-rollup/rollup-typed.xml @@ -0,0 +1,167 @@ + + + + plain + \.sum$ + sum + + 0 + 60 + + + 86400 + 3600 + + + + tagged + ^((.*)|.)sum\? + sum + + 0 + 60 + + + 86400 + 3600 + + + + plain + \.max$ + max + + 0 + 60 + + + 86400 + 3600 + + + + tagged + ^((.*)|.)max\? + max + + 0 + 60 + + + 86400 + 3600 + + + + plain + \.min$ + min + + 0 + 60 + + + 86400 + 3600 + + + + tagged + ^((.*)|.)min\? + min + + 0 + 60 + + + 86400 + 3600 + + + + plain + \.fake1\..*\.Fake1\. + sum + + + tagged + + sum + + + plain + \.fake2\..*\.Fake2\. + sum + + + tagged + + sum + + + plain + \.fake3\..*\.Fake3\. + sum + + + tagged + + sum + + + plain + \.fake4\..*\.Fake4\. + sum + + + tagged + + sum + + + plain + \.fake5\..*\.Fake5\. + sum + + + tagged + + sum + + + plain + \.fake6\..*\.Fake6\. + sum + + + tagged + + sum + + + plain + \.fake7\..*\.Fake7\. + sum + + + tagged + + sum + + + avg + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + + diff --git a/utils/graphite-rollup/rollup.xml b/utils/graphite-rollup/rollup.xml new file mode 100644 index 00000000000..641b0130509 --- /dev/null +++ b/utils/graphite-rollup/rollup.xml @@ -0,0 +1,147 @@ + + + + \.sum$ + sum + + 0 + 60 + + + 86400 + 3600 + + + + ^((.*)|.)sum\? + sum + + 0 + 60 + + + 86400 + 3600 + + + + \.max$ + max + + 0 + 60 + + + 86400 + 3600 + + + + ^((.*)|.)max\? + max + + 0 + 60 + + + 86400 + 3600 + + + + \.min$ + min + + 0 + 60 + + + 86400 + 3600 + + + + ^((.*)|.)min\? + min + + 0 + 60 + + + 86400 + 3600 + + + + \.fake1\..*\.Fake1\. + sum + + + + sum + + + \.fake2\..*\.Fake2\. + sum + + + + sum + + + \.fake3\..*\.Fake3\. + sum + + + + sum + + + \.fake4\..*\.Fake4\. + sum + + + + sum + + + \.fake5\..*\.Fake5\. + sum + + + + sum + + + \.fake6\..*\.Fake6\. + sum + + + + sum + + + \.fake7\..*\.Fake7\. + sum + + + + sum + + + avg + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + + diff --git a/utils/tests-visualizer/index.html b/utils/tests-visualizer/index.html index a15b09ea58e..13f8daaa151 100644 --- a/utils/tests-visualizer/index.html +++ b/utils/tests-visualizer/index.html @@ -1,16 +1,85 @@ - + + - -

Loading (10 seconds, 20 MB)...

- - + +

+

Data not load

+
+ +

Loading (~10 seconds, ~20 MB)

+
+ diff --git a/utils/zero_copy/zero_copy_schema_converter.py b/utils/zero_copy/zero_copy_schema_converter.py new file mode 100755 index 00000000000..6fdd03add5a --- /dev/null +++ b/utils/zero_copy/zero_copy_schema_converter.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +import argparse +import socket +import uuid +from kazoo.client import KazooClient + +def parse_args(): + """ + Parse command-line arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument('--hosts', default=socket.getfqdn() + ':2181', help='ZooKeeper hosts (host:port,host:port,...)') + parser.add_argument('-s', '--secure', default=False, action='store_true', help='Use secure connection') + parser.add_argument('--cert', default='', help='Client TLS certificate file') + parser.add_argument('--key', default='', help='Client TLS key file') + parser.add_argument('--ca', default='', help='Client TLS ca file') + parser.add_argument('-u', '--user', default='', help='ZooKeeper ACL user') + parser.add_argument('-p', '--password', default='', help='ZooKeeper ACL password') + parser.add_argument('-r', '--root', default='/clickhouse', help='ZooKeeper root path for ClickHouse') + parser.add_argument('-z', '--zcroot', default='zero_copy', help='ZooKeeper node for new zero-copy data') + parser.add_argument('--dryrun', default=False, action='store_true', help='Do not perform any actions') + parser.add_argument('--cleanup', default=False, action='store_true', help='Clean old nodes') + parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Verbose mode') + + return parser.parse_args() + + +# Several folders to heuristic that zookeepr node is folder node +# May be false positive when someone creates set of tables with same paths +table_nodes = ['alter_partition_version', 'block_numbers', 'blocks', 'columns', 'leader_election'] +zc_nodes = ['zero_copy_s3', 'zero_copy_hdfs'] + + +def convert_node(client, args, path, zc_node): + base_path = f'{path}/{zc_node}/shared' + parts = client.get_children(base_path) + table_id_path = f'{path}/table_id' + table_id = '' + if client.exists(table_id_path): + table_id = client.get(table_id_path)[0].decode('UTF-8') + else: + table_id = str(uuid.uuid4()) + if args.verbose: + print(f'Make table_id "{table_id_path}" = "{table_id}"') + if not args.dryrun: + client.create(table_id_path, bytes(table_id, 'UTF-8')) + for part in parts: + part_path = f'{base_path}/{part}' + uniq_ids = client.get_children(part_path) + for uniq_id in uniq_ids: + uniq_path = f'{part_path}/{uniq_id}' + replicas = client.get_children(uniq_path) + for replica in replicas: + replica_path = f'{uniq_path}/{replica}' + new_path = f'{args.root}/{args.zcroot}/{zc_node}/{table_id}/{part}/{uniq_id}/{replica}' + if not client.exists(new_path): + if args.verbose: + print(f'Make node "{new_path}"') + if not args.dryrun: + client.ensure_path(f'{args.root}/{args.zcroot}/{zc_node}/{table_id}/{part}/{uniq_id}') + client.create(new_path, value=b'lock') + if args.cleanup: + if args.verbose: + print(f'Remove node "{replica_path}"') + if not args.dryrun: + client.delete(replica_path) + if args.cleanup and not args.dryrun: + client.delete(uniq_path) + if args.cleanup and not args.dryrun: + client.delete(part_path) + if args.cleanup and not args.dryrun: + client.delete(base_path) + client.delete(f'{path}/{zc_node}') + + +def convert_table(client, args, path, nodes): + print(f'Convert table nodes by path "{path}"') + for zc_node in zc_nodes: + if zc_node in nodes: + convert_node(client, args, path, zc_node) + + +def is_like_a_table(nodes): + for tn in table_nodes: + if tn not in nodes: + return False + return True + + +def scan_recursive(client, args, path): + nodes = client.get_children(path) + if is_like_a_table(nodes): + convert_table(client, args, path, nodes) + else: + for node in nodes: + scan_recursive(client, args, f'{path}/{node}') + + +def scan(client, args): + nodes = client.get_children(args.root) + for node in nodes: + if node != args.zcroot: + scan_recursive(client, args, f'{args.root}/{node}') + + +def get_client(args): + client = KazooClient(connection_retry=3, + command_retry=3, + timeout=1, + hosts=args.hosts, + use_ssl=args.secure, + certfile=args.cert, + keyfile=args.key, + ca=args.ca + ) + client.start() + if (args.user and args.password): + client.add_auth('digest', f'{args.user}:{args.password}') + return client + + +def main(): + args = parse_args() + client = get_client(args) + scan(client, args) + + +if __name__ == '__main__': + main() diff --git a/website/blog/en/2022/admixer-aggregates-over-1-billion-unique-users-a-day-using-clickhouse.md b/website/blog/en/2022/admixer-aggregates-over-1-billion-unique-users-a-day-using-clickhouse.md new file mode 100644 index 00000000000..3f38d31b2f7 --- /dev/null +++ b/website/blog/en/2022/admixer-aggregates-over-1-billion-unique-users-a-day-using-clickhouse.md @@ -0,0 +1,135 @@ +--- +title: 'Admixer Aggregates Over 1 Billion Unique Users a Day using ClickHouse' +image: 'https://blog-images.clickhouse.com/en/2022/admixer-case-study/featured.jpg' +date: '2022-01-11' +author: 'Vladimir Zakrevsky' +tags: ['company'] +--- + +## Highlights + +* Inserting around 100 billion records per day, over 1 million records per second +* Able to aggregate over 1 billion unique users a day +* Moved from MSSQL to Azure Table Storage to ClickHouse +* ClickHouse is deployed on 15 servers with 2 TB total RAM + +Admixer is an Ad-Tech company that provides all the components to build infrastructure for advertising products for brands, ad agencies, media houses, publishers, ad networks, and other buy- and sell-side industry players looking for effective ad management. A distinctive feature of Admixer is their technology, which allows: + +* Agencies to place advertising campaigns with specified execution conditions (terms, budget, creative display settings) +* Set the rules for distributing advertising campaign budgets among thousands of publishers +* Provide accounts for publishers, where they could not only see income statistics or withdraw money but also create their advertising campaigns, as well as connect other sources of monetization in addition to Network advertising campaigns. + +Admixers products include: + +* SSP - Supply-side platform where publishers/websites offer advertising space +* DSP - Demand-side platform where advertisers buy advertising space +* ADX - Ad exchange (connects SSPs and DSPs - buyers and sellers of advertisements and advertising space) +* DMP - Data management platform (used by advertisers to configure the audience they want to target) + + Admixer provides not only access to these products but allows customers to build an entire ecosystem. + +## Why We Chose ClickHouse + +To implement the previous point, Admixer began developing an Advertising Exchange. Initially, AdExchange was based on the sale of local inventory by external DSPs. Then it began to aggregate the traffic of external SSPs to place local advertisements on it and later redirect this traffic to external DSPs. Thus, ADX was created. + +In 2015-2016, the share of external inventory was 3% (100 million requests), then at the end of 2016, it was more than 90% (3 billion requests). With a sharp increase in requests, the load on their processing increased, and most importantly, the load on the storage and provision of online analytics increased. Relational databases could not handle that many inserts for statistics records. Before migrating to Azure, we used a MSSQL server which stored the object structure and statistics. + +In 2011, when migrating to Azure, we used Azure Table Storage to store and issue statistics. But with an increase in the number of transactions and the amount of data, it was not optimal to use this solution since Azure Table Storage charges for the number of transactions and the amount of data. + +Thus we needed to: + +* Display statistics on advertising transactions in the user interface in real-time; +* Accept a significant amount (1 million records per second) of data for insertion; +* Aggregate the received data for different sections (40 operations and the same number of metrics); +* Be able to scale the data warehouse as the number of requests grew; +* Have full control over our costs. + +![Profile Report](https://blog-images.clickhouse.com/en/2022/admixer-case-study/profile-report.png) + +This image shows the Profile Report. Any Ad Campaign in Admixer is split by Line Items (Profiles). It is possible to overview detailed reports by each Profile including Date-Time Statistics, Geo, Domans, SSPs. This report is also updated in real time. + +## The Advantages of Using ClickHouse + +ClickHouse helps to cope with the challenges above and provides the following benefits: + +* Not tied to the platform (we decided to migrate from the cloud); +* The cluster we built allows us to receive up to a million inserts per second (and we know how to scale up on demand); +* Has built-in mechanisms for aggregating and distributing data across tables (materialized views); +* Excellent data compression; +* Reading speed makes it possible to display statistics directly in the user interface in real-time; +* Has a SQL dialect that provides the ability to build any reports; +* Has several advanced functions (and allows you to write your own) for processing statistics; +* Built-in HyperLogLog for storing rough data; +* Data sampling; +* Open source / community / good documentation; +* Constant additions of new features, bug fixes, and improvements to the current functionality; +* Convenient operations. + +## ClickHouse Architecture + +Our architecture changed from 2016 to 2020. There are two diagrams below: the state we started and the state we came to. + +![Architecture 2016](https://blog-images.clickhouse.com/en/2022/admixer-case-study/architecture-2016.png) + +_Architecture 2016_ + +![Architecture 2020](https://blog-images.clickhouse.com/en/2022/admixer-case-study/architecture-2020.png) + +_Architecture 2020_ + +Requests Handler is a component that accepts a request for an advertisement and determines which banner to display. After the banner is selected, it records this in the statistics. Since 2020, these components have been receiving over 1 million requests per second. Statistics were recorded through an intermediate element named Global Events Queue. Events were retrieved from GlobalEventsQueue, read by the EventsProcessor components, and additionally validated/enriched, then written to the ClickHouse cluster. + +Initially, we wrote from EventsProcessor in ClickHouse into several tables in parallel but then switched through Buffer-> Null-table-> MatViews. We will next investigate if the new [asynchronous insert feature](https://clickhouse.com/blog/en/2021/clickhouse-v21.11-released/#async-inserts) in version 21.11 would be an alternative to using a buffer table. + +We also reviewed the implementation of the event queue. Initially, we used Redis (but Redis is InMemory storage), thus: + +* On server restart, there was a risk of losing events; +* The amount of RAM is relatively small, and if we planned to stop the Events Processor or ClickHouse, there was a risk of overflowing the event queue, so a very high response rate to event processor problems was required. + +We tried to replace Redis and use Kafka instead, but the Kafka driver for ClickHouse at the time had issues with arrays (which has since been fixed). + +Therefore, we implemented our event queue, which was stored on the disk of each EventHandler component, and the local EventsProcessor was located on the same server. The number of EventsProcessor components has increased, which means that the number of insert requests in ClickHouse has also increased, but this was not a problem. + +Since financial optimization was also an essential factor for us, this scheme proved to be excellent in this regard as well. To receive processing and storage of data from ADX, we assembled a cluster with 15 servers (40 threads, 128 RAM, SSD storage), and we also took this with a margin. For the storage cluster for unique users, we used a cluster with 6 of the same servers. + +An important point was also the work with receiving data from clusters. If you recklessly send a request to the cluster, this could create a pretty significant load on it, leading to the slowdown of other processes. But ClickHouse has settings for limiting resources and allocating quotas for specific users, which allowed us to solve this case quickly. All configuration files can be perfectly placed in the configuration management system and managed from there. + +## ClickHouse Handles Over 1 Billion Unique Users Per Day + +In addition to statistics aggregation, which summed up metrics by dimension, Admixer provides information on how many unique users have watched ads for an arbitrary time. The number of uniques cannot be summed up. In our system, the user ID is the UUID. When we want to get several unique UUIDs for some arbitrary period, we need to recalculate the unique UUIDs for this period each time. We cannot decompose all possible combinations in advance since the intersection will be too large. + +Before using ClickHouse, we could count uniques only for predefined periods: day, week, month, all the time. Also, the number of slices was limited. Also, constant bulk requests for Aerospike slowed down the event processor. + +AggregatingMergeTree allowed us with minimal costs to count unique users by a large number of keys in one report. In the beginning, with a cluster from three servers, we could easily count 1 billion uniques per day in ~ 12 slices. There are nuances; large slices cannot be output to the interface since simultaneous scanning of large tables will take a lot of CPU time. The solution to this problem was the report generation service, which has its internal queue and sends the already generated CSV files to the interface. On the other hand, we can output small slices to the interface with a limited date range. + +ClickHouse was perfect as Big Data Storage for our ML models. + +## Advice To Others Who Might Be Considering ClickHouse + +The Devil is in the details! + +ClickHouse technical tips: + +* If you do not need high data accuracy, use HyperLogLog and sampling; +* Run load tests to determine the number of operations that your cluster can withstand given your data structure before assembling the cluster; +* Buffer is a great way to insert data, but watch out for memory; +* Use Native format for insert; +* Avoid large numbers of small parts for continuous flow insertion. Too many tables generate a lot of merges in the background such as the Too many parts (300) error; +* It is necessary to decide on the replication scheme at the beginning. One option is to use ZooKeeper and let tables replicate themselves using ReplicatedMergeTree and other replicating table engines. Because we had many tables and we wanted to choose what parts of the data to replicate to which servers we chose to not use ZooKeeper and have our client spread the writes - each write goes to two servers. + +Over the past five years, the Admixer's Core team has been working with a high-load and aggregation of big data. Any work has its subtleties, do not step on your rake. Use ours. + +We offer customers specialized audit, consultation, or create ready-made solutions using ClickHouse to solve high-load tasks. These speciality services are now offered via our new initiative [LoadFighters](https://loadfighters.com). + +### About Admixer + +Admixer is an independent adtech company that develops an ecosystem of full-stack programmatic solutions. Admixer has its own line of adtech products for brands, ad agencies, media houses, publishers, ad networks, and other buy- and sell-side industry players looking for effective ad management. Our customizable technology, in-depth expertise, and a personal approach help businesses turn programmatic advertising into a scalable revenue channel. + +Since their start in 2008, we’ve been on a mission to build an ecosystem with effective and transparent relationships between all of the players in the digital advertising industry. + +Today, the company has over 100 supply and demand partners, 3,000+ customers, and 200+ employees worldwide. They run offices in Ukraine, Belarus, Kazakhstan, Moldova, Georgia, and legal entities in the UK and Germany. + +For more information please visit: +[https://admixer.com/](https://admixer.com/) + + diff --git a/website/images/photos/anne-carlhoff.jpg b/website/images/photos/anne-carlhoff.jpg new file mode 100644 index 00000000000..4bbc9265585 Binary files /dev/null and b/website/images/photos/anne-carlhoff.jpg differ diff --git a/website/images/photos/baird-garrett.jpg b/website/images/photos/baird-garrett.jpg new file mode 100644 index 00000000000..3400431e379 Binary files /dev/null and b/website/images/photos/baird-garrett.jpg differ diff --git a/website/images/photos/dale-mcdiarmid.jpg b/website/images/photos/dale-mcdiarmid.jpg new file mode 100644 index 00000000000..bf590696a87 Binary files /dev/null and b/website/images/photos/dale-mcdiarmid.jpg differ diff --git a/website/images/photos/geoffrey-genz.jpg b/website/images/photos/geoffrey-genz.jpg new file mode 100644 index 00000000000..6d86aca47f7 Binary files /dev/null and b/website/images/photos/geoffrey-genz.jpg differ diff --git a/website/images/photos/marcel-birkner.jpg b/website/images/photos/marcel-birkner.jpg new file mode 100644 index 00000000000..6ec821cfb66 Binary files /dev/null and b/website/images/photos/marcel-birkner.jpg differ diff --git a/website/images/photos/melvyn-peignon.jpg b/website/images/photos/melvyn-peignon.jpg new file mode 100644 index 00000000000..532c1759c65 Binary files /dev/null and b/website/images/photos/melvyn-peignon.jpg differ diff --git a/website/images/photos/michael-lex.jpg b/website/images/photos/michael-lex.jpg new file mode 100644 index 00000000000..0e6de27a14e Binary files /dev/null and b/website/images/photos/michael-lex.jpg differ diff --git a/website/images/photos/nihat-hosgur.jpg b/website/images/photos/nihat-hosgur.jpg new file mode 100644 index 00000000000..ad47b4aba50 Binary files /dev/null and b/website/images/photos/nihat-hosgur.jpg differ diff --git a/website/images/photos/nikolay-degterinsky.jpg b/website/images/photos/nikolay-degterinsky.jpg new file mode 100644 index 00000000000..620c2d83f51 Binary files /dev/null and b/website/images/photos/nikolay-degterinsky.jpg differ diff --git a/website/images/photos/nir-peled.jpg b/website/images/photos/nir-peled.jpg new file mode 100644 index 00000000000..a8952465164 Binary files /dev/null and b/website/images/photos/nir-peled.jpg differ diff --git a/website/images/photos/sergei-trifonov.jpg b/website/images/photos/sergei-trifonov.jpg new file mode 100644 index 00000000000..87ce88a3b1b Binary files /dev/null and b/website/images/photos/sergei-trifonov.jpg differ diff --git a/website/images/photos/tanya-bragin.jpg b/website/images/photos/tanya-bragin.jpg new file mode 100644 index 00000000000..0b5a6972b01 Binary files /dev/null and b/website/images/photos/tanya-bragin.jpg differ diff --git a/website/images/photos/tom-schreiber.jpg b/website/images/photos/tom-schreiber.jpg new file mode 100644 index 00000000000..ec227de6122 Binary files /dev/null and b/website/images/photos/tom-schreiber.jpg differ diff --git a/website/images/photos/yuko-takagi.jpg b/website/images/photos/yuko-takagi.jpg new file mode 100644 index 00000000000..eb44e414256 Binary files /dev/null and b/website/images/photos/yuko-takagi.jpg differ diff --git a/website/templates/company/team.html b/website/templates/company/team.html index b4ed1c26a29..e8cc07751dd 100644 --- a/website/templates/company/team.html +++ b/website/templates/company/team.html @@ -19,6 +19,20 @@ {{ _('Principal Sofware Engineer') }}

+ +
+ + + + +

+ {{ _('Marcel Birkner') }} +

+

+ {{ _(' + Cloud SWE') }} +

+
@@ -32,6 +46,33 @@ {{ _('VP, Product') }}

+
+
+ + + + +

+ {{ _('Tanya Bragin') }} +

+

+ {{ _('VP, Product') }} +

+ +
+
+ + + + +

+ {{ _('Anne Carlhoff') }} +

+

+ {{ _(' + Sr Recruiter') }} +

+
@@ -58,6 +99,19 @@ {{ _('Software Engineer') }}

+
+
+ + + + +

+ {{ _('Nikolay Degterinsky') }} +

+

+ {{ _('Core SWE') }} +

+
@@ -71,6 +125,32 @@ {{ _('Senior Director, Business Technology') }}

+
+
+ + + + +

+ {{ _('Baird Garrett') }} +

+

+ {{ _('General Counsel') }} +

+ +
+
+ + + + +

+ {{ _('Geoffrey Genz') }} +

+

+ {{ _('Principal Support Engineer') }} +

+
@@ -97,6 +177,19 @@ {{ _('VP, Sales') }}

+
+
+ + + + +

+ {{ _('Nihat Hosgur') }} +

+

+ {{ _('Principal Cloud SWE') }} +

+
@@ -162,6 +255,19 @@ {{ _('Software Engineer') }}

+
+
+ + + + +

+ {{ _('Michael Lex') }} +

+

+ {{ _('Cloud SWE') }} +

+
@@ -201,6 +307,19 @@ {{ _('Executive Assistant') }}

+
+
+ + + + +

+ {{ _('Dale McDiarmid') }} +

+

+ {{ _('Consulting Architect') }} +

+
@@ -240,6 +359,32 @@ {{ _('VP, Support & Services') }}

+
+
+ + + + +

+ {{ _('Melvyn Peignon') }} +

+

+ {{ _('Manager, Support Services – EMEA') }} +

+ +
+
+ + + + +

+ {{ _('Nir Peled') }} +

+

+ {{ _('Principal UX/UI Engineer') }} +

+
@@ -279,6 +424,19 @@ {{ _('Engineering Team Lead') }}

+
+
+ + + + +

+ {{ _('Tom Schreiber') }} +

+

+ {{ _('Consulting Architect – EMEA') }} +

+
@@ -318,6 +476,19 @@ {{ _('VP, Operations') }}

+
+
+ + + + +

+ {{ _('Yuko Takagi') }} +

+

+ {{ _('Director, Go To Market Technology') }} +

+
@@ -344,6 +515,19 @@ {{ _('Software Engineer') }}

+
+
+ +
+ +
+

+ {{ _('Sergei Trifonov') }} +

+

+ {{ _('Principal Core SWE') }} +

+
diff --git a/website/templates/index/success.html b/website/templates/index/success.html index e09274c3a6f..7d70f4367b2 100644 --- a/website/templates/index/success.html +++ b/website/templates/index/success.html @@ -62,7 +62,7 @@
-

{{ _('Uber moved it’s logging platform to ClickHouse increasing developer productivity and overall reliability of the platform while seeing 3x data compression, 10x performance increase, and ½ the reduction in hardware cost.') }}

+

{{ _('Uber moved its logging platform to ClickHouse increasing developer productivity and overall reliability of the platform while seeing 3x data compression, 10x performance increase, and ½ the reduction in hardware cost.') }}

{{ _('Read the Case Study') }}