diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index ce135846dd5..8ac3271676c 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -580,6 +580,47 @@ jobs: docker ps --quiet | xargs --no-run-if-empty docker kill ||: docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" + BuilderFuzzers: + needs: [DockerHubPush, FastTest, StyleCheck] + runs-on: [self-hosted, builder] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + BUILD_NAME=fuzzers + EOF + - name: Download changed images + uses: actions/download-artifact@v3 + with: + name: changed_images + path: ${{ env.IMAGES_PATH }} + - name: Check out repository code + uses: ClickHouse/checkout@v1 + with: + clear-repository: true + submodules: true + - name: Build + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" + - name: Upload build URLs to artifacts + if: ${{ success() || failure() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ env.BUILD_URLS }} + path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json + - name: Cleanup + if: always() + run: | + docker ps --quiet | xargs --no-run-if-empty docker kill ||: + docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: + sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" ########################################################################################## ##################################### SPECIAL BUILDS ##################################### ########################################################################################## diff --git a/CMakeLists.txt b/CMakeLists.txt index 55bcf5fbf3c..65ff9dc5384 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,7 +118,11 @@ endif() # - sanitize.cmake add_library(global-libs INTERFACE) -include (cmake/fuzzer.cmake) +# We don't want to instrument everything with fuzzer, but only specific targets (see below), +# also, since we build our own llvm, we specifically don't want to instrument +# libFuzzer library itself - it would result in infinite recursion +#include (cmake/fuzzer.cmake) + include (cmake/sanitize.cmake) option(ENABLE_COLORED_BUILD "Enable colors in compiler output" ON) @@ -558,6 +562,46 @@ add_subdirectory (programs) add_subdirectory (tests) add_subdirectory (utils) +# Function get_all_targets collects all targets recursively +function(get_all_targets var) + macro(get_all_targets_recursive targets dir) + get_property(subdirectories DIRECTORY ${dir} PROPERTY SUBDIRECTORIES) + foreach(subdir ${subdirectories}) + get_all_targets_recursive(${targets} ${subdir}) + endforeach() + get_property(current_targets DIRECTORY ${dir} PROPERTY BUILDSYSTEM_TARGETS) + list(APPEND ${targets} ${current_targets}) + endmacro() + + set(targets) + get_all_targets_recursive(targets ${CMAKE_CURRENT_SOURCE_DIR}) + set(${var} ${targets} PARENT_SCOPE) +endfunction() + +if (FUZZER) + # Bundle fuzzers target + add_custom_target(fuzzers) + # Instrument all targets fuzzer and link with libfuzzer + get_all_targets(all_targets) + foreach(target ${all_targets}) + if (NOT(target STREQUAL "_fuzzer" OR target STREQUAL "_fuzzer_no_main")) + get_target_property(target_type ${target} TYPE) + if (NOT(target_type STREQUAL "INTERFACE_LIBRARY" OR target_type STREQUAL "UTILITY")) + target_compile_options(${target} PRIVATE "-fsanitize=fuzzer-no-link") + endif() + # clickhouse fuzzer isn't working correctly + # initial PR https://github.com/ClickHouse/ClickHouse/pull/27526 + #if (target MATCHES ".+_fuzzer" OR target STREQUAL "clickhouse") + if (target MATCHES ".+_fuzzer") + message(STATUS "${target} instrumented with fuzzer") + target_link_libraries(${target} PUBLIC ch_contrib::fuzzer) + # Add to fuzzers bundle + add_dependencies(fuzzers ${target}) + endif() + endif() + endforeach() +endif() + include (cmake/sanitize_targets.cmake) # Build native targets if necessary diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 601cb8cf06d..63dbac6a995 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ esac ARG REPOSITORY="https://s3.amazonaws.com/clickhouse-builds/22.4/31c367d3cd3aefd316778601ff6565119fe36682/package_release" -ARG VERSION="23.8.1.2992" +ARG VERSION="23.8.2.7" ARG PACKAGES="clickhouse-keeper" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index 38b2ef75d24..39d299e1794 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -97,9 +97,11 @@ if [ -n "$MAKE_DEB" ]; then bash -x /build/packages/build fi -mv ./programs/clickhouse* /output -[ -x ./programs/self-extracting/clickhouse ] && mv ./programs/self-extracting/clickhouse /output -mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds +if [ "$BUILD_TARGET" != "fuzzers" ]; then + mv ./programs/clickhouse* /output + [ -x ./programs/self-extracting/clickhouse ] && mv ./programs/self-extracting/clickhouse /output + mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds +fi prepare_combined_output () { local OUTPUT diff --git a/docker/packager/packager b/docker/packager/packager index 1d0ccae8a69..c1bb839193f 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -149,7 +149,10 @@ def parse_env_variables( result = [] result.append("OUTPUT_DIR=/output") cmake_flags = ["$CMAKE_FLAGS"] - build_target = "clickhouse-bundle" + if package_type == "fuzzers": + build_target = "fuzzers" + else: + build_target = "clickhouse-bundle" is_cross_darwin = compiler.endswith(DARWIN_SUFFIX) is_cross_darwin_arm = compiler.endswith(DARWIN_ARM_SUFFIX) @@ -258,6 +261,17 @@ def parse_env_variables( cmake_flags.append("-DBUILD_STANDALONE_KEEPER=1") else: result.append("BUILD_MUSL_KEEPER=1") + elif package_type == "fuzzers": + cmake_flags.append("-DENABLE_FUZZING=1") + cmake_flags.append("-DENABLE_PROTOBUF=1") + cmake_flags.append("-DUSE_INTERNAL_PROTOBUF_LIBRARY=1") + cmake_flags.append("-DWITH_COVERAGE=1") + cmake_flags.append("-DCMAKE_AUTOGEN_VERBOSE=ON") + # cmake_flags.append("-DCMAKE_INSTALL_PREFIX=/usr") + # cmake_flags.append("-DCMAKE_INSTALL_SYSCONFDIR=/etc") + # cmake_flags.append("-DCMAKE_INSTALL_LOCALSTATEDIR=/var") + # Reduce linking and building time by avoid *install/all dependencies + cmake_flags.append("-DCMAKE_SKIP_INSTALL_ALL_DEPENDENCY=ON") result.append(f"CC={cc}") result.append(f"CXX={cxx}") @@ -365,7 +379,7 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--package-type", - choices=["deb", "binary"], + choices=["deb", "binary", "fuzzers"], required=True, ) parser.add_argument( diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 1afaa3c623f..36dacd781bc 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="23.8.1.2992" +ARG VERSION="23.8.2.7" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index 409fb082617..752adf67229 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -23,7 +23,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="23.8.1.2992" +ARG VERSION="23.8.2.7" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # set non-empty deb_location_url url to create a docker image diff --git a/docs/changelogs/v23.3.12.11-lts.md b/docs/changelogs/v23.3.12.11-lts.md new file mode 100644 index 00000000000..2eaaa575f60 --- /dev/null +++ b/docs/changelogs/v23.3.12.11-lts.md @@ -0,0 +1,20 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.3.12.11-lts (414317bed21) FIXME as compared to v23.3.11.5-lts (5762a23a76d) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix async insert with deduplication for ReplicatedMergeTree using merging algorithms [#51676](https://github.com/ClickHouse/ClickHouse/pull/51676) ([Antonio Andelic](https://github.com/antonio2368)). +* Fix deadlock on DatabaseCatalog shutdown [#51908](https://github.com/ClickHouse/ClickHouse/pull/51908) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix crash in join on sparse column [#53548](https://github.com/ClickHouse/ClickHouse/pull/53548) ([vdimir](https://github.com/vdimir)). +* Fix rows_before_limit_at_least for DelayedSource. [#54122](https://github.com/ClickHouse/ClickHouse/pull/54122) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Fix broken `02862_sorted_distinct_sparse_fix` [#53738](https://github.com/ClickHouse/ClickHouse/pull/53738) ([Antonio Andelic](https://github.com/antonio2368)). + diff --git a/docs/changelogs/v23.8.2.7-lts.md b/docs/changelogs/v23.8.2.7-lts.md new file mode 100644 index 00000000000..317e2c6d56a --- /dev/null +++ b/docs/changelogs/v23.8.2.7-lts.md @@ -0,0 +1,18 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.8.2.7-lts (f73c8f37874) FIXME as compared to v23.8.1.2992-lts (ebc7d9a9f3b) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix: parallel replicas over distributed don't read from all replicas [#54199](https://github.com/ClickHouse/ClickHouse/pull/54199) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix: allow IPv6 for bloom filter [#54200](https://github.com/ClickHouse/ClickHouse/pull/54200) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* S3Queue is experimental [#54214](https://github.com/ClickHouse/ClickHouse/pull/54214) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + diff --git a/docs/en/operations/system-tables/distributed_ddl_queue.md b/docs/en/operations/system-tables/distributed_ddl_queue.md index 8cccf946621..a552fd548a8 100644 --- a/docs/en/operations/system-tables/distributed_ddl_queue.md +++ b/docs/en/operations/system-tables/distributed_ddl_queue.md @@ -8,17 +8,21 @@ Contains information about [distributed ddl queries (ON CLUSTER clause)](../../s Columns: - `entry` ([String](../../sql-reference/data-types/string.md)) — Query id. -- `host_name` ([String](../../sql-reference/data-types/string.md)) — Hostname. -- `host_address` ([String](../../sql-reference/data-types/string.md)) — IP address that the Hostname resolves to. -- `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — Host Port. -- `status` ([Enum8](../../sql-reference/data-types/enum.md)) — Status of the query. +- `entry_version` ([Nullable(UInt8)](../../sql-reference/data-types/int-uint.md)) - Version of the entry +- `initiator_host` ([Nullable(String)](../../sql-reference/data-types/string.md)) - Host that initiated the DDL operation +- `initiator_port` ([Nullable(UInt16)](../../sql-reference/data-types/int-uint.md)) - Port used by the initiator - `cluster` ([String](../../sql-reference/data-types/string.md)) — Cluster name. - `query` ([String](../../sql-reference/data-types/string.md)) — Query executed. -- `initiator` ([String](../../sql-reference/data-types/string.md)) — Node that executed the query. -- `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query start time. +- `settings` ([Map(String, String)](../../sql-reference/data-types/map.md)) - Settings used in the DDL operation +- `query_create_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query created time. +- `host` ([String](../../sql-reference/data-types/string.md)) — Hostname +- `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — Host Port. +- `status` ([Enum8](../../sql-reference/data-types/enum.md)) — Status of the query. +- `exception_code` ([Enum8](../../sql-reference/data-types/enum.md)) — Exception code. +- `exception_text` ([Nullable(String)](../../sql-reference/data-types/string.md)) - Exception message - `query_finish_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query finish time. - `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration of query execution (in milliseconds). -- `exception_code` ([Enum8](../../sql-reference/data-types/enum.md)) — Exception code from [ClickHouse Keeper](../../operations/tips.md#zookeeper). + **Example** @@ -34,32 +38,38 @@ Query id: f544e72a-6641-43f1-836b-24baa1c9632a Row 1: ────── entry: query-0000000000 -host_name: clickhouse01 -host_address: 172.23.0.11 -port: 9000 -status: Finished +entry_version: 5 +initiator_host: clickhouse01 +initiator_port: 9000 cluster: test_cluster query: CREATE DATABASE test_db UUID '4a82697e-c85e-4e5b-a01e-a36f2a758456' ON CLUSTER test_cluster -initiator: clickhouse01:9000 -query_start_time: 2020-12-30 13:07:51 -query_finish_time: 2020-12-30 13:07:51 -query_duration_ms: 6 -exception_code: ZOK +settings: {'max_threads':'16','use_uncompressed_cache':'0'} +query_create_time: 2023-09-01 16:15:14 +host: clickhouse-01 +port: 9000 +status: Finished +exception_code: 0 +exception_text: +query_finish_time: 2023-09-01 16:15:14 +query_duration_ms: 154 Row 2: ────── -entry: query-0000000000 -host_name: clickhouse02 -host_address: 172.23.0.12 -port: 9000 -status: Finished +entry: query-0000000001 +entry_version: 5 +initiator_host: clickhouse01 +initiator_port: 9000 cluster: test_cluster query: CREATE DATABASE test_db UUID '4a82697e-c85e-4e5b-a01e-a36f2a758456' ON CLUSTER test_cluster -initiator: clickhouse01:9000 -query_start_time: 2020-12-30 13:07:51 -query_finish_time: 2020-12-30 13:07:51 -query_duration_ms: 6 -exception_code: ZOK +settings: {'max_threads':'16','use_uncompressed_cache':'0'} +query_create_time: 2023-09-01 16:15:14 +host: clickhouse-01 +port: 9000 +status: Finished +exception_code: 630 +exception_text: Code: 630. DB::Exception: Cannot drop or rename test_db, because some tables depend on it: +query_finish_time: 2023-09-01 16:15:14 +query_duration_ms: 154 2 rows in set. Elapsed: 0.025 sec. ``` diff --git a/docs/en/operations/utilities/clickhouse-disks.md b/docs/en/operations/utilities/clickhouse-disks.md new file mode 100644 index 00000000000..76db9e41836 --- /dev/null +++ b/docs/en/operations/utilities/clickhouse-disks.md @@ -0,0 +1,38 @@ +--- +slug: /en/operations/utilities/clickhouse-disks +sidebar_position: 59 +sidebar_label: clickhouse-disks +--- + +# clickhouse-disks + +A utility providing filesystem-like operations for ClickHouse disks. + +Program-wide options: + +* `--config-file, -C` -- path to ClickHouse config, defaults to `/etc/clickhouse-server/config.xml`. +* `--save-logs` -- Log progress of invoked commands to `/var/log/clickhouse-server/clickhouse-disks.log`. +* `--log-level` -- What [type](../server-configuration-parameters/settings#server_configuration_parameters-logger) of events to log, defaults to `none`. +* `--disk` -- what disk to use for `mkdir, move, read, write, remove` commands. Defaults to `default`. + +## Commands + +* `copy [--disk-from d1] [--disk-to d2] `. + Recursively copy data from `FROM_PATH` at disk `d1` (defaults to `disk` value if not provided) + to `TO_PATH` at disk `d2` (defaults to `disk` value if not provided). +* `move `. + Move file or directory from `FROM_PATH` to `TO_PATH`. +* `remove `. + Remove `PATH` recursively. +* `link `. + Create a hardlink from `FROM_PATH` to `TO_PATH`. +* `list [--recursive] ...` + List files at `PATH`s. Non-recursive by default. +* `list-disks`. + List disks names. +* `mkdir [--recursive] `. + Create a directory. Non-recursive by default. +* `read: []` + Read a file from `FROM_PATH` to `TO_PATH` (`stdout` if not supplied). +* `write [FROM_PATH] `. + Write a file from `FROM_PATH` (`stdin` if not supplied) to `TO_PATH`. diff --git a/docs/en/operations/utilities/index.md b/docs/en/operations/utilities/index.md index 112a51cfa97..5667f99b6fa 100644 --- a/docs/en/operations/utilities/index.md +++ b/docs/en/operations/utilities/index.md @@ -13,4 +13,6 @@ pagination_next: 'en/operations/utilities/clickhouse-copier' - [clickhouse-format](../../operations/utilities/clickhouse-format.md) — Enables formatting input queries. - [ClickHouse obfuscator](../../operations/utilities/clickhouse-obfuscator.md) — Obfuscates data. - [ClickHouse compressor](../../operations/utilities/clickhouse-compressor.md) — Compresses and decompresses data. +- [clickhouse-disks](../../operations/utilities/clickhouse-disks.md) -- Provides filesystem-like operations + on files among different ClickHouse disks. - [clickhouse-odbc-bridge](../../operations/utilities/odbc-bridge.md) — A proxy server for ODBC driver. diff --git a/docs/en/sql-reference/statements/optimize.md b/docs/en/sql-reference/statements/optimize.md index 45d336c42f2..49843eaff9a 100644 --- a/docs/en/sql-reference/statements/optimize.md +++ b/docs/en/sql-reference/statements/optimize.md @@ -94,8 +94,10 @@ Result: │ 1 │ 1 │ 3 │ 3 │ └─────────────┴───────────────┴───────┴───────────────┘ ``` +All following examples are executed against this state with 5 rows. -When columns for deduplication are not specified, all of them are taken into account. Row is removed only if all values in all columns are equal to corresponding values in previous row: +#### `DEDUPLICATE` +When columns for deduplication are not specified, all of them are taken into account. The row is removed only if all values in all columns are equal to corresponding values in the previous row: ``` sql OPTIMIZE TABLE example FINAL DEDUPLICATE; @@ -116,7 +118,7 @@ Result: │ 1 │ 1 │ 3 │ 3 │ └─────────────┴───────────────┴───────┴───────────────┘ ``` - +#### `DEDUPLICATE BY *` When columns are specified implicitly, the table is deduplicated by all columns that are not `ALIAS` or `MATERIALIZED`. Considering the table above, these are `primary_key`, `secondary_key`, `value`, and `partition_key` columns: ```sql OPTIMIZE TABLE example FINAL DEDUPLICATE BY *; @@ -137,7 +139,7 @@ Result: │ 1 │ 1 │ 3 │ 3 │ └─────────────┴───────────────┴───────┴───────────────┘ ``` - +#### `DEDUPLICATE BY * EXCEPT` Deduplicate by all columns that are not `ALIAS` or `MATERIALIZED` and explicitly not `value`: `primary_key`, `secondary_key`, and `partition_key` columns. ``` sql @@ -158,7 +160,7 @@ Result: │ 1 │ 1 │ 2 │ 3 │ └─────────────┴───────────────┴───────┴───────────────┘ ``` - +#### `DEDUPLICATE BY ` Deduplicate explicitly by `primary_key`, `secondary_key`, and `partition_key` columns: ```sql OPTIMIZE TABLE example FINAL DEDUPLICATE BY primary_key, secondary_key, partition_key; @@ -178,8 +180,8 @@ Result: │ 1 │ 1 │ 2 │ 3 │ └─────────────┴───────────────┴───────┴───────────────┘ ``` - -Deduplicate by any column matching a regex: `primary_key`, `secondary_key`, and `partition_key` columns: +#### `DEDUPLICATE BY COLUMNS()` +Deduplicate by all columns matching a regex: `primary_key`, `secondary_key`, and `partition_key` columns: ```sql OPTIMIZE TABLE example FINAL DEDUPLICATE BY COLUMNS('.*_key'); ``` diff --git a/programs/disks/CommandCopy.cpp b/programs/disks/CommandCopy.cpp index 02c11bc5aa9..4a7af1ced29 100644 --- a/programs/disks/CommandCopy.cpp +++ b/programs/disks/CommandCopy.cpp @@ -17,23 +17,21 @@ public: { command_name = "copy"; command_option_description.emplace(createOptionsDescription("Allowed options", getTerminalWidth())); - description = "Recursively copy data containing at `from_path` to `to_path`\nPath should be in format './' or './path' or 'path'"; + description = "Recursively copy data from `FROM_PATH` to `TO_PATH`"; usage = "copy [OPTION]... "; command_option_description->add_options() - ("diskFrom", po::value(), "set name for disk from which we do operations") - ("diskTo", po::value(), "set name for disk to which we do operations") - ; - + ("disk-from", po::value(), "disk from which we copy") + ("disk-to", po::value(), "disk to which we copy"); } void processOptions( Poco::Util::LayeredConfiguration & config, po::variables_map & options) const override { - if (options.count("diskFrom")) - config.setString("diskFrom", options["diskFrom"].as()); - if (options.count("diskTo")) - config.setString("diskTo", options["diskTo"].as()); + if (options.count("disk-from")) + config.setString("disk-from", options["disk-from"].as()); + if (options.count("disk-to")) + config.setString("disk-to", options["disk-to"].as()); } void execute( @@ -47,8 +45,8 @@ public: throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Bad Arguments"); } - String disk_name_from = config.getString("diskFrom", config.getString("disk", "default")); - String disk_name_to = config.getString("diskTo", config.getString("disk", "default")); + String disk_name_from = config.getString("disk-from", config.getString("disk", "default")); + String disk_name_to = config.getString("disk-to", config.getString("disk", "default")); const String & path_from = command_arguments[0]; const String & path_to = command_arguments[1]; diff --git a/programs/disks/CommandLink.cpp b/programs/disks/CommandLink.cpp index 0e94eb87c04..357832865fb 100644 --- a/programs/disks/CommandLink.cpp +++ b/programs/disks/CommandLink.cpp @@ -15,7 +15,7 @@ public: CommandLink() { command_name = "link"; - description = "Create hardlink from `from_path` to `to_path`\nPath should be in format './' or './path' or 'path'"; + description = "Create hardlink from `from_path` to `to_path`"; usage = "link [OPTION]... "; } diff --git a/programs/disks/CommandList.cpp b/programs/disks/CommandList.cpp index 470784bff00..48b54b70014 100644 --- a/programs/disks/CommandList.cpp +++ b/programs/disks/CommandList.cpp @@ -17,11 +17,10 @@ public: { command_name = "list"; command_option_description.emplace(createOptionsDescription("Allowed options", getTerminalWidth())); - description = "List files (the default disk is used by default)\nPath should be in format './' or './path' or 'path'"; + description = "List files at path[s]"; usage = "list [OPTION]... ..."; command_option_description->add_options() - ("recursive", "recursively list all directories") - ; + ("recursive", "recursively list all directories"); } void processOptions( diff --git a/programs/disks/CommandMkDir.cpp b/programs/disks/CommandMkDir.cpp index c938cc52132..e5df982d896 100644 --- a/programs/disks/CommandMkDir.cpp +++ b/programs/disks/CommandMkDir.cpp @@ -18,11 +18,10 @@ public: { command_name = "mkdir"; command_option_description.emplace(createOptionsDescription("Allowed options", getTerminalWidth())); - description = "Create directory or directories recursively"; + description = "Create a directory"; usage = "mkdir [OPTION]... "; command_option_description->add_options() - ("recursive", "recursively create directories") - ; + ("recursive", "recursively create directories"); } void processOptions( diff --git a/programs/disks/CommandMove.cpp b/programs/disks/CommandMove.cpp index 3c564f3bcd3..654090b2138 100644 --- a/programs/disks/CommandMove.cpp +++ b/programs/disks/CommandMove.cpp @@ -15,7 +15,7 @@ public: CommandMove() { command_name = "move"; - description = "Move file or directory from `from_path` to `to_path`\nPath should be in format './' or './path' or 'path'"; + description = "Move file or directory from `from_path` to `to_path`"; usage = "move [OPTION]... "; } diff --git a/programs/disks/CommandRead.cpp b/programs/disks/CommandRead.cpp index 2dd5c191d10..b6cacdd2c61 100644 --- a/programs/disks/CommandRead.cpp +++ b/programs/disks/CommandRead.cpp @@ -20,11 +20,10 @@ public: { command_name = "read"; command_option_description.emplace(createOptionsDescription("Allowed options", getTerminalWidth())); - description = "read File `from_path` to `to_path` or to stdout\nPath should be in format './' or './path' or 'path'"; - usage = "read [OPTION]... \nor\nread [OPTION]... "; + description = "Read a file from `FROM_PATH` to `TO_PATH`"; + usage = "read [OPTION]... []"; command_option_description->add_options() - ("output", po::value(), "set path to file to which we are read") - ; + ("output", po::value(), "file to which we are reading, defaults to `stdout`"); } void processOptions( diff --git a/programs/disks/CommandWrite.cpp b/programs/disks/CommandWrite.cpp index b055c6f9343..d075daf3215 100644 --- a/programs/disks/CommandWrite.cpp +++ b/programs/disks/CommandWrite.cpp @@ -21,11 +21,10 @@ public: { command_name = "write"; command_option_description.emplace(createOptionsDescription("Allowed options", getTerminalWidth())); - description = "Write File `from_path` or stdin to `to_path`"; - usage = "write [OPTION]... \nor\nstdin | write [OPTION]... \nPath should be in format './' or './path' or 'path'"; + description = "Write a file from `FROM_PATH` to `TO_PATH`"; + usage = "write [OPTION]... [] "; command_option_description->add_options() - ("input", po::value(), "set path to file to which we are write") - ; + ("input", po::value(), "file from which we are reading, defaults to `stdin`"); } void processOptions( diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 5c522b678ef..f5814926914 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -948,48 +948,66 @@ int mainEntryClickHouseLocal(int argc, char ** argv) #if defined(FUZZING_MODE) +// linked from programs/main.cpp +bool isClickhouseApp(const std::string & app_suffix, std::vector & argv); + std::optional fuzz_app; extern "C" int LLVMFuzzerInitialize(int * pargc, char *** pargv) { - int & argc = *pargc; - char ** argv = *pargv; + std::vector argv(*pargv, *pargv + (*pargc + 1)); + + if (!isClickhouseApp("local", argv)) + { + std::cerr << "\033[31m" << "ClickHouse compiled in fuzzing mode, only clickhouse local is available." << "\033[0m" << std::endl; + exit(1); + } /// As a user you can add flags to clickhouse binary in fuzzing mode as follows - /// clickhouse -- + /// clickhouse local -- - /// Calculate the position of delimiter "--" that separates arguments - /// of clickhouse-local and libfuzzer - int pos_delim = argc; - for (int i = 0; i < argc; ++i) - { - if (strcmp(argv[i], "--") == 0) + char **p = &(*pargv)[1]; + + auto it = argv.begin() + 1; + for (; *it; ++it) + if (strcmp(*it, "--") == 0) { - pos_delim = i; + ++it; break; } - } + + while (*it) + if (strncmp(*it, "--", 2) != 0) + { + *(p++) = *it; + it = argv.erase(it); + } + else + ++it; + + *pargc = static_cast(p - &(*pargv)[0]); + *p = nullptr; /// Initialize clickhouse-local app fuzz_app.emplace(); - fuzz_app->init(pos_delim, argv); + fuzz_app->init(static_cast(argv.size() - 1), argv.data()); - /// We will leave clickhouse-local specific arguments as is, because libfuzzer will ignore - /// all keys starting with -- return 0; } extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - auto input = String(reinterpret_cast(data), size); - DB::FunctionGetFuzzerData::update(input); - fuzz_app->run(); + try + { + auto input = String(reinterpret_cast(data), size); + DB::FunctionGetFuzzerData::update(input); + fuzz_app->run(); + } + catch (...) + { + } + return 0; } -catch (...) -{ - return 1; -} #endif diff --git a/programs/main.cpp b/programs/main.cpp index 4af9e3a3067..5857e8d5ee4 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -165,26 +165,6 @@ int printHelp(int, char **) std::cerr << "clickhouse " << application.first << " [args] " << std::endl; return -1; } - -bool isClickhouseApp(const std::string & app_suffix, std::vector & argv) -{ - /// Use app if the first arg 'app' is passed (the arg should be quietly removed) - if (argv.size() >= 2) - { - auto first_arg = argv.begin() + 1; - - /// 'clickhouse --client ...' and 'clickhouse client ...' are Ok - if (*first_arg == "--" + app_suffix || *first_arg == app_suffix) - { - argv.erase(first_arg); - return true; - } - } - - /// Use app if clickhouse binary is run through symbolic link with name clickhouse-app - std::string app_name = "clickhouse-" + app_suffix; - return !argv.empty() && (app_name == argv[0] || endsWith(argv[0], "/" + app_name)); -} #endif @@ -351,7 +331,7 @@ struct Checker ; -#if !defined(USE_MUSL) +#if !defined(FUZZING_MODE) && !defined(USE_MUSL) /// NOTE: We will migrate to full static linking or our own dynamic loader to make this code obsolete. void checkHarmfulEnvironmentVariables(char ** argv) { @@ -407,6 +387,25 @@ void checkHarmfulEnvironmentVariables(char ** argv) } +bool isClickhouseApp(const std::string & app_suffix, std::vector & argv) +{ + /// Use app if the first arg 'app' is passed (the arg should be quietly removed) + if (argv.size() >= 2) + { + auto first_arg = argv.begin() + 1; + + /// 'clickhouse --client ...' and 'clickhouse client ...' are Ok + if (*first_arg == "--" + app_suffix || *first_arg == app_suffix) + { + argv.erase(first_arg); + return true; + } + } + + /// Use app if clickhouse binary is run through symbolic link with name clickhouse-app + std::string app_name = "clickhouse-" + app_suffix; + return !argv.empty() && (app_name == argv[0] || endsWith(argv[0], "/" + app_name)); +} /// Don't allow dlopen in the main ClickHouse binary, because it is harmful and insecure. /// We don't use it. But it can be used by some libraries for implementation of "plugins". diff --git a/programs/server/CMakeLists.txt b/programs/server/CMakeLists.txt index b8241afa1eb..81440b03690 100644 --- a/programs/server/CMakeLists.txt +++ b/programs/server/CMakeLists.txt @@ -30,3 +30,7 @@ endif() clickhouse_program_add(server) install(FILES config.xml users.xml DESTINATION "${CLICKHOUSE_ETC_DIR}/clickhouse-server" COMPONENT clickhouse) + +if (ENABLE_FUZZING) + add_subdirectory(fuzzers) +endif() diff --git a/programs/server/fuzzers/CMakeLists.txt b/programs/server/fuzzers/CMakeLists.txt new file mode 100644 index 00000000000..1f25fa3ceb3 --- /dev/null +++ b/programs/server/fuzzers/CMakeLists.txt @@ -0,0 +1,15 @@ +clickhouse_add_executable(tcp_protocol_fuzzer tcp_protocol_fuzzer.cpp ../Server.cpp ../MetricsTransmitter.cpp) + +set (TCP_PROTOCOL_FUZZER_LINK + PRIVATE + daemon + clickhouse_aggregate_functions + clickhouse_functions + clickhouse_table_functions +) + +if (TARGET ch_contrib::jemalloc) + list(APPEND TCP_PROTOCOL_FUZZER_LINK PRIVATE ch_contrib::jemalloc) +endif() + +target_link_libraries(tcp_protocol_fuzzer ${TCP_PROTOCOL_FUZZER_LINK}) diff --git a/programs/server/fuzzers/tcp_protocol_fuzzer.cpp b/programs/server/fuzzers/tcp_protocol_fuzzer.cpp new file mode 100644 index 00000000000..950ea09669a --- /dev/null +++ b/programs/server/fuzzers/tcp_protocol_fuzzer.cpp @@ -0,0 +1,119 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + + +int mainEntryClickHouseServer(int argc, char ** argv); + +static std::string clickhouse("clickhouse-server"); +static std::vector args{clickhouse.data()}; +static std::future main_app; + +static std::string s_host("0.0.0.0"); +static char * host = s_host.data(); +static int64_t port = 9000; + +using namespace std::chrono_literals; + +extern "C" +int LLVMFuzzerInitialize(int * argc, char ***argv) +{ + for (int i = 1; i < *argc; ++i) + { + if ((*argv)[i][0] == '-') + { + if ((*argv)[i][1] == '-') + args.push_back((*argv)[i]); + else + { + if (strncmp((*argv)[i], "-host=", 6) == 0) + { + host = (*argv)[i] + 6; + } + else if (strncmp((*argv)[i], "-port=", 6) == 0) + { + char * p_end = nullptr; + port = strtol((*argv)[i] + 6, &p_end, 10); + } + } + } + } + + args.push_back(nullptr); + + main_app = std::async(std::launch::async, mainEntryClickHouseServer, args.size() - 1, args.data()); + + while (!DB::Context::getGlobalContextInstance() || !DB::Context::getGlobalContextInstance()->isServerCompletelyStarted()) + { + std::this_thread::sleep_for(100ms); + if (main_app.wait_for(0s) == std::future_status::ready) + exit(-1); + } + + return 0; +} + +extern "C" +int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) +{ + try + { + if (main_app.wait_for(0s) == std::future_status::ready) + return -1; + + if (size == 0) + return -1; + + Poco::Net::SocketAddress address(host, port); + Poco::Net::StreamSocket socket; + + socket.connectNB(address); + + Poco::Net::PollSet ps; + ps.add(socket, Poco::Net::PollSet::POLL_READ | Poco::Net::PollSet::POLL_WRITE); + + std::vector buf(1048576); + size_t sent = 0; + while (true) + { + auto m = ps.poll(Poco::Timespan(1000000)); + if (m.empty()) + continue; + if (m.begin()->second & Poco::Net::PollSet::POLL_READ) + { + if (int n = socket.receiveBytes(buf.data(), static_cast(buf.size())); n == 0) + { + socket.close(); + break; + } + + continue; + } + + if (sent < size && m.begin()->second & Poco::Net::PollSet::POLL_WRITE) + { + sent += socket.sendBytes(data + sent, static_cast(size - sent)); + if (sent == size) + { + socket.shutdownSend(); + continue; + } + } + } + } + catch (...) + { + } + + return 0; +} diff --git a/src/AggregateFunctions/fuzzers/CMakeLists.txt b/src/AggregateFunctions/fuzzers/CMakeLists.txt index 3876ffac7ab..907a275b4b3 100644 --- a/src/AggregateFunctions/fuzzers/CMakeLists.txt +++ b/src/AggregateFunctions/fuzzers/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable(aggregate_function_state_deserialization_fuzzer aggregate_function_state_deserialization_fuzzer.cpp ${SRCS}) -target_link_libraries(aggregate_function_state_deserialization_fuzzer PRIVATE dbms clickhouse_aggregate_functions ${LIB_FUZZING_ENGINE}) +target_link_libraries(aggregate_function_state_deserialization_fuzzer PRIVATE dbms clickhouse_aggregate_functions) diff --git a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp index 2ea01e1d5bc..290da81944d 100644 --- a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp +++ b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -16,68 +17,69 @@ #include extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - using namespace DB; - - static SharedContextHolder shared_context; - static ContextMutablePtr context; - - auto initialize = [&]() mutable + try { - shared_context = Context::createShared(); - context = Context::createGlobal(shared_context.get()); - context->makeGlobalContext(); - context->setApplicationType(Context::ApplicationType::LOCAL); + using namespace DB; - MainThreadStatus::getInstance(); + static SharedContextHolder shared_context; + static ContextMutablePtr context; - registerAggregateFunctions(); - return true; - }; + auto initialize = [&]() mutable + { + shared_context = Context::createShared(); + context = Context::createGlobal(shared_context.get()); + context->makeGlobalContext(); + context->setApplicationType(Context::ApplicationType::LOCAL); - static bool initialized = initialize(); - (void) initialized; + MainThreadStatus::getInstance(); - total_memory_tracker.resetCounters(); - total_memory_tracker.setHardLimit(1_GiB); - CurrentThread::get().memory_tracker.resetCounters(); - CurrentThread::get().memory_tracker.setHardLimit(1_GiB); + registerAggregateFunctions(); + return true; + }; - /// The input format is as follows: - /// - the aggregate function name on the first line, possible with parameters, then data types of the arguments, - /// example: quantile(0.5), Float64 - /// - the serialized aggregation state for the rest of the input. + static bool initialized = initialize(); + (void) initialized; - /// Compile the code as follows: - /// mkdir build_asan_fuzz - /// cd build_asan_fuzz - /// CC=clang CXX=clang++ cmake -D SANITIZE=address -D ENABLE_FUZZING=1 -D WITH_COVERAGE=1 .. - /// - /// The corpus is located here: - /// https://github.com/ClickHouse/fuzz-corpus/tree/main/aggregate_function_state_deserialization - /// - /// The fuzzer can be run as follows: - /// ../../../build_asan_fuzz/src/DataTypes/fuzzers/aggregate_function_state_deserialization corpus -jobs=64 -rss_limit_mb=8192 + total_memory_tracker.resetCounters(); + total_memory_tracker.setHardLimit(1_GiB); + CurrentThread::get().memory_tracker.resetCounters(); + CurrentThread::get().memory_tracker.setHardLimit(1_GiB); - DB::ReadBufferFromMemory in(data, size); + /// The input format is as follows: + /// - the aggregate function name on the first line, possible with parameters, then data types of the arguments, + /// example: quantile(0.5), Float64 + /// - the serialized aggregation state for the rest of the input. - String args; - readStringUntilNewlineInto(args, in); - assertChar('\n', in); + /// Compile the code as follows: + /// mkdir build_asan_fuzz + /// cd build_asan_fuzz + /// CC=clang CXX=clang++ cmake -D SANITIZE=address -D ENABLE_FUZZING=1 -D WITH_COVERAGE=1 .. + /// + /// The corpus is located here: + /// https://github.com/ClickHouse/fuzz-corpus/tree/main/aggregate_function_state_deserialization + /// + /// The fuzzer can be run as follows: + /// ../../../build_asan_fuzz/src/DataTypes/fuzzers/aggregate_function_state_deserialization corpus -jobs=64 -rss_limit_mb=8192 - DataTypePtr type = DataTypeFactory::instance().get(fmt::format("AggregateFunction({})", args)); - AggregateFunctionPtr func = assert_cast(*type).getFunction(); + DB::ReadBufferFromMemory in(data, size); - Arena arena; - char * place = arena.alignedAlloc(func->sizeOfData(), func->alignOfData()); - func->create(place); - SCOPE_EXIT(func->destroy(place)); - func->deserialize(place, in, {}, &arena); + String args; + readStringUntilNewlineInto(args, in); + assertChar('\n', in); + + DataTypePtr type = DataTypeFactory::instance().get(fmt::format("AggregateFunction({})", args)); + AggregateFunctionPtr func = assert_cast(*type).getFunction(); + + Arena arena; + char * place = arena.alignedAlloc(func->sizeOfData(), func->alignOfData()); + func->create(place); + SCOPE_EXIT(func->destroy(place)); + func->deserialize(place, in, {}, &arena); + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Analyzer/Passes/UniqToCountPass.cpp b/src/Analyzer/Passes/UniqToCountPass.cpp new file mode 100644 index 00000000000..4373918a8cc --- /dev/null +++ b/src/Analyzer/Passes/UniqToCountPass.cpp @@ -0,0 +1,195 @@ +#include "UniqToCountPass.h" + +#include +#include + +#include +#include +#include +#include + +namespace DB +{ + +namespace +{ + +bool matchFnUniq(String func_name) +{ + auto name = Poco::toLower(func_name); + return name == "uniq" || name == "uniqHLL12" || name == "uniqExact" || name == "uniqTheta" || name == "uniqCombined" + || name == "uniqCombined64"; +} + +/// Extract the corresponding projection columns for group by node list. +/// For example: +/// SELECT a as aa, any(b) FROM table group by a; -> aa(ColumnNode) +NamesAndTypes extractProjectionColumnsForGroupBy(const QueryNode * query_node) +{ + if (!query_node->hasGroupBy()) + return {}; + + NamesAndTypes result; + for (const auto & group_by_ele : query_node->getGroupByNode()->getChildren()) + { + const auto & projection_columns = query_node->getProjectionColumns(); + const auto & projection_nodes = query_node->getProjection().getNodes(); + + assert(projection_columns.size() == projection_nodes.size()); + + for (size_t i = 0; i < projection_columns.size(); i++) + { + if (projection_nodes[i]->isEqual(*group_by_ele)) + result.push_back(projection_columns[i]); + } + } + return result; +} + +/// Whether query_columns equals subquery_columns. +/// query_columns: query columns from query +/// subquery_columns: projection columns from subquery +bool nodeListEquals(const QueryTreeNodes & query_columns, const NamesAndTypes & subquery_columns) +{ + if (query_columns.size() != subquery_columns.size()) + return false; + + for (const auto & query_column : query_columns) + { + auto find = std::find_if( + subquery_columns.begin(), + subquery_columns.end(), + [&](const auto & subquery_column) -> bool + { + if (auto * column_node = query_column->as()) + { + return subquery_column == column_node->getColumn(); + } + return false; + }); + + if (find == subquery_columns.end()) + return false; + } + return true; +} + +/// Whether subquery_columns contains all columns in subquery_columns. +/// query_columns: query columns from query +/// subquery_columns: projection columns from subquery +bool nodeListContainsAll(const QueryTreeNodes & query_columns, const NamesAndTypes & subquery_columns) +{ + if (query_columns.size() > subquery_columns.size()) + return false; + + for (const auto & query_column : query_columns) + { + auto find = std::find_if( + subquery_columns.begin(), + subquery_columns.end(), + [&](const auto & subquery_column) -> bool + { + if (auto * column_node = query_column->as()) + { + return subquery_column == column_node->getColumn(); + } + return false; + }); + + if (find == subquery_columns.end()) + return false; + } + return true; +} + +} + +class UniqToCountVisitor : public InDepthQueryTreeVisitorWithContext +{ +public: + using Base = InDepthQueryTreeVisitorWithContext; + using Base::Base; + + void enterImpl(QueryTreeNodePtr & node) + { + if (!getSettings().optimize_uniq_to_count) + return; + + auto * query_node = node->as(); + if (!query_node) + return; + + /// Check that query has only single table expression which is subquery + auto * subquery_node = query_node->getJoinTree()->as(); + if (!subquery_node) + return; + + /// Check that query has only single node in projection + auto & projection_nodes = query_node->getProjection().getNodes(); + if (projection_nodes.size() != 1) + return; + + /// Check that projection_node is a function + auto & projection_node = projection_nodes[0]; + auto * function_node = projection_node->as(); + if (!function_node) + return; + + /// Check that query single projection node is `uniq` or its variants + if (!matchFnUniq(function_node->getFunctionName())) + return; + + auto & uniq_arguments_nodes = function_node->getArguments().getNodes(); + + /// Whether query matches 'SELECT uniq(x ...) FROM (SELECT DISTINCT x ...)' + auto match_subquery_with_distinct = [&]() -> bool + { + if (!subquery_node->isDistinct()) + return false; + + /// uniq expression list == subquery projection columns + if (!nodeListEquals(uniq_arguments_nodes, subquery_node->getProjectionColumns())) + return false; + + return true; + }; + + /// Whether query matches 'SELECT uniq(x ...) FROM (SELECT x ... GROUP BY x ...)' + auto match_subquery_with_group_by = [&]() -> bool + { + if (!subquery_node->hasGroupBy()) + return false; + + /// uniq argument node list == subquery group by node list + auto group_by_columns = extractProjectionColumnsForGroupBy(subquery_node); + + if (!nodeListEquals(uniq_arguments_nodes, group_by_columns)) + return false; + + /// subquery projection columns must contain all columns in uniq argument node list + if (!nodeListContainsAll(uniq_arguments_nodes, subquery_node->getProjectionColumns())) + return false; + + return true; + }; + + /// Replace uniq of initial query to count + if (match_subquery_with_distinct() || match_subquery_with_group_by()) + { + AggregateFunctionProperties properties; + auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties); + + function_node->getArguments().getNodes().clear(); + function_node->resolveAsAggregateFunction(std::move(aggregate_function)); + } + } +}; + + +void UniqToCountPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +{ + UniqToCountVisitor visitor(context); + visitor.visit(query_tree_node); +} + +} diff --git a/src/Analyzer/Passes/UniqToCountPass.h b/src/Analyzer/Passes/UniqToCountPass.h new file mode 100644 index 00000000000..4992d524e5e --- /dev/null +++ b/src/Analyzer/Passes/UniqToCountPass.h @@ -0,0 +1,30 @@ +#pragma once + +#include + +namespace DB +{ + +/** Optimize `uniq` and its variants(except uniqUpTo) into `count` over subquery. + * Example: 'SELECT uniq(x ...) FROM (SELECT DISTINCT x ...)' to + * Result: 'SELECT count() FROM (SELECT DISTINCT x ...)' + * + * Example: 'SELECT uniq(x ...) FROM (SELECT x ... GROUP BY x ...)' to + * Result: 'SELECT count() FROM (SELECT x ... GROUP BY x ...)' + * + * Note that we can rewrite all uniq variants except uniqUpTo. + */ +class UniqToCountPass final : public IQueryTreePass +{ +public: + String getName() override { return "UniqToCount"; } + + String getDescription() override + { + return "Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause."; + } + + void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; +}; + +} diff --git a/src/Analyzer/QueryTreePassManager.cpp b/src/Analyzer/QueryTreePassManager.cpp index cd3abd9593e..2e4a32bddf6 100644 --- a/src/Analyzer/QueryTreePassManager.cpp +++ b/src/Analyzer/QueryTreePassManager.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -247,6 +248,7 @@ void addQueryTreePasses(QueryTreePassManager & manager) manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); + manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 19244134617..fc116346946 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1147,7 +1147,18 @@ void ClientBase::onEndOfStream() bool is_running = false; output_format->setStartTime( clock_gettime_ns(CLOCK_MONOTONIC) - static_cast(progress_indication.elapsedSeconds() * 1000000000), is_running); - output_format->finalize(); + + try + { + output_format->finalize(); + } + catch (...) + { + /// Format should be reset to make it work for subsequent query + /// (otherwise it will throw again in resetOutput()) + output_format.reset(); + throw; + } } resetOutput(); diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index d1b3388b2fb..8aa36bb9349 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -1013,8 +1013,8 @@ Packet Connection::receivePacket() case Protocol::Server::ReadTaskRequest: return res; - case Protocol::Server::MergeTreeAllRangesAnnounecement: - res.announcement = receiveInitialParallelReadAnnounecement(); + case Protocol::Server::MergeTreeAllRangesAnnouncement: + res.announcement = receiveInitialParallelReadAnnouncement(); return res; case Protocol::Server::MergeTreeReadTaskRequest: @@ -1181,7 +1181,7 @@ ParallelReadRequest Connection::receiveParallelReadRequest() const return ParallelReadRequest::deserialize(*in); } -InitialAllRangesAnnouncement Connection::receiveInitialParallelReadAnnounecement() const +InitialAllRangesAnnouncement Connection::receiveInitialParallelReadAnnouncement() const { return InitialAllRangesAnnouncement::deserialize(*in); } diff --git a/src/Client/Connection.h b/src/Client/Connection.h index f4daf8e3aeb..fcd009fa389 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -276,7 +276,7 @@ private: std::unique_ptr receiveException() const; Progress receiveProgress() const; ParallelReadRequest receiveParallelReadRequest() const; - InitialAllRangesAnnouncement receiveInitialParallelReadAnnounecement() const; + InitialAllRangesAnnouncement receiveInitialParallelReadAnnouncement() const; ProfileInfo receiveProfileInfo() const; void initInputBuffers(); diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index 1a26c4609c7..c0218568d67 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -260,7 +260,7 @@ Packet MultiplexedConnections::drain() switch (packet.type) { case Protocol::Server::TimezoneUpdate: - case Protocol::Server::MergeTreeAllRangesAnnounecement: + case Protocol::Server::MergeTreeAllRangesAnnouncement: case Protocol::Server::MergeTreeReadTaskRequest: case Protocol::Server::ReadTaskRequest: case Protocol::Server::PartUUIDs: @@ -339,7 +339,7 @@ Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callbac switch (packet.type) { case Protocol::Server::TimezoneUpdate: - case Protocol::Server::MergeTreeAllRangesAnnounecement: + case Protocol::Server::MergeTreeAllRangesAnnouncement: case Protocol::Server::MergeTreeReadTaskRequest: case Protocol::Server::ReadTaskRequest: case Protocol::Server::PartUUIDs: diff --git a/src/Common/IntervalTree.h b/src/Common/IntervalTree.h index 1543275233a..fbd1de3197e 100644 --- a/src/Common/IntervalTree.h +++ b/src/Common/IntervalTree.h @@ -119,7 +119,7 @@ public: return true; } - template + template requires(!std::is_same_v) ALWAYS_INLINE bool emplace(Interval interval, Args &&... args) { diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 886522687bd..5220aba1d27 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1516,7 +1516,7 @@ bool ZooKeeper::hasReachedDeadline() const void ZooKeeper::maybeInjectSendFault() { if (unlikely(inject_setup.test() && send_inject_fault && send_inject_fault.value()(thread_local_rng))) - throw Exception::fromMessage(Error::ZSESSIONEXPIRED, "Session expired (fault injected on recv)"); + throw Exception::fromMessage(Error::ZSESSIONEXPIRED, "Session expired (fault injected on send)"); } void ZooKeeper::maybeInjectRecvFault() diff --git a/src/Compression/fuzzers/CMakeLists.txt b/src/Compression/fuzzers/CMakeLists.txt index 6c0e36afdf7..33a0c2d78ce 100644 --- a/src/Compression/fuzzers/CMakeLists.txt +++ b/src/Compression/fuzzers/CMakeLists.txt @@ -5,16 +5,16 @@ # If you want really small size of the resulted binary, just link with fuzz_compression and clickhouse_common_io clickhouse_add_executable (compressed_buffer_fuzzer compressed_buffer_fuzzer.cpp) -target_link_libraries (compressed_buffer_fuzzer PRIVATE dbms ${LIB_FUZZING_ENGINE}) +target_link_libraries (compressed_buffer_fuzzer PRIVATE dbms) clickhouse_add_executable (lz4_decompress_fuzzer lz4_decompress_fuzzer.cpp) -target_link_libraries (lz4_decompress_fuzzer PUBLIC dbms ch_contrib::lz4 ${LIB_FUZZING_ENGINE}) +target_link_libraries (lz4_decompress_fuzzer PUBLIC dbms ch_contrib::lz4) clickhouse_add_executable (delta_decompress_fuzzer delta_decompress_fuzzer.cpp) -target_link_libraries (delta_decompress_fuzzer PRIVATE dbms ${LIB_FUZZING_ENGINE}) +target_link_libraries (delta_decompress_fuzzer PRIVATE dbms) clickhouse_add_executable (double_delta_decompress_fuzzer double_delta_decompress_fuzzer.cpp) -target_link_libraries (double_delta_decompress_fuzzer PRIVATE dbms ${LIB_FUZZING_ENGINE}) +target_link_libraries (double_delta_decompress_fuzzer PRIVATE dbms) clickhouse_add_executable (encrypted_decompress_fuzzer encrypted_decompress_fuzzer.cpp) -target_link_libraries (encrypted_decompress_fuzzer PRIVATE dbms ${LIB_FUZZING_ENGINE}) +target_link_libraries (encrypted_decompress_fuzzer PRIVATE dbms) diff --git a/src/Compression/fuzzers/compressed_buffer_fuzzer.cpp b/src/Compression/fuzzers/compressed_buffer_fuzzer.cpp index bdab11ef8ab..aea749900db 100644 --- a/src/Compression/fuzzers/compressed_buffer_fuzzer.cpp +++ b/src/Compression/fuzzers/compressed_buffer_fuzzer.cpp @@ -4,17 +4,18 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - DB::ReadBufferFromMemory from(data, size); - DB::CompressedReadBuffer in{from}; + try + { + DB::ReadBufferFromMemory from(data, size); + DB::CompressedReadBuffer in{from}; - while (!in.eof()) - in.next(); + while (!in.eof()) + in.next(); + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Compression/fuzzers/delta_decompress_fuzzer.cpp b/src/Compression/fuzzers/delta_decompress_fuzzer.cpp index eaef1d1896c..267c6014461 100644 --- a/src/Compression/fuzzers/delta_decompress_fuzzer.cpp +++ b/src/Compression/fuzzers/delta_decompress_fuzzer.cpp @@ -15,29 +15,30 @@ struct AuxiliaryRandomData }; extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - if (size < sizeof(AuxiliaryRandomData)) - return 0; + try + { + if (size < sizeof(AuxiliaryRandomData)) + return 0; - const auto * p = reinterpret_cast(data); - auto codec = DB::getCompressionCodecDelta(p->delta_size_bytes); + const auto * p = reinterpret_cast(data); + auto codec = DB::getCompressionCodecDelta(p->delta_size_bytes); - size_t output_buffer_size = p->decompressed_size % 65536; - size -= sizeof(AuxiliaryRandomData); - data += sizeof(AuxiliaryRandomData) / sizeof(uint8_t); + size_t output_buffer_size = p->decompressed_size % 65536; + size -= sizeof(AuxiliaryRandomData); + data += sizeof(AuxiliaryRandomData) / sizeof(uint8_t); - // std::string input = std::string(reinterpret_cast(data), size); - // fmt::print(stderr, "Using input {} of size {}, output size is {}. \n", input, size, output_buffer_size); + // std::string input = std::string(reinterpret_cast(data), size); + // fmt::print(stderr, "Using input {} of size {}, output size is {}. \n", input, size, output_buffer_size); - DB::Memory<> memory; - memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer()); + DB::Memory<> memory; + memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer()); - codec->doDecompressData(reinterpret_cast(data), size, memory.data(), output_buffer_size); + codec->doDecompressData(reinterpret_cast(data), static_cast(size), memory.data(), static_cast(output_buffer_size)); + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Compression/fuzzers/double_delta_decompress_fuzzer.cpp b/src/Compression/fuzzers/double_delta_decompress_fuzzer.cpp index c32120bacbf..22d89a1282b 100644 --- a/src/Compression/fuzzers/double_delta_decompress_fuzzer.cpp +++ b/src/Compression/fuzzers/double_delta_decompress_fuzzer.cpp @@ -15,29 +15,30 @@ struct AuxiliaryRandomData }; extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - if (size < sizeof(AuxiliaryRandomData)) - return 0; + try + { + if (size < sizeof(AuxiliaryRandomData)) + return 0; - const auto * p = reinterpret_cast(data); - auto codec = DB::getCompressionCodecDoubleDelta(p->data_bytes_size); + const auto * p = reinterpret_cast(data); + auto codec = DB::getCompressionCodecDoubleDelta(p->data_bytes_size); - size_t output_buffer_size = p->decompressed_size % 65536; - size -= sizeof(AuxiliaryRandomData); - data += sizeof(AuxiliaryRandomData) / sizeof(uint8_t); + size_t output_buffer_size = p->decompressed_size % 65536; + size -= sizeof(AuxiliaryRandomData); + data += sizeof(AuxiliaryRandomData) / sizeof(uint8_t); - // std::string input = std::string(reinterpret_cast(data), size); - // fmt::print(stderr, "Using input {} of size {}, output size is {}. \n", input, size, output_buffer_size); + // std::string input = std::string(reinterpret_cast(data), size); + // fmt::print(stderr, "Using input {} of size {}, output size is {}. \n", input, size, output_buffer_size); - DB::Memory<> memory; - memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer()); + DB::Memory<> memory; + memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer()); - codec->doDecompressData(reinterpret_cast(data), size, memory.data(), output_buffer_size); + codec->doDecompressData(reinterpret_cast(data), static_cast(size), memory.data(), static_cast(output_buffer_size)); + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp b/src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp index eb95c83e1d7..4131ab43c59 100644 --- a/src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp +++ b/src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp @@ -271,33 +271,35 @@ void XMLGenerator::generate() extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - XMLGenerator generator(data, size); + try + { + XMLGenerator generator(data, size); - generator.generate(); - if (generator.hasError()) - return 0; + generator.generate(); + if (generator.hasError()) + return 0; - auto config = generator.getResult(); - auto codec_128 = getCompressionCodecEncrypted(DB::AES_128_GCM_SIV); - auto codec_256 = getCompressionCodecEncrypted(DB::AES_256_GCM_SIV); - DB::CompressionCodecEncrypted::Configuration::instance().tryLoad(*config, ""); + auto config = generator.getResult(); + auto codec_128 = getCompressionCodecEncrypted(DB::AES_128_GCM_SIV); + auto codec_256 = getCompressionCodecEncrypted(DB::AES_256_GCM_SIV); + DB::CompressionCodecEncrypted::Configuration::instance().tryLoad(*config, ""); - size_t data_size = size - generator.keySize(); + size_t data_size = size - generator.keySize(); - std::string input = std::string(reinterpret_cast(data), data_size); - fmt::print(stderr, "Using input {} of size {}, output size is {}. \n", input, data_size, input.size() - 31); + std::string input = std::string(reinterpret_cast(data), data_size); + fmt::print(stderr, "Using input {} of size {}, output size is {}. \n", input, data_size, input.size() - 31); - DB::Memory<> memory; - memory.resize(input.size() + codec_128->getAdditionalSizeAtTheEndOfBuffer()); - codec_128->doDecompressData(input.data(), input.size(), memory.data(), input.size() - 31); + DB::Memory<> memory; + memory.resize(input.size() + codec_128->getAdditionalSizeAtTheEndOfBuffer()); + codec_128->doDecompressData(input.data(), static_cast(input.size()), memory.data(), static_cast(input.size()) - 31); + + memory.resize(input.size() + codec_128->getAdditionalSizeAtTheEndOfBuffer()); + codec_256->doDecompressData(input.data(), static_cast(input.size()), memory.data(), static_cast(input.size()) - 31); + } + catch (...) + { + } - memory.resize(input.size() + codec_128->getAdditionalSizeAtTheEndOfBuffer()); - codec_256->doDecompressData(input.data(), input.size(), memory.data(), input.size() - 31); return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Compression/fuzzers/lz4_decompress_fuzzer.cpp b/src/Compression/fuzzers/lz4_decompress_fuzzer.cpp index f03fc716c2c..e2275990b72 100644 --- a/src/Compression/fuzzers/lz4_decompress_fuzzer.cpp +++ b/src/Compression/fuzzers/lz4_decompress_fuzzer.cpp @@ -16,31 +16,31 @@ struct AuxiliaryRandomData }; extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { + try + { + if (size < sizeof(AuxiliaryRandomData) + LZ4::ADDITIONAL_BYTES_AT_END_OF_BUFFER) + return 0; - if (size < sizeof(AuxiliaryRandomData) + LZ4::ADDITIONAL_BYTES_AT_END_OF_BUFFER) - return 0; + const auto * p = reinterpret_cast(data); + auto codec = DB::getCompressionCodecLZ4(static_cast(p->level)); - const auto * p = reinterpret_cast(data); - auto codec = DB::getCompressionCodecLZ4(p->level); + size_t output_buffer_size = p->decompressed_size % 65536; + size -= sizeof(AuxiliaryRandomData); + size -= LZ4::ADDITIONAL_BYTES_AT_END_OF_BUFFER; + data += sizeof(AuxiliaryRandomData) / sizeof(uint8_t); - size_t output_buffer_size = p->decompressed_size % 65536; - size -= sizeof(AuxiliaryRandomData); - size -= LZ4::ADDITIONAL_BYTES_AT_END_OF_BUFFER; - data += sizeof(AuxiliaryRandomData) / sizeof(uint8_t); + // std::string input = std::string(reinterpret_cast(data), size); + // fmt::print(stderr, "Using input {} of size {}, output size is {}. \n", input, size, output_buffer_size); - // std::string input = std::string(reinterpret_cast(data), size); - // fmt::print(stderr, "Using input {} of size {}, output size is {}. \n", input, size, output_buffer_size); + DB::Memory<> memory; + memory.resize(output_buffer_size + LZ4::ADDITIONAL_BYTES_AT_END_OF_BUFFER); - DB::Memory<> memory; - memory.resize(output_buffer_size + LZ4::ADDITIONAL_BYTES_AT_END_OF_BUFFER); - - codec->doDecompressData(reinterpret_cast(data), size, memory.data(), output_buffer_size); + codec->doDecompressData(reinterpret_cast(data), static_cast(size), memory.data(), static_cast(output_buffer_size)); + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Core/Protocol.h b/src/Core/Protocol.h index 97a2831ffe8..7f2b3cfa26a 100644 --- a/src/Core/Protocol.h +++ b/src/Core/Protocol.h @@ -81,7 +81,7 @@ namespace Protocol /// This is such an inverted logic, where server sends requests /// And client returns back response ProfileEvents = 14, /// Packet with profile events from server. - MergeTreeAllRangesAnnounecement = 15, + MergeTreeAllRangesAnnouncement = 15, MergeTreeReadTaskRequest = 16, /// Request from a MergeTree replica to a coordinator TimezoneUpdate = 17, /// Receive server's (session-wide) default timezone MAX = TimezoneUpdate, @@ -110,7 +110,7 @@ namespace Protocol "PartUUIDs", "ReadTaskRequest", "ProfileEvents", - "MergeTreeAllRangesAnnounecement", + "MergeTreeAllRangesAnnouncement", "MergeTreeReadTaskRequest", "TimezoneUpdate", }; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a8bb200c8ca..c969ef187c4 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -775,6 +775,7 @@ class IColumn; M(Bool, function_json_value_return_type_allow_nullable, false, "Allow function JSON_VALUE to return nullable type.", 0) \ M(Bool, function_json_value_return_type_allow_complex, false, "Allow function JSON_VALUE to return complex type, such as: struct, array, map.", 0) \ M(Bool, use_with_fill_by_sorting_prefix, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently", 0) \ + M(Bool, optimize_uniq_to_count, true, "Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause.", 0) \ \ /** Experimental functions */ \ M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \ diff --git a/src/Core/fuzzers/CMakeLists.txt b/src/Core/fuzzers/CMakeLists.txt index 269217392e7..51db6fa0b53 100644 --- a/src/Core/fuzzers/CMakeLists.txt +++ b/src/Core/fuzzers/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable (names_and_types_fuzzer names_and_types_fuzzer.cpp) -target_link_libraries (names_and_types_fuzzer PRIVATE dbms ${LIB_FUZZING_ENGINE}) +target_link_libraries (names_and_types_fuzzer PRIVATE dbms) diff --git a/src/Core/fuzzers/names_and_types_fuzzer.cpp b/src/Core/fuzzers/names_and_types_fuzzer.cpp index 94f0872fff6..6fdd8703014 100644 --- a/src/Core/fuzzers/names_and_types_fuzzer.cpp +++ b/src/Core/fuzzers/names_and_types_fuzzer.cpp @@ -3,15 +3,16 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - DB::ReadBufferFromMemory in(data, size); - DB::NamesAndTypesList res; - res.readText(in); + try + { + DB::ReadBufferFromMemory in(data, size); + DB::NamesAndTypesList res; + res.readText(in); + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 774b86472be..15203bdc9fa 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -189,7 +189,7 @@ void SerializationNullable::serializeBinary(const IColumn & column, size_t row_n /// Deserialize value into ColumnNullable. /// We need to insert both to nested column and to null byte map, or, in case of exception, to not insert at all. -template +template requires std::same_as static ReturnType safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested) @@ -217,7 +217,7 @@ safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_ } /// Deserialize value into non-nullable column. In case of NULL, insert default value and return false. -template +template requires std::same_as static ReturnType safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested) diff --git a/src/DataTypes/fuzzers/CMakeLists.txt b/src/DataTypes/fuzzers/CMakeLists.txt index d9c19cb7d01..939bf5f5e3f 100644 --- a/src/DataTypes/fuzzers/CMakeLists.txt +++ b/src/DataTypes/fuzzers/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable(data_type_deserialization_fuzzer data_type_deserialization_fuzzer.cpp ${SRCS}) -target_link_libraries(data_type_deserialization_fuzzer PRIVATE dbms clickhouse_aggregate_functions ${LIB_FUZZING_ENGINE}) +target_link_libraries(data_type_deserialization_fuzzer PRIVATE dbms clickhouse_aggregate_functions) diff --git a/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp b/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp index 31e4c470ee7..e40734e0a57 100644 --- a/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp +++ b/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp @@ -14,69 +14,70 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - using namespace DB; - - static SharedContextHolder shared_context; - static ContextMutablePtr context; - - auto initialize = [&]() mutable + try { - shared_context = Context::createShared(); - context = Context::createGlobal(shared_context.get()); - context->makeGlobalContext(); - context->setApplicationType(Context::ApplicationType::LOCAL); + using namespace DB; - MainThreadStatus::getInstance(); + static SharedContextHolder shared_context; + static ContextMutablePtr context; - registerAggregateFunctions(); - return true; - }; + auto initialize = [&]() mutable + { + shared_context = Context::createShared(); + context = Context::createGlobal(shared_context.get()); + context->makeGlobalContext(); + context->setApplicationType(Context::ApplicationType::LOCAL); - static bool initialized = initialize(); - (void) initialized; + MainThreadStatus::getInstance(); - total_memory_tracker.resetCounters(); - total_memory_tracker.setHardLimit(1_GiB); - CurrentThread::get().memory_tracker.resetCounters(); - CurrentThread::get().memory_tracker.setHardLimit(1_GiB); + registerAggregateFunctions(); + return true; + }; - /// The input format is as follows: - /// - data type name on the first line, - /// - the data for the rest of the input. + static bool initialized = initialize(); + (void) initialized; - /// Compile the code as follows: - /// mkdir build_asan_fuzz - /// cd build_asan_fuzz - /// CC=clang CXX=clang++ cmake -D SANITIZE=address -D ENABLE_FUZZING=1 -D WITH_COVERAGE=1 .. - /// - /// The corpus is located here: - /// https://github.com/ClickHouse/fuzz-corpus/tree/main/data_type_deserialization - /// - /// The fuzzer can be run as follows: - /// ../../../build_asan_fuzz/src/DataTypes/fuzzers/data_type_deserialization_fuzzer corpus -jobs=64 -rss_limit_mb=8192 + total_memory_tracker.resetCounters(); + total_memory_tracker.setHardLimit(1_GiB); + CurrentThread::get().memory_tracker.resetCounters(); + CurrentThread::get().memory_tracker.setHardLimit(1_GiB); - /// clickhouse-local --query "SELECT toJSONString(*) FROM (SELECT name FROM system.functions UNION ALL SELECT name FROM system.data_type_families)" > dictionary + /// The input format is as follows: + /// - data type name on the first line, + /// - the data for the rest of the input. - DB::ReadBufferFromMemory in(data, size); + /// Compile the code as follows: + /// mkdir build_asan_fuzz + /// cd build_asan_fuzz + /// CC=clang CXX=clang++ cmake -D SANITIZE=address -D ENABLE_FUZZING=1 -D WITH_COVERAGE=1 .. + /// + /// The corpus is located here: + /// https://github.com/ClickHouse/fuzz-corpus/tree/main/data_type_deserialization + /// + /// The fuzzer can be run as follows: + /// ../../../build_asan_fuzz/src/DataTypes/fuzzers/data_type_deserialization_fuzzer corpus -jobs=64 -rss_limit_mb=8192 - String data_type; - readStringUntilNewlineInto(data_type, in); - assertChar('\n', in); + /// clickhouse-local --query "SELECT toJSONString(*) FROM (SELECT name FROM system.functions UNION ALL SELECT name FROM system.data_type_families)" > dictionary - DataTypePtr type = DataTypeFactory::instance().get(data_type); + DB::ReadBufferFromMemory in(data, size); - FormatSettings settings; - settings.max_binary_string_size = 100; - settings.max_binary_array_size = 100; + String data_type; + readStringUntilNewlineInto(data_type, in); + assertChar('\n', in); - Field field; - type->getDefaultSerialization()->deserializeBinary(field, in, settings); + DataTypePtr type = DataTypeFactory::instance().get(data_type); + + FormatSettings settings; + settings.max_binary_string_size = 100; + settings.max_binary_array_size = 100; + + Field field; + type->getDefaultSerialization()->deserializeBinary(field, in, settings); + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Formats/fuzzers/CMakeLists.txt b/src/Formats/fuzzers/CMakeLists.txt index 984823f3360..38009aeec1d 100644 --- a/src/Formats/fuzzers/CMakeLists.txt +++ b/src/Formats/fuzzers/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable(format_fuzzer format_fuzzer.cpp ${SRCS}) -target_link_libraries(format_fuzzer PRIVATE dbms clickhouse_aggregate_functions ${LIB_FUZZING_ENGINE}) +target_link_libraries(format_fuzzer PRIVATE dbms clickhouse_aggregate_functions) diff --git a/src/Formats/fuzzers/format_fuzzer.cpp b/src/Formats/fuzzers/format_fuzzer.cpp index e84d0913d0d..583d1173a01 100644 --- a/src/Formats/fuzzers/format_fuzzer.cpp +++ b/src/Formats/fuzzers/format_fuzzer.cpp @@ -22,112 +22,113 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - using namespace DB; - - static SharedContextHolder shared_context; - static ContextMutablePtr context; - - auto initialize = [&]() mutable + try { - shared_context = Context::createShared(); - context = Context::createGlobal(shared_context.get()); - context->makeGlobalContext(); - context->setApplicationType(Context::ApplicationType::LOCAL); + using namespace DB; - MainThreadStatus::getInstance(); + static SharedContextHolder shared_context; + static ContextMutablePtr context; - registerAggregateFunctions(); - registerFormats(); + auto initialize = [&]() mutable + { + shared_context = Context::createShared(); + context = Context::createGlobal(shared_context.get()); + context->makeGlobalContext(); + context->setApplicationType(Context::ApplicationType::LOCAL); - return true; - }; + MainThreadStatus::getInstance(); - static bool initialized = initialize(); - (void) initialized; + registerAggregateFunctions(); + registerFormats(); - total_memory_tracker.resetCounters(); - total_memory_tracker.setHardLimit(1_GiB); - CurrentThread::get().memory_tracker.resetCounters(); - CurrentThread::get().memory_tracker.setHardLimit(1_GiB); + return true; + }; - /// The input format is as follows: - /// - format name on the first line, - /// - table structure on the second line, - /// - the data for the rest of the input. + static bool initialized = initialize(); + (void) initialized; - /** The corpus was generated as follows: + total_memory_tracker.resetCounters(); + total_memory_tracker.setHardLimit(1_GiB); + CurrentThread::get().memory_tracker.resetCounters(); + CurrentThread::get().memory_tracker.setHardLimit(1_GiB); - i=0; find ../../../../tests/queries -name '*.sql' | - xargs -I{} bash -c "tr '\n' ' ' <{}; echo" | - rg -o -i 'CREATE TABLE\s+\w+\s+\(.+?\) ENGINE' | - sed -r -e 's/CREATE TABLE\s+\w+\s+\((.+?)\) ENGINE/\1/i' | sort | uniq | - while read line; do - i=$((i+1)); - clickhouse-local --query "SELECT name FROM system.formats ORDER BY rand() LIMIT 1" >> $i; - echo "$line" >> $i; - echo $RANDOM >> $i; - echo $i; + /// The input format is as follows: + /// - format name on the first line, + /// - table structure on the second line, + /// - the data for the rest of the input. + + /** The corpus was generated as follows: + + i=0; find ../../../../tests/queries -name '*.sql' | + xargs -I{} bash -c "tr '\n' ' ' <{}; echo" | + rg -o -i 'CREATE TABLE\s+\w+\s+\(.+?\) ENGINE' | + sed -r -e 's/CREATE TABLE\s+\w+\s+\((.+?)\) ENGINE/\1/i' | sort | uniq | + while read line; do + i=$((i+1)); + clickhouse-local --query "SELECT name FROM system.formats ORDER BY rand() LIMIT 1" >> $i; + echo "$line" >> $i; + echo $RANDOM >> $i; + echo $i; + done + */ + + /** And: + + for format in $(clickhouse-client --query "SELECT name FROM system.formats WHERE is_output"); do + echo $format; + echo $format >> $format; + echo "WatchID Int64, JavaEnable Int16, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID Int32, ClientIP Int32, RegionID Int32, UserID Int64, CounterClass Int16, OS Int16, UserAgent Int16, URL String, Referer String, IsRefresh Int16, RefererCategoryID Int16, RefererRegionID Int32, URLCategoryID Int16, URLRegionID Int32, ResolutionWidth Int16, ResolutionHeight Int16, ResolutionDepth Int16, FlashMajor Int16, FlashMinor Int16, FlashMinor2 String, NetMajor Int16, NetMinor Int16, UserAgentMajor Int16, UserAgentMinor String, CookieEnable Int16, JavascriptEnable Int16, IsMobile Int16, MobilePhone Int16, MobilePhoneModel String, Params String, IPNetworkID Int32, TraficSourceID Int16, SearchEngineID Int16, SearchPhrase String, AdvEngineID Int16, IsArtifical Int16, WindowClientWidth Int16, WindowClientHeight Int16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 Int16, SilverlightVersion2 Int16, SilverlightVersion3 Int32, SilverlightVersion4 Int16, PageCharset String, CodeVersion Int32, IsLink Int16, IsDownload Int16, IsNotBounce Int16, FUniqID Int64, OriginalURL String, HID Int32, IsOldCounter Int16, IsEvent Int16, IsParameter Int16, DontCountHits Int16, WithHash Int16, HitColor String, LocalEventTime DateTime, Age Int16, Sex Int16, Income Int16, Interests Int16, Robotness Int16, RemoteIP Int32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage String, BrowserCountry String, SocialNetwork String, SocialAction String, HTTPError Int16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, SocialSourceNetworkID Int16, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency String, ParamCurrencyID Int16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID Int16, RefererHash Int64, URLHash Int64, CLID Int32" >> $format; + clickhouse-client --query "SELECT * FROM hits LIMIT 10 FORMAT $format" >> $format || rm $format; done - */ - /** And: + */ - for format in $(clickhouse-client --query "SELECT name FROM system.formats WHERE is_output"); do - echo $format; - echo $format >> $format; - echo "WatchID Int64, JavaEnable Int16, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID Int32, ClientIP Int32, RegionID Int32, UserID Int64, CounterClass Int16, OS Int16, UserAgent Int16, URL String, Referer String, IsRefresh Int16, RefererCategoryID Int16, RefererRegionID Int32, URLCategoryID Int16, URLRegionID Int32, ResolutionWidth Int16, ResolutionHeight Int16, ResolutionDepth Int16, FlashMajor Int16, FlashMinor Int16, FlashMinor2 String, NetMajor Int16, NetMinor Int16, UserAgentMajor Int16, UserAgentMinor String, CookieEnable Int16, JavascriptEnable Int16, IsMobile Int16, MobilePhone Int16, MobilePhoneModel String, Params String, IPNetworkID Int32, TraficSourceID Int16, SearchEngineID Int16, SearchPhrase String, AdvEngineID Int16, IsArtifical Int16, WindowClientWidth Int16, WindowClientHeight Int16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 Int16, SilverlightVersion2 Int16, SilverlightVersion3 Int32, SilverlightVersion4 Int16, PageCharset String, CodeVersion Int32, IsLink Int16, IsDownload Int16, IsNotBounce Int16, FUniqID Int64, OriginalURL String, HID Int32, IsOldCounter Int16, IsEvent Int16, IsParameter Int16, DontCountHits Int16, WithHash Int16, HitColor String, LocalEventTime DateTime, Age Int16, Sex Int16, Income Int16, Interests Int16, Robotness Int16, RemoteIP Int32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage String, BrowserCountry String, SocialNetwork String, SocialAction String, HTTPError Int16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, SocialSourceNetworkID Int16, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency String, ParamCurrencyID Int16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID Int16, RefererHash Int64, URLHash Int64, CLID Int32" >> $format; - clickhouse-client --query "SELECT * FROM hits LIMIT 10 FORMAT $format" >> $format || rm $format; - done + /// Compile the code as follows: + /// mkdir build_asan_fuzz + /// cd build_asan_fuzz + /// CC=clang CXX=clang++ cmake -D SANITIZE=address -D ENABLE_FUZZING=1 -D WITH_COVERAGE=1 .. + /// + /// The corpus is located here: + /// https://github.com/ClickHouse/fuzz-corpus/tree/main/format_fuzzer + /// + /// The fuzzer can be run as follows: + /// ../../../build_asan_fuzz/src/Formats/fuzzers/format_fuzzer corpus -jobs=64 -rss_limit_mb=8192 - */ + DB::ReadBufferFromMemory in(data, size); - /// Compile the code as follows: - /// mkdir build_asan_fuzz - /// cd build_asan_fuzz - /// CC=clang CXX=clang++ cmake -D SANITIZE=address -D ENABLE_FUZZING=1 -D WITH_COVERAGE=1 .. - /// - /// The corpus is located here: - /// https://github.com/ClickHouse/fuzz-corpus/tree/main/format_fuzzer - /// - /// The fuzzer can be run as follows: - /// ../../../build_asan_fuzz/src/Formats/fuzzers/format_fuzzer corpus -jobs=64 -rss_limit_mb=8192 + String format; + readStringUntilNewlineInto(format, in); + assertChar('\n', in); - DB::ReadBufferFromMemory in(data, size); + String structure; + readStringUntilNewlineInto(structure, in); + assertChar('\n', in); - String format; - readStringUntilNewlineInto(format, in); - assertChar('\n', in); + ColumnsDescription description = parseColumnsListFromString(structure, context); + auto columns_info = description.getOrdinary(); - String structure; - readStringUntilNewlineInto(structure, in); - assertChar('\n', in); + Block header; + for (const auto & info : columns_info) + { + ColumnWithTypeAndName column; + column.name = info.name; + column.type = info.type; + column.column = column.type->createColumn(); + header.insert(std::move(column)); + } - ColumnsDescription description = parseColumnsListFromString(structure, context); - auto columns_info = description.getOrdinary(); + InputFormatPtr input_format = context->getInputFormat(format, in, header, 13 /* small block size */); - Block header; - for (const auto & info : columns_info) - { - ColumnWithTypeAndName column; - column.name = info.name; - column.type = info.type; - column.column = column.type->createColumn(); - header.insert(std::move(column)); + QueryPipeline pipeline(Pipe(std::move(input_format))); + PullingPipelineExecutor executor(pipeline); + Block res; + while (executor.pull(res)) + ; + } + catch (...) + { } - - InputFormatPtr input_format = context->getInputFormat(format, in, header, 13 /* small block size */); - - QueryPipeline pipeline(Pipe(std::move(input_format))); - PullingPipelineExecutor executor(pipeline); - Block res; - while (executor.pull(res)) - ; return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 3c0fd2f5555..345b9a11e0d 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -594,14 +594,14 @@ struct JavaHashImpl static_cast(x) ^ static_cast(static_cast(x) >> 32)); } - template + template requires std::same_as || std::same_as || std::same_as static ReturnType apply(T x) { return x; } - template + template requires(!std::same_as && !std::same_as && !std::same_as) static ReturnType apply(T x) { diff --git a/src/Functions/TransformDateTime64.h b/src/Functions/TransformDateTime64.h index 8484846ddc5..896e9d8ca48 100644 --- a/src/Functions/TransformDateTime64.h +++ b/src/Functions/TransformDateTime64.h @@ -89,7 +89,7 @@ public: } template - requires (!std::same_as) + requires(!std::same_as) inline auto execute(const T & t, Args &&... args) const { return wrapped_transform.execute(t, std::forward(args)...); diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index ca731b11dbd..c3601ee9d90 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2741,6 +2741,8 @@ zkutil::ZooKeeperPtr Context::getZooKeeper() const Stopwatch watch; LOG_DEBUG(shared->log, "Trying to establish a new connection with ZooKeeper"); shared->zookeeper = shared->zookeeper->startNewSession(); + if (isServerCompletelyStarted()) + shared->zookeeper->setServerCompletelyStarted(); LOG_DEBUG(shared->log, "Establishing a new connection with ZooKeeper took {} ms", watch.elapsedMilliseconds()); } diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index de2d34162a8..cc0f2bf7283 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -421,6 +422,12 @@ InterpreterSelectQuery::InterpreterSelectQuery( RewriteCountDistinctFunctionVisitor(data_rewrite_countdistinct).visit(query_ptr); } + if (settings.optimize_uniq_to_count) + { + RewriteUniqToCountMatcher::Data data_rewrite_uniq_count; + RewriteUniqToCountVisitor(data_rewrite_uniq_count).visit(query_ptr); + } + JoinedTables joined_tables(getSubqueryContext(context), getSelectQuery(), options.with_all_cols, options_.is_create_parameterized_view); bool got_storage_from_query = false; diff --git a/src/Interpreters/RewriteUniqToCountVisitor.cpp b/src/Interpreters/RewriteUniqToCountVisitor.cpp new file mode 100644 index 00000000000..7445068207a --- /dev/null +++ b/src/Interpreters/RewriteUniqToCountVisitor.cpp @@ -0,0 +1,163 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +using Aliases = std::unordered_map; + +namespace +{ + +bool matchFnUniq(String func_name) +{ + auto name = Poco::toLower(func_name); + return name == "uniq" || name == "uniqHLL12" || name == "uniqExact" || name == "uniqTheta" || name == "uniqCombined" + || name == "uniqCombined64"; +} + +bool expressionEquals(const ASTPtr & lhs, const ASTPtr & rhs, const Aliases & alias) +{ + if (lhs->getTreeHash() == rhs->getTreeHash()) + { + return true; + } + else + { + auto * lhs_idf = lhs->as(); + auto * rhs_idf = rhs->as(); + if (lhs_idf && rhs_idf) + { + /// compound identifiers, such as: + if (lhs_idf->shortName() == rhs_idf->shortName()) + return true; + + /// translate alias + if (alias.find(lhs_idf->shortName()) != alias.end()) + lhs_idf = alias.find(lhs_idf->shortName())->second->as(); + + if (alias.find(rhs_idf->shortName()) != alias.end()) + rhs_idf = alias.find(rhs_idf->shortName())->second->as(); + + if (lhs_idf->shortName() == rhs_idf->shortName()) + return true; + } + } + return false; +} + +bool expressionListEquals(ASTExpressionList * lhs, ASTExpressionList * rhs, const Aliases & alias) +{ + if (!lhs || !rhs) + return false; + if (lhs->children.size() != rhs->children.size()) + return false; + for (size_t i = 0; i < lhs->children.size(); i++) + { + if (!expressionEquals(lhs->children[i], rhs->children[i], alias)) + return false; + } + return true; +} + +/// Test whether lhs contains all expressions in rhs. +bool expressionListContainsAll(ASTExpressionList * lhs, ASTExpressionList * rhs, const Aliases & alias) +{ + if (!lhs || !rhs) + return false; + if (lhs->children.size() < rhs->children.size()) + return false; + for (const auto & re : rhs->children) + { + auto predicate = [&re, &alias](ASTPtr & le) { return expressionEquals(le, re, alias); }; + if (std::find_if(lhs->children.begin(), lhs->children.end(), predicate) == lhs->children.end()) + return false; + } + return true; +} + +} + +void RewriteUniqToCountMatcher::visit(ASTPtr & ast, Data & /*data*/) +{ + auto * selectq = ast->as(); + if (!selectq || !selectq->tables() || selectq->tables()->children.size() != 1) + return; + auto expr_list = selectq->select(); + if (!expr_list || expr_list->children.size() != 1) + return; + auto * func = expr_list->children[0]->as(); + if (!func || !matchFnUniq(func->name)) + return; + if (selectq->tables()->as()->children[0]->as()->children.size() != 1) + return; + auto * table_expr = selectq->tables() + ->as() + ->children[0] + ->as() + ->children[0] + ->as(); + if (!table_expr || table_expr->children.size() != 1 || !table_expr->subquery) + return; + auto * subquery = table_expr->subquery->as(); + if (!subquery) + return; + auto * sub_selectq = subquery->children[0] + ->as()->children[0] + ->as()->children[0] + ->as(); + if (!sub_selectq) + return; + auto sub_expr_list = sub_selectq->select(); + if (!sub_expr_list) + return; + + /// collect subquery select expressions alias + Aliases alias; + for (const auto & expr : sub_expr_list->children) + { + if (!expr->tryGetAlias().empty()) + alias.insert({expr->tryGetAlias(), expr}); + } + + /// Whether query matches 'SELECT uniq(x ...) FROM (SELECT DISTINCT x ...)' + auto match_subquery_with_distinct = [&]() -> bool + { + if (!sub_selectq->distinct) + return false; + /// uniq expression list == subquery group by expression list + if (!expressionListEquals(func->children[0]->as(), sub_expr_list->as(), alias)) + return false; + return true; + }; + + /// Whether query matches 'SELECT uniq(x ...) FROM (SELECT x ... GROUP BY x ...)' + auto match_subquery_with_group_by = [&]() -> bool + { + auto group_by = sub_selectq->groupBy(); + if (!group_by) + return false; + /// uniq expression list == subquery group by expression list + if (!expressionListEquals(func->children[0]->as(), group_by->as(), alias)) + return false; + /// subquery select expression list must contain all columns in uniq expression list + if (!expressionListContainsAll(sub_expr_list->as(), func->children[0]->as(), alias)) + return false; + return true; + }; + + if (match_subquery_with_distinct() || match_subquery_with_group_by()) + expr_list->children[0] = makeASTFunction("count"); +} + +} diff --git a/src/Interpreters/RewriteUniqToCountVisitor.h b/src/Interpreters/RewriteUniqToCountVisitor.h new file mode 100644 index 00000000000..94528ccf2ee --- /dev/null +++ b/src/Interpreters/RewriteUniqToCountVisitor.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include "Interpreters/TreeRewriter.h" + +namespace DB +{ + +class ASTFunction; + +/** Optimize `uniq` into `count` over subquery. + * Example: 'SELECT uniq(x ...) FROM (SELECT DISTINCT x ...)' to + * Result: 'SELECT count() FROM (SELECT DISTINCT x ...)' + * + * Example: 'SELECT uniq(x ...) FROM (SELECT x ... GROUP BY x ...)' to + * Result: 'SELECT count() FROM (SELECT x ... GROUP BY x ...)' + * + * Note that we can rewrite all uniq variants except uniqUpTo. + */ +class RewriteUniqToCountMatcher +{ +public: + struct Data {}; + static void visit(ASTPtr & ast, Data &); + static bool needChildVisit(const ASTPtr &, const ASTPtr &) { return true; } +}; + +using RewriteUniqToCountVisitor = InDepthNodeVisitor; +} diff --git a/src/Interpreters/fuzzers/CMakeLists.txt b/src/Interpreters/fuzzers/CMakeLists.txt index 8e301470de2..4ac002d3d4a 100644 --- a/src/Interpreters/fuzzers/CMakeLists.txt +++ b/src/Interpreters/fuzzers/CMakeLists.txt @@ -5,5 +5,4 @@ target_link_libraries(execute_query_fuzzer PRIVATE clickhouse_table_functions clickhouse_aggregate_functions clickhouse_dictionaries - clickhouse_dictionaries_embedded - ${LIB_FUZZING_ENGINE}) + clickhouse_dictionaries_embedded) diff --git a/src/Interpreters/fuzzers/execute_query_fuzzer.cpp b/src/Interpreters/fuzzers/execute_query_fuzzer.cpp index 284e780ed1f..f12c01120cf 100644 --- a/src/Interpreters/fuzzers/execute_query_fuzzer.cpp +++ b/src/Interpreters/fuzzers/execute_query_fuzzer.cpp @@ -13,43 +13,44 @@ using namespace DB; extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - std::string input = std::string(reinterpret_cast(data), size); - - static SharedContextHolder shared_context; - static ContextMutablePtr context; - - auto initialize = [&]() mutable + try { - shared_context = Context::createShared(); - context = Context::createGlobal(shared_context.get()); - context->makeGlobalContext(); - context->setApplicationType(Context::ApplicationType::LOCAL); + std::string input = std::string(reinterpret_cast(data), size); - registerFunctions(); - registerAggregateFunctions(); - registerTableFunctions(); - registerStorages(); - registerDictionaries(); - registerDisks(/* global_skip_access_check= */ true); - registerFormats(); + static SharedContextHolder shared_context; + static ContextMutablePtr context; - return true; - }; + auto initialize = [&]() mutable + { + shared_context = Context::createShared(); + context = Context::createGlobal(shared_context.get()); + context->makeGlobalContext(); + context->setApplicationType(Context::ApplicationType::LOCAL); - static bool initialized = initialize(); - (void) initialized; + registerFunctions(); + registerAggregateFunctions(); + registerTableFunctions(); + registerStorages(); + registerDictionaries(); + registerDisks(/* global_skip_access_check= */ true); + registerFormats(); - auto io = DB::executeQuery(input, context, true, QueryProcessingStage::Complete); + return true; + }; - PullingPipelineExecutor executor(io.pipeline); - Block res; - while (!res && executor.pull(res)); + static bool initialized = initialize(); + (void) initialized; + + auto io = DB::executeQuery(input, context, true, QueryProcessingStage::Complete); + + PullingPipelineExecutor executor(io.pipeline); + Block res; + while (!res && executor.pull(res)); + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Parsers/fuzzers/CMakeLists.txt b/src/Parsers/fuzzers/CMakeLists.txt index c3aa21e2a04..903319d733c 100644 --- a/src/Parsers/fuzzers/CMakeLists.txt +++ b/src/Parsers/fuzzers/CMakeLists.txt @@ -1,11 +1,11 @@ clickhouse_add_executable(lexer_fuzzer lexer_fuzzer.cpp ${SRCS}) -target_link_libraries(lexer_fuzzer PRIVATE clickhouse_parsers ${LIB_FUZZING_ENGINE}) +target_link_libraries(lexer_fuzzer PRIVATE clickhouse_parsers) clickhouse_add_executable(select_parser_fuzzer select_parser_fuzzer.cpp ${SRCS}) -target_link_libraries(select_parser_fuzzer PRIVATE clickhouse_parsers ${LIB_FUZZING_ENGINE}) +target_link_libraries(select_parser_fuzzer PRIVATE clickhouse_parsers dbms) clickhouse_add_executable(create_parser_fuzzer create_parser_fuzzer.cpp ${SRCS}) -target_link_libraries(create_parser_fuzzer PRIVATE clickhouse_parsers ${LIB_FUZZING_ENGINE}) +target_link_libraries(create_parser_fuzzer PRIVATE clickhouse_parsers dbms) add_subdirectory(codegen_fuzzer) diff --git a/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt b/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt index 727c49cfc4d..20fd951d390 100644 --- a/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt +++ b/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt @@ -41,5 +41,10 @@ clickhouse_add_executable(codegen_select_fuzzer ${FUZZER_SRCS}) set_source_files_properties("${PROTO_SRCS}" "out.cpp" PROPERTIES COMPILE_FLAGS "-Wno-reserved-identifier") +# contrib/libprotobuf-mutator/src/libfuzzer/libfuzzer_macro.h:143:44: error: no newline at end of file [-Werror,-Wnewline-eof] +target_compile_options (codegen_select_fuzzer PRIVATE -Wno-newline-eof) + +target_link_libraries(protoc ch_contrib::fuzzer) + target_include_directories(codegen_select_fuzzer SYSTEM BEFORE PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") -target_link_libraries(codegen_select_fuzzer PRIVATE ch_contrib::protobuf_mutator ch_contrib::protoc dbms ${LIB_FUZZING_ENGINE}) +target_link_libraries(codegen_select_fuzzer PRIVATE ch_contrib::protobuf_mutator ch_contrib::protoc dbms) diff --git a/src/Parsers/fuzzers/create_parser_fuzzer.cpp b/src/Parsers/fuzzers/create_parser_fuzzer.cpp index 13cb1dfd36e..854885ad33b 100644 --- a/src/Parsers/fuzzers/create_parser_fuzzer.cpp +++ b/src/Parsers/fuzzers/create_parser_fuzzer.cpp @@ -8,27 +8,28 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - std::string input = std::string(reinterpret_cast(data), size); + try + { + std::string input = std::string(reinterpret_cast(data), size); - DB::ParserCreateQuery parser; - DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 1000); + DB::ParserCreateQuery parser; + DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 1000); - const UInt64 max_ast_depth = 1000; - ast->checkDepth(max_ast_depth); + const UInt64 max_ast_depth = 1000; + ast->checkDepth(max_ast_depth); - const UInt64 max_ast_elements = 50000; - ast->checkSize(max_ast_elements); + const UInt64 max_ast_elements = 50000; + ast->checkSize(max_ast_elements); - DB::WriteBufferFromOwnString wb; - DB::formatAST(*ast, wb); + DB::WriteBufferFromOwnString wb; + DB::formatAST(*ast, wb); - std::cerr << wb.str() << std::endl; + std::cerr << wb.str() << std::endl; + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Parsers/fuzzers/lexer_fuzzer.cpp b/src/Parsers/fuzzers/lexer_fuzzer.cpp index d7dd2cfe970..0f9471a8e4f 100644 --- a/src/Parsers/fuzzers/lexer_fuzzer.cpp +++ b/src/Parsers/fuzzers/lexer_fuzzer.cpp @@ -8,21 +8,27 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { - DB::String query; - DB::ReadBufferFromMemory in(data, size); - readStringUntilEOF(query, in); - - DB::Lexer lexer(query.data(), query.data() + query.size()); - - while (true) + try { - DB::Token token = lexer.nextToken(); + DB::String query; + DB::ReadBufferFromMemory in(data, size); + readStringUntilEOF(query, in); - if (token.isEnd()) - break; + DB::Lexer lexer(query.data(), query.data() + query.size()); - if (token.isError()) - return 1; + while (true) + { + DB::Token token = lexer.nextToken(); + + if (token.isEnd()) + break; + + if (token.isError()) + return 0; + } + } + catch (...) + { } return 0; diff --git a/src/Parsers/fuzzers/select_parser_fuzzer.cpp b/src/Parsers/fuzzers/select_parser_fuzzer.cpp index 3f712834c55..ae490ed4e56 100644 --- a/src/Parsers/fuzzers/select_parser_fuzzer.cpp +++ b/src/Parsers/fuzzers/select_parser_fuzzer.cpp @@ -7,29 +7,30 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - std::string input = std::string(reinterpret_cast(data), size); + try + { + std::string input = std::string(reinterpret_cast(data), size); - DB::ParserQueryWithOutput parser(input.data() + input.size()); + DB::ParserQueryWithOutput parser(input.data() + input.size()); - const UInt64 max_parser_depth = 1000; - DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, max_parser_depth); + const UInt64 max_parser_depth = 1000; + DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, max_parser_depth); - const UInt64 max_ast_depth = 1000; - ast->checkDepth(max_ast_depth); + const UInt64 max_ast_depth = 1000; + ast->checkDepth(max_ast_depth); - const UInt64 max_ast_elements = 50000; - ast->checkSize(max_ast_elements); + const UInt64 max_ast_elements = 50000; + ast->checkSize(max_ast_elements); - DB::WriteBufferFromOwnString wb; - DB::formatAST(*ast, wb); + DB::WriteBufferFromOwnString wb; + DB::formatAST(*ast, wb); - std::cerr << wb.str() << std::endl; + std::cerr << wb.str() << std::endl; + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 4dd65def123..37af391fba3 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -206,6 +206,27 @@ void PipelineExecutor::finalizeExecution() all_processors_finished = false; break; } + else if (node->processor && read_progress_callback) + { + /// Some executors might have reported progress as part of their finish() call + /// For example, when reading from parallel replicas the coordinator will cancel the queries as soon as it + /// enough data (on LIMIT), but as the progress report is asynchronous it might not be reported until the + /// connection is cancelled and all packets drained + /// To cover these cases we check if there is any pending progress in the processors to report + if (auto read_progress = node->processor->getReadProgress()) + { + if (read_progress->counters.total_rows_approx) + read_progress_callback->addTotalRowsApprox(read_progress->counters.total_rows_approx); + + if (read_progress->counters.total_bytes) + read_progress_callback->addTotalBytes(read_progress->counters.total_bytes); + + /// We are finalizing the execution, so no need to call onProgress if there is nothing to report + if (read_progress->counters.read_rows || read_progress->counters.read_bytes) + read_progress_callback->onProgress( + read_progress->counters.read_rows, read_progress->counters.read_bytes, read_progress->limits); + } + } } if (!all_processors_finished) diff --git a/src/Processors/ISource.cpp b/src/Processors/ISource.cpp index 6a88d3973a1..68749c47453 100644 --- a/src/Processors/ISource.cpp +++ b/src/Processors/ISource.cpp @@ -66,12 +66,14 @@ void ISource::progress(size_t read_rows, size_t read_bytes) { //std::cerr << "========= Progress " << read_rows << " from " << getName() << std::endl << StackTrace().toString() << std::endl; read_progress_was_set = true; + std::lock_guard lock(read_progress_mutex); read_progress.read_rows += read_rows; read_progress.read_bytes += read_bytes; } std::optional ISource::getReadProgress() { + std::lock_guard lock(read_progress_mutex); if (finished && read_progress.read_bytes == 0 && read_progress.total_rows_approx == 0) return {}; @@ -85,6 +87,18 @@ std::optional ISource::getReadProgress() return ReadProgress{res_progress, empty_limits}; } +void ISource::addTotalRowsApprox(size_t value) +{ + std::lock_guard lock(read_progress_mutex); + read_progress.total_rows_approx += value; +} + +void ISource::addTotalBytes(size_t value) +{ + std::lock_guard lock(read_progress_mutex); + read_progress.total_bytes += value; +} + void ISource::work() { try diff --git a/src/Processors/ISource.h b/src/Processors/ISource.h index 2593a241c63..767a73d0924 100644 --- a/src/Processors/ISource.h +++ b/src/Processors/ISource.h @@ -2,6 +2,9 @@ #include +#include +#include + namespace DB { @@ -9,8 +12,9 @@ namespace DB class ISource : public IProcessor { private: + std::mutex read_progress_mutex; ReadProgressCounters read_progress; - bool read_progress_was_set = false; + std::atomic_bool read_progress_was_set = false; bool auto_progress; protected: @@ -42,8 +46,8 @@ public: /// Default implementation for all the sources. std::optional getReadProgress() final; - void addTotalRowsApprox(size_t value) { read_progress.total_rows_approx += value; } - void addTotalBytes(size_t value) { read_progress.total_bytes += value; } + void addTotalRowsApprox(size_t value); + void addTotalBytes(size_t value); }; using SourcePtr = std::shared_ptr; diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp index eebe9797051..b834870c334 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.cpp +++ b/src/QueryPipeline/RemoteQueryExecutor.cpp @@ -444,9 +444,9 @@ RemoteQueryExecutor::ReadResult RemoteQueryExecutor::processPacket(Packet packet processMergeTreeReadTaskRequest(packet.request.value()); return ReadResult(ReadResult::Type::ParallelReplicasToken); - case Protocol::Server::MergeTreeAllRangesAnnounecement: + case Protocol::Server::MergeTreeAllRangesAnnouncement: chassert(packet.announcement.has_value()); - processMergeTreeInitialReadAnnounecement(packet.announcement.value()); + processMergeTreeInitialReadAnnouncement(packet.announcement.value()); return ReadResult(ReadResult::Type::ParallelReplicasToken); case Protocol::Server::ReadTaskRequest: @@ -568,7 +568,7 @@ void RemoteQueryExecutor::processMergeTreeReadTaskRequest(ParallelReadRequest re connections->sendMergeTreeReadTaskResponse(response); } -void RemoteQueryExecutor::processMergeTreeInitialReadAnnounecement(InitialAllRangesAnnouncement announcement) +void RemoteQueryExecutor::processMergeTreeInitialReadAnnouncement(InitialAllRangesAnnouncement announcement) { if (!extension || !extension->parallel_reading_coordinator) throw Exception(ErrorCodes::LOGICAL_ERROR, "Coordinator for parallel reading from replicas is not initialized"); @@ -602,39 +602,51 @@ void RemoteQueryExecutor::finish() return; /// Get the remaining packets so that there is no out of sync in the connections to the replicas. - Packet packet = connections->drain(); - switch (packet.type) + /// We do this manually instead of calling drain() because we want to process Log, ProfileEvents and Progress + /// packets that had been sent before the connection is fully finished in order to have final statistics of what + /// was executed in the remote queries + while (connections->hasActiveConnections() && !finished) { - case Protocol::Server::EndOfStream: - finished = true; - break; + Packet packet = connections->receivePacket(); - case Protocol::Server::Log: - /// Pass logs from remote server to client - if (auto log_queue = CurrentThread::getInternalTextLogsQueue()) - log_queue->pushBlock(std::move(packet.block)); - break; + switch (packet.type) + { + case Protocol::Server::EndOfStream: + finished = true; + break; - case Protocol::Server::Exception: - got_exception_from_replica = true; - packet.exception->rethrow(); - break; + case Protocol::Server::Exception: + got_exception_from_replica = true; + packet.exception->rethrow(); + break; - case Protocol::Server::ProfileEvents: - /// Pass profile events from remote server to client - if (auto profile_queue = CurrentThread::getInternalProfileEventsQueue()) - if (!profile_queue->emplace(std::move(packet.block))) - throw Exception(ErrorCodes::SYSTEM_ERROR, "Could not push into profile queue"); - break; + case Protocol::Server::Log: + /// Pass logs from remote server to client + if (auto log_queue = CurrentThread::getInternalTextLogsQueue()) + log_queue->pushBlock(std::move(packet.block)); + break; - case Protocol::Server::TimezoneUpdate: - break; + case Protocol::Server::ProfileEvents: + /// Pass profile events from remote server to client + if (auto profile_queue = CurrentThread::getInternalProfileEventsQueue()) + if (!profile_queue->emplace(std::move(packet.block))) + throw Exception(ErrorCodes::SYSTEM_ERROR, "Could not push into profile queue"); + break; - default: - got_unknown_packet_from_replica = true; - throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}", - toString(packet.type), - connections->dumpAddresses()); + case Protocol::Server::ProfileInfo: + /// Use own (client-side) info about read bytes, it is more correct info than server-side one. + if (profile_info_callback) + profile_info_callback(packet.profile_info); + break; + + case Protocol::Server::Progress: + if (progress_callback) + progress_callback(packet.progress); + break; + + default: + break; + } } } diff --git a/src/QueryPipeline/RemoteQueryExecutor.h b/src/QueryPipeline/RemoteQueryExecutor.h index e5094b4705d..8d834eb3f81 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.h +++ b/src/QueryPipeline/RemoteQueryExecutor.h @@ -285,7 +285,7 @@ private: void processReadTaskRequest(); void processMergeTreeReadTaskRequest(ParallelReadRequest request); - void processMergeTreeInitialReadAnnounecement(InitialAllRangesAnnouncement announcement); + void processMergeTreeInitialReadAnnouncement(InitialAllRangesAnnouncement announcement); /// Cancel query and restart it with info about duplicate UUIDs /// only for `allow_experimental_query_deduplication`. diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 136f2dd9537..96c449b7e17 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -471,7 +471,7 @@ void TCPHandler::runImpl() if (state.cancellation_status == CancellationStatus::FULLY_CANCELLED) return; - sendMergeTreeAllRangesAnnounecementAssumeLocked(announcement); + sendMergeTreeAllRangesAnnouncementAssumeLocked(announcement); ProfileEvents::increment(ProfileEvents::MergeTreeAllRangesAnnouncementsSent); ProfileEvents::increment(ProfileEvents::MergeTreeAllRangesAnnouncementsSentElapsedMicroseconds, watch.elapsedMicroseconds()); }); @@ -1044,9 +1044,9 @@ void TCPHandler::sendReadTaskRequestAssumeLocked() } -void TCPHandler::sendMergeTreeAllRangesAnnounecementAssumeLocked(InitialAllRangesAnnouncement announcement) +void TCPHandler::sendMergeTreeAllRangesAnnouncementAssumeLocked(InitialAllRangesAnnouncement announcement) { - writeVarUInt(Protocol::Server::MergeTreeAllRangesAnnounecement, *out); + writeVarUInt(Protocol::Server::MergeTreeAllRangesAnnouncement, *out); announcement.serialize(*out); out->next(); } diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 235f634afec..cfb17ce6ae6 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -264,7 +264,7 @@ private: void sendEndOfStream(); void sendPartUUIDs(); void sendReadTaskRequestAssumeLocked(); - void sendMergeTreeAllRangesAnnounecementAssumeLocked(InitialAllRangesAnnouncement announcement); + void sendMergeTreeAllRangesAnnouncementAssumeLocked(InitialAllRangesAnnouncement announcement); void sendMergeTreeReadTaskRequestAssumeLocked(ParallelReadRequest request); void sendProfileInfo(const ProfileInfo & info); void sendTotals(const Block & totals); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 12cafb62859..ea138d74ab2 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4852,17 +4852,18 @@ void MergeTreeData::removePartContributionToColumnAndSecondaryIndexSizes(const D log_subtract(total_column_size.marks, part_column_size.marks, ".marks"); } - auto indexes_descriptions = getInMemoryMetadataPtr()->secondary_indices; - for (const auto & index : indexes_descriptions) + for (auto & [secondary_index_name, total_secondary_index_size] : secondary_index_sizes) { - IndexSize & total_secondary_index_size = secondary_index_sizes[index.name]; - IndexSize part_secondary_index_size = part->getSecondaryIndexSize(index.name); + if (!part->hasSecondaryIndex(secondary_index_name)) + continue; + + IndexSize part_secondary_index_size = part->getSecondaryIndexSize(secondary_index_name); auto log_subtract = [&](size_t & from, size_t value, const char * field) { if (value > from) LOG_ERROR(log, "Possibly incorrect index size subtraction: {} - {} = {}, index: {}, field: {}", - from, value, from - value, index.name, field); + from, value, from - value, secondary_index_name, field); from -= value; }; @@ -8604,7 +8605,7 @@ std::pair MergeTreeData::createE bool MergeTreeData::allowRemoveStaleMovingParts() const { - return ConfigHelper::getBool(getContext()->getConfigRef(), "allow_remove_stale_moving_parts"); + return ConfigHelper::getBool(getContext()->getConfigRef(), "allow_remove_stale_moving_parts", /* default_ = */ true); } CurrentlySubmergingEmergingTagger::~CurrentlySubmergingEmergingTagger() diff --git a/src/Storages/fuzzers/CMakeLists.txt b/src/Storages/fuzzers/CMakeLists.txt index 98f490c5984..719b9b77cd9 100644 --- a/src/Storages/fuzzers/CMakeLists.txt +++ b/src/Storages/fuzzers/CMakeLists.txt @@ -1,7 +1,7 @@ clickhouse_add_executable (mergetree_checksum_fuzzer mergetree_checksum_fuzzer.cpp) # Look at comment around fuzz_compression target declaration -target_link_libraries (mergetree_checksum_fuzzer PRIVATE dbms ${LIB_FUZZING_ENGINE}) +target_link_libraries (mergetree_checksum_fuzzer PRIVATE dbms) clickhouse_add_executable (columns_description_fuzzer columns_description_fuzzer.cpp) -target_link_libraries (columns_description_fuzzer PRIVATE dbms ${LIB_FUZZING_ENGINE}) +target_link_libraries (columns_description_fuzzer PRIVATE dbms) diff --git a/src/Storages/fuzzers/columns_description_fuzzer.cpp b/src/Storages/fuzzers/columns_description_fuzzer.cpp index 44fd667ff1c..b703a1e7051 100644 --- a/src/Storages/fuzzers/columns_description_fuzzer.cpp +++ b/src/Storages/fuzzers/columns_description_fuzzer.cpp @@ -2,14 +2,16 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - using namespace DB; - ColumnsDescription columns = ColumnsDescription::parse(std::string(reinterpret_cast(data), size)); - std::cerr << columns.toString() << "\n"; + try + { + using namespace DB; + ColumnsDescription columns = ColumnsDescription::parse(std::string(reinterpret_cast(data), size)); + std::cerr << columns.toString() << "\n"; + } + catch (...) + { + } + return 0; } -catch (...) -{ - return 1; -} diff --git a/src/Storages/fuzzers/mergetree_checksum_fuzzer.cpp b/src/Storages/fuzzers/mergetree_checksum_fuzzer.cpp index 0f2ce8a2e44..e046a73b1f9 100644 --- a/src/Storages/fuzzers/mergetree_checksum_fuzzer.cpp +++ b/src/Storages/fuzzers/mergetree_checksum_fuzzer.cpp @@ -5,19 +5,20 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) -try { - DB::ReadBufferFromMemory in(data, size); - DB::MergeTreeDataPartChecksums res; - DB::WriteBufferFromFileDescriptor out(STDOUT_FILENO); + try + { + DB::ReadBufferFromMemory in(data, size); + DB::MergeTreeDataPartChecksums res; + DB::WriteBufferFromFileDescriptor out(STDOUT_FILENO); - if (!res.read(in)) - return 1; - res.write(out); + if (!res.read(in)) + return 0; + res.write(out); + } + catch (...) + { + } return 0; } -catch (...) -{ - return 1; -} diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index bd7c94a85ba..b840d28a541 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -96,8 +96,17 @@ def get_packager_cmd( return cmd +def _expect_artifacts(build_config: BuildConfig) -> bool: + if build_config.package_type == "fuzzers": + return False + return True + + def build_clickhouse( - packager_cmd: str, logs_path: Path, build_output_path: Path + packager_cmd: str, + logs_path: Path, + build_output_path: Path, + expect_artifacts: bool = True, ) -> Tuple[Path, bool]: build_log_path = logs_path / BUILD_LOG_NAME success = False @@ -109,7 +118,7 @@ def build_clickhouse( build_results = [] if retcode == 0: - if len(build_results) > 0: + if not expect_artifacts or len(build_results) > 0: success = True logging.info("Built successfully") else: @@ -312,7 +321,9 @@ def main(): os.makedirs(logs_path, exist_ok=True) start = time.time() - log_path, success = build_clickhouse(packager_cmd, logs_path, build_output_path) + log_path, success = build_clickhouse( + packager_cmd, logs_path, build_output_path, _expect_artifacts(build_config) + ) elapsed = int(time.time() - start) subprocess.check_call( f"sudo chown -R ubuntu:ubuntu {build_output_path}", shell=True diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index a2f3a3ab401..b9ccc23cb2e 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -9,7 +9,7 @@ from typing import Callable, Dict, List, Literal @dataclass class BuildConfig: compiler: str - package_type: Literal["deb", "binary"] + package_type: Literal["deb", "binary", "fuzzers"] additional_pkgs: bool = False debug_build: bool = False sanitizer: str = "" @@ -182,6 +182,10 @@ CI_CONFIG = CiConfig( package_type="binary", static_binary_name="s390x", ), + "fuzzers": BuildConfig( + compiler="clang-16", + package_type="fuzzers", + ), }, builds_report_config={ "ClickHouse build check": [ @@ -193,6 +197,7 @@ CI_CONFIG = CiConfig( "package_msan", "package_debug", "binary_release", + "fuzzers", ], "ClickHouse special build check": [ "binary_tidy", diff --git a/utils/ci-slack-bot/ci-slack-bot.py b/tests/ci/slack_bot_ci_lambda/app.py similarity index 88% rename from utils/ci-slack-bot/ci-slack-bot.py rename to tests/ci/slack_bot_ci_lambda/app.py index ea883e3cda3..45e14138335 100755 --- a/utils/ci-slack-bot/ci-slack-bot.py +++ b/tests/ci/slack_bot_ci_lambda/app.py @@ -1,28 +1,28 @@ #!/usr/bin/env python3 -# A trivial stateless slack bot that notifies about new broken tests in ClickHouse CI. -# It checks what happened to our CI during the last check_period hours (1 hour) and notifies us in slack if necessary. -# This script should be executed once each check_period hours (1 hour). -# It will post duplicate messages if you run it more often; it will lose some messages if you run it less often. -# -# You can run it locally with no arguments, it will work in a dry-run mode. Or you can set your own SLACK_URL_DEFAULT. -# Feel free to add more checks, more details to messages, or better heuristics. -# NOTE There's no deployment automation for now, -# an AWS Lambda (slack-ci-bot-test lambda in CI-CD) has to be updated manually after changing this script. -# -# See also: https://aretestsgreenyet.com/ +""" +A trivial stateless slack bot that notifies about new broken tests in ClickHouse CI. +It checks what happened to our CI during the last check_period hours (1 hour) and + notifies us in slack if necessary. +This script should be executed once each check_period hours (1 hour). +It will post duplicate messages if you run it more often; it will lose some messages + if you run it less often. + +You can run it locally with no arguments, it will work in a dry-run mode. + Or you can set your own SLACK_URL_DEFAULT. +Feel free to add more checks, more details to messages, or better heuristics. + +It's deployed to slack-bot-ci-lambda in CI/CD account + +See also: https://aretestsgreenyet.com/ +""" import os import json import base64 import random -if os.environ.get("AWS_LAMBDA_ENV", "0") == "1": - # For AWS labmda (python 3.7) - from botocore.vendored import requests -else: - # For running locally - import requests +import requests # type: ignore DRY_RUN_MARK = "" @@ -34,7 +34,8 @@ REPORT_NO_FAILURES_PROBABILITY = 0.99 MAX_TESTS_TO_REPORT = 4 -# Slack has a stupid limitation on message size, it splits long messages into multiple ones breaking formatting +# Slack has a stupid limitation on message size, it splits long messages into multiple, +# ones breaking formatting MESSAGE_LENGTH_LIMIT = 4000 # Find tests that failed in master during the last check_period * 24 hours, @@ -61,7 +62,7 @@ WHERE 1 AND test_name NOT IN ( SELECT test_name FROM checks WHERE 1 AND check_start_time >= now - INTERVAL 1 MONTH - AND (check_start_time + check_duration_ms / 1000) BETWEEN now - INTERVAL 2 WEEK AND now - INTERVAL extended_check_period HOUR + AND (check_start_time + check_duration_ms / 1000) BETWEEN now - INTERVAL 2 WEEK AND now - INTERVAL extended_check_period HOUR AND pull_request_number = 0 AND check_status != 'success' AND test_status LIKE 'F%') @@ -95,11 +96,11 @@ FAILED_CHECKS_PERCENTAGE_QUERY = """ SELECT if(toHour(now('Europe/Amsterdam')) = 12, v, 0) FROM ( - SELECT - countDistinctIf((commit_sha, check_name), (test_status LIKE 'F%') AND (check_status != 'success')) + SELECT + countDistinctIf((commit_sha, check_name), (test_status LIKE 'F%') AND (check_status != 'success')) / countDistinct((commit_sha, check_name)) AS v FROM checks - WHERE 1 + WHERE 1 AND (pull_request_number = 0) AND (test_status != 'SKIPPED') AND (check_start_time > (now() - toIntervalDay(1))) @@ -111,7 +112,7 @@ ALL_RECENT_FAILURES_QUERY = """ WITH '{}' AS name_substr, 90 AS interval_days, - ('Stateless tests (asan)', 'Stateless tests (address)', 'Stateless tests (address, actions)') AS backport_and_release_specific_checks + ('Stateless tests (asan)', 'Stateless tests (address)', 'Stateless tests (address, actions)', 'Integration tests (asan) [1/3]', 'Stateless tests (tsan) [1/3]') AS backport_and_release_specific_checks SELECT toStartOfDay(check_start_time) AS d, count(), @@ -315,14 +316,14 @@ def check_and_alert(): ) -def lambda_handler(event, context): +def handler(event, context): try: check_and_alert() return {"statusCode": 200, "body": "OK"} except Exception as e: send_to_slack( - "I failed, please help me (see ClickHouse/utils/ci-slack-bot/ci-slack-bot.py): " - + str(e) + "I failed, please help me " + f"(see ClickHouse/ClickHouse/tests/ci/slack_bot_ci_lambda/app.py): {e}" ) return {"statusCode": 200, "body": "FAIL"} diff --git a/tests/ci/slack_bot_ci_lambda/build_and_deploy_archive.sh b/tests/ci/slack_bot_ci_lambda/build_and_deploy_archive.sh new file mode 120000 index 00000000000..96ba3fa024e --- /dev/null +++ b/tests/ci/slack_bot_ci_lambda/build_and_deploy_archive.sh @@ -0,0 +1 @@ +../team_keys_lambda/build_and_deploy_archive.sh \ No newline at end of file diff --git a/tests/ci/slack_bot_ci_lambda/requirements.txt b/tests/ci/slack_bot_ci_lambda/requirements.txt new file mode 100644 index 00000000000..098e04a9798 --- /dev/null +++ b/tests/ci/slack_bot_ci_lambda/requirements.txt @@ -0,0 +1 @@ +../lambda_shared_package diff --git a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh index 02d5638cf18..a55c1bb2b3b 100644 --- a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh +++ b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh @@ -23,7 +23,7 @@ cp app.py "$PACKAGE" if [ -f requirements.txt ]; then VENV=lambda-venv rm -rf "$VENV" lambda-package.zip - docker run --rm --user="${UID}" -e HOME=/tmp --entrypoint=/bin/bash \ + docker run --net=host --rm --user="${UID}" -e HOME=/tmp --entrypoint=/bin/bash \ --volume="${WORKDIR}/..:/ci" --workdir="/ci/${DIR_NAME}" "${DOCKER_IMAGE}" \ -exc " '$PY_EXEC' -m venv '$VENV' && diff --git a/tests/integration/test_disks_app_func/test.py b/tests/integration/test_disks_app_func/test.py index 2428c53854e..97d5da787cd 100644 --- a/tests/integration/test_disks_app_func/test.py +++ b/tests/integration/test_disks_app_func/test.py @@ -114,9 +114,9 @@ def test_disks_app_func_cp(started_cluster): "/usr/bin/clickhouse", "disks", "copy", - "--diskFrom", + "--disk-from", "test1", - "--diskTo", + "--disk-to", "test2", ".", ".", diff --git a/tests/performance/uniq_to_count.xml b/tests/performance/uniq_to_count.xml new file mode 100644 index 00000000000..64e4cf1cc0d --- /dev/null +++ b/tests/performance/uniq_to_count.xml @@ -0,0 +1,8 @@ + + select uniq(number) from (select DISTINCT number from numbers(1000000)) + select uniq(number) from (select number from numbers(1000000) group by number) + + + select uniq(number) from (select DISTINCT number from numbers(1000000)) SETTINGS allow_experimental_analyzer=1 + select uniq(number) from (select number from numbers(1000000) group by number) SETTINGS allow_experimental_analyzer=1 + diff --git a/tests/queries/0_stateless/02028_system_data_skipping_indices_size.reference b/tests/queries/0_stateless/02028_system_data_skipping_indices_size.reference index e455643c01e..456f9d113be 100644 --- a/tests/queries/0_stateless/02028_system_data_skipping_indices_size.reference +++ b/tests/queries/0_stateless/02028_system_data_skipping_indices_size.reference @@ -1 +1,2 @@ default test_table value_index minmax minmax value 1 38 12 24 +default test_table value_index minmax minmax value 1 38 12 24 diff --git a/tests/queries/0_stateless/02028_system_data_skipping_indices_size.sql b/tests/queries/0_stateless/02028_system_data_skipping_indices_size.sql index 1efb9cff6a4..07237c43bea 100644 --- a/tests/queries/0_stateless/02028_system_data_skipping_indices_size.sql +++ b/tests/queries/0_stateless/02028_system_data_skipping_indices_size.sql @@ -12,4 +12,10 @@ ORDER BY key SETTINGS compress_marks=false; INSERT INTO test_table VALUES (0, 'Value'); SELECT * FROM system.data_skipping_indices WHERE database = currentDatabase(); +ALTER TABLE test_table DROP INDEX value_index; +ALTER TABLE test_table ADD INDEX value_index value TYPE minmax GRANULARITY 1; +ALTER TABLE test_table MATERIALIZE INDEX value_index SETTINGS mutations_sync=1; + +SELECT * FROM system.data_skipping_indices WHERE database = currentDatabase(); + DROP TABLE test_table; diff --git a/tests/queries/0_stateless/02841_parallel_replicas_summary.reference b/tests/queries/0_stateless/02841_parallel_replicas_summary.reference new file mode 100644 index 00000000000..21f0b24c10d --- /dev/null +++ b/tests/queries/0_stateless/02841_parallel_replicas_summary.reference @@ -0,0 +1,4 @@ +1 +1 +02841_summary_default_interactive_0 2 +02841_summary_default_interactive_high 2 diff --git a/tests/queries/0_stateless/02841_parallel_replicas_summary.sh b/tests/queries/0_stateless/02841_parallel_replicas_summary.sh new file mode 100755 index 00000000000..56dd26726ef --- /dev/null +++ b/tests/queries/0_stateless/02841_parallel_replicas_summary.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +function involved_parallel_replicas () { + # Not using current_database = '$CLICKHOUSE_DATABASE' as nested parallel queries aren't run with it + $CLICKHOUSE_CLIENT --query " + SELECT + initial_query_id, + (count() - 2) / 2 as number_of_parallel_replicas + FROM system.query_log + WHERE event_date >= yesterday() + AND initial_query_id LIKE '$1%' + GROUP BY initial_query_id + ORDER BY min(event_time_microseconds) ASC + FORMAT TSV" +} + +$CLICKHOUSE_CLIENT --query "CREATE TABLE replicas_summary (n Int64) ENGINE = MergeTree() ORDER BY n AS Select * from numbers(100_000)" + +# Note that we are not verifying the exact read rows and bytes (apart from not being 0) for 2 reasons: +# - Different block sizes lead to different read rows +# - Depending on how fast the replicas are they might need data that ends up being discarded because the coordinator +# already has enough (but it has been read in parallel, so it's reported). + +query_id_base="02841_summary_$CLICKHOUSE_DATABASE" + +echo " + SELECT * + FROM replicas_summary + LIMIT 100 + SETTINGS + max_parallel_replicas = 2, + cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', + allow_experimental_parallel_reading_from_replicas = 1, + parallel_replicas_for_non_replicated_merge_tree = 1, + use_hedged_requests = 0, + interactive_delay=0 + "\ + | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&wait_end_of_query=1&query_id=${query_id_base}_interactive_0" --data-binary @- -vvv 2>&1 \ + | grep "Summary" | grep -cv '"read_rows":"0"' + +echo " + SELECT * + FROM replicas_summary + LIMIT 100 + SETTINGS + max_parallel_replicas = 2, + cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', + allow_experimental_parallel_reading_from_replicas = 1, + parallel_replicas_for_non_replicated_merge_tree = 1, + use_hedged_requests = 0, + interactive_delay=99999999999 + "\ + | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&wait_end_of_query=1&query_id=${query_id_base}_interactive_high" --data-binary @- -vvv 2>&1 \ + | grep "Summary" | grep -cv '"read_rows":"0"' + +$CLICKHOUSE_CLIENT --query "SYSTEM FLUSH LOGS" +involved_parallel_replicas "${query_id_base}" diff --git a/tests/queries/0_stateless/02842_vertical_merge_after_add_drop_column.reference b/tests/queries/0_stateless/02842_vertical_merge_after_add_drop_column.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02842_vertical_merge_after_add_drop_column.sql b/tests/queries/0_stateless/02842_vertical_merge_after_add_drop_column.sql new file mode 100644 index 00000000000..0a06eb05431 --- /dev/null +++ b/tests/queries/0_stateless/02842_vertical_merge_after_add_drop_column.sql @@ -0,0 +1,25 @@ +-- In some versions vertical merges after DROP COLUMN was broken in some cases + +drop table if exists data; + +create table data ( + key Int, + `legacy_features_Map.id` Array(UInt8), + `legacy_features_Map.count` Array(UInt32), +) engine=MergeTree() +order by key +settings + min_bytes_for_wide_part=0, + min_rows_for_wide_part=0, + vertical_merge_algorithm_min_rows_to_activate=0, + vertical_merge_algorithm_min_columns_to_activate=0; + +insert into data (key) values (1); +insert into data (key) values (2); + +alter table data add column `features_legacy_Map.id` Array(UInt8), add column `features_legacy_Map.count` Array(UInt32); + +alter table data drop column legacy_features_Map settings mutations_sync=2; + +optimize table data final; +DROP TABLE data; diff --git a/tests/queries/0_stateless/02868_distinct_to_count_optimization.reference b/tests/queries/0_stateless/02868_distinct_to_count_optimization.reference new file mode 100644 index 00000000000..b2b15f92199 --- /dev/null +++ b/tests/queries/0_stateless/02868_distinct_to_count_optimization.reference @@ -0,0 +1,252 @@ +1. test simple distinct +3 +SELECT count() +FROM +( + SELECT DISTINCT a + FROM test_rewrite_uniq_to_count +) +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, is_subquery: 1, is_distinct: 1 + PROJECTION COLUMNS + a UInt8 + PROJECTION + LIST id: 4, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + SETTINGS allow_experimental_analyzer=1 +2. test distinct with subquery alias +3 +SELECT count() +FROM +( + SELECT DISTINCT a + FROM test_rewrite_uniq_to_count +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1, is_distinct: 1 + PROJECTION COLUMNS + a UInt8 + PROJECTION + LIST id: 4, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + SETTINGS allow_experimental_analyzer=1 +3. test distinct with compound column name +3 +SELECT count() +FROM +( + SELECT DISTINCT a + FROM test_rewrite_uniq_to_count +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1, is_distinct: 1 + PROJECTION COLUMNS + a UInt8 + PROJECTION + LIST id: 4, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + SETTINGS allow_experimental_analyzer=1 +4. test distinct with select expression alias +3 +SELECT count() +FROM +( + SELECT DISTINCT a AS alias_of_a + FROM test_rewrite_uniq_to_count +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(alias_of_a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1, is_distinct: 1 + PROJECTION COLUMNS + alias_of_a UInt8 + PROJECTION + LIST id: 4, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + SETTINGS allow_experimental_analyzer=1 +5. test simple group by +3 +SELECT count() +FROM +( + SELECT + a, + sum(b) + FROM test_rewrite_uniq_to_count + GROUP BY a +) +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, is_subquery: 1 + PROJECTION COLUMNS + a UInt8 + sum(b) UInt64 + PROJECTION + LIST id: 4, nodes: 2 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + FUNCTION id: 7, function_name: sum, function_type: aggregate, result_type: UInt64 + ARGUMENTS + LIST id: 8, nodes: 1 + COLUMN id: 9, column_name: b, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + GROUP BY + LIST id: 10, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + SETTINGS allow_experimental_analyzer=1 +6. test group by with subquery alias +3 +SELECT count() +FROM +( + SELECT + a, + sum(b) + FROM test_rewrite_uniq_to_count + GROUP BY a +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1 + PROJECTION COLUMNS + a UInt8 + sum(b) UInt64 + PROJECTION + LIST id: 4, nodes: 2 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + FUNCTION id: 7, function_name: sum, function_type: aggregate, result_type: UInt64 + ARGUMENTS + LIST id: 8, nodes: 1 + COLUMN id: 9, column_name: b, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + GROUP BY + LIST id: 10, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + SETTINGS allow_experimental_analyzer=1 +7. test group by with compound column name +3 +SELECT count() +FROM +( + SELECT + a AS alias_of_a, + sum(b) + FROM test_rewrite_uniq_to_count + GROUP BY a +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(alias_of_a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1 + PROJECTION COLUMNS + alias_of_a UInt8 + sum(b) UInt64 + PROJECTION + LIST id: 4, nodes: 2 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + FUNCTION id: 7, function_name: sum, function_type: aggregate, result_type: UInt64 + ARGUMENTS + LIST id: 8, nodes: 1 + COLUMN id: 9, column_name: b, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + GROUP BY + LIST id: 10, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + SETTINGS allow_experimental_analyzer=1 +8. test group by with select expression alias +3 +SELECT count() +FROM +( + SELECT + a AS alias_of_a, + sum(b) + FROM test_rewrite_uniq_to_count + GROUP BY alias_of_a +) AS t +SETTINGS allow_experimental_analyzer = 0 +3 +QUERY id: 0 + PROJECTION COLUMNS + uniq(alias_of_a) UInt64 + PROJECTION + LIST id: 1, nodes: 1 + FUNCTION id: 2, function_name: count, function_type: aggregate, result_type: UInt64 + JOIN TREE + QUERY id: 3, alias: t, is_subquery: 1 + PROJECTION COLUMNS + alias_of_a UInt8 + sum(b) UInt64 + PROJECTION + LIST id: 4, nodes: 2 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + FUNCTION id: 7, function_name: sum, function_type: aggregate, result_type: UInt64 + ARGUMENTS + LIST id: 8, nodes: 1 + COLUMN id: 9, column_name: b, result_type: UInt8, source_id: 6 + JOIN TREE + TABLE id: 6, table_name: default.test_rewrite_uniq_to_count + GROUP BY + LIST id: 10, nodes: 1 + COLUMN id: 5, column_name: a, result_type: UInt8, source_id: 6 + SETTINGS allow_experimental_analyzer=1 diff --git a/tests/queries/0_stateless/02868_distinct_to_count_optimization.sql b/tests/queries/0_stateless/02868_distinct_to_count_optimization.sql new file mode 100644 index 00000000000..2bf249e08f3 --- /dev/null +++ b/tests/queries/0_stateless/02868_distinct_to_count_optimization.sql @@ -0,0 +1,69 @@ +drop table if exists test_rewrite_uniq_to_count; + +CREATE TABLE test_rewrite_uniq_to_count +( + `a` UInt8, + `b` UInt8, + `c` UInt8 +) ENGINE = MergeTree ORDER BY `a`; + + +INSERT INTO test_rewrite_uniq_to_count values ('1', '1', '1'), ('1', '1', '1'); +INSERT INTO test_rewrite_uniq_to_count values ('2', '2', '2'), ('2', '2', '2'); +INSERT INTO test_rewrite_uniq_to_count values ('3', '3', '3'), ('3', '3', '3'); + +set optimize_uniq_to_count=true; + + +SELECT '1. test simple distinct'; +SELECT uniq(a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) settings allow_experimental_analyzer=0; +SELECT uniq(a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) settings allow_experimental_analyzer=1; + + +SELECT '2. test distinct with subquery alias'; +SELECT uniq(t.a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +SELECT uniq(t.a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(t.a) FROM (SELECT DISTINCT a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; + +SELECT '3. test distinct with compound column name'; +SELECT uniq(a) FROM (SELECT DISTINCT test_rewrite_uniq_to_count.a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(a) FROM (SELECT DISTINCT test_rewrite_uniq_to_count.a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +SELECT uniq(a) FROM (SELECT DISTINCT test_rewrite_uniq_to_count.a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(a) FROM (SELECT DISTINCT test_rewrite_uniq_to_count.a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; + +SELECT '4. test distinct with select expression alias'; +SELECT uniq(alias_of_a) FROM (SELECT DISTINCT a as alias_of_a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(alias_of_a) FROM (SELECT DISTINCT a as alias_of_a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=0; +SELECT uniq(alias_of_a) FROM (SELECT DISTINCT a as alias_of_a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(alias_of_a) FROM (SELECT DISTINCT a as alias_of_a FROM test_rewrite_uniq_to_count) t settings allow_experimental_analyzer=1; + + +SELECT '5. test simple group by'; +SELECT uniq(a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) settings allow_experimental_analyzer=0; +SELECT uniq(a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) settings allow_experimental_analyzer=1; + +SELECT '6. test group by with subquery alias'; +SELECT uniq(t.a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(t.a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=0; +SELECT uniq(t.a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(t.a) FROM (SELECT a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=1; + +SELECT '7. test group by with compound column name'; +SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=0; +SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY a) t settings allow_experimental_analyzer=1; + +SELECT '8. test group by with select expression alias'; +SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY alias_of_a) t settings allow_experimental_analyzer=0; +EXPLAIN SYNTAX SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY alias_of_a) t settings allow_experimental_analyzer=0; +SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY alias_of_a) t settings allow_experimental_analyzer=1; +EXPLAIN QUERY TREE SELECT uniq(t.alias_of_a) FROM (SELECT a as alias_of_a, sum(b) FROM test_rewrite_uniq_to_count GROUP BY alias_of_a) t settings allow_experimental_analyzer=1; + +drop table if exists test_rewrite_uniq_to_count; + diff --git a/tests/queries/0_stateless/02871_clickhouse_client_restart_pager.reference b/tests/queries/0_stateless/02871_clickhouse_client_restart_pager.reference new file mode 100644 index 00000000000..7b36cc96f5e --- /dev/null +++ b/tests/queries/0_stateless/02871_clickhouse_client_restart_pager.reference @@ -0,0 +1,20 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/tests/queries/0_stateless/02871_clickhouse_client_restart_pager.sh b/tests/queries/0_stateless/02871_clickhouse_client_restart_pager.sh new file mode 100755 index 00000000000..cc4ce9b122e --- /dev/null +++ b/tests/queries/0_stateless/02871_clickhouse_client_restart_pager.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# head by default print 10 rows, but it is not enough to query 11 rows, since +# we need to overflow the default pipe size, hence just 1 million of rows (it +# should be around 6 MiB in text representation, should be definitelly enough). +$CLICKHOUSE_CLIENT --ignore-error -nm --pager head -q " + select * from numbers(1e6); -- { clientError CANNOT_WRITE_TO_FILE_DESCRIPTOR } + select * from numbers(1e6); -- { clientError CANNOT_WRITE_TO_FILE_DESCRIPTOR } +" + +exit 0 diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 2c29fd9369e..2e45b885afd 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2582,3 +2582,4 @@ znode znodes zookeeperSessionUptime zstd +hardlink diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 868eb7d89c4..084075dee79 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v23.8.2.7-lts 2023-09-04 v23.8.1.2992-lts 2023-09-01 v23.7.5.30-stable 2023-08-28 v23.7.4.5-stable 2023-08-08 @@ -18,6 +19,7 @@ v23.4.4.16-stable 2023-06-17 v23.4.3.48-stable 2023-06-12 v23.4.2.11-stable 2023-05-02 v23.4.1.1943-stable 2023-04-27 +v23.3.12.11-lts 2023-09-04 v23.3.11.5-lts 2023-08-28 v23.3.10.5-lts 2023-08-23 v23.3.9.55-lts 2023-08-21