diff --git a/.github/workflows/docs_check.yml b/.github/workflows/docs_check.yml index 2b02e7c23ae..0c657a245cb 100644 --- a/.github/workflows/docs_check.yml +++ b/.github/workflows/docs_check.yml @@ -13,9 +13,9 @@ on: # yamllint disable-line rule:truthy branches: - master paths: + - 'docker/docs/**' - 'docs/**' - 'website/**' - - 'docker/docs/**' jobs: CheckLabels: runs-on: [self-hosted, style-checker] diff --git a/.github/workflows/docs_release.yml b/.github/workflows/docs_release.yml index b697fb78738..aed691844da 100644 --- a/.github/workflows/docs_release.yml +++ b/.github/workflows/docs_release.yml @@ -7,16 +7,17 @@ env: concurrency: group: master-release cancel-in-progress: true -on: # yamllint disable-line rule:truthy +'on': push: branches: - master paths: - - 'docs/**' - - 'website/**' - - 'benchmark/**' - - 'docker/**' - '.github/**' + - 'benchmark/**' + - 'docker/docs/release/**' + - 'docs/**' + - 'utils/list-versions/version_date.tsv' + - 'website/**' workflow_dispatch: jobs: DockerHubPushAarch64: diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 6482ddebe06..01490dff59e 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -13,6 +13,7 @@ on: # yamllint disable-line rule:truthy branches: - master paths-ignore: + - 'docker/docs/**' - 'docs/**' - 'website/**' ########################################################################################## diff --git a/.gitmodules b/.gitmodules index 8b30973951f..55fd684fddb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -265,10 +265,6 @@ [submodule "contrib/wyhash"] path = contrib/wyhash url = https://github.com/wangyi-fudan/wyhash.git -[submodule "contrib/eigen"] - path = contrib/eigen - url = https://github.com/eigen-mirror/eigen [submodule "contrib/hashidsxx"] path = contrib/hashidsxx url = https://github.com/schoentoon/hashidsxx.git - diff --git a/base/base/defines.h b/base/base/defines.h index bd98e99f5b9..084e710abf6 100644 --- a/base/base/defines.h +++ b/base/base/defines.h @@ -105,6 +105,25 @@ # define ASAN_POISON_MEMORY_REGION(a, b) #endif +#if !defined(ABORT_ON_LOGICAL_ERROR) + #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || defined(THREAD_SANITIZER) || defined(MEMORY_SANITIZER) || defined(UNDEFINED_BEHAVIOR_SANITIZER) + #define ABORT_ON_LOGICAL_ERROR + #endif +#endif + +/// chassert(x) is similar to assert(x), but: +/// - works in builds with sanitizers, not only in debug builds +/// - tries to print failed assertion into server log +/// It can be used for all assertions except heavy ones. +/// Heavy assertions (that run loops or call complex functions) are allowed in debug builds only. +#if !defined(chassert) + #if defined(ABORT_ON_LOGICAL_ERROR) + #define chassert(x) static_cast(x) ? void(0) : abortOnFailedAssertion(#x) + #else + #define chassert(x) ((void)0) + #endif +#endif + /// A template function for suppressing warnings about unused variables or function results. template constexpr void UNUSED(Args &&... args [[maybe_unused]]) diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index a7f1a908474..943e0e0ebc1 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -153,7 +153,6 @@ endif() add_contrib (sqlite-cmake sqlite-amalgamation) add_contrib (s2geometry-cmake s2geometry) -add_contrib (eigen-cmake eigen) # Put all targets defined here and in subdirectories under "contrib/" folders in GUI-based IDEs. # Some of third-party projects may override CMAKE_FOLDER or FOLDER property of their targets, so they would not appear diff --git a/contrib/eigen b/contrib/eigen deleted file mode 160000 index 3147391d946..00000000000 --- a/contrib/eigen +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 3147391d946bb4b6c68edd901f2add6ac1f31f8c diff --git a/contrib/eigen-cmake/CMakeLists.txt b/contrib/eigen-cmake/CMakeLists.txt deleted file mode 100644 index a37d341109c..00000000000 --- a/contrib/eigen-cmake/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -set(EIGEN_LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/eigen") - -add_library (_eigen INTERFACE) - -# Only include MPL2 code from Eigen library -target_compile_definitions(_eigen INTERFACE EIGEN_MPL2_ONLY) - -# Clang by default mimics gcc 4.2.1 compatibility but Eigen checks __GNUC__ version to enable -# a workaround for bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 fixed in 6.3 -# So we fake gcc > 6.3 when building with clang -if (COMPILER_CLANG AND ARCH_PPC64LE) - target_compile_options(_eigen INTERFACE -fgnuc-version=6.4) -endif() - -target_include_directories (_eigen SYSTEM INTERFACE ${EIGEN_LIBRARY_DIR}) -add_library(ch_contrib::eigen ALIAS _eigen) diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt index 711a24369de..c59b4da890b 100644 --- a/contrib/jemalloc-cmake/CMakeLists.txt +++ b/contrib/jemalloc-cmake/CMakeLists.txt @@ -170,7 +170,13 @@ endif () target_compile_definitions(_jemalloc PRIVATE -DJEMALLOC_PROF=1) if (USE_UNWIND) - target_compile_definitions (_jemalloc PRIVATE -DJEMALLOC_PROF_LIBUNWIND=1) + # jemalloc provides support for two different libunwind flavors: the original HP libunwind and the one coming with gcc / g++ / libstdc++. + # The latter is identified by `JEMALLOC_PROF_LIBGCC` and uses `_Unwind_Backtrace` method instead of `unw_backtrace`. + # At the time ClickHouse uses LLVM libunwind which follows libgcc's way of backtracing. + + # ClickHouse has to provide `unw_backtrace` method by the means of [commit 8e2b31e](https://github.com/ClickHouse/libunwind/commit/8e2b31e766dd502f6df74909e04a7dbdf5182eb1). + + target_compile_definitions (_jemalloc PRIVATE -DJEMALLOC_PROF_LIBGCC=1) target_link_libraries (_jemalloc PRIVATE unwind) endif () diff --git a/docker/docs/release/Dockerfile b/docker/docs/release/Dockerfile new file mode 100644 index 00000000000..89536889746 --- /dev/null +++ b/docker/docs/release/Dockerfile @@ -0,0 +1,44 @@ +# docker build -t clickhouse/docs-release . +FROM ubuntu:20.04 + +# ARG for quick switch to a given ubuntu mirror +ARG apt_archive="http://archive.ubuntu.com" +RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list + +ENV LANG=C.UTF-8 + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + wget \ + bash \ + python \ + curl \ + python3-requests \ + sudo \ + git \ + openssl \ + python3-pip \ + software-properties-common \ + fonts-arphic-ukai \ + fonts-arphic-uming \ + fonts-ipafont-mincho \ + fonts-ipafont-gothic \ + fonts-unfonts-core \ + xvfb \ + ssh-client \ + && apt-get autoremove --yes \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN pip3 install --ignore-installed --upgrade setuptools pip virtualenv + +# We create the most popular default 1000:1000 ubuntu user to not have ssh issues when running with UID==1000 +RUN useradd --create-home --uid 1000 --user-group ubuntu \ + && ssh-keyscan -t rsa github.com >> /etc/ssh/ssh_known_hosts + +COPY run.sh / + +ENV REPO_PATH=/repo_path +ENV OUTPUT_PATH=/output_path + +CMD ["/bin/bash", "/run.sh"] diff --git a/docker/docs/release/run.sh b/docker/docs/release/run.sh new file mode 100644 index 00000000000..e5a9f2101aa --- /dev/null +++ b/docker/docs/release/run.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$REPO_PATH/docs/tools" +if ! [ -d venv ]; then + mkdir -p venv + virtualenv -p "$(which python3)" venv + source venv/bin/activate + python3 -m pip install --ignore-installed -r requirements.txt +fi +source venv/bin/activate +./release.sh 2>&1 | tee "$OUTPUT_PATH/output.log" diff --git a/docker/images.json b/docker/images.json index 9b7d44bc990..181452f17bc 100644 --- a/docker/images.json +++ b/docker/images.json @@ -146,5 +146,9 @@ "name": "clickhouse/docs-builder", "dependent": [ ] + }, + "docker/docs/release": { + "name": "clickhouse/docs-release", + "dependent": [] } } diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 3a660d9cf15..cafc62b365e 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -177,7 +177,6 @@ function clone_submodules contrib/jemalloc contrib/replxx contrib/wyhash - contrib/eigen contrib/hashidsxx ) diff --git a/docker/test/stress/stress b/docker/test/stress/stress index d78de84f60d..94fdfd536a7 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -3,8 +3,6 @@ from multiprocessing import cpu_count from subprocess import Popen, call, check_output, STDOUT import os -import sys -import shutil import argparse import logging import time @@ -31,6 +29,9 @@ def get_options(i, backward_compatibility_check): if i % 5 == 1: client_options.append("join_use_nulls=1") + if i % 15 == 1: + client_options.append("join_algorithm='parallel_hash'") + if i % 15 == 6: client_options.append("join_algorithm='partial_merge'") diff --git a/docs/_includes/cmake_in_clickhouse_header.md b/docs/_includes/cmake_in_clickhouse_header.md index de4b1ef7af5..2f2e0421946 100644 --- a/docs/_includes/cmake_in_clickhouse_header.md +++ b/docs/_includes/cmake_in_clickhouse_header.md @@ -9,11 +9,6 @@ cmake .. \ -DCMAKE_C_COMPILER=$(which clang-14) \ -DCMAKE_CXX_COMPILER=$(which clang++-14) \ -DCMAKE_BUILD_TYPE=Debug \ - -DENABLE_CLICKHOUSE_ALL=OFF \ - -DENABLE_CLICKHOUSE_SERVER=ON \ - -DENABLE_CLICKHOUSE_CLIENT=ON \ - -DENABLE_LIBRARIES=OFF \ - -DUSE_UNWIND=ON \ -DENABLE_UTILS=OFF \ -DENABLE_TESTS=OFF ``` diff --git a/docs/en/development/adding_test_queries.md b/docs/en/development/adding_test_queries.md index 9b993a96ed5..ca47818dad7 100644 --- a/docs/en/development/adding_test_queries.md +++ b/docs/en/development/adding_test_queries.md @@ -106,7 +106,7 @@ vim tests/queries/0_stateless/01521_dummy_test.sql 4) run the test, and put the result of that into the reference file: ``` -clickhouse-client -nmT < tests/queries/0_stateless/01521_dummy_test.sql | tee tests/queries/0_stateless/01521_dummy_test.reference +clickhouse-client -nm < tests/queries/0_stateless/01521_dummy_test.sql | tee tests/queries/0_stateless/01521_dummy_test.reference ``` 5) ensure everything is correct, if the test output is incorrect (due to some bug for example), adjust the reference file using text editor. diff --git a/docs/en/development/cmake-in-clickhouse.md b/docs/en/development/cmake-in-clickhouse.md index 65d280df902..a2ea99ecb67 100644 --- a/docs/en/development/cmake-in-clickhouse.md +++ b/docs/en/development/cmake-in-clickhouse.md @@ -13,11 +13,6 @@ cmake .. \ -DCMAKE_C_COMPILER=$(which clang-13) \ -DCMAKE_CXX_COMPILER=$(which clang++-13) \ -DCMAKE_BUILD_TYPE=Debug \ - -DENABLE_CLICKHOUSE_ALL=OFF \ - -DENABLE_CLICKHOUSE_SERVER=ON \ - -DENABLE_CLICKHOUSE_CLIENT=ON \ - -DENABLE_LIBRARIES=OFF \ - -DUSE_UNWIND=ON \ -DENABLE_UTILS=OFF \ -DENABLE_TESTS=OFF ``` diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index e382bbcddd8..31f948cbb00 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -31,8 +31,11 @@ The supported formats are: | [JSON](#json) | ✗ | ✔ | | [JSONAsString](#jsonasstring) | ✔ | ✗ | | [JSONStrings](#jsonstrings) | ✗ | ✔ | +| [JSONColumns](#jsoncolumns) | ✔ | ✔ | +| [JSONColumnsWithMetadata](#jsoncolumnswithmetadata) | ✗ | ✔ | | [JSONCompact](#jsoncompact) | ✗ | ✔ | | [JSONCompactStrings](#jsoncompactstrings) | ✗ | ✔ | +| [JSONCompactColumns](#jsoncompactcolumns) | ✔ | ✔ | | [JSONEachRow](#jsoneachrow) | ✔ | ✔ | | [JSONEachRowWithProgress](#jsoneachrowwithprogress) | ✗ | ✔ | | [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ | @@ -400,6 +403,8 @@ Both data output and parsing are supported in this format. For parsing, any orde Parsing allows the presence of the additional field `tskv` without the equal sign or a value. This field is ignored. +During import, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](../operations/settings/settings.md#settings-input-format-skip-unknown-fields) is set to 1. + ## CSV {#csv} Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)). @@ -459,15 +464,15 @@ SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTA "meta": [ { - "name": "'hello'", + "name": "num", + "type": "Int32" + }, + { + "name": "str", "type": "String" }, { - "name": "multiply(42, number)", - "type": "UInt64" - }, - { - "name": "range(5)", + "name": "arr", "type": "Array(UInt8)" } ], @@ -475,25 +480,32 @@ SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTA "data": [ { - "'hello'": "hello", - "multiply(42, number)": "0", - "range(5)": [0,1,2,3,4] + "num": 42, + "str": "hello", + "arr": [0,1] }, { - "'hello'": "hello", - "multiply(42, number)": "42", - "range(5)": [0,1,2,3,4] + "num": 43, + "str": "hello", + "arr": [0,1,2] }, { - "'hello'": "hello", - "multiply(42, number)": "84", - "range(5)": [0,1,2,3,4] + "num": 44, + "str": "hello", + "arr": [0,1,2,3] } ], "rows": 3, - "rows_before_limit_at_least": 3 + "rows_before_limit_at_least": 3, + + "statistics": + { + "elapsed": 0.001137687, + "rows_read": 3, + "bytes_read": 24 + } } ``` @@ -528,15 +540,15 @@ Example: "meta": [ { - "name": "'hello'", + "name": "num", + "type": "Int32" + }, + { + "name": "str", "type": "String" }, { - "name": "multiply(42, number)", - "type": "UInt64" - }, - { - "name": "range(5)", + "name": "arr", "type": "Array(UInt8)" } ], @@ -544,25 +556,95 @@ Example: "data": [ { - "'hello'": "hello", - "multiply(42, number)": "0", - "range(5)": "[0,1,2,3,4]" + "num": "42", + "str": "hello", + "arr": "[0,1]" }, { - "'hello'": "hello", - "multiply(42, number)": "42", - "range(5)": "[0,1,2,3,4]" + "num": "43", + "str": "hello", + "arr": "[0,1,2]" }, { - "'hello'": "hello", - "multiply(42, number)": "84", - "range(5)": "[0,1,2,3,4]" + "num": "44", + "str": "hello", + "arr": "[0,1,2,3]" } ], "rows": 3, - "rows_before_limit_at_least": 3 + "rows_before_limit_at_least": 3, + + "statistics": + { + "elapsed": 0.001403233, + "rows_read": 3, + "bytes_read": 24 + } +} +``` + +## JSONColumns {#jsoncolumns} + +In this format, all data is represented as a single JSON Object. +Note that JSONColumns output format buffers all data in memory to output it as a single block and it can lead to high memory consumption. + +Example: +```json +{ + "num": [42, 43, 44], + "str": ["hello", "hello", "hello"], + "arr": [[0,1], [0,1,2], [0,1,2,3]] +} +``` + +During import, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](../operations/settings/settings.md#settings-input-format-skip-unknown-fields) is set to 1. +Columns that are not present in the block will be filled with default values (you can use [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) setting here) + + +## JSONColumnsWithMetadata {#jsoncolumnsmonoblock} + +Differs from JSONColumns output format in that it also outputs some metadata and statistics (similar to JSON output format). +This format buffers all data in memory and then outputs them as a single block, so, it can lead to high memory consumption. + +Example: +```json +{ + "meta": + [ + { + "name": "num", + "type": "Int32" + }, + { + "name": "str", + "type": "String" + }, + + { + "name": "arr", + "type": "Array(UInt8)" + } + ], + + "data": + { + "num": [42, 43, 44], + "str": ["hello", "hello", "hello"], + "arr": [[0,1], [0,1,2], [0,1,2,3]] + }, + + "rows": 3, + + "rows_before_limit_at_least": 3, + + "statistics": + { + "elapsed": 0.000272376, + "rows_read": 3, + "bytes_read": 24 + } } ``` @@ -618,71 +700,101 @@ Result: Differs from JSON only in that data rows are output in arrays, not in objects. +Examples: + +1) JSONCompact: +```json +{ + "meta": + [ + { + "name": "num", + "type": "Int32" + }, + { + "name": "str", + "type": "String" + }, + { + "name": "arr", + "type": "Array(UInt8)" + } + ], + + "data": + [ + [42, "hello", [0,1]], + [43, "hello", [0,1,2]], + [44, "hello", [0,1,2,3]] + ], + + "rows": 3, + + "rows_before_limit_at_least": 3, + + "statistics": + { + "elapsed": 0.001222069, + "rows_read": 3, + "bytes_read": 24 + } +} +``` + +2) JSONCompactStrings +```json +{ + "meta": + [ + { + "name": "num", + "type": "Int32" + }, + { + "name": "str", + "type": "String" + }, + { + "name": "arr", + "type": "Array(UInt8)" + } + ], + + "data": + [ + ["42", "hello", "[0,1]"], + ["43", "hello", "[0,1,2]"], + ["44", "hello", "[0,1,2,3]"] + ], + + "rows": 3, + + "rows_before_limit_at_least": 3, + + "statistics": + { + "elapsed": 0.001572097, + "rows_read": 3, + "bytes_read": 24 + } +} +``` + +## JSONCompactColumns {#jsoncompactcolumns} + +In this format, all data is represented as a single JSON Array. +Note that JSONCompactColumns output format buffers all data in memory to output it as a single block and it can lead to high memory consumption + Example: - -``` -// JSONCompact -{ - "meta": - [ - { - "name": "'hello'", - "type": "String" - }, - { - "name": "multiply(42, number)", - "type": "UInt64" - }, - { - "name": "range(5)", - "type": "Array(UInt8)" - } - ], - - "data": - [ - ["hello", "0", [0,1,2,3,4]], - ["hello", "42", [0,1,2,3,4]], - ["hello", "84", [0,1,2,3,4]] - ], - - "rows": 3, - - "rows_before_limit_at_least": 3 -} +```json +[ + [42, 43, 44], + ["hello", "hello", "hello"], + [[0,1], [0,1,2], [0,1,2,3]] +] ``` -``` -// JSONCompactStrings -{ - "meta": - [ - { - "name": "'hello'", - "type": "String" - }, - { - "name": "multiply(42, number)", - "type": "UInt64" - }, - { - "name": "range(5)", - "type": "Array(UInt8)" - } - ], - - "data": - [ - ["hello", "0", "[0,1,2,3,4]"], - ["hello", "42", "[0,1,2,3,4]"], - ["hello", "84", "[0,1,2,3,4]"] - ], - - "rows": 3, - - "rows_before_limit_at_least": 3 -} -``` +Columns that are not present in the block will be filled with default values (you can use [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) setting here) ## JSONEachRow {#jsoneachrow} ## JSONStringsEachRow {#jsonstringseachrow} @@ -699,15 +811,17 @@ When using these formats, ClickHouse outputs rows as separated, newline-delimite When inserting the data, you should provide a separate JSON value for each row. +In JSONEachRow/JSONStringsEachRow input formats columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](../operations/settings/settings.md#settings-input-format-skip-unknown-fields) is set to 1. + ## JSONEachRowWithProgress {#jsoneachrowwithprogress} ## JSONStringsEachRowWithProgress {#jsonstringseachrowwithprogress} Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yield progress information as JSON values. ```json -{"row":{"'hello'":"hello","multiply(42, number)":"0","range(5)":[0,1,2,3,4]}} -{"row":{"'hello'":"hello","multiply(42, number)":"42","range(5)":[0,1,2,3,4]}} -{"row":{"'hello'":"hello","multiply(42, number)":"84","range(5)":[0,1,2,3,4]}} +{"row":{"num":42,"str":"hello","arr":[0,1]}} +{"row":{"num":43,"str":"hello","arr":[0,1,2]}} +{"row":{"num":44,"str":"hello","arr":[0,1,2,3]}} {"progress":{"read_rows":"3","read_bytes":"24","written_rows":"0","written_bytes":"0","total_rows_to_read":"3"}} ``` @@ -728,11 +842,11 @@ Differs from `JSONCompactStringsEachRow` in that in that it also prints the head Differs from `JSONCompactStringsEachRow` in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes). ```json -["'hello'", "multiply(42, number)", "range(5)"] -["String", "UInt64", "Array(UInt8)"] -["hello", "0", [0,1,2,3,4]] -["hello", "42", [0,1,2,3,4]] -["hello", "84", [0,1,2,3,4]] +["num", "str", "arr"] +["Int32", "String", "Array(UInt8)"] +[42, "hello", [0,1]] +[43, "hello", [0,1,2]] +[44, "hello", [0,1,2,3]] ``` ### Inserting Data {#inserting-data} diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index d23b505a93f..08f281ba281 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -11,10 +11,16 @@ The functions for working with UUID are listed below. Generates the [UUID](../data-types/uuid.md) of [version 4](https://tools.ietf.org/html/rfc4122#section-4.4). +**Syntax** + ``` sql -generateUUIDv4() +generateUUIDv4([x]) ``` +**Arguments** + +- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter. + **Returned value** The UUID type value. @@ -37,6 +43,15 @@ SELECT * FROM t_uuid └──────────────────────────────────────┘ ``` +**Usage example if it is needed to generate multiple values in one row** + +```sql +SELECT generateUUIDv4(1), generateUUIDv4(2) +┌─generateUUIDv4(1)────────────────────┬─generateUUIDv4(2)────────────────────┐ +│ 2d49dc6e-ddce-4cd0-afb8-790956df54c1 │ 8abf8c13-7dea-4fdf-af3e-0e18767770e6 │ +└──────────────────────────────────────┴──────────────────────────────────────┘ +``` + ## empty {#empty} Checks whether the input UUID is empty. diff --git a/docs/en/sql-reference/functions/ym-dict-functions.md b/docs/en/sql-reference/functions/ym-dict-functions.md index 85215957443..4fc727844e7 100644 --- a/docs/en/sql-reference/functions/ym-dict-functions.md +++ b/docs/en/sql-reference/functions/ym-dict-functions.md @@ -105,7 +105,7 @@ Example: `regionToCountry(toUInt32(213)) = 225` converts Moscow (213) to Russia Converts a region to a continent. In every other way, this function is the same as ‘regionToCity’. Example: `regionToContinent(toUInt32(213)) = 10001` converts Moscow (213) to Eurasia (10001). -### regionToTopContinent (#regiontotopcontinent) {#regiontotopcontinent-regiontotopcontinent} +### regionToTopContinent(id\[, geobase\]) {#regiontotopcontinentid-geobase} Finds the highest continent in the hierarchy for the region. diff --git a/docs/ru/sql-reference/functions/uuid-functions.md b/docs/ru/sql-reference/functions/uuid-functions.md index babeb0d2693..554e78002b8 100644 --- a/docs/ru/sql-reference/functions/uuid-functions.md +++ b/docs/ru/sql-reference/functions/uuid-functions.md @@ -9,10 +9,16 @@ sidebar_label: "Функции для работы с UUID" Генерирует идентификатор [UUID версии 4](https://tools.ietf.org/html/rfc4122#section-4.4). +**Синтаксис** + ``` sql -generateUUIDv4() +generateUUIDv4([x]) ``` +**Аргументы** + +- `x` — [выражение](../syntax.md#syntax-expressions), возвращающее значение одного из [поддерживаемых типов данных](../data-types/index.md#data_types). Значение используется, чтобы избежать [склейки одинаковых выражений](index.md#common-subexpression-elimination), если функция вызывается несколько раз в одном запросе. Необязательный параметр. + **Возвращаемое значение** Значение типа [UUID](../../sql-reference/functions/uuid-functions.md). @@ -35,6 +41,15 @@ SELECT * FROM t_uuid └──────────────────────────────────────┘ ``` +**Пример использования, для генерации нескольких значений в одной строке** + +```sql +SELECT generateUUIDv4(1), generateUUIDv4(2) +┌─generateUUIDv4(1)────────────────────┬─generateUUIDv4(2)────────────────────┐ +│ 2d49dc6e-ddce-4cd0-afb8-790956df54c1 │ 8abf8c13-7dea-4fdf-af3e-0e18767770e6 │ +└──────────────────────────────────────┴──────────────────────────────────────┘ +``` + ## empty {#empty} Проверяет, является ли входной UUID пустым. diff --git a/docs/tools/blog.py b/docs/tools/blog.py deleted file mode 100644 index 9bb6beae972..00000000000 --- a/docs/tools/blog.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python3 -import datetime -import logging -import os -import time - -import nav # monkey patches mkdocs - -import mkdocs.commands -from mkdocs import config -from mkdocs import exceptions - -import mdx_clickhouse -import redirects - -import util - - -def build_for_lang(lang, args): - logging.info(f"Building {lang} blog") - - try: - theme_cfg = { - "name": None, - "custom_dir": os.path.join(os.path.dirname(__file__), "..", args.theme_dir), - "language": lang, - "direction": "ltr", - "static_templates": ["404.html"], - "extra": { - "now": int( - time.mktime(datetime.datetime.now().timetuple()) - ) # TODO better way to avoid caching - }, - } - - # the following list of languages is sorted according to - # https://en.wikipedia.org/wiki/List_of_languages_by_total_number_of_speakers - languages = {"en": "English"} - - site_names = {"en": "ClickHouse Blog"} - - assert len(site_names) == len(languages) - - site_dir = os.path.join(args.blog_output_dir, lang) - - plugins = ["macros"] - if args.htmlproofer: - plugins.append("htmlproofer") - - website_url = "https://clickhouse.com" - site_name = site_names.get(lang, site_names["en"]) - blog_nav, post_meta = nav.build_blog_nav(lang, args) - raw_config = dict( - site_name=site_name, - site_url=f"{website_url}/blog/{lang}/", - docs_dir=os.path.join(args.blog_dir, lang), - site_dir=site_dir, - strict=True, - theme=theme_cfg, - nav=blog_nav, - copyright="©2016–2022 ClickHouse, Inc.", - use_directory_urls=True, - repo_name="ClickHouse/ClickHouse", - repo_url="https://github.com/ClickHouse/ClickHouse/", - edit_uri=f"edit/master/website/blog/{lang}", - markdown_extensions=mdx_clickhouse.MARKDOWN_EXTENSIONS, - plugins=plugins, - extra=dict( - now=datetime.datetime.now().isoformat(), - rev=args.rev, - rev_short=args.rev_short, - rev_url=args.rev_url, - website_url=website_url, - events=args.events, - languages=languages, - includes_dir=os.path.join(os.path.dirname(__file__), "..", "_includes"), - is_blog=True, - post_meta=post_meta, - today=datetime.date.today().isoformat(), - ), - ) - - cfg = config.load_config(**raw_config) - mkdocs.commands.build.build(cfg) - - redirects.build_blog_redirects(args) - - env = util.init_jinja2_env(args) - with open( - os.path.join(args.website_dir, "templates", "blog", "rss.xml"), "rb" - ) as f: - rss_template_string = f.read().decode("utf-8").strip() - rss_template = env.from_string(rss_template_string) - with open(os.path.join(args.blog_output_dir, lang, "rss.xml"), "w") as f: - f.write(rss_template.render({"config": raw_config})) - - logging.info(f"Finished building {lang} blog") - - except exceptions.ConfigurationError as e: - raise SystemExit("\n" + str(e)) - - -def build_blog(args): - tasks = [] - for lang in args.blog_lang.split(","): - if lang: - tasks.append( - ( - lang, - args, - ) - ) - util.run_function_in_parallel(build_for_lang, tasks, threads=False) diff --git a/docs/tools/build.py b/docs/tools/build.py index f084a8e5c0c..3756cf66794 100755 --- a/docs/tools/build.py +++ b/docs/tools/build.py @@ -1,144 +1,17 @@ #!/usr/bin/env python3 import argparse -import datetime import logging import os import shutil import subprocess import sys -import time -import jinja2 import livereload -import markdown.util -import nav # monkey patches mkdocs - -from mkdocs import config -from mkdocs import exceptions -import mkdocs.commands.build - -import blog -import mdx_clickhouse import redirects -import util import website -from cmake_in_clickhouse_generator import generate_cmake_flags_files - - -class ClickHouseMarkdown(markdown.extensions.Extension): - class ClickHousePreprocessor(markdown.util.Processor): - def run(self, lines): - for line in lines: - if "" not in line: - yield line - - def extendMarkdown(self, md): - md.preprocessors.register( - self.ClickHousePreprocessor(), "clickhouse_preprocessor", 31 - ) - - -markdown.extensions.ClickHouseMarkdown = ClickHouseMarkdown - - -def build_for_lang(lang, args): - logging.info(f"Building {lang} docs") - - try: - theme_cfg = { - "name": None, - "custom_dir": os.path.join(os.path.dirname(__file__), "..", args.theme_dir), - "language": lang, - "direction": "rtl" if lang == "fa" else "ltr", - "static_templates": ["404.html"], - "extra": { - "now": int( - time.mktime(datetime.datetime.now().timetuple()) - ) # TODO better way to avoid caching - }, - } - - # the following list of languages is sorted according to - # https://en.wikipedia.org/wiki/List_of_languages_by_total_number_of_speakers - languages = {"en": "English", "zh": "中文", "ru": "Русский", "ja": "日本語"} - - site_names = { - "en": "ClickHouse %s Documentation", - "zh": "ClickHouse文档 %s", - "ru": "Документация ClickHouse %s", - "ja": "ClickHouseドキュメント %s", - } - - assert len(site_names) == len(languages) - - site_dir = os.path.join(args.docs_output_dir, lang) - - plugins = ["macros"] - if args.htmlproofer: - plugins.append("htmlproofer") - - website_url = "https://clickhouse.com" - site_name = site_names.get(lang, site_names["en"]) % "" - site_name = site_name.replace(" ", " ") - - raw_config = dict( - site_name=site_name, - site_url=f"{website_url}/docs/{lang}/", - docs_dir=os.path.join(args.docs_dir, lang), - site_dir=site_dir, - strict=True, - theme=theme_cfg, - copyright="©2016–2022 ClickHouse, Inc.", - use_directory_urls=True, - repo_name="ClickHouse/ClickHouse", - repo_url="https://github.com/ClickHouse/ClickHouse/", - edit_uri=f"edit/master/docs/{lang}", - markdown_extensions=mdx_clickhouse.MARKDOWN_EXTENSIONS, - plugins=plugins, - extra=dict( - now=datetime.datetime.now().isoformat(), - rev=args.rev, - rev_short=args.rev_short, - rev_url=args.rev_url, - website_url=website_url, - events=args.events, - languages=languages, - includes_dir=os.path.join(os.path.dirname(__file__), "..", "_includes"), - is_blog=False, - ), - ) - - raw_config["nav"] = nav.build_docs_nav(lang, args) - - cfg = config.load_config(**raw_config) - - if not args.skip_multi_page: - mkdocs.commands.build.build(cfg) - - mdx_clickhouse.PatchedMacrosPlugin.disabled = False - - logging.info(f"Finished building {lang} docs") - - except exceptions.ConfigurationError as e: - raise SystemExit("\n" + str(e)) - - -def build_docs(args): - tasks = [] - for lang in args.lang.split(","): - if lang: - tasks.append( - ( - lang, - args, - ) - ) - util.run_function_in_parallel(build_for_lang, tasks, threads=False) - redirects.build_docs_redirects(args) - def build(args): if os.path.exists(args.output_dir): @@ -147,14 +20,6 @@ def build(args): if not args.skip_website: website.build_website(args) - if not args.skip_docs: - generate_cmake_flags_files() - - build_docs(args) - - if not args.skip_blog: - blog.build_blog(args) - if not args.skip_website: website.process_benchmark_results(args) website.minify_website(args) @@ -171,20 +36,14 @@ if __name__ == "__main__": arg_parser = argparse.ArgumentParser() arg_parser.add_argument("--lang", default="en,ru,zh,ja") - arg_parser.add_argument("--blog-lang", default="en") - arg_parser.add_argument("--docs-dir", default=".") arg_parser.add_argument("--theme-dir", default=website_dir) arg_parser.add_argument("--website-dir", default=website_dir) arg_parser.add_argument("--src-dir", default=src_dir) - arg_parser.add_argument("--blog-dir", default=os.path.join(website_dir, "blog")) arg_parser.add_argument("--output-dir", default="build") arg_parser.add_argument("--nav-limit", type=int, default="0") arg_parser.add_argument("--skip-multi-page", action="store_true") arg_parser.add_argument("--skip-website", action="store_true") - arg_parser.add_argument("--skip-blog", action="store_true") - arg_parser.add_argument("--skip-docs", action="store_true") arg_parser.add_argument("--htmlproofer", action="store_true") - arg_parser.add_argument("--no-docs-macros", action="store_true") arg_parser.add_argument("--livereload", type=int, default="0") arg_parser.add_argument("--verbose", action="store_true") @@ -196,11 +55,6 @@ if __name__ == "__main__": logging.getLogger("MARKDOWN").setLevel(logging.INFO) - args.docs_output_dir = os.path.join(os.path.abspath(args.output_dir), "docs") - args.blog_output_dir = os.path.join(os.path.abspath(args.output_dir), "blog") - - from github import get_events - args.rev = ( subprocess.check_output("git rev-parse HEAD", shell=True) .decode("utf-8") @@ -212,9 +66,6 @@ if __name__ == "__main__": .strip() ) args.rev_url = f"https://github.com/ClickHouse/ClickHouse/commit/{args.rev}" - args.events = get_events(args) - - from build import build build(args) @@ -223,9 +74,6 @@ if __name__ == "__main__": new_args = sys.executable + " " + " ".join(new_args) server = livereload.Server() - server.watch( - args.docs_dir + "**/*", livereload.shell(new_args, cwd="tools", shell=True) - ) server.watch( args.website_dir + "**/*", livereload.shell(new_args, cwd="tools", shell=True), diff --git a/docs/tools/cmake_in_clickhouse_generator.py b/docs/tools/cmake_in_clickhouse_generator.py deleted file mode 100644 index 9bbc94fd206..00000000000 --- a/docs/tools/cmake_in_clickhouse_generator.py +++ /dev/null @@ -1,181 +0,0 @@ -import re -import os -from typing import TextIO, List, Tuple, Optional, Dict - -# name, default value, description -Entity = Tuple[str, str, str] - -# https://regex101.com/r/R6iogw/12 -cmake_option_regex: str = ( - r"^\s*option\s*\(([A-Z_0-9${}]+)\s*(?:\"((?:.|\n)*?)\")?\s*(.*)?\).*$" -) - -ch_master_url: str = "https://github.com/clickhouse/clickhouse/blob/master/" - -name_str: str = '[`{name}`](' + ch_master_url + "{path}#L{line})" -default_anchor_str: str = "[`{name}`](#{anchor})" - -comment_var_regex: str = r"\${(.+)}" -comment_var_replace: str = "`\\1`" - -table_header: str = """ -| Name | Default value | Description | Comment | -|------|---------------|-------------|---------| -""" - -# Needed to detect conditional variables (those which are defined twice) -# name -> (path, values) -entities: Dict[str, Tuple[str, str]] = {} - - -def make_anchor(t: str) -> str: - return "".join( - ["-" if i == "_" else i.lower() for i in t if i.isalpha() or i == "_"] - ) - - -def process_comment(comment: str) -> str: - return re.sub(comment_var_regex, comment_var_replace, comment, flags=re.MULTILINE) - - -def build_entity(path: str, entity: Entity, line_comment: Tuple[int, str]) -> None: - (line, comment) = line_comment - (name, description, default) = entity - - if name in entities: - return - - if len(default) == 0: - formatted_default: str = "`OFF`" - elif default[0] == "$": - formatted_default: str = "`{}`".format(default[2:-1]) - else: - formatted_default: str = "`" + default + "`" - - formatted_name: str = name_str.format( - anchor=make_anchor(name), name=name, path=path, line=line - ) - - formatted_description: str = "".join(description.split("\n")) - - formatted_comment: str = process_comment(comment) - - formatted_entity: str = "| {} | {} | {} | {} |".format( - formatted_name, formatted_default, formatted_description, formatted_comment - ) - - entities[name] = path, formatted_entity - - -def process_file(root_path: str, file_path: str, file_name: str) -> None: - with open(os.path.join(file_path, file_name), "r") as cmake_file: - contents: str = cmake_file.read() - - def get_line_and_comment(target: str) -> Tuple[int, str]: - contents_list: List[str] = contents.split("\n") - comment: str = "" - - for n, line in enumerate(contents_list): - if "option" not in line.lower() or target not in line: - continue - - for maybe_comment_line in contents_list[n - 1 :: -1]: - if not re.match("\s*#\s*", maybe_comment_line): - break - - comment = re.sub("\s*#\s*", "", maybe_comment_line) + " " + comment - - # line numbering starts with 1 - return n + 1, comment - - matches: Optional[List[Entity]] = re.findall( - cmake_option_regex, contents, re.MULTILINE - ) - - file_rel_path_with_name: str = os.path.join( - file_path[len(root_path) :], file_name - ) - if file_rel_path_with_name.startswith("/"): - file_rel_path_with_name = file_rel_path_with_name[1:] - - if matches: - for entity in matches: - build_entity( - file_rel_path_with_name, entity, get_line_and_comment(entity[0]) - ) - - -def process_folder(root_path: str, name: str) -> None: - for root, _, files in os.walk(os.path.join(root_path, name)): - for f in files: - if f == "CMakeLists.txt" or ".cmake" in f: - process_file(root_path, root, f) - - -def generate_cmake_flags_files() -> None: - root_path: str = os.path.join(os.path.dirname(__file__), "..", "..") - - output_file_name: str = os.path.join( - root_path, "docs/en/development/cmake-in-clickhouse.md" - ) - header_file_name: str = os.path.join( - root_path, "docs/_includes/cmake_in_clickhouse_header.md" - ) - footer_file_name: str = os.path.join( - root_path, "docs/_includes/cmake_in_clickhouse_footer.md" - ) - - process_file(root_path, root_path, "CMakeLists.txt") - process_file(root_path, os.path.join(root_path, "programs"), "CMakeLists.txt") - - process_folder(root_path, "base") - process_folder(root_path, "cmake") - process_folder(root_path, "src") - - with open(output_file_name, "w") as f: - with open(header_file_name, "r") as header: - f.write(header.read()) - - sorted_keys: List[str] = sorted(entities.keys()) - ignored_keys: List[str] = [] - - f.write("### ClickHouse modes\n" + table_header) - - for k in sorted_keys: - if k.startswith("ENABLE_CLICKHOUSE_"): - f.write(entities[k][1] + "\n") - ignored_keys.append(k) - - f.write( - "\n### External libraries\nNote that ClickHouse uses forks of these libraries, see https://github.com/ClickHouse-Extras.\n" - + table_header - ) - - for k in sorted_keys: - if k.startswith("ENABLE_") and ".cmake" in entities[k][0]: - f.write(entities[k][1] + "\n") - ignored_keys.append(k) - - f.write("\n\n### Other flags\n" + table_header) - - for k in sorted(set(sorted_keys).difference(set(ignored_keys))): - f.write(entities[k][1] + "\n") - - with open(footer_file_name, "r") as footer: - f.write(footer.read()) - - other_languages = [ - "docs/ja/development/cmake-in-clickhouse.md", - "docs/zh/development/cmake-in-clickhouse.md", - "docs/ru/development/cmake-in-clickhouse.md", - ] - - for lang in other_languages: - other_file_name = os.path.join(root_path, lang) - if os.path.exists(other_file_name): - os.unlink(other_file_name) - os.symlink(output_file_name, other_file_name) - - -if __name__ == "__main__": - generate_cmake_flags_files() diff --git a/docs/tools/deploy-to-test.sh b/docs/tools/deploy-to-test.sh index 30771052535..a7a922137d5 100755 --- a/docs/tools/deploy-to-test.sh +++ b/docs/tools/deploy-to-test.sh @@ -12,12 +12,11 @@ # set -ex -BASE_DIR=$(dirname $(readlink -f $0)) +BASE_DIR=$(dirname "$(readlink -f "$0")") GIT_USER=${GIT_USER:-$USER} -GIT_TEST_URI=git@github.com:${GIT_USER}/clickhouse.github.io.git \ +GIT_PROD_URI=git@github.com:${GIT_USER}/clickhouse.github.io.git \ BASE_DOMAIN=${GIT_USER}-test.clickhouse.com \ - EXTRA_BUILD_ARGS="${@}" \ + EXTRA_BUILD_ARGS="${*}" \ CLOUDFLARE_TOKEN="" \ - HISTORY_SIZE=3 \ - ${BASE_DIR}/release.sh + "${BASE_DIR}/release.sh" diff --git a/docs/tools/easy_diff.py b/docs/tools/easy_diff.py deleted file mode 100755 index 14e3ca91776..00000000000 --- a/docs/tools/easy_diff.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os, sys -import argparse -import subprocess -import contextlib -from git import cmd -from tempfile import NamedTemporaryFile - -SCRIPT_DESCRIPTION = """ - usage: ./easy_diff.py language/document path - - Show the difference between a language document and an English document. - - This script is based on the assumption that documents in other languages are fully synchronized with the en document at a commit. - - For example: - Execute: - ./easy_diff.py --no-pager zh/data_types - Output: - Need translate document:~/ClickHouse/docs/en/data_types/uuid.md - Need link document:~/ClickHouse/docs/en/data_types/decimal.md to ~/ClickHouse/docs/zh/data_types/decimal.md - diff --git a/docs/en/data_types/domains/ipv6.md b/docs/en/data_types/domains/ipv6.md - index 1bfbe3400b..e2abaff017 100644 - --- a/docs/en/data_types/domains/ipv6.md - +++ b/docs/en/data_types/domains/ipv6.md - @@ -4,13 +4,13 @@ - - ### Basic Usage - - -``` sql - +```sql - CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url; - - DESCRIBE TABLE hits; - ``` - - -``` - +```text - ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ - │ url │ String │ │ │ │ │ - │ from │ IPv6 │ │ │ │ │ - @@ -19,19 +19,19 @@ DESCRIBE TABLE hits; - - OR you can use `IPv6` domain as a key: - - -``` sql - +```sql - CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY from; - ... MORE - - OPTIONS: - -h, --help show this help message and exit - --no-pager use stdout as difference result output -""" - -SCRIPT_PATH = os.path.abspath(__file__) -CLICKHOUSE_REPO_HOME = os.path.join(os.path.dirname(SCRIPT_PATH), "..", "..") -SCRIPT_COMMAND_EXECUTOR = cmd.Git(CLICKHOUSE_REPO_HOME) - -SCRIPT_COMMAND_PARSER = argparse.ArgumentParser(add_help=False) -SCRIPT_COMMAND_PARSER.add_argument("path", type=bytes, nargs="?", default=None) -SCRIPT_COMMAND_PARSER.add_argument("--no-pager", action="store_true", default=False) -SCRIPT_COMMAND_PARSER.add_argument("-h", "--help", action="store_true", default=False) - - -def execute(commands): - return SCRIPT_COMMAND_EXECUTOR.execute(commands) - - -def get_hash(file_name): - return execute(["git", "log", "-n", "1", '--pretty=format:"%H"', file_name]) - - -def diff_file(reference_file, working_file, out): - if not os.path.exists(reference_file): - raise RuntimeError( - "reference file [" + os.path.abspath(reference_file) + "] is not exists." - ) - - if os.path.islink(working_file): - out.writelines(["Need translate document:" + os.path.abspath(reference_file)]) - elif not os.path.exists(working_file): - out.writelines( - [ - "Need link document " - + os.path.abspath(reference_file) - + " to " - + os.path.abspath(working_file) - ] - ) - elif get_hash(working_file) != get_hash(reference_file): - out.writelines( - [ - ( - execute( - [ - "git", - "diff", - get_hash(working_file).strip('"'), - reference_file, - ] - ).encode("utf-8") - ) - ] - ) - - return 0 - - -def diff_directory(reference_directory, working_directory, out): - if not os.path.isdir(reference_directory): - return diff_file(reference_directory, working_directory, out) - - for list_item in os.listdir(reference_directory): - working_item = os.path.join(working_directory, list_item) - reference_item = os.path.join(reference_directory, list_item) - if ( - diff_file(reference_item, working_item, out) - if os.path.isfile(reference_item) - else diff_directory(reference_item, working_item, out) != 0 - ): - return 1 - - return 0 - - -def find_language_doc(custom_document, other_language="en", children=[]): - if len(custom_document) == 0: - raise RuntimeError( - "The " - + os.path.join(custom_document, *children) - + " is not in docs directory." - ) - - if os.path.samefile(os.path.join(CLICKHOUSE_REPO_HOME, "docs"), custom_document): - return os.path.join(CLICKHOUSE_REPO_HOME, "docs", other_language, *children[1:]) - children.insert(0, os.path.split(custom_document)[1]) - return find_language_doc( - os.path.split(custom_document)[0], other_language, children - ) - - -class ToPager: - def __init__(self, temp_named_file): - self.temp_named_file = temp_named_file - - def writelines(self, lines): - self.temp_named_file.writelines(lines) - - def close(self): - self.temp_named_file.flush() - git_pager = execute(["git", "var", "GIT_PAGER"]) - subprocess.check_call([git_pager, self.temp_named_file.name]) - self.temp_named_file.close() - - -class ToStdOut: - def writelines(self, lines): - self.system_stdout_stream.writelines(lines) - - def close(self): - self.system_stdout_stream.flush() - - def __init__(self, system_stdout_stream): - self.system_stdout_stream = system_stdout_stream - - -if __name__ == "__main__": - arguments = SCRIPT_COMMAND_PARSER.parse_args() - if arguments.help or not arguments.path: - sys.stdout.write(SCRIPT_DESCRIPTION) - sys.exit(0) - - working_language = os.path.join(CLICKHOUSE_REPO_HOME, "docs", arguments.path) - with contextlib.closing( - ToStdOut(sys.stdout) - if arguments.no_pager - else ToPager(NamedTemporaryFile("r+")) - ) as writer: - exit( - diff_directory( - find_language_doc(working_language), working_language, writer - ) - ) diff --git a/docs/tools/github.py b/docs/tools/github.py deleted file mode 100644 index 3a6f155e25d..00000000000 --- a/docs/tools/github.py +++ /dev/null @@ -1,41 +0,0 @@ -import collections -import copy -import io -import logging -import os -import random -import sys -import tarfile -import time - -import requests - -import util - - -def get_events(args): - events = [] - skip = True - with open(os.path.join(args.docs_dir, "..", "README.md")) as f: - for line in f: - if skip: - if "Upcoming Events" in line: - skip = False - else: - if not line: - continue - line = line.strip().split("](") - if len(line) == 2: - tail = line[1].split(") ") - events.append( - { - "signup_link": tail[0], - "event_name": line[0].replace("* [", ""), - "event_date": tail[1].replace("on ", "").replace(".", ""), - } - ) - return events - - -if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG, stream=sys.stderr) diff --git a/docs/tools/nav.py b/docs/tools/nav.py deleted file mode 100644 index e3df85bbe4e..00000000000 --- a/docs/tools/nav.py +++ /dev/null @@ -1,190 +0,0 @@ -import collections -import datetime -import hashlib -import logging -import os - -import mkdocs.structure.nav - -import util - - -def find_first_header(content): - for line in content.split("\n"): - if line.startswith("#"): - no_hash = line.lstrip("#") - return no_hash.split("{", 1)[0].strip() - - -def build_nav_entry(root, args): - if root.endswith("images"): - return None, None, None - result_items = [] - index_meta, index_content = util.read_md_file(os.path.join(root, "index.md")) - current_title = index_meta.get("toc_folder_title", index_meta.get("toc_title")) - current_title = current_title or index_meta.get( - "title", find_first_header(index_content) - ) - for filename in os.listdir(root): - path = os.path.join(root, filename) - if os.path.isdir(path): - prio, title, payload = build_nav_entry(path, args) - if title and payload: - result_items.append((prio, title, payload)) - elif filename.endswith(".md"): - path = os.path.join(root, filename) - - meta = "" - content = "" - - try: - meta, content = util.read_md_file(path) - except: - print("Error in file: {}".format(path)) - raise - - path = path.split("/", 2)[-1] - title = meta.get("toc_title", find_first_header(content)) - if title: - title = title.strip().rstrip(".") - else: - title = meta.get("toc_folder_title", "hidden") - prio = meta.get("toc_priority", 9999) - logging.debug(f"Nav entry: {prio}, {title}, {path}") - if meta.get("toc_hidden") or not content.strip(): - title = "hidden" - if title == "hidden": - title = "hidden-" + hashlib.sha1(content.encode("utf-8")).hexdigest() - if args.nav_limit and len(result_items) >= args.nav_limit: - break - result_items.append((prio, title, path)) - result_items = sorted(result_items, key=lambda x: (x[0], x[1])) - result = collections.OrderedDict([(item[1], item[2]) for item in result_items]) - if index_meta.get("toc_hidden_folder"): - current_title += "|hidden-folder" - return index_meta.get("toc_priority", 10000), current_title, result - - -def build_docs_nav(lang, args): - docs_dir = os.path.join(args.docs_dir, lang) - _, _, nav = build_nav_entry(docs_dir, args) - result = [] - index_key = None - for key, value in list(nav.items()): - if key and value: - if value == "index.md": - index_key = key - continue - result.append({key: value}) - if args.nav_limit and len(result) >= args.nav_limit: - break - if index_key: - key = list(result[0].keys())[0] - result[0][key][index_key] = "index.md" - result[0][key].move_to_end(index_key, last=False) - return result - - -def build_blog_nav(lang, args): - blog_dir = os.path.join(args.blog_dir, lang) - years = sorted(os.listdir(blog_dir), reverse=True) - result_nav = [{"hidden": "index.md"}] - post_meta = collections.OrderedDict() - for year in years: - year_dir = os.path.join(blog_dir, year) - if not os.path.isdir(year_dir): - continue - result_nav.append({year: collections.OrderedDict()}) - posts = [] - post_meta_items = [] - for post in os.listdir(year_dir): - post_path = os.path.join(year_dir, post) - if not post.endswith(".md"): - raise RuntimeError( - f"Unexpected non-md file in posts folder: {post_path}" - ) - meta, _ = util.read_md_file(post_path) - post_date = meta["date"] - post_title = meta["title"] - if datetime.date.fromisoformat(post_date) > datetime.date.today(): - continue - posts.append( - ( - post_date, - post_title, - os.path.join(year, post), - ) - ) - if post_title in post_meta: - raise RuntimeError(f"Duplicate post title: {post_title}") - if not post_date.startswith(f"{year}-"): - raise RuntimeError( - f"Post date {post_date} doesn't match the folder year {year}: {post_title}" - ) - post_url_part = post.replace(".md", "") - post_meta_items.append( - ( - post_date, - { - "date": post_date, - "title": post_title, - "image": meta.get("image"), - "url": f"/blog/{lang}/{year}/{post_url_part}/", - }, - ) - ) - for _, title, path in sorted(posts, reverse=True): - result_nav[-1][year][title] = path - for _, post_meta_item in sorted( - post_meta_items, reverse=True, key=lambda item: item[0] - ): - post_meta[post_meta_item["title"]] = post_meta_item - return result_nav, post_meta - - -def _custom_get_navigation(files, config): - nav_config = config["nav"] or mkdocs.structure.nav.nest_paths( - f.src_path for f in files.documentation_pages() - ) - items = mkdocs.structure.nav._data_to_navigation(nav_config, files, config) - if not isinstance(items, list): - items = [items] - - pages = mkdocs.structure.nav._get_by_type(items, mkdocs.structure.nav.Page) - - mkdocs.structure.nav._add_previous_and_next_links(pages) - mkdocs.structure.nav._add_parent_links(items) - - missing_from_config = [ - file for file in files.documentation_pages() if file.page is None - ] - if missing_from_config: - files._files = [ - file for file in files._files if file not in missing_from_config - ] - - links = mkdocs.structure.nav._get_by_type(items, mkdocs.structure.nav.Link) - for link in links: - scheme, netloc, path, params, query, fragment = mkdocs.structure.nav.urlparse( - link.url - ) - if scheme or netloc: - mkdocs.structure.nav.log.debug( - "An external link to '{}' is included in " - "the 'nav' configuration.".format(link.url) - ) - elif link.url.startswith("/"): - mkdocs.structure.nav.log.debug( - "An absolute path to '{}' is included in the 'nav' configuration, " - "which presumably points to an external resource.".format(link.url) - ) - else: - msg = ( - "A relative path to '{}' is included in the 'nav' configuration, " - "which is not found in the documentation files".format(link.url) - ) - mkdocs.structure.nav.log.warning(msg) - return mkdocs.structure.nav.Navigation(items, pages) - - -mkdocs.structure.nav.get_navigation = _custom_get_navigation diff --git a/docs/tools/redirects.py b/docs/tools/redirects.py index 5d222376683..1b5490a040f 100644 --- a/docs/tools/redirects.py +++ b/docs/tools/redirects.py @@ -27,45 +27,6 @@ def write_redirect_html(out_path, to_url): ) -def build_redirect_html(args, base_prefix, lang, output_dir, from_path, to_path): - out_path = os.path.join( - output_dir, - lang, - from_path.replace("/index.md", "/index.html").replace(".md", "/index.html"), - ) - target_path = to_path.replace("/index.md", "/").replace(".md", "/") - - if target_path[0:7] != "http://" and target_path[0:8] != "https://": - to_url = f"/{base_prefix}/{lang}/{target_path}" - else: - to_url = target_path - - to_url = to_url.strip() - write_redirect_html(out_path, to_url) - - -def build_docs_redirects(args): - with open(os.path.join(args.docs_dir, "redirects.txt"), "r") as f: - for line in f: - for lang in args.lang.split(","): - from_path, to_path = line.split(" ", 1) - build_redirect_html( - args, "docs", lang, args.docs_output_dir, from_path, to_path - ) - - -def build_blog_redirects(args): - for lang in args.blog_lang.split(","): - redirects_path = os.path.join(args.blog_dir, lang, "redirects.txt") - if os.path.exists(redirects_path): - with open(redirects_path, "r") as f: - for line in f: - from_path, to_path = line.split(" ", 1) - build_redirect_html( - args, "blog", lang, args.blog_output_dir, from_path, to_path - ) - - def build_static_redirects(args): for static_redirect in [ ("benchmark.html", "/benchmark/dbms/"), diff --git a/docs/tools/release.sh b/docs/tools/release.sh index 3482a0fbcc1..b55841f9da2 100755 --- a/docs/tools/release.sh +++ b/docs/tools/release.sh @@ -1,24 +1,24 @@ #!/usr/bin/env bash set -ex -BASE_DIR=$(dirname $(readlink -f $0)) +BASE_DIR=$(dirname "$(readlink -f "$0")") BUILD_DIR="${BASE_DIR}/../build" PUBLISH_DIR="${BASE_DIR}/../publish" BASE_DOMAIN="${BASE_DOMAIN:-content.clickhouse.com}" -GIT_TEST_URI="${GIT_TEST_URI:-git@github.com:ClickHouse/clickhouse-com-content.git}" -GIT_PROD_URI="git@github.com:ClickHouse/clickhouse-website-content.git" +GIT_PROD_URI="${GIT_PROD_URI:-git@github.com:ClickHouse/clickhouse-com-content.git}" EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS:---verbose}" if [[ -z "$1" ]] then source "${BASE_DIR}/venv/bin/activate" + # shellcheck disable=2086 python3 "${BASE_DIR}/build.py" ${EXTRA_BUILD_ARGS} rm -rf "${PUBLISH_DIR}" mkdir "${PUBLISH_DIR}" && cd "${PUBLISH_DIR}" # Will make a repository with website content as the only commit. git init - git remote add origin "${GIT_TEST_URI}" + git remote add origin "${GIT_PROD_URI}" git config user.email "robot-clickhouse@clickhouse.com" git config user.name "robot-clickhouse" @@ -28,7 +28,7 @@ then echo -n "" > README.md echo -n "" > ".nojekyll" cp "${BASE_DIR}/../../LICENSE" . - git add * + git add ./* git add ".nojekyll" git commit --quiet -m "Add new release at $(date)" @@ -40,7 +40,7 @@ then # Turn off logging. set +x - if [[ ! -z "${CLOUDFLARE_TOKEN}" ]] + if [[ -n "${CLOUDFLARE_TOKEN}" ]] then sleep 1m # https://api.cloudflare.com/#zone-purge-files-by-cache-tags,-host-or-prefix diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt index dd641c13629..afd6b1a889d 100644 --- a/docs/tools/requirements.txt +++ b/docs/tools/requirements.txt @@ -1,39 +1,30 @@ Babel==2.9.1 -backports-abc==0.5 -backports.functools-lru-cache==1.6.1 -beautifulsoup4==4.9.1 -certifi==2020.4.5.2 -chardet==3.0.4 -click==7.1.2 -closure==20191111 -cssmin==0.2.0 -future==0.18.2 -htmlmin==0.1.12 -idna==2.10 Jinja2==3.0.3 -jinja2-highlight==0.6.1 -jsmin==3.0.0 -livereload==2.6.3 Markdown==3.3.2 -MarkupSafe==2.1.0 -mkdocs==1.3.0 -mkdocs-htmlproofer-plugin==0.0.3 -mkdocs-macros-plugin==0.4.20 -nltk==3.7 -nose==1.3.7 -protobuf==3.14.0 -numpy==1.21.2 -pymdown-extensions==8.0 -python-slugify==4.0.1 +MarkupSafe==2.1.1 PyYAML==6.0 -repackage==0.7.3 -requests==2.25.1 -singledispatch==3.4.0.3 +Pygments>=2.12.0 +beautifulsoup4==4.9.1 +click==7.1.2 +ghp_import==2.1.0 +importlib_metadata==4.11.4 +jinja2-highlight==0.6.1 +livereload==2.6.3 +mergedeep==1.3.4 +mkdocs-macros-plugin==0.4.20 +mkdocs-macros-test==0.1.0 +mkdocs-material==8.2.15 +mkdocs==1.3.0 +mkdocs_material_extensions==1.0.3 +packaging==21.3 +pymdown_extensions==9.4 +pyparsing==3.0.9 +python-slugify==4.0.1 +python_dateutil==2.8.2 +pytz==2022.1 six==1.15.0 -soupsieve==2.0.1 +soupsieve==2.3.2 termcolor==1.1.0 +text_unidecode==1.3 tornado==6.1 -Unidecode==1.1.1 -urllib3>=1.26.8 -Pygments>=2.11.2 - +zipp==3.8.0 diff --git a/docs/tools/util.py b/docs/tools/util.py index ec670725122..a5ebb1b11b2 100644 --- a/docs/tools/util.py +++ b/docs/tools/util.py @@ -124,7 +124,7 @@ def init_jinja2_env(args): env = jinja2.Environment( loader=jinja2.FileSystemLoader( - [args.website_dir, os.path.join(args.docs_dir, "_includes")] + [args.website_dir, os.path.join(args.src_dir, "docs", "_includes")] ), extensions=["jinja2.ext.i18n", "jinja2_highlight.HighlightExtension"], ) diff --git a/docs/tools/webpack.config.js b/docs/tools/webpack.config.js deleted file mode 100644 index e0dea964101..00000000000 --- a/docs/tools/webpack.config.js +++ /dev/null @@ -1,81 +0,0 @@ -const path = require('path'); -const jsPath = path.resolve(__dirname, '../../website/src/js'); -const scssPath = path.resolve(__dirname, '../../website/src/scss'); - -console.log(path.resolve(__dirname, 'node_modules/bootstrap', require('bootstrap/package.json').sass)); - -module.exports = { - - mode: ('development' === process.env.NODE_ENV) && 'development' || 'production', - - ...(('development' === process.env.NODE_ENV) && { - watch: true, - }), - - entry: [ - path.resolve(scssPath, 'bootstrap.scss'), - path.resolve(scssPath, 'main.scss'), - path.resolve(jsPath, 'main.js'), - ], - - output: { - path: path.resolve(__dirname, '../../website'), - filename: 'js/main.js', - }, - - resolve: { - alias: { - bootstrap: path.resolve(__dirname, 'node_modules/bootstrap', require('bootstrap/package.json').sass), - }, - }, - - module: { - rules: [{ - test: /\.js$/, - exclude: /(node_modules)/, - use: [{ - loader: 'babel-loader', - options: { - presets: ['@babel/preset-env'], - }, - }], - }, { - test: /\.scss$/, - use: [{ - loader: 'file-loader', - options: { - sourceMap: true, - outputPath: (url, entryPath, context) => { - if (0 === entryPath.indexOf(scssPath)) { - const outputFile = entryPath.slice(entryPath.lastIndexOf('/') + 1, -5) - const outputPath = entryPath.slice(0, entryPath.lastIndexOf('/')).slice(scssPath.length + 1) - return `./css/${outputPath}/${outputFile}.css` - } - return `./css/${url}` - }, - }, - }, { - loader: 'postcss-loader', - options: { - options: {}, - plugins: () => ([ - require('autoprefixer'), - ('production' === process.env.NODE_ENV) && require('cssnano'), - ].filter(plugin => plugin)), - } - }, { - loader: 'sass-loader', - options: { - implementation: require('sass'), - implementation: require('sass'), - sourceMap: ('development' === process.env.NODE_ENV), - sassOptions: { - importer: require('node-sass-glob-importer')(), - precision: 10, - }, - }, - }], - }], - }, - -}; diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index defc66b0ed9..4a964b81694 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1314,7 +1314,7 @@ int Server::main(const std::vector & /*args*/) global_context->setConfigReloadCallback([&]() { main_config_reloader->reload(); - access_control.reloadUsersConfigs(); + access_control.reload(); }); /// Limit on total number of concurrently executed queries. @@ -1405,6 +1405,7 @@ int Server::main(const std::vector & /*args*/) /// Stop reloading of the main config. This must be done before `global_context->shutdown()` because /// otherwise the reloading may pass a changed config to some destroyed parts of ContextSharedPart. main_config_reloader.reset(); + access_control.stopPeriodicReloading(); async_metrics.stop(); @@ -1628,7 +1629,7 @@ int Server::main(const std::vector & /*args*/) buildLoggers(config(), logger()); main_config_reloader->start(); - access_control.startPeriodicReloadingUsersConfigs(); + access_control.startPeriodicReloading(); if (dns_cache_updater) dns_cache_updater->start(); diff --git a/programs/server/play.html b/programs/server/play.html index 06fc5d8de9a..6b530790ad0 100644 --- a/programs/server/play.html +++ b/programs/server/play.html @@ -81,6 +81,8 @@ { height: 100%; margin: 0; + /* This enables position: sticky on controls */ + overflow: auto; } html @@ -89,9 +91,26 @@ font-family: Liberation Sans, DejaVu Sans, sans-serif, Noto Color Emoji, Apple Color Emoji, Segoe UI Emoji; background: var(--background-color); color: var(--text-color); + } + + body + { + /* This element will show scroll-bar on overflow, and the scroll-bar will be outside of the padding. */ padding: 0.5rem; } + #controls + { + /* Make enough space for even huge queries. */ + height: 20%; + /* When a page will be scrolled horizontally due to large table size, keep controls in place. */ + position: sticky; + left: 0; + /* This allows query textarea to occupy the remaining height while other elements have fixed height. */ + display: flex; + flex-direction: column; + } + /* Otherwise Webkit based browsers will display ugly border on focus. */ textarea, input, button { @@ -129,8 +148,7 @@ #query_div { - /* Make enough space for even huge queries. */ - height: 20%; + height: 100%; } #query @@ -380,19 +398,21 @@ -
- -
-
- -
-
- -  (Ctrl/Cmd+Enter) - - - - 🌑🌞 +
+
+ +
+
+ +
+
+ +  (Ctrl/Cmd+Enter) + + + + 🌑🌞 +
diff --git a/src/Access/AccessChangesNotifier.cpp b/src/Access/AccessChangesNotifier.cpp new file mode 100644 index 00000000000..05516285efb --- /dev/null +++ b/src/Access/AccessChangesNotifier.cpp @@ -0,0 +1,122 @@ +#include +#include + + +namespace DB +{ + +AccessChangesNotifier::AccessChangesNotifier() : handlers(std::make_shared()) +{ +} + +AccessChangesNotifier::~AccessChangesNotifier() = default; + +void AccessChangesNotifier::onEntityAdded(const UUID & id, const AccessEntityPtr & new_entity) +{ + std::lock_guard lock{queue_mutex}; + Event event; + event.id = id; + event.entity = new_entity; + event.type = new_entity->getType(); + queue.push(std::move(event)); +} + +void AccessChangesNotifier::onEntityUpdated(const UUID & id, const AccessEntityPtr & changed_entity) +{ + std::lock_guard lock{queue_mutex}; + Event event; + event.id = id; + event.entity = changed_entity; + event.type = changed_entity->getType(); + queue.push(std::move(event)); +} + +void AccessChangesNotifier::onEntityRemoved(const UUID & id, AccessEntityType type) +{ + std::lock_guard lock{queue_mutex}; + Event event; + event.id = id; + event.type = type; + queue.push(std::move(event)); +} + +scope_guard AccessChangesNotifier::subscribeForChanges(AccessEntityType type, const OnChangedHandler & handler) +{ + std::lock_guard lock{handlers->mutex}; + auto & list = handlers->by_type[static_cast(type)]; + list.push_back(handler); + auto handler_it = std::prev(list.end()); + + return [handlers=handlers, type, handler_it] + { + std::lock_guard lock2{handlers->mutex}; + auto & list2 = handlers->by_type[static_cast(type)]; + list2.erase(handler_it); + }; +} + +scope_guard AccessChangesNotifier::subscribeForChanges(const UUID & id, const OnChangedHandler & handler) +{ + std::lock_guard lock{handlers->mutex}; + auto it = handlers->by_id.emplace(id, std::list{}).first; + auto & list = it->second; + list.push_back(handler); + auto handler_it = std::prev(list.end()); + + return [handlers=handlers, it, handler_it] + { + std::lock_guard lock2{handlers->mutex}; + auto & list2 = it->second; + list2.erase(handler_it); + if (list2.empty()) + handlers->by_id.erase(it); + }; +} + + +scope_guard AccessChangesNotifier::subscribeForChanges(const std::vector & ids, const OnChangedHandler & handler) +{ + scope_guard subscriptions; + for (const auto & id : ids) + subscriptions.join(subscribeForChanges(id, handler)); + return subscriptions; +} + +void AccessChangesNotifier::sendNotifications() +{ + /// Only one thread can send notification at any time. + std::lock_guard sending_notifications_lock{sending_notifications}; + + std::unique_lock queue_lock{queue_mutex}; + while (!queue.empty()) + { + auto event = std::move(queue.front()); + queue.pop(); + queue_lock.unlock(); + + std::vector current_handlers; + { + std::lock_guard handlers_lock{handlers->mutex}; + boost::range::copy(handlers->by_type[static_cast(event.type)], std::back_inserter(current_handlers)); + auto it = handlers->by_id.find(event.id); + if (it != handlers->by_id.end()) + boost::range::copy(it->second, std::back_inserter(current_handlers)); + } + + for (const auto & handler : current_handlers) + { + try + { + handler(event.id, event.entity); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + + queue_lock.lock(); + } +} + +} diff --git a/src/Access/AccessChangesNotifier.h b/src/Access/AccessChangesNotifier.h new file mode 100644 index 00000000000..46a7cdf26b6 --- /dev/null +++ b/src/Access/AccessChangesNotifier.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include +#include +#include + + +namespace DB +{ + +/// Helper class implementing subscriptions and notifications in access management. +class AccessChangesNotifier +{ +public: + AccessChangesNotifier(); + ~AccessChangesNotifier(); + + using OnChangedHandler + = std::function; + + /// Subscribes for all changes. + /// Can return nullptr if cannot subscribe (identifier not found) or if it doesn't make sense (the storage is read-only). + scope_guard subscribeForChanges(AccessEntityType type, const OnChangedHandler & handler); + + template + scope_guard subscribeForChanges(OnChangedHandler handler) + { + return subscribeForChanges(EntityClassT::TYPE, handler); + } + + /// Subscribes for changes of a specific entry. + /// Can return nullptr if cannot subscribe (identifier not found) or if it doesn't make sense (the storage is read-only). + scope_guard subscribeForChanges(const UUID & id, const OnChangedHandler & handler); + scope_guard subscribeForChanges(const std::vector & ids, const OnChangedHandler & handler); + + /// Called by access storages after a new access entity has been added. + void onEntityAdded(const UUID & id, const AccessEntityPtr & new_entity); + + /// Called by access storages after an access entity has been changed. + void onEntityUpdated(const UUID & id, const AccessEntityPtr & changed_entity); + + /// Called by access storages after an access entity has been removed. + void onEntityRemoved(const UUID & id, AccessEntityType type); + + /// Sends notifications to subscribers about changes in access entities + /// (added with previous calls onEntityAdded(), onEntityUpdated(), onEntityRemoved()). + void sendNotifications(); + +private: + struct Handlers + { + std::unordered_map> by_id; + std::list by_type[static_cast(AccessEntityType::MAX)]; + std::mutex mutex; + }; + + /// shared_ptr is here for safety because AccessChangesNotifier can be destroyed before all subscriptions are removed. + std::shared_ptr handlers; + + struct Event + { + UUID id; + AccessEntityPtr entity; + AccessEntityType type; + }; + std::queue queue; + std::mutex queue_mutex; + std::mutex sending_notifications; +}; + +} diff --git a/src/Access/AccessControl.cpp b/src/Access/AccessControl.cpp index d74695e645e..5cf283ba803 100644 --- a/src/Access/AccessControl.cpp +++ b/src/Access/AccessControl.cpp @@ -14,9 +14,10 @@ #include #include #include +#include #include #include -#include +#include #include #include #include @@ -82,7 +83,7 @@ public: private: const AccessControl & access_control; - Poco::ExpireCache> cache; + Poco::AccessExpireCache> cache; std::mutex mutex; }; @@ -142,7 +143,8 @@ AccessControl::AccessControl() quota_cache(std::make_unique(*this)), settings_profiles_cache(std::make_unique(*this)), external_authenticators(std::make_unique()), - custom_settings_prefixes(std::make_unique()) + custom_settings_prefixes(std::make_unique()), + changes_notifier(std::make_unique()) { } @@ -231,35 +233,6 @@ void AccessControl::addUsersConfigStorage( LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}", String(new_storage->getStorageType()), new_storage->getStorageName(), new_storage->getPath()); } -void AccessControl::reloadUsersConfigs() -{ - auto storages = getStoragesPtr(); - for (const auto & storage : *storages) - { - if (auto users_config_storage = typeid_cast>(storage)) - users_config_storage->reload(); - } -} - -void AccessControl::startPeriodicReloadingUsersConfigs() -{ - auto storages = getStoragesPtr(); - for (const auto & storage : *storages) - { - if (auto users_config_storage = typeid_cast>(storage)) - users_config_storage->startPeriodicReloading(); - } -} - -void AccessControl::stopPeriodicReloadingUsersConfigs() -{ - auto storages = getStoragesPtr(); - for (const auto & storage : *storages) - { - if (auto users_config_storage = typeid_cast>(storage)) - users_config_storage->stopPeriodicReloading(); - } -} void AccessControl::addReplicatedStorage( const String & storage_name_, @@ -272,10 +245,9 @@ void AccessControl::addReplicatedStorage( if (auto replicated_storage = typeid_cast>(storage)) return; } - auto new_storage = std::make_shared(storage_name_, zookeeper_path_, get_zookeeper_function_); + auto new_storage = std::make_shared(storage_name_, zookeeper_path_, get_zookeeper_function_, *changes_notifier); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}'", String(new_storage->getStorageType()), new_storage->getStorageName()); - new_storage->startup(); } void AccessControl::addDiskStorage(const String & directory_, bool readonly_) @@ -298,7 +270,7 @@ void AccessControl::addDiskStorage(const String & storage_name_, const String & } } } - auto new_storage = std::make_shared(storage_name_, directory_, readonly_); + auto new_storage = std::make_shared(storage_name_, directory_, readonly_, *changes_notifier); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}", String(new_storage->getStorageType()), new_storage->getStorageName(), new_storage->getPath()); } @@ -312,7 +284,7 @@ void AccessControl::addMemoryStorage(const String & storage_name_) if (auto memory_storage = typeid_cast>(storage)) return; } - auto new_storage = std::make_shared(storage_name_); + auto new_storage = std::make_shared(storage_name_, *changes_notifier); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}'", String(new_storage->getStorageType()), new_storage->getStorageName()); } @@ -320,7 +292,7 @@ void AccessControl::addMemoryStorage(const String & storage_name_) void AccessControl::addLDAPStorage(const String & storage_name_, const Poco::Util::AbstractConfiguration & config_, const String & prefix_) { - auto new_storage = std::make_shared(storage_name_, this, config_, prefix_); + auto new_storage = std::make_shared(storage_name_, *this, config_, prefix_); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}', LDAP server name: {}", String(new_storage->getStorageType()), new_storage->getStorageName(), new_storage->getLDAPServerName()); } @@ -423,6 +395,57 @@ void AccessControl::addStoragesFromMainConfig( } +void AccessControl::reload() +{ + MultipleAccessStorage::reload(); + changes_notifier->sendNotifications(); +} + +scope_guard AccessControl::subscribeForChanges(AccessEntityType type, const OnChangedHandler & handler) const +{ + return changes_notifier->subscribeForChanges(type, handler); +} + +scope_guard AccessControl::subscribeForChanges(const UUID & id, const OnChangedHandler & handler) const +{ + return changes_notifier->subscribeForChanges(id, handler); +} + +scope_guard AccessControl::subscribeForChanges(const std::vector & ids, const OnChangedHandler & handler) const +{ + return changes_notifier->subscribeForChanges(ids, handler); +} + +std::optional AccessControl::insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) +{ + auto id = MultipleAccessStorage::insertImpl(entity, replace_if_exists, throw_if_exists); + if (id) + changes_notifier->sendNotifications(); + return id; +} + +bool AccessControl::removeImpl(const UUID & id, bool throw_if_not_exists) +{ + bool removed = MultipleAccessStorage::removeImpl(id, throw_if_not_exists); + if (removed) + changes_notifier->sendNotifications(); + return removed; +} + +bool AccessControl::updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) +{ + bool updated = MultipleAccessStorage::updateImpl(id, update_func, throw_if_not_exists); + if (updated) + changes_notifier->sendNotifications(); + return updated; +} + +AccessChangesNotifier & AccessControl::getChangesNotifier() +{ + return *changes_notifier; +} + + UUID AccessControl::authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address) const { try diff --git a/src/Access/AccessControl.h b/src/Access/AccessControl.h index 4ee29aa20c7..cbc71241316 100644 --- a/src/Access/AccessControl.h +++ b/src/Access/AccessControl.h @@ -3,8 +3,8 @@ #include #include #include +#include #include -#include #include @@ -40,6 +40,7 @@ class SettingsProfilesCache; class SettingsProfileElements; class ClientInfo; class ExternalAuthenticators; +class AccessChangesNotifier; struct Settings; @@ -50,6 +51,7 @@ public: AccessControl(); ~AccessControl() override; + /// Initializes access storage (user directories). void setUpFromMainConfig(const Poco::Util::AbstractConfiguration & config_, const String & config_path_, const zkutil::GetZooKeeper & get_zookeeper_function_); @@ -74,9 +76,6 @@ public: const String & preprocessed_dir_, const zkutil::GetZooKeeper & get_zookeeper_function_ = {}); - void reloadUsersConfigs(); - void startPeriodicReloadingUsersConfigs(); - void stopPeriodicReloadingUsersConfigs(); /// Loads access entities from the directory on the local disk. /// Use that directory to keep created users/roles/etc. void addDiskStorage(const String & directory_, bool readonly_ = false); @@ -106,6 +105,26 @@ public: const String & config_path, const zkutil::GetZooKeeper & get_zookeeper_function); + /// Reloads and updates entities in this storage. This function is used to implement SYSTEM RELOAD CONFIG. + void reload() override; + + using OnChangedHandler = std::function; + + /// Subscribes for all changes. + /// Can return nullptr if cannot subscribe (identifier not found) or if it doesn't make sense (the storage is read-only). + scope_guard subscribeForChanges(AccessEntityType type, const OnChangedHandler & handler) const; + + template + scope_guard subscribeForChanges(OnChangedHandler handler) const { return subscribeForChanges(EntityClassT::TYPE, handler); } + + /// Subscribes for changes of a specific entry. + /// Can return nullptr if cannot subscribe (identifier not found) or if it doesn't make sense (the storage is read-only). + scope_guard subscribeForChanges(const UUID & id, const OnChangedHandler & handler) const; + scope_guard subscribeForChanges(const std::vector & ids, const OnChangedHandler & handler) const; + + UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address) const; + void setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config); + /// Sets the default profile's name. /// The default profile's settings are always applied before any other profile's. void setDefaultProfileName(const String & default_profile_name); @@ -135,9 +154,6 @@ public: void setOnClusterQueriesRequireClusterGrant(bool enable) { on_cluster_queries_require_cluster_grant = enable; } bool doesOnClusterQueriesRequireClusterGrant() const { return on_cluster_queries_require_cluster_grant; } - UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address) const; - void setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config); - std::shared_ptr getContextAccess( const UUID & user_id, const std::vector & current_roles, @@ -178,10 +194,17 @@ public: const ExternalAuthenticators & getExternalAuthenticators() const; + /// Gets manager of notifications. + AccessChangesNotifier & getChangesNotifier(); + private: class ContextAccessCache; class CustomSettingsPrefixes; + std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; + bool removeImpl(const UUID & id, bool throw_if_not_exists) override; + bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; + std::unique_ptr context_access_cache; std::unique_ptr role_cache; std::unique_ptr row_policy_cache; @@ -189,6 +212,7 @@ private: std::unique_ptr settings_profiles_cache; std::unique_ptr external_authenticators; std::unique_ptr custom_settings_prefixes; + std::unique_ptr changes_notifier; std::atomic_bool allow_plaintext_password = true; std::atomic_bool allow_no_password = true; std::atomic_bool users_without_row_policies_can_read_rows = false; diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp index 28926310c20..46fdba9d65e 100644 --- a/src/Access/ContextAccess.cpp +++ b/src/Access/ContextAccess.cpp @@ -149,6 +149,21 @@ ContextAccess::ContextAccess(const AccessControl & access_control_, const Params } +ContextAccess::~ContextAccess() +{ + enabled_settings.reset(); + enabled_quota.reset(); + enabled_row_policies.reset(); + access_with_implicit.reset(); + access.reset(); + roles_info.reset(); + subscription_for_roles_changes.reset(); + enabled_roles.reset(); + subscription_for_user_change.reset(); + user.reset(); +} + + void ContextAccess::initialize() { std::lock_guard lock{mutex}; diff --git a/src/Access/ContextAccess.h b/src/Access/ContextAccess.h index 5742b6a3222..f1c215a4029 100644 --- a/src/Access/ContextAccess.h +++ b/src/Access/ContextAccess.h @@ -155,6 +155,8 @@ public: /// without any limitations. This is used for the global context. static std::shared_ptr getFullAccess(); + ~ContextAccess(); + private: friend class AccessControl; ContextAccess() {} /// NOLINT diff --git a/src/Access/DiskAccessStorage.cpp b/src/Access/DiskAccessStorage.cpp index 95d58f9da87..57e09d40b35 100644 --- a/src/Access/DiskAccessStorage.cpp +++ b/src/Access/DiskAccessStorage.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -164,13 +165,8 @@ namespace } -DiskAccessStorage::DiskAccessStorage(const String & directory_path_, bool readonly_) - : DiskAccessStorage(STORAGE_TYPE, directory_path_, readonly_) -{ -} - -DiskAccessStorage::DiskAccessStorage(const String & storage_name_, const String & directory_path_, bool readonly_) - : IAccessStorage(storage_name_) +DiskAccessStorage::DiskAccessStorage(const String & storage_name_, const String & directory_path_, bool readonly_, AccessChangesNotifier & changes_notifier_) + : IAccessStorage(storage_name_), changes_notifier(changes_notifier_) { directory_path = makeDirectoryPathCanonical(directory_path_); readonly = readonly_; @@ -199,7 +195,15 @@ DiskAccessStorage::DiskAccessStorage(const String & storage_name_, const String DiskAccessStorage::~DiskAccessStorage() { stopListsWritingThread(); - writeLists(); + + try + { + writeLists(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } } @@ -470,19 +474,16 @@ std::optional DiskAccessStorage::readNameImpl(const UUID & id, bool thro std::optional DiskAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); - UUID id = generateRandomID(); std::lock_guard lock{mutex}; - if (insertNoLock(id, new_entity, replace_if_exists, throw_if_exists, notifications)) + if (insertNoLock(id, new_entity, replace_if_exists, throw_if_exists)) return id; return std::nullopt; } -bool DiskAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists, Notifications & notifications) +bool DiskAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { const String & name = new_entity->getName(); AccessEntityType type = new_entity->getType(); @@ -514,7 +515,7 @@ bool DiskAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & ne writeAccessEntityToDisk(id, *new_entity); if (name_collision && replace_if_exists) - removeNoLock(it_by_name->second->id, /* throw_if_not_exists = */ false, notifications); + removeNoLock(it_by_name->second->id, /* throw_if_not_exists = */ false); /// Do insertion. auto & entry = entries_by_id[id]; @@ -523,22 +524,20 @@ bool DiskAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & ne entry.name = name; entry.entity = new_entity; entries_by_name[entry.name] = &entry; - prepareNotifications(id, entry, false, notifications); + + changes_notifier.onEntityAdded(id, new_entity); return true; } bool DiskAccessStorage::removeImpl(const UUID & id, bool throw_if_not_exists) { - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); - std::lock_guard lock{mutex}; - return removeNoLock(id, throw_if_not_exists, notifications); + return removeNoLock(id, throw_if_not_exists); } -bool DiskAccessStorage::removeNoLock(const UUID & id, bool throw_if_not_exists, Notifications & notifications) +bool DiskAccessStorage::removeNoLock(const UUID & id, bool throw_if_not_exists) { auto it = entries_by_id.find(id); if (it == entries_by_id.end()) @@ -559,25 +558,24 @@ bool DiskAccessStorage::removeNoLock(const UUID & id, bool throw_if_not_exists, deleteAccessEntityOnDisk(id); /// Do removing. - prepareNotifications(id, entry, true, notifications); + UUID removed_id = id; auto & entries_by_name = entries_by_name_and_type[static_cast(type)]; entries_by_name.erase(entry.name); entries_by_id.erase(it); + + changes_notifier.onEntityRemoved(removed_id, type); return true; } bool DiskAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) { - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); - std::lock_guard lock{mutex}; - return updateNoLock(id, update_func, throw_if_not_exists, notifications); + return updateNoLock(id, update_func, throw_if_not_exists); } -bool DiskAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists, Notifications & notifications) +bool DiskAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) { auto it = entries_by_id.find(id); if (it == entries_by_id.end()) @@ -626,7 +624,8 @@ bool DiskAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_ entries_by_name[entry.name] = &entry; } - prepareNotifications(id, entry, false, notifications); + changes_notifier.onEntityUpdated(id, new_entity); + return true; } @@ -650,74 +649,4 @@ void DiskAccessStorage::deleteAccessEntityOnDisk(const UUID & id) const throw Exception("Couldn't delete " + file_path, ErrorCodes::FILE_DOESNT_EXIST); } - -void DiskAccessStorage::prepareNotifications(const UUID & id, const Entry & entry, bool remove, Notifications & notifications) const -{ - if (!remove && !entry.entity) - return; - - const AccessEntityPtr entity = remove ? nullptr : entry.entity; - for (const auto & handler : entry.handlers_by_id) - notifications.push_back({handler, id, entity}); - - for (const auto & handler : handlers_by_type[static_cast(entry.type)]) - notifications.push_back({handler, id, entity}); -} - - -scope_guard DiskAccessStorage::subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const -{ - std::lock_guard lock{mutex}; - auto it = entries_by_id.find(id); - if (it == entries_by_id.end()) - return {}; - const Entry & entry = it->second; - auto handler_it = entry.handlers_by_id.insert(entry.handlers_by_id.end(), handler); - - return [this, id, handler_it] - { - std::lock_guard lock2{mutex}; - auto it2 = entries_by_id.find(id); - if (it2 != entries_by_id.end()) - { - const Entry & entry2 = it2->second; - entry2.handlers_by_id.erase(handler_it); - } - }; -} - -scope_guard DiskAccessStorage::subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const -{ - std::lock_guard lock{mutex}; - auto & handlers = handlers_by_type[static_cast(type)]; - handlers.push_back(handler); - auto handler_it = std::prev(handlers.end()); - - return [this, type, handler_it] - { - std::lock_guard lock2{mutex}; - auto & handlers2 = handlers_by_type[static_cast(type)]; - handlers2.erase(handler_it); - }; -} - -bool DiskAccessStorage::hasSubscription(const UUID & id) const -{ - std::lock_guard lock{mutex}; - auto it = entries_by_id.find(id); - if (it != entries_by_id.end()) - { - const Entry & entry = it->second; - return !entry.handlers_by_id.empty(); - } - return false; -} - -bool DiskAccessStorage::hasSubscription(AccessEntityType type) const -{ - std::lock_guard lock{mutex}; - const auto & handlers = handlers_by_type[static_cast(type)]; - return !handlers.empty(); -} - } diff --git a/src/Access/DiskAccessStorage.h b/src/Access/DiskAccessStorage.h index 20390dabfa0..7784a80e779 100644 --- a/src/Access/DiskAccessStorage.h +++ b/src/Access/DiskAccessStorage.h @@ -7,14 +7,15 @@ namespace DB { +class AccessChangesNotifier; + /// Loads and saves access entities on a local disk to a specified directory. class DiskAccessStorage : public IAccessStorage { public: static constexpr char STORAGE_TYPE[] = "local directory"; - DiskAccessStorage(const String & storage_name_, const String & directory_path_, bool readonly_ = false); - DiskAccessStorage(const String & directory_path_, bool readonly_ = false); + DiskAccessStorage(const String & storage_name_, const String & directory_path_, bool readonly_, AccessChangesNotifier & changes_notifier_); ~DiskAccessStorage() override; const char * getStorageType() const override { return STORAGE_TYPE; } @@ -27,8 +28,6 @@ public: bool isReadOnly() const override { return readonly; } bool exists(const UUID & id) const override; - bool hasSubscription(const UUID & id) const override; - bool hasSubscription(AccessEntityType type) const override; private: std::optional findImpl(AccessEntityType type, const String & name) const override; @@ -38,8 +37,6 @@ private: std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; bool removeImpl(const UUID & id, bool throw_if_not_exists) override; bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; - scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; - scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; void clear(); bool readLists(); @@ -50,9 +47,9 @@ private: void listsWritingThreadFunc(); void stopListsWritingThread(); - bool insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists, Notifications & notifications); - bool removeNoLock(const UUID & id, bool throw_if_not_exists, Notifications & notifications); - bool updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists, Notifications & notifications); + bool insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists); + bool removeNoLock(const UUID & id, bool throw_if_not_exists); + bool updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists); AccessEntityPtr readAccessEntityFromDisk(const UUID & id) const; void writeAccessEntityToDisk(const UUID & id, const IAccessEntity & entity) const; @@ -65,11 +62,8 @@ private: String name; AccessEntityType type; mutable AccessEntityPtr entity; /// may be nullptr, if the entity hasn't been loaded yet. - mutable std::list handlers_by_id; }; - void prepareNotifications(const UUID & id, const Entry & entry, bool remove, Notifications & notifications) const; - String directory_path; std::atomic readonly; std::unordered_map entries_by_id; @@ -79,7 +73,7 @@ private: ThreadFromGlobalPool lists_writing_thread; /// List files are written in a separate thread. std::condition_variable lists_writing_thread_should_exit; /// Signals `lists_writing_thread` to exit. bool lists_writing_thread_is_waiting = false; - mutable std::list handlers_by_type[static_cast(AccessEntityType::MAX)]; + AccessChangesNotifier & changes_notifier; mutable std::mutex mutex; }; } diff --git a/src/Access/EnabledRoles.cpp b/src/Access/EnabledRoles.cpp index 282c52a9544..456529da942 100644 --- a/src/Access/EnabledRoles.cpp +++ b/src/Access/EnabledRoles.cpp @@ -6,7 +6,7 @@ namespace DB { -EnabledRoles::EnabledRoles(const Params & params_) : params(params_) +EnabledRoles::EnabledRoles(const Params & params_) : params(params_), handlers(std::make_shared()) { } @@ -15,42 +15,50 @@ EnabledRoles::~EnabledRoles() = default; std::shared_ptr EnabledRoles::getRolesInfo() const { - std::lock_guard lock{mutex}; + std::lock_guard lock{info_mutex}; return info; } scope_guard EnabledRoles::subscribeForChanges(const OnChangeHandler & handler) const { - std::lock_guard lock{mutex}; - handlers.push_back(handler); - auto it = std::prev(handlers.end()); + std::lock_guard lock{handlers->mutex}; + handlers->list.push_back(handler); + auto it = std::prev(handlers->list.end()); - return [this, it] + return [handlers=handlers, it] { - std::lock_guard lock2{mutex}; - handlers.erase(it); + std::lock_guard lock2{handlers->mutex}; + handlers->list.erase(it); }; } -void EnabledRoles::setRolesInfo(const std::shared_ptr & info_, scope_guard & notifications) +void EnabledRoles::setRolesInfo(const std::shared_ptr & info_, scope_guard * notifications) { - std::lock_guard lock{mutex}; - - if (info && info_ && *info == *info_) - return; - - info = info_; - - std::vector handlers_to_notify; - boost::range::copy(handlers, std::back_inserter(handlers_to_notify)); - - notifications.join(scope_guard([info = info, handlers_to_notify = std::move(handlers_to_notify)] { - for (const auto & handler : handlers_to_notify) - handler(info); - })); + std::lock_guard lock{info_mutex}; + if (info && info_ && *info == *info_) + return; + + info = info_; + } + + if (notifications) + { + std::vector handlers_to_notify; + { + std::lock_guard lock{handlers->mutex}; + boost::range::copy(handlers->list, std::back_inserter(handlers_to_notify)); + } + + notifications->join(scope_guard( + [info = info, handlers_to_notify = std::move(handlers_to_notify)] + { + for (const auto & handler : handlers_to_notify) + handler(info); + })); + } } } diff --git a/src/Access/EnabledRoles.h b/src/Access/EnabledRoles.h index 28d1f9ea376..e0d773db343 100644 --- a/src/Access/EnabledRoles.h +++ b/src/Access/EnabledRoles.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -43,12 +44,21 @@ private: friend class RoleCache; explicit EnabledRoles(const Params & params_); - void setRolesInfo(const std::shared_ptr & info_, scope_guard & notifications); + void setRolesInfo(const std::shared_ptr & info_, scope_guard * notifications); const Params params; - mutable std::shared_ptr info; - mutable std::list handlers; - mutable std::mutex mutex; + + std::shared_ptr info; + mutable std::mutex info_mutex; + + struct Handlers + { + std::list list; + std::mutex mutex; + }; + + /// shared_ptr is here for safety because EnabledRoles can be destroyed before all subscriptions are removed. + std::shared_ptr handlers; }; } diff --git a/src/Access/IAccessStorage.cpp b/src/Access/IAccessStorage.cpp index 8c53216c638..6b04355099d 100644 --- a/src/Access/IAccessStorage.cpp +++ b/src/Access/IAccessStorage.cpp @@ -410,34 +410,6 @@ bool IAccessStorage::updateImpl(const UUID & id, const UpdateFunc &, bool throw_ } -scope_guard IAccessStorage::subscribeForChanges(AccessEntityType type, const OnChangedHandler & handler) const -{ - return subscribeForChangesImpl(type, handler); -} - - -scope_guard IAccessStorage::subscribeForChanges(const UUID & id, const OnChangedHandler & handler) const -{ - return subscribeForChangesImpl(id, handler); -} - - -scope_guard IAccessStorage::subscribeForChanges(const std::vector & ids, const OnChangedHandler & handler) const -{ - scope_guard subscriptions; - for (const auto & id : ids) - subscriptions.join(subscribeForChangesImpl(id, handler)); - return subscriptions; -} - - -void IAccessStorage::notify(const Notifications & notifications) -{ - for (const auto & [fn, id, new_entity] : notifications) - fn(id, new_entity); -} - - UUID IAccessStorage::authenticate( const Credentials & credentials, const Poco::Net::IPAddress & address, diff --git a/src/Access/IAccessStorage.h b/src/Access/IAccessStorage.h index 428a0e8f052..5de20cad286 100644 --- a/src/Access/IAccessStorage.h +++ b/src/Access/IAccessStorage.h @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -22,7 +21,7 @@ enum class AuthenticationType; /// Contains entities, i.e. instances of classes derived from IAccessEntity. /// The implementations of this class MUST be thread-safe. -class IAccessStorage +class IAccessStorage : public boost::noncopyable { public: explicit IAccessStorage(const String & storage_name_) : storage_name(storage_name_) {} @@ -41,6 +40,15 @@ public: /// Returns true if this entity is readonly. virtual bool isReadOnly(const UUID &) const { return isReadOnly(); } + /// Reloads and updates entities in this storage. This function is used to implement SYSTEM RELOAD CONFIG. + virtual void reload() {} + + /// Starts periodic reloading and update of entities in this storage. + virtual void startPeriodicReloading() {} + + /// Stops periodic reloading and update of entities in this storage. + virtual void stopPeriodicReloading() {} + /// Returns the identifiers of all the entities of a specified type contained in the storage. std::vector findAll(AccessEntityType type) const; @@ -130,23 +138,6 @@ public: /// Updates multiple entities in the storage. Returns the list of successfully updated. std::vector tryUpdate(const std::vector & ids, const UpdateFunc & update_func); - using OnChangedHandler = std::function; - - /// Subscribes for all changes. - /// Can return nullptr if cannot subscribe (identifier not found) or if it doesn't make sense (the storage is read-only). - scope_guard subscribeForChanges(AccessEntityType type, const OnChangedHandler & handler) const; - - template - scope_guard subscribeForChanges(OnChangedHandler handler) const { return subscribeForChanges(EntityClassT::TYPE, handler); } - - /// Subscribes for changes of a specific entry. - /// Can return nullptr if cannot subscribe (identifier not found) or if it doesn't make sense (the storage is read-only). - scope_guard subscribeForChanges(const UUID & id, const OnChangedHandler & handler) const; - scope_guard subscribeForChanges(const std::vector & ids, const OnChangedHandler & handler) const; - - virtual bool hasSubscription(AccessEntityType type) const = 0; - virtual bool hasSubscription(const UUID & id) const = 0; - /// Finds a user, check the provided credentials and returns the ID of the user if they are valid. /// Throws an exception if no such user or credentials are invalid. UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool allow_no_password, bool allow_plaintext_password) const; @@ -160,8 +151,6 @@ protected: virtual std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists); virtual bool removeImpl(const UUID & id, bool throw_if_not_exists); virtual bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists); - virtual scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const = 0; - virtual scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const = 0; virtual std::optional authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const; virtual bool areCredentialsValid(const User & user, const Credentials & credentials, const ExternalAuthenticators & external_authenticators) const; virtual bool isAddressAllowed(const User & user, const Poco::Net::IPAddress & address) const; @@ -181,9 +170,6 @@ protected: [[noreturn]] static void throwAddressNotAllowed(const Poco::Net::IPAddress & address); [[noreturn]] static void throwInvalidCredentials(); [[noreturn]] static void throwAuthenticationTypeNotAllowed(AuthenticationType auth_type); - using Notification = std::tuple; - using Notifications = std::vector; - static void notify(const Notifications & notifications); private: const String storage_name; diff --git a/src/Access/LDAPAccessStorage.cpp b/src/Access/LDAPAccessStorage.cpp index 0fe9e6a1605..480d0050e2a 100644 --- a/src/Access/LDAPAccessStorage.cpp +++ b/src/Access/LDAPAccessStorage.cpp @@ -27,10 +27,10 @@ namespace ErrorCodes } -LDAPAccessStorage::LDAPAccessStorage(const String & storage_name_, AccessControl * access_control_, const Poco::Util::AbstractConfiguration & config, const String & prefix) - : IAccessStorage(storage_name_) +LDAPAccessStorage::LDAPAccessStorage(const String & storage_name_, AccessControl & access_control_, const Poco::Util::AbstractConfiguration & config, const String & prefix) + : IAccessStorage(storage_name_), access_control(access_control_), memory_storage(storage_name_, access_control.getChangesNotifier()) { - setConfiguration(access_control_, config, prefix); + setConfiguration(config, prefix); } @@ -40,7 +40,7 @@ String LDAPAccessStorage::getLDAPServerName() const } -void LDAPAccessStorage::setConfiguration(AccessControl * access_control_, const Poco::Util::AbstractConfiguration & config, const String & prefix) +void LDAPAccessStorage::setConfiguration(const Poco::Util::AbstractConfiguration & config, const String & prefix) { std::scoped_lock lock(mutex); @@ -80,7 +80,6 @@ void LDAPAccessStorage::setConfiguration(AccessControl * access_control_, const } } - access_control = access_control_; ldap_server_name = ldap_server_name_cfg; role_search_params.swap(role_search_params_cfg); common_role_names.swap(common_roles_cfg); @@ -91,7 +90,7 @@ void LDAPAccessStorage::setConfiguration(AccessControl * access_control_, const granted_role_names.clear(); granted_role_ids.clear(); - role_change_subscription = access_control->subscribeForChanges( + role_change_subscription = access_control.subscribeForChanges( [this] (const UUID & id, const AccessEntityPtr & entity) { return this->processRoleChange(id, entity); @@ -215,7 +214,7 @@ void LDAPAccessStorage::assignRolesNoLock(User & user, const LDAPClient::SearchR auto it = granted_role_ids.find(role_name); if (it == granted_role_ids.end()) { - if (const auto role_id = access_control->find(role_name)) + if (const auto role_id = access_control.find(role_name)) { granted_role_names.insert_or_assign(*role_id, role_name); it = granted_role_ids.insert_or_assign(role_name, *role_id).first; @@ -450,33 +449,6 @@ std::optional LDAPAccessStorage::readNameImpl(const UUID & id, bool thro } -scope_guard LDAPAccessStorage::subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const -{ - std::scoped_lock lock(mutex); - return memory_storage.subscribeForChanges(id, handler); -} - - -scope_guard LDAPAccessStorage::subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const -{ - std::scoped_lock lock(mutex); - return memory_storage.subscribeForChanges(type, handler); -} - - -bool LDAPAccessStorage::hasSubscription(const UUID & id) const -{ - std::scoped_lock lock(mutex); - return memory_storage.hasSubscription(id); -} - - -bool LDAPAccessStorage::hasSubscription(AccessEntityType type) const -{ - std::scoped_lock lock(mutex); - return memory_storage.hasSubscription(type); -} - std::optional LDAPAccessStorage::authenticateImpl( const Credentials & credentials, const Poco::Net::IPAddress & address, diff --git a/src/Access/LDAPAccessStorage.h b/src/Access/LDAPAccessStorage.h index a86c2fcd35c..df13eff179b 100644 --- a/src/Access/LDAPAccessStorage.h +++ b/src/Access/LDAPAccessStorage.h @@ -32,7 +32,7 @@ class LDAPAccessStorage : public IAccessStorage public: static constexpr char STORAGE_TYPE[] = "ldap"; - explicit LDAPAccessStorage(const String & storage_name_, AccessControl * access_control_, const Poco::Util::AbstractConfiguration & config, const String & prefix); + explicit LDAPAccessStorage(const String & storage_name_, AccessControl & access_control_, const Poco::Util::AbstractConfiguration & config, const String & prefix); virtual ~LDAPAccessStorage() override = default; String getLDAPServerName() const; @@ -42,19 +42,15 @@ public: virtual String getStorageParamsJSON() const override; virtual bool isReadOnly() const override { return true; } virtual bool exists(const UUID & id) const override; - virtual bool hasSubscription(const UUID & id) const override; - virtual bool hasSubscription(AccessEntityType type) const override; private: // IAccessStorage implementations. virtual std::optional findImpl(AccessEntityType type, const String & name) const override; virtual std::vector findAllImpl(AccessEntityType type) const override; virtual AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; virtual std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const override; - virtual scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; - virtual scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; virtual std::optional authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const override; - void setConfiguration(AccessControl * access_control_, const Poco::Util::AbstractConfiguration & config, const String & prefix); + void setConfiguration(const Poco::Util::AbstractConfiguration & config, const String & prefix); void processRoleChange(const UUID & id, const AccessEntityPtr & entity); void applyRoleChangeNoLock(bool grant, const UUID & role_id, const String & role_name); @@ -66,7 +62,7 @@ private: // IAccessStorage implementations. const ExternalAuthenticators & external_authenticators, LDAPClient::SearchResultsList & role_search_results) const; mutable std::recursive_mutex mutex; - AccessControl * access_control = nullptr; + AccessControl & access_control; String ldap_server_name; LDAPClient::RoleSearchParamsList role_search_params; std::set common_role_names; // role name that should be granted to all users at all times diff --git a/src/Access/MemoryAccessStorage.cpp b/src/Access/MemoryAccessStorage.cpp index 6aa0688ee3e..9ed80f4a64d 100644 --- a/src/Access/MemoryAccessStorage.cpp +++ b/src/Access/MemoryAccessStorage.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -7,8 +8,8 @@ namespace DB { -MemoryAccessStorage::MemoryAccessStorage(const String & storage_name_) - : IAccessStorage(storage_name_) +MemoryAccessStorage::MemoryAccessStorage(const String & storage_name_, AccessChangesNotifier & changes_notifier_) + : IAccessStorage(storage_name_), changes_notifier(changes_notifier_) { } @@ -63,19 +64,16 @@ AccessEntityPtr MemoryAccessStorage::readImpl(const UUID & id, bool throw_if_not std::optional MemoryAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); - UUID id = generateRandomID(); std::lock_guard lock{mutex}; - if (insertNoLock(id, new_entity, replace_if_exists, throw_if_exists, notifications)) + if (insertNoLock(id, new_entity, replace_if_exists, throw_if_exists)) return id; return std::nullopt; } -bool MemoryAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists, Notifications & notifications) +bool MemoryAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { const String & name = new_entity->getName(); AccessEntityType type = new_entity->getType(); @@ -103,7 +101,7 @@ bool MemoryAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & if (name_collision && replace_if_exists) { const auto & existing_entry = *(it_by_name->second); - removeNoLock(existing_entry.id, /* throw_if_not_exists = */ false, notifications); + removeNoLock(existing_entry.id, /* throw_if_not_exists = */ false); } /// Do insertion. @@ -111,22 +109,19 @@ bool MemoryAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & entry.id = id; entry.entity = new_entity; entries_by_name[name] = &entry; - prepareNotifications(entry, false, notifications); + changes_notifier.onEntityAdded(id, new_entity); return true; } bool MemoryAccessStorage::removeImpl(const UUID & id, bool throw_if_not_exists) { - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); - std::lock_guard lock{mutex}; - return removeNoLock(id, throw_if_not_exists, notifications); + return removeNoLock(id, throw_if_not_exists); } -bool MemoryAccessStorage::removeNoLock(const UUID & id, bool throw_if_not_exists, Notifications & notifications) +bool MemoryAccessStorage::removeNoLock(const UUID & id, bool throw_if_not_exists) { auto it = entries_by_id.find(id); if (it == entries_by_id.end()) @@ -141,27 +136,25 @@ bool MemoryAccessStorage::removeNoLock(const UUID & id, bool throw_if_not_exists const String & name = entry.entity->getName(); AccessEntityType type = entry.entity->getType(); - prepareNotifications(entry, true, notifications); - /// Do removing. + UUID removed_id = id; auto & entries_by_name = entries_by_name_and_type[static_cast(type)]; entries_by_name.erase(name); entries_by_id.erase(it); + + changes_notifier.onEntityRemoved(removed_id, type); return true; } bool MemoryAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) { - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); - std::lock_guard lock{mutex}; - return updateNoLock(id, update_func, throw_if_not_exists, notifications); + return updateNoLock(id, update_func, throw_if_not_exists); } -bool MemoryAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists, Notifications & notifications) +bool MemoryAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) { auto it = entries_by_id.find(id); if (it == entries_by_id.end()) @@ -195,7 +188,7 @@ bool MemoryAccessStorage::updateNoLock(const UUID & id, const UpdateFunc & updat entries_by_name[new_entity->getName()] = &entry; } - prepareNotifications(entry, false, notifications); + changes_notifier.onEntityUpdated(id, new_entity); return true; } @@ -212,16 +205,8 @@ void MemoryAccessStorage::setAll(const std::vector & all_entiti void MemoryAccessStorage::setAll(const std::vector> & all_entities) { - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); - std::lock_guard lock{mutex}; - setAllNoLock(all_entities, notifications); -} - -void MemoryAccessStorage::setAllNoLock(const std::vector> & all_entities, Notifications & notifications) -{ boost::container::flat_set not_used_ids; std::vector conflicting_ids; @@ -256,7 +241,7 @@ void MemoryAccessStorage::setAllNoLock(const std::vector ids_to_remove = std::move(not_used_ids); boost::range::copy(conflicting_ids, std::inserter(ids_to_remove, ids_to_remove.end())); for (const auto & id : ids_to_remove) - removeNoLock(id, /* throw_if_not_exists = */ false, notifications); + removeNoLock(id, /* throw_if_not_exists = */ false); /// Insert or update entities. for (const auto & [id, entity] : all_entities) @@ -269,84 +254,14 @@ void MemoryAccessStorage::setAllNoLock(const std::vector(entry.entity->getType())]) - notifications.push_back({handler, entry.id, entity}); -} - - -scope_guard MemoryAccessStorage::subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const -{ - std::lock_guard lock{mutex}; - auto & handlers = handlers_by_type[static_cast(type)]; - handlers.push_back(handler); - auto handler_it = std::prev(handlers.end()); - - return [this, type, handler_it] - { - std::lock_guard lock2{mutex}; - auto & handlers2 = handlers_by_type[static_cast(type)]; - handlers2.erase(handler_it); - }; -} - - -scope_guard MemoryAccessStorage::subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const -{ - std::lock_guard lock{mutex}; - auto it = entries_by_id.find(id); - if (it == entries_by_id.end()) - return {}; - const Entry & entry = it->second; - auto handler_it = entry.handlers_by_id.insert(entry.handlers_by_id.end(), handler); - - return [this, id, handler_it] - { - std::lock_guard lock2{mutex}; - auto it2 = entries_by_id.find(id); - if (it2 != entries_by_id.end()) - { - const Entry & entry2 = it2->second; - entry2.handlers_by_id.erase(handler_it); - } - }; -} - - -bool MemoryAccessStorage::hasSubscription(const UUID & id) const -{ - std::lock_guard lock{mutex}; - auto it = entries_by_id.find(id); - if (it != entries_by_id.end()) - { - const Entry & entry = it->second; - return !entry.handlers_by_id.empty(); - } - return false; -} - - -bool MemoryAccessStorage::hasSubscription(AccessEntityType type) const -{ - std::lock_guard lock{mutex}; - const auto & handlers = handlers_by_type[static_cast(type)]; - return !handlers.empty(); -} } diff --git a/src/Access/MemoryAccessStorage.h b/src/Access/MemoryAccessStorage.h index f497067bd50..690383c6941 100644 --- a/src/Access/MemoryAccessStorage.h +++ b/src/Access/MemoryAccessStorage.h @@ -9,13 +9,15 @@ namespace DB { +class AccessChangesNotifier; + /// Implementation of IAccessStorage which keeps all data in memory. class MemoryAccessStorage : public IAccessStorage { public: static constexpr char STORAGE_TYPE[] = "memory"; - explicit MemoryAccessStorage(const String & storage_name_ = STORAGE_TYPE); + explicit MemoryAccessStorage(const String & storage_name_, AccessChangesNotifier & changes_notifier_); const char * getStorageType() const override { return STORAGE_TYPE; } @@ -24,8 +26,6 @@ public: void setAll(const std::vector> & all_entities); bool exists(const UUID & id) const override; - bool hasSubscription(const UUID & id) const override; - bool hasSubscription(AccessEntityType type) const override; private: std::optional findImpl(AccessEntityType type, const String & name) const override; @@ -34,25 +34,20 @@ private: std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; bool removeImpl(const UUID & id, bool throw_if_not_exists) override; bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; - scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; - scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; + + bool insertNoLock(const UUID & id, const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists); + bool removeNoLock(const UUID & id, bool throw_if_not_exists); + bool updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists); struct Entry { UUID id; AccessEntityPtr entity; - mutable std::list handlers_by_id; }; - bool insertNoLock(const UUID & id, const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists, Notifications & notifications); - bool removeNoLock(const UUID & id, bool throw_if_not_exists, Notifications & notifications); - bool updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists, Notifications & notifications); - void setAllNoLock(const std::vector> & all_entities, Notifications & notifications); - void prepareNotifications(const Entry & entry, bool remove, Notifications & notifications) const; - - mutable std::recursive_mutex mutex; + mutable std::mutex mutex; std::unordered_map entries_by_id; /// We want to search entries both by ID and by the pair of name and type. std::unordered_map entries_by_name_and_type[static_cast(AccessEntityType::MAX)]; - mutable std::list handlers_by_type[static_cast(AccessEntityType::MAX)]; + AccessChangesNotifier & changes_notifier; }; } diff --git a/src/Access/MultipleAccessStorage.cpp b/src/Access/MultipleAccessStorage.cpp index d71e46c8523..ce4c9f3fd01 100644 --- a/src/Access/MultipleAccessStorage.cpp +++ b/src/Access/MultipleAccessStorage.cpp @@ -45,7 +45,6 @@ void MultipleAccessStorage::setStorages(const std::vector & storages std::unique_lock lock{mutex}; nested_storages = std::make_shared(storages); ids_cache.reset(); - updateSubscriptionsToNestedStorages(lock); } void MultipleAccessStorage::addStorage(const StoragePtr & new_storage) @@ -56,7 +55,6 @@ void MultipleAccessStorage::addStorage(const StoragePtr & new_storage) auto new_storages = std::make_shared(*nested_storages); new_storages->push_back(new_storage); nested_storages = new_storages; - updateSubscriptionsToNestedStorages(lock); } void MultipleAccessStorage::removeStorage(const StoragePtr & storage_to_remove) @@ -70,7 +68,6 @@ void MultipleAccessStorage::removeStorage(const StoragePtr & storage_to_remove) new_storages->erase(new_storages->begin() + index); nested_storages = new_storages; ids_cache.reset(); - updateSubscriptionsToNestedStorages(lock); } std::vector MultipleAccessStorage::getStorages() @@ -225,6 +222,28 @@ bool MultipleAccessStorage::isReadOnly(const UUID & id) const } +void MultipleAccessStorage::reload() +{ + auto storages = getStoragesInternal(); + for (const auto & storage : *storages) + storage->reload(); +} + +void MultipleAccessStorage::startPeriodicReloading() +{ + auto storages = getStoragesInternal(); + for (const auto & storage : *storages) + storage->startPeriodicReloading(); +} + +void MultipleAccessStorage::stopPeriodicReloading() +{ + auto storages = getStoragesInternal(); + for (const auto & storage : *storages) + storage->stopPeriodicReloading(); +} + + std::optional MultipleAccessStorage::insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) { std::shared_ptr storage_for_insertion; @@ -310,145 +329,6 @@ bool MultipleAccessStorage::updateImpl(const UUID & id, const UpdateFunc & updat } -scope_guard MultipleAccessStorage::subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const -{ - auto storage = findStorage(id); - if (!storage) - return {}; - return storage->subscribeForChanges(id, handler); -} - - -bool MultipleAccessStorage::hasSubscription(const UUID & id) const -{ - auto storages = getStoragesInternal(); - for (const auto & storage : *storages) - { - if (storage->hasSubscription(id)) - return true; - } - return false; -} - - -scope_guard MultipleAccessStorage::subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const -{ - std::unique_lock lock{mutex}; - auto & handlers = handlers_by_type[static_cast(type)]; - handlers.push_back(handler); - auto handler_it = std::prev(handlers.end()); - if (handlers.size() == 1) - updateSubscriptionsToNestedStorages(lock); - - return [this, type, handler_it] - { - std::unique_lock lock2{mutex}; - auto & handlers2 = handlers_by_type[static_cast(type)]; - handlers2.erase(handler_it); - if (handlers2.empty()) - updateSubscriptionsToNestedStorages(lock2); - }; -} - - -bool MultipleAccessStorage::hasSubscription(AccessEntityType type) const -{ - std::lock_guard lock{mutex}; - const auto & handlers = handlers_by_type[static_cast(type)]; - return !handlers.empty(); -} - - -/// Updates subscriptions to nested storages. -/// We need the subscriptions to the nested storages if someone has subscribed to us. -/// If any of the nested storages is changed we call our subscribers. -void MultipleAccessStorage::updateSubscriptionsToNestedStorages(std::unique_lock & lock) const -{ - /// lock is already locked. - - std::vector> added_subscriptions[static_cast(AccessEntityType::MAX)]; - std::vector removed_subscriptions; - - for (auto type : collections::range(AccessEntityType::MAX)) - { - auto & handlers = handlers_by_type[static_cast(type)]; - auto & subscriptions = subscriptions_to_nested_storages[static_cast(type)]; - if (handlers.empty()) - { - /// None has subscribed to us, we need no subscriptions to the nested storages. - for (auto & subscription : subscriptions | boost::adaptors::map_values) - removed_subscriptions.push_back(std::move(subscription)); - subscriptions.clear(); - } - else - { - /// Someone has subscribed to us, now we need to have a subscription to each nested storage. - for (auto it = subscriptions.begin(); it != subscriptions.end();) - { - const auto & storage = it->first; - auto & subscription = it->second; - if (boost::range::find(*nested_storages, storage) == nested_storages->end()) - { - removed_subscriptions.push_back(std::move(subscription)); - it = subscriptions.erase(it); - } - else - ++it; - } - - for (const auto & storage : *nested_storages) - { - if (!subscriptions.contains(storage)) - added_subscriptions[static_cast(type)].push_back({storage, nullptr}); - } - } - } - - /// Unlock the mutex temporarily because it's much better to subscribe to the nested storages - /// with the mutex unlocked. - lock.unlock(); - removed_subscriptions.clear(); - - for (auto type : collections::range(AccessEntityType::MAX)) - { - if (!added_subscriptions[static_cast(type)].empty()) - { - auto on_changed = [this, type](const UUID & id, const AccessEntityPtr & entity) - { - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); - std::lock_guard lock2{mutex}; - for (const auto & handler : handlers_by_type[static_cast(type)]) - notifications.push_back({handler, id, entity}); - }; - for (auto & [storage, subscription] : added_subscriptions[static_cast(type)]) - subscription = storage->subscribeForChanges(type, on_changed); - } - } - - /// Lock the mutex again to store added subscriptions to the nested storages. - lock.lock(); - - for (auto type : collections::range(AccessEntityType::MAX)) - { - if (!added_subscriptions[static_cast(type)].empty()) - { - auto & subscriptions = subscriptions_to_nested_storages[static_cast(type)]; - for (auto & [storage, subscription] : added_subscriptions[static_cast(type)]) - { - if (!subscriptions.contains(storage) && (boost::range::find(*nested_storages, storage) != nested_storages->end()) - && !handlers_by_type[static_cast(type)].empty()) - { - subscriptions.emplace(std::move(storage), std::move(subscription)); - } - } - } - } - - lock.unlock(); -} - - std::optional MultipleAccessStorage::authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, diff --git a/src/Access/MultipleAccessStorage.h b/src/Access/MultipleAccessStorage.h index 3a47163af6f..61a975050b6 100644 --- a/src/Access/MultipleAccessStorage.h +++ b/src/Access/MultipleAccessStorage.h @@ -24,6 +24,10 @@ public: bool isReadOnly() const override; bool isReadOnly(const UUID & id) const override; + void reload() override; + void startPeriodicReloading() override; + void stopPeriodicReloading() override; + void setStorages(const std::vector & storages); void addStorage(const StoragePtr & new_storage); void removeStorage(const StoragePtr & storage_to_remove); @@ -37,8 +41,6 @@ public: StoragePtr getStorage(const UUID & id); bool exists(const UUID & id) const override; - bool hasSubscription(const UUID & id) const override; - bool hasSubscription(AccessEntityType type) const override; protected: std::optional findImpl(AccessEntityType type, const String & name) const override; @@ -48,19 +50,14 @@ protected: std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; bool removeImpl(const UUID & id, bool throw_if_not_exists) override; bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; - scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; - scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; std::optional authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const override; private: using Storages = std::vector; std::shared_ptr getStoragesInternal() const; - void updateSubscriptionsToNestedStorages(std::unique_lock & lock) const; std::shared_ptr nested_storages; mutable LRUCache ids_cache; - mutable std::list handlers_by_type[static_cast(AccessEntityType::MAX)]; - mutable std::unordered_map subscriptions_to_nested_storages[static_cast(AccessEntityType::MAX)]; mutable std::mutex mutex; }; diff --git a/src/Access/ReplicatedAccessStorage.cpp b/src/Access/ReplicatedAccessStorage.cpp index e56fad720be..d3d1ee3fb6b 100644 --- a/src/Access/ReplicatedAccessStorage.cpp +++ b/src/Access/ReplicatedAccessStorage.cpp @@ -1,12 +1,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include @@ -30,11 +32,13 @@ static UUID parseUUID(const String & text) ReplicatedAccessStorage::ReplicatedAccessStorage( const String & storage_name_, const String & zookeeper_path_, - zkutil::GetZooKeeper get_zookeeper_) + zkutil::GetZooKeeper get_zookeeper_, + AccessChangesNotifier & changes_notifier_) : IAccessStorage(storage_name_) , zookeeper_path(zookeeper_path_) , get_zookeeper(get_zookeeper_) - , refresh_queue(std::numeric_limits::max()) + , watched_queue(std::make_shared>(std::numeric_limits::max())) + , changes_notifier(changes_notifier_) { if (zookeeper_path.empty()) throw Exception("ZooKeeper path must be non-empty", ErrorCodes::BAD_ARGUMENTS); @@ -45,29 +49,30 @@ ReplicatedAccessStorage::ReplicatedAccessStorage( /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. if (zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; + + initializeZookeeper(); } ReplicatedAccessStorage::~ReplicatedAccessStorage() { - ReplicatedAccessStorage::shutdown(); + stopWatchingThread(); } - -void ReplicatedAccessStorage::startup() +void ReplicatedAccessStorage::startWatchingThread() { - initializeZookeeper(); - worker_thread = ThreadFromGlobalPool(&ReplicatedAccessStorage::runWorkerThread, this); + bool prev_watching_flag = watching.exchange(true); + if (!prev_watching_flag) + watching_thread = ThreadFromGlobalPool(&ReplicatedAccessStorage::runWatchingThread, this); } -void ReplicatedAccessStorage::shutdown() +void ReplicatedAccessStorage::stopWatchingThread() { - bool prev_stop_flag = stop_flag.exchange(true); - if (!prev_stop_flag) + bool prev_watching_flag = watching.exchange(false); + if (prev_watching_flag) { - refresh_queue.finish(); - - if (worker_thread.joinable()) - worker_thread.join(); + watched_queue->finish(); + if (watching_thread.joinable()) + watching_thread.join(); } } @@ -105,10 +110,8 @@ std::optional ReplicatedAccessStorage::insertImpl(const AccessEntityPtr & if (!ok) return std::nullopt; - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; - refreshEntityNoLock(zookeeper, id, notifications); + refreshEntityNoLock(zookeeper, id); return id; } @@ -207,10 +210,8 @@ bool ReplicatedAccessStorage::removeImpl(const UUID & id, bool throw_if_not_exis if (!ok) return false; - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; - removeEntityNoLock(id, notifications); + removeEntityNoLock(id); return true; } @@ -261,10 +262,8 @@ bool ReplicatedAccessStorage::updateImpl(const UUID & id, const UpdateFunc & upd if (!ok) return false; - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; - refreshEntityNoLock(zookeeper, id, notifications); + refreshEntityNoLock(zookeeper, id); return true; } @@ -328,16 +327,18 @@ bool ReplicatedAccessStorage::updateZooKeeper(const zkutil::ZooKeeperPtr & zooke } -void ReplicatedAccessStorage::runWorkerThread() +void ReplicatedAccessStorage::runWatchingThread() { - LOG_DEBUG(getLogger(), "Started worker thread"); - while (!stop_flag) + LOG_DEBUG(getLogger(), "Started watching thread"); + setThreadName("ReplACLWatch"); + while (watching) { try { if (!initialized) initializeZookeeper(); - refresh(); + if (refresh()) + changes_notifier.sendNotifications(); } catch (...) { @@ -353,7 +354,7 @@ void ReplicatedAccessStorage::resetAfterError() initialized = false; UUID id; - while (refresh_queue.tryPop(id)) {} + while (watched_queue->tryPop(id)) {} std::lock_guard lock{mutex}; for (const auto type : collections::range(AccessEntityType::MAX)) @@ -389,21 +390,20 @@ void ReplicatedAccessStorage::createRootNodes(const zkutil::ZooKeeperPtr & zooke } } -void ReplicatedAccessStorage::refresh() +bool ReplicatedAccessStorage::refresh() { UUID id; - if (refresh_queue.tryPop(id, /* timeout_ms: */ 10000)) - { - if (stop_flag) - return; + if (!watched_queue->tryPop(id, /* timeout_ms: */ 10000)) + return false; - auto zookeeper = get_zookeeper(); + auto zookeeper = get_zookeeper(); - if (id == UUIDHelpers::Nil) - refreshEntities(zookeeper); - else - refreshEntity(zookeeper, id); - } + if (id == UUIDHelpers::Nil) + refreshEntities(zookeeper); + else + refreshEntity(zookeeper, id); + + return true; } @@ -412,9 +412,9 @@ void ReplicatedAccessStorage::refreshEntities(const zkutil::ZooKeeperPtr & zooke LOG_DEBUG(getLogger(), "Refreshing entities list"); const String zookeeper_uuids_path = zookeeper_path + "/uuid"; - auto watch_entities_list = [this](const Coordination::WatchResponse &) + auto watch_entities_list = [watched_queue = watched_queue](const Coordination::WatchResponse &) { - [[maybe_unused]] bool push_result = refresh_queue.push(UUIDHelpers::Nil); + [[maybe_unused]] bool push_result = watched_queue->push(UUIDHelpers::Nil); }; Coordination::Stat stat; const auto entity_uuid_strs = zookeeper->getChildrenWatch(zookeeper_uuids_path, &stat, watch_entities_list); @@ -424,8 +424,6 @@ void ReplicatedAccessStorage::refreshEntities(const zkutil::ZooKeeperPtr & zooke for (const String & entity_uuid_str : entity_uuid_strs) entity_uuids.insert(parseUUID(entity_uuid_str)); - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; std::vector entities_to_remove; @@ -437,14 +435,14 @@ void ReplicatedAccessStorage::refreshEntities(const zkutil::ZooKeeperPtr & zooke entities_to_remove.push_back(entity_uuid); } for (const auto & entity_uuid : entities_to_remove) - removeEntityNoLock(entity_uuid, notifications); + removeEntityNoLock(entity_uuid); /// Locally add entities that were added to ZooKeeper for (const auto & entity_uuid : entity_uuids) { const auto it = entries_by_id.find(entity_uuid); if (it == entries_by_id.end()) - refreshEntityNoLock(zookeeper, entity_uuid, notifications); + refreshEntityNoLock(zookeeper, entity_uuid); } LOG_DEBUG(getLogger(), "Refreshing entities list finished"); @@ -452,21 +450,18 @@ void ReplicatedAccessStorage::refreshEntities(const zkutil::ZooKeeperPtr & zooke void ReplicatedAccessStorage::refreshEntity(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id) { - Notifications notifications; - SCOPE_EXIT({ notify(notifications); }); std::lock_guard lock{mutex}; - - refreshEntityNoLock(zookeeper, id, notifications); + refreshEntityNoLock(zookeeper, id); } -void ReplicatedAccessStorage::refreshEntityNoLock(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, Notifications & notifications) +void ReplicatedAccessStorage::refreshEntityNoLock(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id) { LOG_DEBUG(getLogger(), "Refreshing entity {}", toString(id)); - const auto watch_entity = [this, id](const Coordination::WatchResponse & response) + const auto watch_entity = [watched_queue = watched_queue, id](const Coordination::WatchResponse & response) { if (response.type == Coordination::Event::CHANGED) - [[maybe_unused]] bool push_result = refresh_queue.push(id); + [[maybe_unused]] bool push_result = watched_queue->push(id); }; Coordination::Stat entity_stat; const String entity_path = zookeeper_path + "/uuid/" + toString(id); @@ -475,16 +470,16 @@ void ReplicatedAccessStorage::refreshEntityNoLock(const zkutil::ZooKeeperPtr & z if (exists) { const AccessEntityPtr entity = deserializeAccessEntity(entity_definition, entity_path); - setEntityNoLock(id, entity, notifications); + setEntityNoLock(id, entity); } else { - removeEntityNoLock(id, notifications); + removeEntityNoLock(id); } } -void ReplicatedAccessStorage::setEntityNoLock(const UUID & id, const AccessEntityPtr & entity, Notifications & notifications) +void ReplicatedAccessStorage::setEntityNoLock(const UUID & id, const AccessEntityPtr & entity) { LOG_DEBUG(getLogger(), "Setting id {} to entity named {}", toString(id), entity->getName()); const AccessEntityType type = entity->getType(); @@ -494,12 +489,14 @@ void ReplicatedAccessStorage::setEntityNoLock(const UUID & id, const AccessEntit auto & entries_by_name = entries_by_name_and_type[static_cast(type)]; if (auto it = entries_by_name.find(name); it != entries_by_name.end() && it->second->id != id) { - removeEntityNoLock(it->second->id, notifications); + removeEntityNoLock(it->second->id); } /// If the entity already exists under a different type+name, remove old type+name + bool existed_before = false; if (auto it = entries_by_id.find(id); it != entries_by_id.end()) { + existed_before = true; const AccessEntityPtr & existing_entity = it->second.entity; const AccessEntityType existing_type = existing_entity->getType(); const String & existing_name = existing_entity->getName(); @@ -514,11 +511,18 @@ void ReplicatedAccessStorage::setEntityNoLock(const UUID & id, const AccessEntit entry.id = id; entry.entity = entity; entries_by_name[name] = &entry; - prepareNotifications(entry, false, notifications); + + if (initialized) + { + if (existed_before) + changes_notifier.onEntityUpdated(id, entity); + else + changes_notifier.onEntityAdded(id, entity); + } } -void ReplicatedAccessStorage::removeEntityNoLock(const UUID & id, Notifications & notifications) +void ReplicatedAccessStorage::removeEntityNoLock(const UUID & id) { LOG_DEBUG(getLogger(), "Removing entity with id {}", toString(id)); const auto it = entries_by_id.find(id); @@ -531,7 +535,6 @@ void ReplicatedAccessStorage::removeEntityNoLock(const UUID & id, Notifications const Entry & entry = it->second; const AccessEntityType type = entry.entity->getType(); const String & name = entry.entity->getName(); - prepareNotifications(entry, true, notifications); auto & entries_by_name = entries_by_name_and_type[static_cast(type)]; const auto name_it = entries_by_name.find(name); @@ -542,8 +545,11 @@ void ReplicatedAccessStorage::removeEntityNoLock(const UUID & id, Notifications else entries_by_name.erase(name); + UUID removed_id = id; entries_by_id.erase(id); LOG_DEBUG(getLogger(), "Removed entity with id {}", toString(id)); + + changes_notifier.onEntityRemoved(removed_id, type); } @@ -594,73 +600,4 @@ AccessEntityPtr ReplicatedAccessStorage::readImpl(const UUID & id, bool throw_if return entry.entity; } - -void ReplicatedAccessStorage::prepareNotifications(const Entry & entry, bool remove, Notifications & notifications) const -{ - const AccessEntityPtr entity = remove ? nullptr : entry.entity; - for (const auto & handler : entry.handlers_by_id) - notifications.push_back({handler, entry.id, entity}); - - for (const auto & handler : handlers_by_type[static_cast(entry.entity->getType())]) - notifications.push_back({handler, entry.id, entity}); -} - - -scope_guard ReplicatedAccessStorage::subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const -{ - std::lock_guard lock{mutex}; - auto & handlers = handlers_by_type[static_cast(type)]; - handlers.push_back(handler); - auto handler_it = std::prev(handlers.end()); - - return [this, type, handler_it] - { - std::lock_guard lock2{mutex}; - auto & handlers2 = handlers_by_type[static_cast(type)]; - handlers2.erase(handler_it); - }; -} - - -scope_guard ReplicatedAccessStorage::subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const -{ - std::lock_guard lock{mutex}; - const auto it = entries_by_id.find(id); - if (it == entries_by_id.end()) - return {}; - const Entry & entry = it->second; - auto handler_it = entry.handlers_by_id.insert(entry.handlers_by_id.end(), handler); - - return [this, id, handler_it] - { - std::lock_guard lock2{mutex}; - auto it2 = entries_by_id.find(id); - if (it2 != entries_by_id.end()) - { - const Entry & entry2 = it2->second; - entry2.handlers_by_id.erase(handler_it); - } - }; -} - - -bool ReplicatedAccessStorage::hasSubscription(const UUID & id) const -{ - std::lock_guard lock{mutex}; - const auto & it = entries_by_id.find(id); - if (it != entries_by_id.end()) - { - const Entry & entry = it->second; - return !entry.handlers_by_id.empty(); - } - return false; -} - - -bool ReplicatedAccessStorage::hasSubscription(AccessEntityType type) const -{ - std::lock_guard lock{mutex}; - const auto & handlers = handlers_by_type[static_cast(type)]; - return !handlers.empty(); -} } diff --git a/src/Access/ReplicatedAccessStorage.h b/src/Access/ReplicatedAccessStorage.h index 8fdd24b6d54..f9f579e2ba7 100644 --- a/src/Access/ReplicatedAccessStorage.h +++ b/src/Access/ReplicatedAccessStorage.h @@ -18,32 +18,33 @@ namespace DB { +class AccessChangesNotifier; + /// Implementation of IAccessStorage which keeps all data in zookeeper. class ReplicatedAccessStorage : public IAccessStorage { public: static constexpr char STORAGE_TYPE[] = "replicated"; - ReplicatedAccessStorage(const String & storage_name, const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper); + ReplicatedAccessStorage(const String & storage_name, const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper, AccessChangesNotifier & changes_notifier_); virtual ~ReplicatedAccessStorage() override; const char * getStorageType() const override { return STORAGE_TYPE; } - virtual void startup(); - virtual void shutdown(); + void startPeriodicReloading() override { startWatchingThread(); } + void stopPeriodicReloading() override { stopWatchingThread(); } bool exists(const UUID & id) const override; - bool hasSubscription(const UUID & id) const override; - bool hasSubscription(AccessEntityType type) const override; private: String zookeeper_path; zkutil::GetZooKeeper get_zookeeper; std::atomic initialized = false; - std::atomic stop_flag = false; - ThreadFromGlobalPool worker_thread; - ConcurrentBoundedQueue refresh_queue; + + std::atomic watching = false; + ThreadFromGlobalPool watching_thread; + std::shared_ptr> watched_queue; std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; bool removeImpl(const UUID & id, bool throw_if_not_exists) override; @@ -53,37 +54,36 @@ private: bool removeZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, bool throw_if_not_exists); bool updateZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists); - void runWorkerThread(); - void resetAfterError(); void initializeZookeeper(); void createRootNodes(const zkutil::ZooKeeperPtr & zookeeper); - void refresh(); + void startWatchingThread(); + void stopWatchingThread(); + + void runWatchingThread(); + void resetAfterError(); + + bool refresh(); void refreshEntities(const zkutil::ZooKeeperPtr & zookeeper); void refreshEntity(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id); - void refreshEntityNoLock(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, Notifications & notifications); + void refreshEntityNoLock(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id); - void setEntityNoLock(const UUID & id, const AccessEntityPtr & entity, Notifications & notifications); - void removeEntityNoLock(const UUID & id, Notifications & notifications); + void setEntityNoLock(const UUID & id, const AccessEntityPtr & entity); + void removeEntityNoLock(const UUID & id); struct Entry { UUID id; AccessEntityPtr entity; - mutable std::list handlers_by_id; }; std::optional findImpl(AccessEntityType type, const String & name) const override; std::vector findAllImpl(AccessEntityType type) const override; AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; - void prepareNotifications(const Entry & entry, bool remove, Notifications & notifications) const; - scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; - scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; - mutable std::mutex mutex; std::unordered_map entries_by_id; std::unordered_map entries_by_name_and_type[static_cast(AccessEntityType::MAX)]; - mutable std::list handlers_by_type[static_cast(AccessEntityType::MAX)]; + AccessChangesNotifier & changes_notifier; }; } diff --git a/src/Access/RoleCache.cpp b/src/Access/RoleCache.cpp index f0e1435e299..308b771243e 100644 --- a/src/Access/RoleCache.cpp +++ b/src/Access/RoleCache.cpp @@ -66,9 +66,6 @@ RoleCache::~RoleCache() = default; std::shared_ptr RoleCache::getEnabledRoles(const std::vector & roles, const std::vector & roles_with_admin_option) { - /// Declared before `lock` to send notifications after the mutex will be unlocked. - scope_guard notifications; - std::lock_guard lock{mutex}; EnabledRoles::Params params; params.current_roles.insert(roles.begin(), roles.end()); @@ -83,13 +80,13 @@ RoleCache::getEnabledRoles(const std::vector & roles, const std::vector(new EnabledRoles(params)); - collectEnabledRoles(*res, notifications); + collectEnabledRoles(*res, nullptr); enabled_roles.emplace(std::move(params), res); return res; } -void RoleCache::collectEnabledRoles(scope_guard & notifications) +void RoleCache::collectEnabledRoles(scope_guard * notifications) { /// `mutex` is already locked. @@ -107,7 +104,7 @@ void RoleCache::collectEnabledRoles(scope_guard & notifications) } -void RoleCache::collectEnabledRoles(EnabledRoles & enabled, scope_guard & notifications) +void RoleCache::collectEnabledRoles(EnabledRoles & enabled, scope_guard * notifications) { /// `mutex` is already locked. @@ -170,7 +167,7 @@ void RoleCache::roleChanged(const UUID & role_id, const RolePtr & changed_role) return; role_from_cache->first = changed_role; cache.update(role_id, role_from_cache); - collectEnabledRoles(notifications); + collectEnabledRoles(¬ifications); } @@ -181,7 +178,7 @@ void RoleCache::roleRemoved(const UUID & role_id) std::lock_guard lock{mutex}; cache.remove(role_id); - collectEnabledRoles(notifications); + collectEnabledRoles(¬ifications); } } diff --git a/src/Access/RoleCache.h b/src/Access/RoleCache.h index e9c731f1342..51c415d4d1d 100644 --- a/src/Access/RoleCache.h +++ b/src/Access/RoleCache.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include #include @@ -24,14 +24,14 @@ public: const std::vector & current_roles_with_admin_option); private: - void collectEnabledRoles(scope_guard & notifications); - void collectEnabledRoles(EnabledRoles & enabled, scope_guard & notifications); + void collectEnabledRoles(scope_guard * notifications); + void collectEnabledRoles(EnabledRoles & enabled, scope_guard * notifications); RolePtr getRole(const UUID & role_id); void roleChanged(const UUID & role_id, const RolePtr & changed_role); void roleRemoved(const UUID & role_id); const AccessControl & access_control; - Poco::ExpireCache> cache; + Poco::AccessExpireCache> cache; std::map> enabled_roles; mutable std::mutex mutex; }; diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index 712e5393ce7..a6c4388fef8 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -14,9 +15,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -525,8 +523,8 @@ namespace } } -UsersConfigAccessStorage::UsersConfigAccessStorage(const String & storage_name_, const AccessControl & access_control_) - : IAccessStorage(storage_name_), access_control(access_control_) +UsersConfigAccessStorage::UsersConfigAccessStorage(const String & storage_name_, AccessControl & access_control_) + : IAccessStorage(storage_name_), access_control(access_control_), memory_storage(storage_name_, access_control.getChangesNotifier()) { } @@ -605,9 +603,9 @@ void UsersConfigAccessStorage::load( std::make_shared(), [&](Poco::AutoPtr new_config, bool /*initial_loading*/) { - parseFromConfig(*new_config); - Settings::checkNoSettingNamesAtTopLevel(*new_config, users_config_path); + parseFromConfig(*new_config); + access_control.getChangesNotifier().sendNotifications(); }, /* already_loaded = */ false); } @@ -662,27 +660,4 @@ std::optional UsersConfigAccessStorage::readNameImpl(const UUID & id, bo return memory_storage.readName(id, throw_if_not_exists); } - -scope_guard UsersConfigAccessStorage::subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const -{ - return memory_storage.subscribeForChanges(id, handler); -} - - -scope_guard UsersConfigAccessStorage::subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const -{ - return memory_storage.subscribeForChanges(type, handler); -} - - -bool UsersConfigAccessStorage::hasSubscription(const UUID & id) const -{ - return memory_storage.hasSubscription(id); -} - - -bool UsersConfigAccessStorage::hasSubscription(AccessEntityType type) const -{ - return memory_storage.hasSubscription(type); -} } diff --git a/src/Access/UsersConfigAccessStorage.h b/src/Access/UsersConfigAccessStorage.h index e21eb17f462..5c99bf30160 100644 --- a/src/Access/UsersConfigAccessStorage.h +++ b/src/Access/UsersConfigAccessStorage.h @@ -22,7 +22,7 @@ public: static constexpr char STORAGE_TYPE[] = "users.xml"; - UsersConfigAccessStorage(const String & storage_name_, const AccessControl & access_control_); + UsersConfigAccessStorage(const String & storage_name_, AccessControl & access_control_); ~UsersConfigAccessStorage() override; const char * getStorageType() const override { return STORAGE_TYPE; } @@ -37,13 +37,12 @@ public: const String & include_from_path = {}, const String & preprocessed_dir = {}, const zkutil::GetZooKeeper & get_zookeeper_function = {}); - void reload(); - void startPeriodicReloading(); - void stopPeriodicReloading(); + + void reload() override; + void startPeriodicReloading() override; + void stopPeriodicReloading() override; bool exists(const UUID & id) const override; - bool hasSubscription(const UUID & id) const override; - bool hasSubscription(AccessEntityType type) const override; private: void parseFromConfig(const Poco::Util::AbstractConfiguration & config); @@ -51,10 +50,8 @@ private: std::vector findAllImpl(AccessEntityType type) const override; AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const override; - scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override; - scope_guard subscribeForChangesImpl(AccessEntityType type, const OnChangedHandler & handler) const override; - const AccessControl & access_control; + AccessControl & access_control; MemoryAccessStorage memory_storage; String path; std::unique_ptr config_reloader; diff --git a/src/Access/tests/gtest_replicated_access_storage.cpp b/src/Access/tests/gtest_replicated_access_storage.cpp index f2052e91749..c780e598b64 100644 --- a/src/Access/tests/gtest_replicated_access_storage.cpp +++ b/src/Access/tests/gtest_replicated_access_storage.cpp @@ -1,5 +1,6 @@ #include #include +#include using namespace DB; @@ -12,18 +13,6 @@ namespace ErrorCodes } -TEST(ReplicatedAccessStorage, ShutdownWithoutStartup) -{ - auto get_zk = []() - { - return std::shared_ptr(); - }; - - auto storage = ReplicatedAccessStorage("replicated", "/clickhouse/access", get_zk); - storage.shutdown(); -} - - TEST(ReplicatedAccessStorage, ShutdownWithFailedStartup) { auto get_zk = []() @@ -31,16 +20,16 @@ TEST(ReplicatedAccessStorage, ShutdownWithFailedStartup) return std::shared_ptr(); }; - auto storage = ReplicatedAccessStorage("replicated", "/clickhouse/access", get_zk); + AccessChangesNotifier changes_notifier; + try { - storage.startup(); + auto storage = ReplicatedAccessStorage("replicated", "/clickhouse/access", get_zk, changes_notifier); } catch (Exception & e) { if (e.code() != ErrorCodes::NO_ZOOKEEPER) throw; } - storage.shutdown(); } diff --git a/src/AggregateFunctions/AggregateFunctionSum.h b/src/AggregateFunctions/AggregateFunctionSum.h index acff8e7b90f..03aeda1bb9b 100644 --- a/src/AggregateFunctions/AggregateFunctionSum.h +++ b/src/AggregateFunctions/AggregateFunctionSum.h @@ -59,11 +59,11 @@ struct AggregateFunctionSumData } /// Vectorized version - MULTITARGET_FUNCTION_WRAPPER_AVX2_SSE42(addManyImpl, - MULTITARGET_FH( + MULTITARGET_FUNCTION_AVX2_SSE42( + MULTITARGET_FUNCTION_HEADER( template void NO_SANITIZE_UNDEFINED NO_INLINE - ), /*addManyImpl*/ MULTITARGET_FB((const Value * __restrict ptr, size_t start, size_t end) /// NOLINT + ), addManyImpl, MULTITARGET_FUNCTION_BODY((const Value * __restrict ptr, size_t start, size_t end) /// NOLINT { ptr += start; size_t count = end - start; @@ -122,11 +122,11 @@ struct AggregateFunctionSumData addManyImpl(ptr, start, end); } - MULTITARGET_FUNCTION_WRAPPER_AVX2_SSE42(addManyConditionalInternalImpl, - MULTITARGET_FH( + MULTITARGET_FUNCTION_AVX2_SSE42( + MULTITARGET_FUNCTION_HEADER( template void NO_SANITIZE_UNDEFINED NO_INLINE - ), /*addManyConditionalInternalImpl*/ MULTITARGET_FB((const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end) /// NOLINT + ), addManyConditionalInternalImpl, MULTITARGET_FUNCTION_BODY((const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end) /// NOLINT { ptr += start; size_t count = end - start; diff --git a/src/Columns/IColumnImpl.h b/src/Columns/IColumnImpl.h index 7e7ff3c32d4..e90503cbad2 100644 --- a/src/Columns/IColumnImpl.h +++ b/src/Columns/IColumnImpl.h @@ -81,7 +81,8 @@ void IColumn::compareImpl(const Derived & rhs, size_t rhs_row_num, if constexpr (use_indexes) { num_indexes = row_indexes->size(); - next_index = indexes = row_indexes->data(); + indexes = row_indexes->data(); + next_index = indexes; } compare_results.resize(num_rows); @@ -100,15 +101,9 @@ void IColumn::compareImpl(const Derived & rhs, size_t rhs_row_num, if constexpr (use_indexes) row = indexes[i]; - int res = compareAt(row, rhs_row_num, rhs, nan_direction_hint); - - /// We need to convert int to Int8. Sometimes comparison return values which do not fit in one byte. - if (res < 0) - compare_results[row] = -1; - else if (res > 0) - compare_results[row] = 1; - else - compare_results[row] = 0; + int res = static_cast(this)->compareAt(row, rhs_row_num, rhs, nan_direction_hint); + assert(res == 1 || res == -1 || res == 0); + compare_results[row] = static_cast(res); if constexpr (reversed) compare_results[row] = -compare_results[row]; @@ -124,7 +119,10 @@ void IColumn::compareImpl(const Derived & rhs, size_t rhs_row_num, } if constexpr (use_indexes) - row_indexes->resize(next_index - row_indexes->data()); + { + size_t equal_row_indexes_size = next_index - row_indexes->data(); + row_indexes->resize(equal_row_indexes_size); + } } template diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 5f78c79f606..5d5ce052aaa 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -627,6 +627,7 @@ M(656, MEILISEARCH_EXCEPTION) \ M(657, UNSUPPORTED_MEILISEARCH_TYPE) \ M(658, MEILISEARCH_MISSING_SOME_COLUMNS) \ + M(659, UNKNOWN_STATUS_OF_TRANSACTION) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index 21f605ad353..d0f7af2da6b 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -35,6 +35,18 @@ namespace ErrorCodes extern const int CANNOT_MREMAP; } +void abortOnFailedAssertion(const String & description) +{ + LOG_FATAL(&Poco::Logger::root(), "Logical error: '{}'.", description); + + /// This is to suppress -Wmissing-noreturn + volatile bool always_false = false; + if (always_false) + return; + + abort(); +} + /// - Aborts the process if error code is LOGICAL_ERROR. /// - Increments error codes statistics. void handle_error_code([[maybe_unused]] const std::string & msg, int code, bool remote, const Exception::FramePointers & trace) @@ -44,8 +56,7 @@ void handle_error_code([[maybe_unused]] const std::string & msg, int code, bool #ifdef ABORT_ON_LOGICAL_ERROR if (code == ErrorCodes::LOGICAL_ERROR) { - LOG_FATAL(&Poco::Logger::root(), "Logical error: '{}'.", msg); - abort(); + abortOnFailedAssertion(msg); } #endif diff --git a/src/Common/Exception.h b/src/Common/Exception.h index 086b64bf5f9..253dbe6d65c 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -12,16 +12,14 @@ #include -#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || defined(THREAD_SANITIZER) || defined(MEMORY_SANITIZER) || defined(UNDEFINED_BEHAVIOR_SANITIZER) -#define ABORT_ON_LOGICAL_ERROR -#endif - namespace Poco { class Logger; } namespace DB { +void abortOnFailedAssertion(const String & description); + class Exception : public Poco::Exception { public: diff --git a/src/Common/OptimizedRegularExpression.h b/src/Common/OptimizedRegularExpression.h index 53f3a7d34b1..a3d38f27c07 100644 --- a/src/Common/OptimizedRegularExpression.h +++ b/src/Common/OptimizedRegularExpression.h @@ -107,3 +107,4 @@ private: }; using OptimizedRegularExpression = OptimizedRegularExpressionImpl; +using OptimizedRegularExpressionSingleThreaded = OptimizedRegularExpressionImpl; diff --git a/src/Common/TargetSpecific.h b/src/Common/TargetSpecific.h index 89c0f467fe3..67d9eb4831d 100644 --- a/src/Common/TargetSpecific.h +++ b/src/Common/TargetSpecific.h @@ -233,8 +233,8 @@ DECLARE_AVX512F_SPECIFIC_CODE( * class TestClass * { * public: - * MULTITARGET_FUNCTION_WRAPPER_AVX2_SSE42(testFunctionImpl, - * MULTITARGET_FH(int), /\*testFunction*\/ MULTITARGET_FB((int value) + * MULTITARGET_FUNCTION_AVX2_SSE42( + * MULTITARGET_FUNCTION_HEADER(int), testFunctionImpl, MULTITARGET_FUNCTION_BODY((int value) * { * return value; * }) @@ -259,15 +259,15 @@ DECLARE_AVX512F_SPECIFIC_CODE( */ /// Function header -#define MULTITARGET_FH(...) __VA_ARGS__ +#define MULTITARGET_FUNCTION_HEADER(...) __VA_ARGS__ /// Function body -#define MULTITARGET_FB(...) __VA_ARGS__ +#define MULTITARGET_FUNCTION_BODY(...) __VA_ARGS__ #if ENABLE_MULTITARGET_CODE && defined(__GNUC__) && defined(__x86_64__) /// NOLINTNEXTLINE -#define MULTITARGET_FUNCTION_WRAPPER_AVX2_SSE42(name, FUNCTION_HEADER, FUNCTION_BODY) \ +#define MULTITARGET_FUNCTION_AVX2_SSE42(FUNCTION_HEADER, name, FUNCTION_BODY) \ FUNCTION_HEADER \ \ AVX2_FUNCTION_SPECIFIC_ATTRIBUTE \ @@ -288,7 +288,7 @@ DECLARE_AVX512F_SPECIFIC_CODE( #else /// NOLINTNEXTLINE -#define MULTITARGET_FUNCTION_WRAPPER_AVX2_SSE42(name, FUNCTION_HEADER, FUNCTION_BODY) \ +#define MULTITARGET_FUNCTION_AVX2_SSE42(FUNCTION_HEADER, name, FUNCTION_BODY) \ FUNCTION_HEADER \ \ name \ diff --git a/src/Compression/CompressedWriteBuffer.cpp b/src/Compression/CompressedWriteBuffer.cpp index 93f163dc1af..6c1dbd9e00c 100644 --- a/src/Compression/CompressedWriteBuffer.cpp +++ b/src/Compression/CompressedWriteBuffer.cpp @@ -22,15 +22,22 @@ void CompressedWriteBuffer::nextImpl() if (!offset()) return; - UInt32 compressed_size = 0; size_t decompressed_size = offset(); UInt32 compressed_reserve_size = codec->getCompressedReserveSize(decompressed_size); - if (out.available() > compressed_reserve_size + CHECKSUM_SIZE) + /** During compression we need buffer with capacity >= compressed_reserve_size + CHECKSUM_SIZE. + * + * If output buffer has necessary capacity, we can compress data directly in output buffer. + * Then we can write checksum at the output buffer begin. + * + * If output buffer does not have necessary capacity. Compress data in temporary buffer. + * Then we can write checksum and temporary buffer in output buffer. + */ + if (out.available() >= compressed_reserve_size + CHECKSUM_SIZE) { char * out_checksum_ptr = out.position(); char * out_compressed_ptr = out.position() + CHECKSUM_SIZE; - compressed_size = codec->compress(working_buffer.begin(), decompressed_size, out_compressed_ptr); + UInt32 compressed_size = codec->compress(working_buffer.begin(), decompressed_size, out_compressed_ptr); CityHash_v1_0_2::uint128 checksum = CityHash_v1_0_2::CityHash128(out_compressed_ptr, compressed_size); memcpy(out_checksum_ptr, reinterpret_cast(&checksum), CHECKSUM_SIZE); @@ -39,7 +46,7 @@ void CompressedWriteBuffer::nextImpl() else { compressed_buffer.resize(compressed_reserve_size); - compressed_size = codec->compress(working_buffer.begin(), decompressed_size, compressed_buffer.data()); + UInt32 compressed_size = codec->compress(working_buffer.begin(), decompressed_size, compressed_buffer.data()); CityHash_v1_0_2::uint128 checksum = CityHash_v1_0_2::CityHash128(compressed_buffer.data(), compressed_size); out.write(reinterpret_cast(&checksum), CHECKSUM_SIZE); diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 1f089ba2cb7..d74ad173811 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -466,20 +466,23 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ { if (is_recovering) { + const auto finish_recovering = [&] + { + auto new_params = raft_instance->get_current_params(); + new_params.custom_commit_quorum_size_ = 0; + new_params.custom_election_quorum_size_ = 0; + raft_instance->update_params(new_params); + + LOG_INFO(log, "Recovery is done. You can continue using cluster normally."); + is_recovering = false; + }; + switch (type) { case nuraft::cb_func::HeartBeat: { if (raft_instance->isClusterHealthy()) - { - auto new_params = raft_instance->get_current_params(); - new_params.custom_commit_quorum_size_ = 0; - new_params.custom_election_quorum_size_ = 0; - raft_instance->update_params(new_params); - - LOG_INFO(log, "Recovery is done. You can continue using cluster normally."); - is_recovering = false; - } + finish_recovering(); break; } case nuraft::cb_func::NewConfig: @@ -490,8 +493,19 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ // Because we manually set the config to commit // we need to call the reconfigure also uint64_t log_idx = *static_cast(param->ctx); - if (log_idx == state_manager->load_config()->get_log_idx()) - raft_instance->forceReconfigure(state_manager->load_config()); + + auto config = state_manager->load_config(); + if (log_idx == config->get_log_idx()) + { + raft_instance->forceReconfigure(config); + + // Single node cluster doesn't need to wait for any other nodes + // so we can finish recovering immediately after applying + // new configuration + if (config->get_servers().size() == 1) + finish_recovering(); + } + break; } case nuraft::cb_func::ProcessReq: diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 33f5095d385..34e6c08c718 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -601,6 +601,15 @@ NamesAndTypesList Block::getNamesAndTypesList() const return res; } +NamesAndTypes Block::getNamesAndTypes() const +{ + NamesAndTypes res; + + for (const auto & elem : data) + res.emplace_back(elem.name, elem.type); + + return res; +} Names Block::getNames() const { @@ -756,6 +765,17 @@ void Block::updateHash(SipHash & hash) const col.column->updateHashWithValue(row_no, hash); } +Serializations Block::getSerializations() const +{ + Serializations res; + res.reserve(data.size()); + + for (const auto & column : data) + res.push_back(column.type->getDefaultSerialization()); + + return res; +} + void convertToFullIfSparse(Block & block) { for (auto & column : block) diff --git a/src/Core/Block.h b/src/Core/Block.h index 85bbc5005df..5a5458cc8f7 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -89,11 +89,14 @@ public: const ColumnsWithTypeAndName & getColumnsWithTypeAndName() const; NamesAndTypesList getNamesAndTypesList() const; + NamesAndTypes getNamesAndTypes() const; Names getNames() const; DataTypes getDataTypes() const; Names getDataTypeNames() const; std::unordered_map getNamesToIndexesMap() const; + Serializations getSerializations() const; + /// Returns number of rows from first column in block, not equal to nullptr. If no columns, returns 0. size_t rows() const; diff --git a/src/Core/BlockInfo.cpp b/src/Core/BlockInfo.cpp index e9aee871be1..81064dec733 100644 --- a/src/Core/BlockInfo.cpp +++ b/src/Core/BlockInfo.cpp @@ -65,6 +65,13 @@ void BlockMissingValues::setBit(size_t column_idx, size_t row_idx) mask[row_idx] = true; } +void BlockMissingValues::setBits(size_t column_idx, size_t rows) +{ + RowsBitMask & mask = rows_mask_by_column_id[column_idx]; + mask.resize(rows); + std::fill(mask.begin(), mask.end(), true); +} + const BlockMissingValues::RowsBitMask & BlockMissingValues::getDefaultsBitmask(size_t column_idx) const { static RowsBitMask none; diff --git a/src/Core/BlockInfo.h b/src/Core/BlockInfo.h index 82d790bbc8e..d431303ca39 100644 --- a/src/Core/BlockInfo.h +++ b/src/Core/BlockInfo.h @@ -56,7 +56,10 @@ public: const RowsBitMask & getDefaultsBitmask(size_t column_idx) const; /// Check that we have to replace default value at least in one of columns bool hasDefaultBits(size_t column_idx) const; + /// Set bit for a specified row in a single column. void setBit(size_t column_idx, size_t row_idx); + /// Set bits for all rows in a single column. + void setBits(size_t column_idx, size_t rows); bool empty() const { return rows_mask_by_column_id.empty(); } size_t size() const { return rows_mask_by_column_id.size(); } void clear() { rows_mask_by_column_id.clear(); } diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h index f08527ee4d5..331df9aa637 100644 --- a/src/Core/DecimalFunctions.h +++ b/src/Core/DecimalFunctions.h @@ -156,7 +156,7 @@ inline DecimalComponents splitWithScaleMultiplier( using T = typename DecimalType::NativeType; const auto whole = decimal.value / scale_multiplier; auto fractional = decimal.value % scale_multiplier; - if (fractional < T(0)) + if (whole && fractional < T(0)) fractional *= T(-1); return {whole, fractional}; @@ -199,7 +199,7 @@ inline typename DecimalType::NativeType getFractionalPartWithScaleMultiplier( /// Anycase we make modulo before compare to make scale_multiplier > 1 unaffected. T result = decimal.value % scale_multiplier; if constexpr (!keep_sign) - if (result < T(0)) + if (decimal.value / scale_multiplier && result < T(0)) result = -result; return result; diff --git a/src/Core/Defines.h b/src/Core/Defines.h index 4ff48b8ff63..9665a20a397 100644 --- a/src/Core/Defines.h +++ b/src/Core/Defines.h @@ -33,8 +33,6 @@ #define DEFAULT_TEMPORARY_LIVE_VIEW_TIMEOUT_SEC 5 #define DEFAULT_PERIODIC_LIVE_VIEW_REFRESH_SEC 60 -#define DEFAULT_WINDOW_VIEW_CLEAN_INTERVAL_SEC 5 -#define DEFAULT_WINDOW_VIEW_HEARTBEAT_INTERVAL_SEC 15 #define SHOW_CHARS_ON_SYNTAX_ERROR ptrdiff_t(160) #define DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES 3 /// each period reduces the error counter by 2 times diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 2038b385b16..bf9785fcc00 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -435,8 +435,9 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Seconds, live_view_heartbeat_interval, 15, "The heartbeat interval in seconds to indicate live query is alive.", 0) \ M(UInt64, max_live_view_insert_blocks_before_refresh, 64, "Limit maximum number of inserted blocks after which mergeable blocks are dropped and query is re-executed.", 0) \ M(Bool, allow_experimental_window_view, false, "Enable WINDOW VIEW. Not mature enough.", 0) \ - M(Seconds, window_view_clean_interval, DEFAULT_WINDOW_VIEW_CLEAN_INTERVAL_SEC, "The clean interval of window view in seconds to free outdated data.", 0) \ - M(Seconds, window_view_heartbeat_interval, DEFAULT_WINDOW_VIEW_HEARTBEAT_INTERVAL_SEC, "The heartbeat interval in seconds to indicate watch query is alive.", 0) \ + M(Seconds, window_view_clean_interval, 60, "The clean interval of window view in seconds to free outdated data.", 0) \ + M(Seconds, window_view_heartbeat_interval, 15, "The heartbeat interval in seconds to indicate watch query is alive.", 0) \ + M(Seconds, wait_for_window_view_fire_signal_timeout, 10, "Timeout for waiting for window view fire signal in event time processing", 0) \ M(UInt64, min_free_disk_space_for_temporary_data, 0, "The minimum disk space to keep while writing temporary data used in external sorting and aggregation.", 0) \ \ M(DefaultDatabaseEngine, default_database_engine, DefaultDatabaseEngine::Atomic, "Default database engine.", 0) \ @@ -591,6 +592,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \ M(Bool, count_distinct_optimization, false, "Rewrite count distinct to subquery of group by", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ + M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \ M(Bool, throw_if_no_data_to_insert, true, "Enables or disables empty INSERTs, enabled by default", 0) \ M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \ // End of COMMON_SETTINGS @@ -636,7 +638,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \ M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices \\N", 0) \ M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \ - M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \ + M(Bool, input_format_skip_unknown_fields, true, "Skip columns with unknown names from input data (it works for JSONEachRow, -WithNames, -WithNamesAndTypes and TSKV formats).", 0) \ M(Bool, input_format_with_names_use_header, true, "For -WithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \ M(Bool, input_format_with_types_use_header, true, "For -WithNamesAndTypes input formats this controls whether format parser should check if data types from the input match data types from the header.", 0) \ M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \ @@ -699,6 +701,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, output_format_pretty_color, true, "Use ANSI escape sequences to paint colors in Pretty formats", 0) \ M(String, output_format_pretty_grid_charset, "UTF-8", "Charset for printing grid borders. Available charsets: ASCII, UTF-8 (default one).", 0) \ M(UInt64, output_format_parquet_row_group_size, 1000000, "Row group size in rows.", 0) \ + M(Bool, output_format_parquet_string_as_string, false, "Use Parquet String type instead of Binary for String columns.", 0) \ M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \ M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \ @@ -736,6 +739,9 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if possible. Possible values: 0 - no rewrite, 1 - apply if possible, 2 - force rewrite all cross joins", 0) \ \ M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \ + M(Bool, output_format_arrow_string_as_string, false, "Use Arrow String type instead of Binary for String columns", 0) \ + \ + M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \ \ M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \ \ diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index a37c1e9be86..bff1971bad9 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -131,6 +131,11 @@ IMPLEMENT_SETTING_ENUM(ShortCircuitFunctionEvaluation, ErrorCodes::BAD_ARGUMENTS {"force_enable", ShortCircuitFunctionEvaluation::FORCE_ENABLE}, {"disable", ShortCircuitFunctionEvaluation::DISABLE}}) +IMPLEMENT_SETTING_ENUM(TransactionsWaitCSNMode, ErrorCodes::BAD_ARGUMENTS, + {{"async", TransactionsWaitCSNMode::ASYNC}, + {"wait", TransactionsWaitCSNMode::WAIT}, + {"wait_unknown", TransactionsWaitCSNMode::WAIT_UNKNOWN}}) + IMPLEMENT_SETTING_ENUM(EnumComparingMode, ErrorCodes::BAD_ARGUMENTS, {{"by_names", FormatSettings::EnumComparingMode::BY_NAMES}, {"by_values", FormatSettings::EnumComparingMode::BY_VALUES}, diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index 08091da6d6c..83a65f2a320 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -183,6 +183,15 @@ enum class ShortCircuitFunctionEvaluation DECLARE_SETTING_ENUM(ShortCircuitFunctionEvaluation) +enum class TransactionsWaitCSNMode +{ + ASYNC, + WAIT, + WAIT_UNKNOWN, +}; + +DECLARE_SETTING_ENUM(TransactionsWaitCSNMode) + DECLARE_SETTING_ENUM_WITH_RENAME(EnumComparingMode, FormatSettings::EnumComparingMode) DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule) diff --git a/src/Core/tests/gtest_DecimalFunctions.cpp b/src/Core/tests/gtest_DecimalFunctions.cpp index 7517edda937..1712785488e 100644 --- a/src/Core/tests/gtest_DecimalFunctions.cpp +++ b/src/Core/tests/gtest_DecimalFunctions.cpp @@ -176,7 +176,7 @@ INSTANTIATE_TEST_SUITE_P(Basic, } }, { - "When scale is not 0 and whole part is 0.", + "For positive Decimal value, with scale not 0, and whole part is 0.", 123, 3, { @@ -184,6 +184,16 @@ INSTANTIATE_TEST_SUITE_P(Basic, 123 } }, + { + "For negative Decimal value, with scale not 0, and whole part is 0.", + -123, + 3, + { + 0, + -123 + } + }, + { "For negative Decimal value whole part is negative, fractional is non-negative.", -1234567'89, @@ -216,6 +226,24 @@ INSTANTIATE_TEST_SUITE_P(Basic, 187618332, 123 } + }, + { + "Negative timestamp 1969-12-31 23:59:59.123 UTC", + DateTime64(-877), + 3, + { + 0, + -877 + } + }, + { + "Positive timestamp 1970-01-01 00:00:00.123 UTC", + DateTime64(123), + 3, + { + 0, + 123 + } } }) ); diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 1875caf1855..5aab8909a0c 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include @@ -71,7 +71,7 @@ String escapingRuleToString(FormatSettings::EscapingRule escaping_rule) void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings) { - String tmp; + NullOutput out; constexpr const char * field_name = ""; constexpr size_t field_name_len = 16; switch (escaping_rule) @@ -80,19 +80,19 @@ void skipFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule esca /// Empty field, just skip spaces break; case FormatSettings::EscapingRule::Escaped: - readEscapedString(tmp, buf); + readEscapedStringInto(out, buf); break; case FormatSettings::EscapingRule::Quoted: - readQuotedFieldIntoString(tmp, buf); + readQuotedFieldInto(out, buf); break; case FormatSettings::EscapingRule::CSV: - readCSVString(tmp, buf, format_settings.csv); + readCSVStringInto(out, buf, format_settings.csv); break; case FormatSettings::EscapingRule::JSON: skipJSONField(buf, StringRef(field_name, field_name_len)); break; case FormatSettings::EscapingRule::Raw: - readString(tmp, buf); + readStringInto(out, buf); break; default: __builtin_unreachable(); @@ -219,13 +219,13 @@ String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escapin if constexpr (read_string) readQuotedString(result, buf); else - readQuotedFieldIntoString(result, buf); + readQuotedField(result, buf); break; case FormatSettings::EscapingRule::JSON: if constexpr (read_string) readJSONString(result, buf); else - readJSONFieldIntoString(result, buf); + readJSONField(result, buf); break; case FormatSettings::EscapingRule::Raw: readString(result, buf); @@ -452,7 +452,7 @@ DataTypePtr determineDataTypeByEscapingRule(const String & field, const FormatSe return buf.eof() ? type : nullptr; } case FormatSettings::EscapingRule::JSON: - return getDataTypeFromJSONField(field); + return JSONUtils::getDataTypeFromField(field); case FormatSettings::EscapingRule::CSV: { if (!format_settings.csv.input_format_use_best_effort_in_schema_inference) diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 96b52cd2423..644e4d3ecfd 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -99,6 +99,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching; format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns; format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string; format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8; format_settings.pretty.color = settings.output_format_pretty_color; format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width; @@ -132,17 +133,19 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.arrow.import_nested = settings.input_format_arrow_import_nested; format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns; format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; + format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching; + format_settings.arrow.output_string_as_string = settings.output_format_arrow_string_as_string; format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; - format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference; - format_settings.arrow.case_insensitive_column_matching = settings.input_format_arrow_case_insensitive_column_matching; format_settings.orc.import_nested = settings.input_format_orc_import_nested; format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns; format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size; format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_orc_skip_columns_with_unsupported_types_in_schema_inference; format_settings.orc.case_insensitive_column_matching = settings.input_format_orc_case_insensitive_column_matching; + format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.seekable_read = settings.input_format_allow_seeks; @@ -538,19 +541,19 @@ void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & na } -void FormatFactory::markFormatAsColumnOriented(const String & name) +void FormatFactory::markFormatSupportsSubsetOfColumns(const String & name) { - auto & target = dict[name].is_column_oriented; + auto & target = dict[name].supports_subset_of_columns; if (target) - throw Exception("FormatFactory: Format " + name + " is already marked as column oriented", ErrorCodes::LOGICAL_ERROR); + throw Exception("FormatFactory: Format " + name + " is already marked as supporting subset of columns", ErrorCodes::LOGICAL_ERROR); target = true; } -bool FormatFactory::checkIfFormatIsColumnOriented(const String & name) +bool FormatFactory::checkIfFormatSupportsSubsetOfColumns(const String & name) const { const auto & target = getCreators(name); - return target.is_column_oriented; + return target.supports_subset_of_columns; } bool FormatFactory::isInputFormat(const String & name) const @@ -565,19 +568,19 @@ bool FormatFactory::isOutputFormat(const String & name) const return it != dict.end() && it->second.output_creator; } -bool FormatFactory::checkIfFormatHasSchemaReader(const String & name) +bool FormatFactory::checkIfFormatHasSchemaReader(const String & name) const { const auto & target = getCreators(name); return bool(target.schema_reader_creator); } -bool FormatFactory::checkIfFormatHasExternalSchemaReader(const String & name) +bool FormatFactory::checkIfFormatHasExternalSchemaReader(const String & name) const { const auto & target = getCreators(name); return bool(target.external_schema_reader_creator); } -bool FormatFactory::checkIfFormatHasAnySchemaReader(const String & name) +bool FormatFactory::checkIfFormatHasAnySchemaReader(const String & name) const { return checkIfFormatHasSchemaReader(name) || checkIfFormatHasExternalSchemaReader(name); } diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index f7d3c23d3b4..8e949a3e367 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -108,7 +108,7 @@ private: SchemaReaderCreator schema_reader_creator; ExternalSchemaReaderCreator external_schema_reader_creator; bool supports_parallel_formatting{false}; - bool is_column_oriented{false}; + bool supports_subset_of_columns{false}; NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker; AppendSupportChecker append_support_checker; }; @@ -194,13 +194,13 @@ public: void registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator); void markOutputFormatSupportsParallelFormatting(const String & name); - void markFormatAsColumnOriented(const String & name); + void markFormatSupportsSubsetOfColumns(const String & name); - bool checkIfFormatIsColumnOriented(const String & name); + bool checkIfFormatSupportsSubsetOfColumns(const String & name) const; - bool checkIfFormatHasSchemaReader(const String & name); - bool checkIfFormatHasExternalSchemaReader(const String & name); - bool checkIfFormatHasAnySchemaReader(const String & name); + bool checkIfFormatHasSchemaReader(const String & name) const; + bool checkIfFormatHasExternalSchemaReader(const String & name) const; + bool checkIfFormatHasAnySchemaReader(const String & name) const; const FormatsDictionary & getAllFormats() const { diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 4f77fe099e1..e6f0a7d229e 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -81,6 +81,7 @@ struct FormatSettings bool allow_missing_columns = false; bool skip_columns_with_unsupported_types_in_schema_inference = false; bool case_insensitive_column_matching = false; + bool output_string_as_string = false; } arrow; struct @@ -148,6 +149,7 @@ struct FormatSettings bool skip_columns_with_unsupported_types_in_schema_inference = false; bool case_insensitive_column_matching = false; std::unordered_set skip_row_groups = {}; + bool output_string_as_string = false; } parquet; struct Pretty @@ -234,6 +236,7 @@ struct FormatSettings bool skip_columns_with_unsupported_types_in_schema_inference = false; bool case_insensitive_column_matching = false; std::unordered_set skip_stripes = {}; + bool output_string_as_string = false; } orc; /// For capnProto format we should determine how to diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp deleted file mode 100644 index 534237c900c..00000000000 --- a/src/Formats/JSONEachRowUtils.cpp +++ /dev/null @@ -1,387 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace DB -{ -namespace ErrorCodes -{ - extern const int INCORRECT_DATA; - extern const int LOGICAL_ERROR; -} - -template -static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows) -{ - skipWhitespaceIfAny(in); - - char * pos = in.position(); - size_t balance = 0; - bool quotes = false; - size_t number_of_rows = 0; - - while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast(pos - in.position()) < min_chunk_size || number_of_rows < min_rows)) - { - const auto current_object_size = memory.size() + static_cast(pos - in.position()); - if (min_chunk_size != 0 && current_object_size > 10 * min_chunk_size) - throw ParsingException("Size of JSON object is extremely large. Expected not greater than " + - std::to_string(min_chunk_size) + " bytes, but current is " + std::to_string(current_object_size) + - " bytes per row. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely JSON is malformed", ErrorCodes::INCORRECT_DATA); - - if (quotes) - { - pos = find_first_symbols<'\\', '"'>(pos, in.buffer().end()); - - if (pos > in.buffer().end()) - throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR); - else if (pos == in.buffer().end()) - continue; - - if (*pos == '\\') - { - ++pos; - if (loadAtPosition(in, memory, pos)) - ++pos; - } - else if (*pos == '"') - { - ++pos; - quotes = false; - } - } - else - { - pos = find_first_symbols(pos, in.buffer().end()); - - if (pos > in.buffer().end()) - throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR); - else if (pos == in.buffer().end()) - continue; - - else if (*pos == opening_bracket) - { - ++balance; - ++pos; - } - else if (*pos == closing_bracket) - { - --balance; - ++pos; - } - else if (*pos == '\\') - { - ++pos; - if (loadAtPosition(in, memory, pos)) - ++pos; - } - else if (*pos == '"') - { - quotes = true; - ++pos; - } - - if (balance == 0) - ++number_of_rows; - } - } - - saveUpToPosition(in, memory, pos); - return {loadAtPosition(in, memory, pos), number_of_rows}; -} - -template -static String readJSONEachRowLineIntoStringImpl(ReadBuffer & in) -{ - Memory memory; - fileSegmentationEngineJSONEachRowImpl(in, memory, 0, 1); - return String(memory.data(), memory.size()); -} - -template -DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) -{ - if (field.isNull()) - return nullptr; - - if (field.isBool()) - return DataTypeFactory::instance().get("Nullable(Bool)"); - - if (field.isInt64() || field.isUInt64() || field.isDouble()) - return makeNullable(std::make_shared()); - - if (field.isString()) - return makeNullable(std::make_shared()); - - if (field.isArray()) - { - auto array = field.getArray(); - - /// Return nullptr in case of empty array because we cannot determine nested type. - if (array.size() == 0) - return nullptr; - - DataTypes nested_data_types; - /// If this array contains fields with different types we will treat it as Tuple. - bool is_tuple = false; - for (const auto element : array) - { - auto type = getDataTypeFromJSONFieldImpl(element); - if (!type) - return nullptr; - - if (!nested_data_types.empty() && type->getName() != nested_data_types.back()->getName()) - is_tuple = true; - - nested_data_types.push_back(std::move(type)); - } - - if (is_tuple) - return std::make_shared(nested_data_types); - - return std::make_shared(nested_data_types.back()); - } - - if (field.isObject()) - { - auto object = field.getObject(); - DataTypePtr value_type; - bool is_object = false; - for (const auto key_value_pair : object) - { - auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second); - if (!type) - continue; - - if (isObject(type)) - { - is_object = true; - break; - } - - if (!value_type) - { - value_type = type; - } - else if (!value_type->equals(*type)) - { - is_object = true; - break; - } - } - - if (is_object) - return std::make_shared("json", true); - - if (value_type) - return std::make_shared(std::make_shared(), value_type); - - return nullptr; - } - - throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; -} - -auto getJSONParserAndElement() -{ -#if USE_SIMDJSON - return std::pair(); -#elif USE_RAPIDJSON - return std::pair(); -#else - return std::pair(); -#endif -} - -DataTypePtr getDataTypeFromJSONField(const String & field) -{ - auto [parser, element] = getJSONParserAndElement(); - bool parsed = parser.parse(field, element); - if (!parsed) - throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object"); - - return getDataTypeFromJSONFieldImpl(element); -} - -template -static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, bool /*json_strings*/, Extractor & extractor) -{ - String line = readJSONEachRowLineIntoStringImpl(in); - auto [parser, element] = getJSONParserAndElement(); - bool parsed = parser.parse(line, element); - if (!parsed) - throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object"); - - auto fields = extractor.extract(element); - - DataTypes data_types; - data_types.reserve(fields.size()); - for (const auto & field : fields) - data_types.push_back(getDataTypeFromJSONFieldImpl(field)); - - /// TODO: For JSONStringsEachRow/JSONCompactStringsEach all types will be strings. - /// Should we try to parse data inside strings somehow in this case? - - return data_types; -} - -std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) -{ - return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_chunk_size, 1); -} - -std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows) -{ - return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_chunk_size, min_rows); -} - -struct JSONEachRowFieldsExtractor -{ - template - std::vector extract(const Element & element) - { - /// {..., "" : , ...} - - if (!element.isObject()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Root JSON value is not an object"); - - auto object = element.getObject(); - std::vector fields; - fields.reserve(object.size()); - column_names.reserve(object.size()); - for (const auto & key_value_pair : object) - { - column_names.emplace_back(key_value_pair.first); - fields.push_back(key_value_pair.second); - } - - return fields; - } - - std::vector column_names; -}; - -NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings) -{ - JSONEachRowFieldsExtractor extractor; - auto data_types = determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); - NamesAndTypesList result; - for (size_t i = 0; i != extractor.column_names.size(); ++i) - result.emplace_back(extractor.column_names[i], data_types[i]); - return result; -} - -struct JSONCompactEachRowFieldsExtractor -{ - template - std::vector extract(const Element & element) - { - /// [..., , ...] - if (!element.isArray()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Root JSON value is not an array"); - - auto array = element.getArray(); - std::vector fields; - fields.reserve(array.size()); - for (size_t i = 0; i != array.size(); ++i) - fields.push_back(array[i]); - return fields; - } -}; - -DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings) -{ - JSONCompactEachRowFieldsExtractor extractor; - return determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); -} - - -bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) -{ - /// For JSONEachRow we can safely skip whitespace characters - skipWhitespaceIfAny(buf); - return buf.eof() || *buf.position() == '['; -} - -bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings) -{ - try - { - bool as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); - - if (yield_strings) - { - String str; - readJSONString(str, in); - - ReadBufferFromString buf(str); - - if (as_nullable) - return SerializationNullable::deserializeWholeTextImpl(column, buf, format_settings, serialization); - - serialization->deserializeWholeText(column, buf, format_settings); - return true; - } - - if (as_nullable) - return SerializationNullable::deserializeTextJSONImpl(column, in, format_settings, serialization); - - serialization->deserializeTextJSON(column, in, format_settings); - return true; - } - catch (Exception & e) - { - e.addMessage("(while reading the value of key " + column_name + ")"); - throw; - } -} - -DataTypePtr getCommonTypeForJSONFormats(const DataTypePtr & first, const DataTypePtr & second, bool allow_bools_as_numbers) -{ - if (allow_bools_as_numbers) - { - auto not_nullable_first = removeNullable(first); - auto not_nullable_second = removeNullable(second); - /// Check if we have Bool and Number and if so make the result type Number - bool bool_type_presents = isBool(not_nullable_first) || isBool(not_nullable_second); - bool number_type_presents = isNumber(not_nullable_first) || isNumber(not_nullable_second); - if (bool_type_presents && number_type_presents) - { - if (isBool(not_nullable_first)) - return second; - return first; - } - } - - /// If we have Map and Object, make result type Object - bool object_type_presents = isObject(first) || isObject(second); - bool map_type_presents = isMap(first) || isMap(second); - if (object_type_presents && map_type_presents) - { - if (isObject(first)) - return first; - return second; - } - - /// If we have different Maps, make result type Object - if (isMap(first) && isMap(second) && !first->equals(*second)) - return std::make_shared("json", true); - - return nullptr; -} - -} diff --git a/src/Formats/JSONEachRowUtils.h b/src/Formats/JSONEachRowUtils.h deleted file mode 100644 index 46c343f356a..00000000000 --- a/src/Formats/JSONEachRowUtils.h +++ /dev/null @@ -1,37 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); -std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows); - - -/// Parse JSON from string and convert it's type to ClickHouse type. Make the result type always Nullable. -/// JSON array with different nested types is treated as Tuple. -/// If cannot convert (for example when field contains null), return nullptr. -DataTypePtr getDataTypeFromJSONField(const String & field); - -/// Read row in JSONEachRow format and try to determine type for each field. -/// Return list of names and types. -/// If cannot determine the type of some field, return nullptr for it. -NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings); - -/// Read row in JSONCompactEachRow format and try to determine type for each field. -/// If cannot determine the type of some field, return nullptr for it. -DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings); - -bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf); - -bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings); - -DataTypePtr getCommonTypeForJSONFormats(const DataTypePtr & first, const DataTypePtr & second, bool allow_bools_as_numbers); - -} diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp new file mode 100644 index 00000000000..1ac58760516 --- /dev/null +++ b/src/Formats/JSONUtils.cpp @@ -0,0 +1,603 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; + extern const int LOGICAL_ERROR; +} + +namespace JSONUtils +{ + + template + static std::pair + fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows) + { + skipWhitespaceIfAny(in); + + char * pos = in.position(); + size_t balance = 0; + bool quotes = false; + size_t number_of_rows = 0; + + while (loadAtPosition(in, memory, pos) + && (balance || memory.size() + static_cast(pos - in.position()) < min_chunk_size || number_of_rows < min_rows)) + { + const auto current_object_size = memory.size() + static_cast(pos - in.position()); + if (min_chunk_size != 0 && current_object_size > 10 * min_chunk_size) + throw ParsingException( + "Size of JSON object is extremely large. Expected not greater than " + std::to_string(min_chunk_size) + + " bytes, but current is " + std::to_string(current_object_size) + + " bytes per row. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely JSON is malformed", + ErrorCodes::INCORRECT_DATA); + + if (quotes) + { + pos = find_first_symbols<'\\', '"'>(pos, in.buffer().end()); + + if (pos > in.buffer().end()) + throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR); + else if (pos == in.buffer().end()) + continue; + + if (*pos == '\\') + { + ++pos; + if (loadAtPosition(in, memory, pos)) + ++pos; + } + else if (*pos == '"') + { + ++pos; + quotes = false; + } + } + else + { + pos = find_first_symbols(pos, in.buffer().end()); + + if (pos > in.buffer().end()) + throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR); + else if (pos == in.buffer().end()) + continue; + + else if (*pos == opening_bracket) + { + ++balance; + ++pos; + } + else if (*pos == closing_bracket) + { + --balance; + ++pos; + } + else if (*pos == '\\') + { + ++pos; + if (loadAtPosition(in, memory, pos)) + ++pos; + } + else if (*pos == '"') + { + quotes = true; + ++pos; + } + + if (balance == 0) + ++number_of_rows; + } + } + + saveUpToPosition(in, memory, pos); + return {loadAtPosition(in, memory, pos), number_of_rows}; + } + + template + static String readJSONEachRowLineIntoStringImpl(ReadBuffer & in) + { + Memory memory; + fileSegmentationEngineJSONEachRowImpl(in, memory, 0, 1); + return String(memory.data(), memory.size()); + } + + template + DataTypePtr getDataTypeFromFieldImpl(const Element & field) + { + if (field.isNull()) + return nullptr; + + if (field.isBool()) + return DataTypeFactory::instance().get("Nullable(Bool)"); + + if (field.isInt64() || field.isUInt64() || field.isDouble()) + return makeNullable(std::make_shared()); + + if (field.isString()) + return makeNullable(std::make_shared()); + + if (field.isArray()) + { + auto array = field.getArray(); + + /// Return nullptr in case of empty array because we cannot determine nested type. + if (array.size() == 0) + return nullptr; + + DataTypes nested_data_types; + /// If this array contains fields with different types we will treat it as Tuple. + bool is_tuple = false; + for (const auto element : array) + { + auto type = getDataTypeFromFieldImpl(element); + if (!type) + return nullptr; + + if (!nested_data_types.empty() && type->getName() != nested_data_types.back()->getName()) + is_tuple = true; + + nested_data_types.push_back(std::move(type)); + } + + if (is_tuple) + return std::make_shared(nested_data_types); + + return std::make_shared(nested_data_types.back()); + } + + if (field.isObject()) + { + auto object = field.getObject(); + DataTypePtr value_type; + bool is_object = false; + for (const auto key_value_pair : object) + { + auto type = getDataTypeFromFieldImpl(key_value_pair.second); + if (!type) + continue; + + if (isObject(type)) + { + is_object = true; + break; + } + + if (!value_type) + { + value_type = type; + } + else if (!value_type->equals(*type)) + { + is_object = true; + break; + } + } + + if (is_object) + return std::make_shared("json", true); + + if (value_type) + return std::make_shared(std::make_shared(), value_type); + + return nullptr; + } + + throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"}; + } + + auto getJSONParserAndElement() + { +#if USE_SIMDJSON + return std::pair(); +#elif USE_RAPIDJSON + return std::pair(); +#else + return std::pair(); +#endif + } + + DataTypePtr getDataTypeFromField(const String & field) + { + auto [parser, element] = getJSONParserAndElement(); + bool parsed = parser.parse(field, element); + if (!parsed) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object here: {}", field); + + return getDataTypeFromFieldImpl(element); + } + + template + static DataTypes determineColumnDataTypesFromJSONEachRowDataImpl(ReadBuffer & in, bool /*json_strings*/, Extractor & extractor) + { + String line = readJSONEachRowLineIntoStringImpl(in); + auto [parser, element] = getJSONParserAndElement(); + bool parsed = parser.parse(line, element); + if (!parsed) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON object here: {}", line); + + auto fields = extractor.extract(element); + + DataTypes data_types; + data_types.reserve(fields.size()); + for (const auto & field : fields) + data_types.push_back(getDataTypeFromFieldImpl(field)); + + /// TODO: For JSONStringsEachRow/JSONCompactStringsEach all types will be strings. + /// Should we try to parse data inside strings somehow in this case? + + return data_types; + } + + std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) + { + return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_chunk_size, 1); + } + + std::pair + fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows) + { + return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_chunk_size, min_rows); + } + + struct JSONEachRowFieldsExtractor + { + template + std::vector extract(const Element & element) + { + /// {..., "" : , ...} + + if (!element.isObject()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Root JSON value is not an object"); + + auto object = element.getObject(); + std::vector fields; + fields.reserve(object.size()); + column_names.reserve(object.size()); + for (const auto & key_value_pair : object) + { + column_names.emplace_back(key_value_pair.first); + fields.push_back(key_value_pair.second); + } + + return fields; + } + + std::vector column_names; + }; + + NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings) + { + JSONEachRowFieldsExtractor extractor; + auto data_types + = determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); + NamesAndTypesList result; + for (size_t i = 0; i != extractor.column_names.size(); ++i) + result.emplace_back(extractor.column_names[i], data_types[i]); + return result; + } + + struct JSONCompactEachRowFieldsExtractor + { + template + std::vector extract(const Element & element) + { + /// [..., , ...] + if (!element.isArray()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Root JSON value is not an array"); + + auto array = element.getArray(); + std::vector fields; + fields.reserve(array.size()); + for (size_t i = 0; i != array.size(); ++i) + fields.push_back(array[i]); + return fields; + } + }; + + DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings) + { + JSONCompactEachRowFieldsExtractor extractor; + return determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); + } + + + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) + { + /// For JSONEachRow we can safely skip whitespace characters + skipWhitespaceIfAny(buf); + return buf.eof() || *buf.position() == '['; + } + + bool readField( + ReadBuffer & in, + IColumn & column, + const DataTypePtr & type, + const SerializationPtr & serialization, + const String & column_name, + const FormatSettings & format_settings, + bool yield_strings) + { + try + { + bool as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); + + if (yield_strings) + { + String str; + readJSONString(str, in); + + ReadBufferFromString buf(str); + + if (as_nullable) + return SerializationNullable::deserializeWholeTextImpl(column, buf, format_settings, serialization); + + serialization->deserializeWholeText(column, buf, format_settings); + return true; + } + + if (as_nullable) + return SerializationNullable::deserializeTextJSONImpl(column, in, format_settings, serialization); + + serialization->deserializeTextJSON(column, in, format_settings); + return true; + } + catch (Exception & e) + { + e.addMessage("(while reading the value of key " + column_name + ")"); + throw; + } + } + + DataTypePtr getCommonTypeForJSONFormats(const DataTypePtr & first, const DataTypePtr & second, bool allow_bools_as_numbers) + { + if (allow_bools_as_numbers) + { + auto not_nullable_first = removeNullable(first); + auto not_nullable_second = removeNullable(second); + /// Check if we have Bool and Number and if so make the result type Number + bool bool_type_presents = isBool(not_nullable_first) || isBool(not_nullable_second); + bool number_type_presents = isNumber(not_nullable_first) || isNumber(not_nullable_second); + if (bool_type_presents && number_type_presents) + { + if (isBool(not_nullable_first)) + return second; + return first; + } + } + + /// If we have Map and Object, make result type Object + bool object_type_presents = isObject(first) || isObject(second); + bool map_type_presents = isMap(first) || isMap(second); + if (object_type_presents && map_type_presents) + { + if (isObject(first)) + return first; + return second; + } + + /// If we have different Maps, make result type Object + if (isMap(first) && isMap(second) && !first->equals(*second)) + return std::make_shared("json", true); + + return nullptr; + } + + void writeFieldDelimiter(WriteBuffer & out, size_t new_lines) + { + writeChar(',', out); + writeChar('\n', new_lines, out); + } + + void writeFieldCompactDelimiter(WriteBuffer & out) { writeCString(", ", out); } + + template + void writeTitle(const char * title, WriteBuffer & out, size_t indent) + { + writeChar('\t', indent, out); + writeChar('"', out); + writeCString(title, out); + if constexpr (with_space) + writeCString("\": ", out); + else + writeCString("\":\n", out); + } + + void writeObjectStart(WriteBuffer & out, size_t indent, const char * title) + { + if (title) + writeTitle(title, out, indent); + writeChar('\t', indent, out); + writeCString("{\n", out); + } + + void writeObjectEnd(WriteBuffer & out, size_t indent) + { + writeChar('\n', out); + writeChar('\t', indent, out); + writeChar('}', out); + } + + void writeArrayStart(WriteBuffer & out, size_t indent, const char * title) + { + if (title) + writeTitle(title, out, indent); + writeChar('\t', indent, out); + writeCString("[\n", out); + } + + void writeCompactArrayStart(WriteBuffer & out, size_t indent, const char * title) + { + if (title) + writeTitle(title, out, indent); + else + writeChar('\t', indent, out); + writeCString("[", out); + } + + void writeArrayEnd(WriteBuffer & out, size_t indent) + { + writeChar('\n', out); + writeChar('\t', indent, out); + writeChar(']', out); + } + + void writeCompactArrayEnd(WriteBuffer & out) { writeChar(']', out); } + + void writeFieldFromColumn( + const IColumn & column, + const ISerialization & serialization, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out, + const std::optional & name, + size_t indent) + { + if (name.has_value()) + writeTitle(name->data(), out, indent); + + if (yield_strings) + { + WriteBufferFromOwnString buf; + + serialization.serializeText(column, row_num, buf, settings); + writeJSONString(buf.str(), out, settings); + } + else + serialization.serializeTextJSON(column, row_num, out, settings); + } + + void writeColumns( + const Columns & columns, + const NamesAndTypes & fields, + const Serializations & serializations, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out, + size_t indent) + { + for (size_t i = 0; i < columns.size(); ++i) + { + if (i != 0) + writeFieldDelimiter(out); + writeFieldFromColumn(*columns[i], *serializations[i], row_num, yield_strings, settings, out, fields[i].name, indent); + } + } + + void writeCompactColumns( + const Columns & columns, + const Serializations & serializations, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out) + { + for (size_t i = 0; i < columns.size(); ++i) + { + if (i != 0) + writeFieldCompactDelimiter(out); + writeFieldFromColumn(*columns[i], *serializations[i], row_num, yield_strings, settings, out); + } + } + + void writeMetadata(const NamesAndTypes & fields, const FormatSettings & settings, WriteBuffer & out) + { + writeArrayStart(out, 1, "meta"); + + for (size_t i = 0; i < fields.size(); ++i) + { + writeObjectStart(out, 2); + + writeTitle("name", out, 3); + writeDoubleQuoted(fields[i].name, out); + writeFieldDelimiter(out); + writeTitle("type", out, 3); + writeJSONString(fields[i].type->getName(), out, settings); + writeObjectEnd(out, 2); + + if (i + 1 < fields.size()) + writeFieldDelimiter(out); + } + + writeArrayEnd(out, 1); + } + + void writeAdditionalInfo( + size_t rows, + size_t rows_before_limit, + bool applied_limit, + const Stopwatch & watch, + const Progress & progress, + bool write_statistics, + WriteBuffer & out) + { + writeFieldDelimiter(out, 2); + writeTitle("rows", out, 1); + writeIntText(rows, out); + + if (applied_limit) + { + writeFieldDelimiter(out, 2); + writeTitle("rows_before_limit_at_least", out, 1); + writeIntText(rows_before_limit, out); + } + + if (write_statistics) + { + writeFieldDelimiter(out, 2); + writeObjectStart(out, 1, "statistics"); + + writeTitle("elapsed", out, 2); + writeText(watch.elapsedSeconds(), out); + writeFieldDelimiter(out); + + writeTitle("rows_read", out, 2); + writeText(progress.read_rows.load(), out); + writeFieldDelimiter(out); + + writeTitle("bytes_read", out, 2); + writeText(progress.read_bytes.load(), out); + + writeObjectEnd(out, 1); + } + } + + void makeNamesAndTypesWithValidUTF8(NamesAndTypes & fields, const FormatSettings & settings, bool & need_validate_utf8) + { + for (auto & field : fields) + { + if (!field.type->textCanContainOnlyValidUTF8()) + need_validate_utf8 = true; + + WriteBufferFromOwnString buf; + { + WriteBufferValidUTF8 validating_buf(buf); + writeJSONString(field.name, validating_buf, settings); + } + field.name = buf.str().substr(1, buf.str().size() - 2); + } + } + +} + +} diff --git a/src/Formats/JSONUtils.h b/src/Formats/JSONUtils.h new file mode 100644 index 00000000000..f2aba3cbcb5 --- /dev/null +++ b/src/Formats/JSONUtils.h @@ -0,0 +1,109 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace JSONUtils +{ + std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); + std::pair + fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size, size_t min_rows); + + /// Parse JSON from string and convert it's type to ClickHouse type. Make the result type always Nullable. + /// JSON array with different nested types is treated as Tuple. + /// If cannot convert (for example when field contains null), return nullptr. + DataTypePtr getDataTypeFromField(const String & field); + + /// Read row in JSONEachRow format and try to determine type for each field. + /// Return list of names and types. + /// If cannot determine the type of some field, return nullptr for it. + NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings); + + /// Read row in JSONCompactEachRow format and try to determine type for each field. + /// If cannot determine the type of some field, return nullptr for it. + DataTypes readRowAndGetDataTypesForJSONCompactEachRow(ReadBuffer & in, bool json_strings); + + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf); + + bool readField( + ReadBuffer & in, + IColumn & column, + const DataTypePtr & type, + const SerializationPtr & serialization, + const String & column_name, + const FormatSettings & format_settings, + bool yield_strings); + + DataTypePtr getCommonTypeForJSONFormats(const DataTypePtr & first, const DataTypePtr & second, bool allow_bools_as_numbers); + + void makeNamesAndTypesWithValidUTF8(NamesAndTypes & fields, const FormatSettings & settings, bool & need_validate_utf8); + + /// Functions helpers for writing JSON data to WriteBuffer. + + void writeFieldDelimiter(WriteBuffer & out, size_t new_lines = 1); + + void writeFieldCompactDelimiter(WriteBuffer & out); + + void writeObjectStart(WriteBuffer & out, size_t indent = 0, const char * title = nullptr); + + void writeObjectEnd(WriteBuffer & out, size_t indent = 0); + + void writeArrayStart(WriteBuffer & out, size_t indent = 0, const char * title = nullptr); + + void writeCompactArrayStart(WriteBuffer & out, size_t indent = 0, const char * title = nullptr); + + void writeArrayEnd(WriteBuffer & out, size_t indent = 0); + + void writeCompactArrayEnd(WriteBuffer & out); + + void writeFieldFromColumn( + const IColumn & column, + const ISerialization & serialization, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out, + const std::optional & name = std::nullopt, + size_t indent = 0); + + void writeColumns( + const Columns & columns, + const NamesAndTypes & fields, + const Serializations & serializations, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out, + size_t indent = 0); + + void writeCompactColumns( + const Columns & columns, + const Serializations & serializations, + size_t row_num, + bool yield_strings, + const FormatSettings & settings, + WriteBuffer & out); + + void writeMetadata(const NamesAndTypes & fields, const FormatSettings & settings, WriteBuffer & out); + + void writeAdditionalInfo( + size_t rows, + size_t rows_before_limit, + bool applied_limit, + const Stopwatch & watch, + const Progress & progress, + bool write_statistics, + WriteBuffer & out); +} + +} diff --git a/src/Formats/NativeReader.cpp b/src/Formats/NativeReader.cpp index ed3aca43d52..3ad0ce5cfc4 100644 --- a/src/Formats/NativeReader.cpp +++ b/src/Formats/NativeReader.cpp @@ -23,6 +23,7 @@ namespace ErrorCodes extern const int INCORRECT_INDEX; extern const int LOGICAL_ERROR; extern const int CANNOT_READ_ALL_DATA; + extern const int INCORRECT_DATA; } @@ -31,8 +32,8 @@ NativeReader::NativeReader(ReadBuffer & istr_, UInt64 server_revision_) { } -NativeReader::NativeReader(ReadBuffer & istr_, const Block & header_, UInt64 server_revision_) - : istr(istr_), header(header_), server_revision(server_revision_) +NativeReader::NativeReader(ReadBuffer & istr_, const Block & header_, UInt64 server_revision_, bool skip_unknown_columns_) + : istr(istr_), header(header_), server_revision(server_revision_), skip_unknown_columns(skip_unknown_columns_) { } @@ -186,18 +187,29 @@ Block NativeReader::read() column.column = std::move(read_column); + bool use_in_result = true; if (header) { - /// Support insert from old clients without low cardinality type. - auto & header_column = header.getByName(column.name); - if (!header_column.type->equals(*column.type)) + if (header.has(column.name)) { - column.column = recursiveTypeConversion(column.column, column.type, header.safeGetByPosition(i).type); - column.type = header.safeGetByPosition(i).type; + /// Support insert from old clients without low cardinality type. + auto & header_column = header.getByName(column.name); + if (!header_column.type->equals(*column.type)) + { + column.column = recursiveTypeConversion(column.column, column.type, header.safeGetByPosition(i).type); + column.type = header.safeGetByPosition(i).type; + } + } + else + { + if (!skip_unknown_columns) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown column with name {} found while reading data in Native format", column.name); + use_in_result = false; } } - res.insert(std::move(column)); + if (use_in_result) + res.insert(std::move(column)); if (use_index) ++index_column_it; diff --git a/src/Formats/NativeReader.h b/src/Formats/NativeReader.h index 1f9eb8b9764..3ae53d45faf 100644 --- a/src/Formats/NativeReader.h +++ b/src/Formats/NativeReader.h @@ -24,7 +24,7 @@ public: /// For cases when data structure (header) is known in advance. /// NOTE We may use header for data validation and/or type conversions. It is not implemented. - NativeReader(ReadBuffer & istr_, const Block & header_, UInt64 server_revision_); + NativeReader(ReadBuffer & istr_, const Block & header_, UInt64 server_revision_, bool skip_unknown_columns_ = false); /// For cases when we have an index. It allows to skip columns. Only columns specified in the index will be read. NativeReader(ReadBuffer & istr_, UInt64 server_revision_, @@ -43,6 +43,7 @@ private: ReadBuffer & istr; Block header; UInt64 server_revision; + bool skip_unknown_columns; bool use_index = false; IndexForNativeFormat::Blocks::const_iterator index_block_it; diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index 6797b967baa..8493c84173d 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -38,6 +38,10 @@ void registerInputFormatJSONEachRow(FormatFactory & factory); void registerOutputFormatJSONEachRow(FormatFactory & factory); void registerInputFormatJSONCompactEachRow(FormatFactory & factory); void registerOutputFormatJSONCompactEachRow(FormatFactory & factory); +void registerInputFormatJSONColumns(FormatFactory & factory); +void registerOutputFormatJSONColumns(FormatFactory & factory); +void registerInputFormatJSONCompactColumns(FormatFactory & factory); +void registerOutputFormatJSONCompactColumns(FormatFactory & factory); void registerInputFormatProtobuf(FormatFactory & factory); void registerOutputFormatProtobuf(FormatFactory & factory); void registerInputFormatProtobufList(FormatFactory & factory); @@ -70,6 +74,7 @@ void registerOutputFormatVertical(FormatFactory & factory); void registerOutputFormatJSON(FormatFactory & factory); void registerOutputFormatJSONCompact(FormatFactory & factory); void registerOutputFormatJSONEachRowWithProgress(FormatFactory & factory); +void registerOutputFormatJSONColumnsWithMetadata(FormatFactory & factory); void registerOutputFormatXML(FormatFactory & factory); void registerOutputFormatODBCDriver2(FormatFactory & factory); void registerOutputFormatNull(FormatFactory & factory); @@ -102,14 +107,16 @@ void registerTSVSchemaReader(FormatFactory & factory); void registerCSVSchemaReader(FormatFactory & factory); void registerJSONCompactEachRowSchemaReader(FormatFactory & factory); void registerJSONEachRowSchemaReader(FormatFactory & factory); +void registerJSONAsStringSchemaReader(FormatFactory & factory); +void registerJSONAsObjectSchemaReader(FormatFactory & factory); +void registerJSONColumnsSchemaReader(FormatFactory & factory); +void registerJSONCompactColumnsSchemaReader(FormatFactory & factory); void registerNativeSchemaReader(FormatFactory & factory); void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory); void registerAvroSchemaReader(FormatFactory & factory); void registerProtobufSchemaReader(FormatFactory & factory); void registerProtobufListSchemaReader(FormatFactory & factory); void registerLineAsStringSchemaReader(FormatFactory & factory); -void registerJSONAsStringSchemaReader(FormatFactory & factory); -void registerJSONAsObjectSchemaReader(FormatFactory & factory); void registerRawBLOBSchemaReader(FormatFactory & factory); void registerMsgPackSchemaReader(FormatFactory & factory); void registerCapnProtoSchemaReader(FormatFactory & factory); @@ -120,6 +127,7 @@ void registerValuesSchemaReader(FormatFactory & factory); void registerTemplateSchemaReader(FormatFactory & factory); void registerMySQLSchemaReader(FormatFactory & factory); + void registerFileExtensions(FormatFactory & factory); void registerFormats() @@ -128,8 +136,8 @@ void registerFormats() registerFileSegmentationEngineTabSeparated(factory); registerFileSegmentationEngineCSV(factory); - registerFileSegmentationEngineJSONEachRow(factory); registerFileSegmentationEngineRegexp(factory); + registerFileSegmentationEngineJSONEachRow(factory); registerFileSegmentationEngineJSONAsString(factory); registerFileSegmentationEngineJSONAsObject(factory); registerFileSegmentationEngineJSONCompactEachRow(factory); @@ -155,6 +163,10 @@ void registerFormats() registerOutputFormatJSONEachRow(factory); registerInputFormatJSONCompactEachRow(factory); registerOutputFormatJSONCompactEachRow(factory); + registerInputFormatJSONColumns(factory); + registerOutputFormatJSONColumns(factory); + registerInputFormatJSONCompactColumns(factory); + registerOutputFormatJSONCompactColumns(factory); registerInputFormatProtobuf(factory); registerOutputFormatProtobufList(factory); registerInputFormatProtobufList(factory); @@ -184,6 +196,7 @@ void registerFormats() registerOutputFormatJSON(factory); registerOutputFormatJSONCompact(factory); registerOutputFormatJSONEachRowWithProgress(factory); + registerOutputFormatJSONColumnsWithMetadata(factory); registerOutputFormatXML(factory); registerOutputFormatODBCDriver2(factory); registerOutputFormatNull(factory); @@ -195,8 +208,8 @@ void registerFormats() registerInputFormatRegexp(factory); registerInputFormatJSONAsString(factory); - registerInputFormatLineAsString(factory); registerInputFormatJSONAsObject(factory); + registerInputFormatLineAsString(factory); #if USE_HIVE registerInputFormatHiveText(factory); #endif @@ -215,14 +228,16 @@ void registerFormats() registerCSVSchemaReader(factory); registerJSONCompactEachRowSchemaReader(factory); registerJSONEachRowSchemaReader(factory); + registerJSONAsStringSchemaReader(factory); + registerJSONAsObjectSchemaReader(factory); + registerJSONColumnsSchemaReader(factory); + registerJSONCompactColumnsSchemaReader(factory); registerNativeSchemaReader(factory); registerRowBinaryWithNamesAndTypesSchemaReader(factory); registerAvroSchemaReader(factory); registerProtobufSchemaReader(factory); registerProtobufListSchemaReader(factory); registerLineAsStringSchemaReader(factory); - registerJSONAsStringSchemaReader(factory); - registerJSONAsObjectSchemaReader(factory); registerRawBLOBSchemaReader(factory); registerMsgPackSchemaReader(factory); registerCapnProtoSchemaReader(factory); diff --git a/src/Formats/registerWithNamesAndTypes.cpp b/src/Formats/registerWithNamesAndTypes.cpp index cba578b08c7..2dee107844d 100644 --- a/src/Formats/registerWithNamesAndTypes.cpp +++ b/src/Formats/registerWithNamesAndTypes.cpp @@ -10,4 +10,10 @@ void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWit register_func(base_format_name + "WithNamesAndTypes", true, true); } +void markFormatWithNamesAndTypesSupportsSamplingColumns(const std::string & base_format_name, FormatFactory & factory) +{ + factory.markFormatSupportsSubsetOfColumns(base_format_name + "WithNames"); + factory.markFormatSupportsSubsetOfColumns(base_format_name + "WithNamesAndTypes"); +} + } diff --git a/src/Formats/registerWithNamesAndTypes.h b/src/Formats/registerWithNamesAndTypes.h index d8e74e3421e..50a0eee9616 100644 --- a/src/Formats/registerWithNamesAndTypes.h +++ b/src/Formats/registerWithNamesAndTypes.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB { @@ -9,4 +10,6 @@ namespace DB using RegisterWithNamesAndTypesFunc = std::function; void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWithNamesAndTypesFunc register_func); +void markFormatWithNamesAndTypesSupportsSamplingColumns(const std::string & base_format_name, FormatFactory & factory); + } diff --git a/src/Functions/CountSubstringsImpl.h b/src/Functions/CountSubstringsImpl.h index fc6e4a0e671..c8cef81333a 100644 --- a/src/Functions/CountSubstringsImpl.h +++ b/src/Functions/CountSubstringsImpl.h @@ -26,19 +26,21 @@ struct CountSubstringsImpl static constexpr bool supports_start_pos = true; static constexpr auto name = Name::name; + static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {};} + using ResultType = UInt64; /// Count occurrences of one substring in many strings. static void vectorConstant( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, const std::string & needle, const ColumnPtr & start_pos, PaddedPODArray & res) { - const UInt8 * begin = data.data(); + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.data() + haystack_data.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); /// FIXME: suboptimal memset(&res[0], 0, res.size() * sizeof(res[0])); @@ -52,15 +54,15 @@ struct CountSubstringsImpl while (pos < end && end != (pos = searcher.search(pos, end - pos))) { /// Determine which index it refers to. - while (begin + offsets[i] <= pos) + while (begin + haystack_offsets[i] <= pos) ++i; auto start = start_pos != nullptr ? start_pos->getUInt(i) : 0; /// We check that the entry does not pass through the boundaries of strings. - if (pos + needle.size() < begin + offsets[i]) + if (pos + needle.size() < begin + haystack_offsets[i]) { - auto res_pos = needle.size() + Impl::countChars(reinterpret_cast(begin + offsets[i - 1]), reinterpret_cast(pos)); + auto res_pos = needle.size() + Impl::countChars(reinterpret_cast(begin + haystack_offsets[i - 1]), reinterpret_cast(pos)); if (res_pos >= start) { ++res[i]; @@ -69,14 +71,14 @@ struct CountSubstringsImpl pos += needle.size(); continue; } - pos = begin + offsets[i]; + pos = begin + haystack_offsets[i]; ++i; } } /// Count number of occurrences of substring in string. static void constantConstantScalar( - std::string data, + std::string haystack, std::string needle, UInt64 start_pos, UInt64 & res) @@ -87,9 +89,9 @@ struct CountSubstringsImpl return; auto start = std::max(start_pos, UInt64(1)); - size_t start_byte = Impl::advancePos(data.data(), data.data() + data.size(), start - 1) - data.data(); + size_t start_byte = Impl::advancePos(haystack.data(), haystack.data() + haystack.size(), start - 1) - haystack.data(); size_t new_start_byte; - while ((new_start_byte = data.find(needle, start_byte)) != std::string::npos) + while ((new_start_byte = haystack.find(needle, start_byte)) != std::string::npos) { ++res; /// Intersecting substrings in haystack accounted only once @@ -99,21 +101,21 @@ struct CountSubstringsImpl /// Count number of occurrences of substring in string starting from different positions. static void constantConstant( - std::string data, + std::string haystack, std::string needle, const ColumnPtr & start_pos, PaddedPODArray & res) { - Impl::toLowerIfNeed(data); + Impl::toLowerIfNeed(haystack); Impl::toLowerIfNeed(needle); if (start_pos == nullptr) { - constantConstantScalar(data, needle, 0, res[0]); + constantConstantScalar(haystack, needle, 0, res[0]); return; } - size_t haystack_size = Impl::countChars(data.data(), data.data() + data.size()); + size_t haystack_size = Impl::countChars(haystack.data(), haystack.data() + haystack.size()); size_t size = start_pos != nullptr ? start_pos->size() : 0; for (size_t i = 0; i < size; ++i) @@ -125,7 +127,7 @@ struct CountSubstringsImpl res[i] = 0; continue; } - constantConstantScalar(data, needle, start, res[i]); + constantConstantScalar(haystack, needle, start, res[i]); } } @@ -228,6 +230,12 @@ struct CountSubstringsImpl { throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name); } + + template + static void vectorFixedVector(Args &&...) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name); + } }; } diff --git a/src/Functions/FunctionMathBinaryFloat64.h b/src/Functions/FunctionMathBinaryFloat64.h index badbde280f1..aec20d30271 100644 --- a/src/Functions/FunctionMathBinaryFloat64.h +++ b/src/Functions/FunctionMathBinaryFloat64.h @@ -213,7 +213,7 @@ private: template -struct BinaryFunctionPlain +struct BinaryFunctionVectorized { static constexpr auto name = Name::name; static constexpr auto rows_per_iteration = 1; @@ -225,6 +225,4 @@ struct BinaryFunctionPlain } }; -#define BinaryFunctionVectorized BinaryFunctionPlain - } diff --git a/src/Functions/FunctionUnaryArithmetic.h b/src/Functions/FunctionUnaryArithmetic.h index 4dc769b8177..445eb45fd9d 100644 --- a/src/Functions/FunctionUnaryArithmetic.h +++ b/src/Functions/FunctionUnaryArithmetic.h @@ -42,9 +42,8 @@ struct UnaryOperationImpl using ArrayA = typename ColVecA::Container; using ArrayC = typename ColVecC::Container; - MULTITARGET_FUNCTION_WRAPPER_AVX2_SSE42(vectorImpl, - MULTITARGET_FH( - static void NO_INLINE), /*vectorImpl*/ MULTITARGET_FB((const ArrayA & a, ArrayC & c) /// NOLINT + MULTITARGET_FUNCTION_AVX2_SSE42( + MULTITARGET_FUNCTION_HEADER(static void NO_INLINE), vectorImpl, MULTITARGET_FUNCTION_BODY((const ArrayA & a, ArrayC & c) /// NOLINT { size_t size = a.size(); for (size_t i = 0; i < size; ++i) @@ -79,9 +78,9 @@ struct UnaryOperationImpl template struct FixedStringUnaryOperationImpl { - MULTITARGET_FUNCTION_WRAPPER_AVX2_SSE42(vectorImpl, - MULTITARGET_FH( - static void NO_INLINE), /*vectorImpl*/ MULTITARGET_FB((const ColumnFixedString::Chars & a, ColumnFixedString::Chars & c) /// NOLINT + MULTITARGET_FUNCTION_AVX2_SSE42( + MULTITARGET_FUNCTION_HEADER(static void NO_INLINE), vectorImpl, MULTITARGET_FUNCTION_BODY((const ColumnFixedString::Chars & a, /// NOLINT + ColumnFixedString::Chars & c) { size_t size = a.size(); for (size_t i = 0; i < size; ++i) diff --git a/src/Functions/FunctionsBinaryRepr.cpp b/src/Functions/FunctionsBinaryRepresentation.cpp similarity index 97% rename from src/Functions/FunctionsBinaryRepr.cpp rename to src/Functions/FunctionsBinaryRepresentation.cpp index 4dd11a849a0..582dd1f1049 100644 --- a/src/Functions/FunctionsBinaryRepr.cpp +++ b/src/Functions/FunctionsBinaryRepresentation.cpp @@ -253,13 +253,13 @@ struct UnbinImpl /// Encode number or string to string with binary or hexadecimal representation template -class EncodeToBinaryRepr : public IFunction +class EncodeToBinaryRepresentation : public IFunction { public: static constexpr auto name = Impl::name; static constexpr size_t word_size = Impl::word_size; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -550,12 +550,12 @@ public: /// Decode number or string from string with binary or hexadecimal representation template -class DecodeFromBinaryRepr : public IFunction +class DecodeFromBinaryRepresentation : public IFunction { public: static constexpr auto name = Impl::name; static constexpr size_t word_size = Impl::word_size; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -623,10 +623,10 @@ public: void registerFunctionsBinaryRepr(FunctionFactory & factory) { - factory.registerFunction>(FunctionFactory::CaseInsensitive); - factory.registerFunction>(FunctionFactory::CaseInsensitive); - factory.registerFunction>(FunctionFactory::CaseInsensitive); - factory.registerFunction>(FunctionFactory::CaseInsensitive); + factory.registerFunction>(FunctionFactory::CaseInsensitive); + factory.registerFunction>(FunctionFactory::CaseInsensitive); + factory.registerFunction>(FunctionFactory::CaseInsensitive); + factory.registerFunction>(FunctionFactory::CaseInsensitive); } } diff --git a/src/Functions/FunctionsComparison.h b/src/Functions/FunctionsComparison.h index 16575e551a7..7bbb1c1096c 100644 --- a/src/Functions/FunctionsComparison.h +++ b/src/Functions/FunctionsComparison.h @@ -85,8 +85,9 @@ struct NumComparisonImpl using ContainerA = PaddedPODArray; using ContainerB = PaddedPODArray; - MULTITARGET_FUNCTION_WRAPPER_AVX2_SSE42(vectorVectorImpl, - MULTITARGET_FH(static void), /*vectorVectorImpl*/ MULTITARGET_FB((const ContainerA & a, const ContainerB & b, PaddedPODArray & c) /// NOLINT + MULTITARGET_FUNCTION_AVX2_SSE42( + MULTITARGET_FUNCTION_HEADER(static void), vectorVectorImpl, MULTITARGET_FUNCTION_BODY(( /// NOLINT + const ContainerA & a, const ContainerB & b, PaddedPODArray & c) { /** GCC 4.8.2 vectorizes a loop only if it is written in this form. * In this case, if you loop through the array index (the code will look simpler), @@ -127,8 +128,9 @@ struct NumComparisonImpl } - MULTITARGET_FUNCTION_WRAPPER_AVX2_SSE42(vectorConstantImpl, - MULTITARGET_FH(static void), /*vectorConstantImpl*/ MULTITARGET_FB((const ContainerA & a, B b, PaddedPODArray & c) /// NOLINT + MULTITARGET_FUNCTION_AVX2_SSE42( + MULTITARGET_FUNCTION_HEADER(static void), vectorConstantImpl, MULTITARGET_FUNCTION_BODY(( /// NOLINT + const ContainerA & a, B b, PaddedPODArray & c) { size_t size = a.size(); const A * __restrict a_pos = a.data(); diff --git a/src/Functions/FunctionsStringSearch.h b/src/Functions/FunctionsStringSearch.h index 4aa76ceec28..68425ee496e 100644 --- a/src/Functions/FunctionsStringSearch.h +++ b/src/Functions/FunctionsStringSearch.h @@ -15,7 +15,6 @@ namespace DB { /** Search and replace functions in strings: - * * position(haystack, needle) - the normal search for a substring in a string, returns the position (in bytes) of the found substring starting with 1, or 0 if no substring is found. * positionUTF8(haystack, needle) - the same, but the position is calculated at code points, provided that the string is encoded in UTF-8. * positionCaseInsensitive(haystack, needle) @@ -24,13 +23,29 @@ namespace DB * like(haystack, pattern) - search by the regular expression LIKE; Returns 0 or 1. Case-insensitive, but only for Latin. * notLike(haystack, pattern) * + * ilike(haystack, pattern) - like 'like' but case-insensitive + * notIlike(haystack, pattern) + * * match(haystack, pattern) - search by regular expression re2; Returns 0 or 1. - * multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns 0 or 1 if any pattern_i matches. - * multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns index of any match or zero if none; - * multiMatchAllIndices(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns an array of matched indices in any order; * * countSubstrings(haystack, needle) -- count number of occurrences of needle in haystack. * countSubstringsCaseInsensitive(haystack, needle) + * countSubstringsCaseInsensitiveUTF8(haystack, needle) + * + * hasToken() + * hasTokenCaseInsensitive() + * + * JSON stuff: + * visitParamExtractBool() + * simpleJSONExtractBool() + * visitParamExtractFloat() + * simpleJSONExtractFloat() + * visitParamExtractInt() + * simpleJSONExtractInt() + * visitParamExtractUInt() + * simpleJSONExtractUInt() + * visitParamHas() + * simpleJSONHas() * * Applies regexp re2 and pulls: * - the first subpattern, if the regexp has a subpattern; @@ -70,11 +85,7 @@ public: ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { - if (!Impl::use_default_implementation_for_constants) - return ColumnNumbers{}; - if (!Impl::supports_start_pos) - return ColumnNumbers{1, 2}; - return ColumnNumbers{1, 2, 3}; + return Impl::getArgumentsThatAreAlwaysConstant(); } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override @@ -104,8 +115,6 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override { - using ResultType = typename Impl::ResultType; - const ColumnPtr & column_haystack = arguments[0].column; const ColumnPtr & column_needle = arguments[1].column; @@ -116,6 +125,8 @@ public: const ColumnConst * col_haystack_const = typeid_cast(&*column_haystack); const ColumnConst * col_needle_const = typeid_cast(&*column_needle); + using ResultType = typename Impl::ResultType; + if constexpr (!Impl::use_default_implementation_for_constants) { bool is_col_start_pos_const = column_start_pos == nullptr || isColumnConst(*column_start_pos); @@ -162,6 +173,14 @@ public: col_needle_const->getValue(), column_start_pos, vec_res); + else if (col_haystack_vector_fixed && col_needle_vector) + Impl::vectorFixedVector( + col_haystack_vector_fixed->getChars(), + col_haystack_vector_fixed->getN(), + col_needle_vector->getChars(), + col_needle_vector->getOffsets(), + column_start_pos, + vec_res); else if (col_haystack_vector_fixed && col_needle_const) Impl::vectorFixedConstant( col_haystack_vector_fixed->getChars(), diff --git a/src/Functions/FunctionsVisitParam.h b/src/Functions/FunctionsVisitParam.h index 09fcf8659ed..5f86923b0d1 100644 --- a/src/Functions/FunctionsVisitParam.h +++ b/src/Functions/FunctionsVisitParam.h @@ -83,10 +83,12 @@ struct ExtractParamImpl static constexpr bool supports_start_pos = false; static constexpr auto name = Name::name; + static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {1, 2};} + /// It is assumed that `res` is the correct size and initialized with zeros. static void vectorConstant( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, std::string needle, const ColumnPtr & start_pos, PaddedPODArray & res) @@ -97,9 +99,9 @@ struct ExtractParamImpl /// We are looking for a parameter simply as a substring of the form "name" needle = "\"" + needle + "\":"; - const UInt8 * begin = data.data(); + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.data() + haystack_data.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); /// The current index in the string array. size_t i = 0; @@ -110,19 +112,19 @@ struct ExtractParamImpl while (pos < end && end != (pos = searcher.search(pos, end - pos))) { /// Let's determine which index it belongs to. - while (begin + offsets[i] <= pos) + while (begin + haystack_offsets[i] <= pos) { res[i] = 0; ++i; } /// We check that the entry does not pass through the boundaries of strings. - if (pos + needle.size() < begin + offsets[i]) - res[i] = ParamExtractor::extract(pos + needle.size(), begin + offsets[i] - 1); /// don't include terminating zero + if (pos + needle.size() < begin + haystack_offsets[i]) + res[i] = ParamExtractor::extract(pos + needle.size(), begin + haystack_offsets[i] - 1); /// don't include terminating zero else res[i] = 0; - pos = begin + offsets[i]; + pos = begin + haystack_offsets[i]; ++i; } @@ -145,6 +147,12 @@ struct ExtractParamImpl { throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name); } + + template + static void vectorFixedVector(Args &&...) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name); + } }; @@ -153,20 +161,20 @@ struct ExtractParamImpl template struct ExtractParamToStringImpl { - static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, + static void vector(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, std::string needle, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { /// Constant 5 is taken from a function that performs a similar task FunctionsStringSearch.h::ExtractImpl - res_data.reserve(data.size() / 5); - res_offsets.resize(offsets.size()); + res_data.reserve(haystack_data.size() / 5); + res_offsets.resize(haystack_offsets.size()); /// We are looking for a parameter simply as a substring of the form "name" needle = "\"" + needle + "\":"; - const UInt8 * begin = data.data(); + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.data() + haystack_data.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); /// The current index in the string array. size_t i = 0; @@ -177,7 +185,7 @@ struct ExtractParamToStringImpl while (pos < end && end != (pos = searcher.search(pos, end - pos))) { /// Determine which index it belongs to. - while (begin + offsets[i] <= pos) + while (begin + haystack_offsets[i] <= pos) { res_data.push_back(0); res_offsets[i] = res_data.size(); @@ -185,10 +193,10 @@ struct ExtractParamToStringImpl } /// We check that the entry does not pass through the boundaries of strings. - if (pos + needle.size() < begin + offsets[i]) - ParamExtractor::extract(pos + needle.size(), begin + offsets[i], res_data); + if (pos + needle.size() < begin + haystack_offsets[i]) + ParamExtractor::extract(pos + needle.size(), begin + haystack_offsets[i], res_data); - pos = begin + offsets[i]; + pos = begin + haystack_offsets[i]; res_data.push_back(0); res_offsets[i] = res_data.size(); diff --git a/src/Functions/HasTokenImpl.h b/src/Functions/HasTokenImpl.h index ec33a07fce3..9328bd99139 100644 --- a/src/Functions/HasTokenImpl.h +++ b/src/Functions/HasTokenImpl.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB @@ -14,7 +15,7 @@ namespace ErrorCodes /** Token search the string, means that needle must be surrounded by some separator chars, like whitespace or puctuation. */ -template +template struct HasTokenImpl { using ResultType = UInt8; @@ -23,9 +24,11 @@ struct HasTokenImpl static constexpr bool supports_start_pos = false; static constexpr auto name = Name::name; + static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {1, 2};} + static void vectorConstant( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, const std::string & pattern, const ColumnPtr & start_pos, PaddedPODArray & res) @@ -33,12 +36,12 @@ struct HasTokenImpl if (start_pos != nullptr) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function '{}' does not support start_pos argument", name); - if (offsets.empty()) + if (haystack_offsets.empty()) return; - const UInt8 * begin = data.data(); + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.data() + haystack_data.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); /// The current index in the array of strings. size_t i = 0; @@ -49,25 +52,25 @@ struct HasTokenImpl while (pos < end && end != (pos = searcher.search(pos, end - pos))) { /// Let's determine which index it refers to. - while (begin + offsets[i] <= pos) + while (begin + haystack_offsets[i] <= pos) { - res[i] = negate_result; + res[i] = negate; ++i; } /// We check that the entry does not pass through the boundaries of strings. - if (pos + pattern.size() < begin + offsets[i]) - res[i] = !negate_result; + if (pos + pattern.size() < begin + haystack_offsets[i]) + res[i] = !negate; else - res[i] = negate_result; + res[i] = negate; - pos = begin + offsets[i]; + pos = begin + haystack_offsets[i]; ++i; } /// Tail, in which there can be no substring. if (i < res.size()) - memset(&res[i], negate_result, (res.size() - i) * sizeof(res[0])); + memset(&res[i], negate, (res.size() - i) * sizeof(res[0])); } template @@ -88,6 +91,12 @@ struct HasTokenImpl { throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name); } + + template + static void vectorFixedVector(Args &&...) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name); + } }; } diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index 026b38b997b..54aaa3116fd 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "Regexps.h" #include "config_functions.h" @@ -17,24 +18,24 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_COLUMN; + extern const int LOGICAL_ERROR; extern const int ILLEGAL_TYPE_OF_ARGUMENT; } +namespace impl +{ /// Is the [I]LIKE expression reduced to finding a substring in a string? -static inline bool likePatternIsStrstr(const String & pattern, String & res) +inline bool likePatternIsSubstring(std::string_view pattern, String & res) { if (pattern.size() < 2 || pattern.front() != '%' || pattern.back() != '%') return false; - res = ""; + res.clear(); res.reserve(pattern.size() - 2); - const char * pos = pattern.data(); - const char * end = pos + pattern.size(); - - ++pos; - --end; + const char * pos = pattern.data() + 1; + const char * const end = pattern.data() + pattern.size() - 1; while (pos < end) { @@ -60,17 +61,24 @@ static inline bool likePatternIsStrstr(const String & pattern, String & res) return true; } -/** 'like' - if true, treat pattern as SQL LIKE or ILIKE; if false - treat pattern as re2 regexp. +} + +/** 'like' - if true, treat pattern as SQL LIKE, otherwise as re2 regexp. + * 'negate' - if true, negate result + * 'case_insensitive' - if true, match case insensitively + * * NOTE: We want to run regexp search for whole columns by one call (as implemented in function 'position') * but for that, regexp engine must support \0 bytes and their interpretation as string boundaries. */ -template +template struct MatchImpl { static constexpr bool use_default_implementation_for_constants = true; static constexpr bool supports_start_pos = false; static constexpr auto name = Name::name; + static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {2};} + using ResultType = UInt8; using Searcher = std::conditional_t; static void vectorConstant( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, - const String & pattern, - const ColumnPtr & start_pos, + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const String & needle, + const ColumnPtr & start_pos_, PaddedPODArray & res) { - if (start_pos != nullptr) + if (start_pos_ != nullptr) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function '{}' doesn't support start_pos argument", name); - if (offsets.empty()) + if (haystack_offsets.empty()) return; /// A simple case where the [I]LIKE expression reduces to finding a substring in a string String strstr_pattern; - if (like && likePatternIsStrstr(pattern, strstr_pattern)) + if (like && impl::likePatternIsSubstring(needle, strstr_pattern)) { - const UInt8 * const begin = data.data(); - const UInt8 * const end = data.data() + data.size(); + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.data() + haystack_data.size(); const UInt8 * pos = begin; /// The current index in the array of strings. @@ -109,31 +117,29 @@ struct MatchImpl while (pos < end && end != (pos = searcher.search(pos, end - pos))) { /// Let's determine which index it refers to. - while (begin + offsets[i] <= pos) + while (begin + haystack_offsets[i] <= pos) { - res[i] = revert; + res[i] = negate; ++i; } /// We check that the entry does not pass through the boundaries of strings. - if (pos + strstr_pattern.size() < begin + offsets[i]) - res[i] = !revert; + if (pos + strstr_pattern.size() < begin + haystack_offsets[i]) + res[i] = !negate; else - res[i] = revert; + res[i] = negate; - pos = begin + offsets[i]; + pos = begin + haystack_offsets[i]; ++i; } /// Tail, in which there can be no substring. if (i < res.size()) - memset(&res[i], revert, (res.size() - i) * sizeof(res[0])); + memset(&res[i], negate, (res.size() - i) * sizeof(res[0])); } else { - size_t size = offsets.size(); - - auto regexp = Regexps::get(pattern); + auto regexp = Regexps::get(needle); String required_substring; bool is_trivial; @@ -141,37 +147,39 @@ struct MatchImpl regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); + size_t haystack_size = haystack_offsets.size(); + if (required_substring.empty()) { if (!regexp->getRE2()) /// An empty regexp. Always matches. { - if (size) - memset(res.data(), 1, size * sizeof(res[0])); + if (haystack_size) + memset(res.data(), 1, haystack_size * sizeof(res[0])); } else { size_t prev_offset = 0; - for (size_t i = 0; i < size; ++i) + for (size_t i = 0; i < haystack_size; ++i) { - res[i] = revert + res[i] = negate ^ regexp->getRE2()->Match( - re2_st::StringPiece(reinterpret_cast(&data[prev_offset]), offsets[i] - prev_offset - 1), + {reinterpret_cast(&haystack_data[prev_offset]), haystack_offsets[i] - prev_offset - 1}, 0, - offsets[i] - prev_offset - 1, + haystack_offsets[i] - prev_offset - 1, re2_st::RE2::UNANCHORED, nullptr, 0); - prev_offset = offsets[i]; + prev_offset = haystack_offsets[i]; } } } else { - /// NOTE This almost matches with the case of LikePatternIsStrstr. + /// NOTE This almost matches with the case of impl::likePatternIsSubstring. - const UInt8 * const begin = data.data(); - const UInt8 * const end = data.begin() + data.size(); + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.begin() + haystack_data.size(); const UInt8 * pos = begin; /// The current index in the array of strings. @@ -183,79 +191,78 @@ struct MatchImpl while (pos < end && end != (pos = searcher.search(pos, end - pos))) { /// Determine which index it refers to. - while (begin + offsets[i] <= pos) + while (begin + haystack_offsets[i] <= pos) { - res[i] = revert; + res[i] = negate; ++i; } /// We check that the entry does not pass through the boundaries of strings. - if (pos + required_substring.size() < begin + offsets[i]) + if (pos + required_substring.size() < begin + haystack_offsets[i]) { /// And if it does not, if necessary, we check the regexp. if (is_trivial) - res[i] = !revert; + res[i] = !negate; else { - const char * str_data = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1] - 1; + const char * str_data = reinterpret_cast(&haystack_data[haystack_offsets[i - 1]]); + size_t str_size = haystack_offsets[i] - haystack_offsets[i - 1] - 1; /** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp, * so that it can match when `required_substring` occurs into the string several times, * and at the first occurrence, the regexp is not a match. */ + const size_t start_pos = (required_substring_is_prefix) ? (reinterpret_cast(pos) - str_data) : 0; + const size_t end_pos = str_size; - if (required_substring_is_prefix) - res[i] = revert - ^ regexp->getRE2()->Match( - re2_st::StringPiece(str_data, str_size), - reinterpret_cast(pos) - str_data, - str_size, - re2_st::RE2::UNANCHORED, - nullptr, - 0); - else - res[i] = revert - ^ regexp->getRE2()->Match( - re2_st::StringPiece(str_data, str_size), 0, str_size, re2_st::RE2::UNANCHORED, nullptr, 0); + res[i] = negate + ^ regexp->getRE2()->Match( + {str_data, str_size}, + start_pos, + end_pos, + re2_st::RE2::UNANCHORED, + nullptr, + 0); } } else - res[i] = revert; + res[i] = negate; - pos = begin + offsets[i]; + pos = begin + haystack_offsets[i]; ++i; } /// Tail, in which there can be no substring. if (i < res.size()) - memset(&res[i], revert, (res.size() - i) * sizeof(res[0])); + memset(&res[i], negate, (res.size() - i) * sizeof(res[0])); } } } /// Very carefully crafted copy-paste. static void vectorFixedConstant( - const ColumnString::Chars & data, size_t n, const String & pattern, + const ColumnString::Chars & haystack, + size_t N, + const String & needle, PaddedPODArray & res) { - if (data.empty()) + if (haystack.empty()) return; /// A simple case where the LIKE expression reduces to finding a substring in a string String strstr_pattern; - if (like && likePatternIsStrstr(pattern, strstr_pattern)) + if (like && impl::likePatternIsSubstring(needle, strstr_pattern)) { - const UInt8 * begin = data.data(); + const UInt8 * const begin = haystack.data(); + const UInt8 * const end = haystack.data() + haystack.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); size_t i = 0; const UInt8 * next_pos = begin; - /// If pattern is larger than string size - it cannot be found. - if (strstr_pattern.size() <= n) + /// If needle is larger than string size - it cannot be found. + if (strstr_pattern.size() <= N) { Searcher searcher(strstr_pattern.data(), strstr_pattern.size(), end - pos); @@ -263,19 +270,19 @@ struct MatchImpl while (pos < end && end != (pos = searcher.search(pos, end - pos))) { /// Let's determine which index it refers to. - while (next_pos + n <= pos) + while (next_pos + N <= pos) { - res[i] = revert; - next_pos += n; + res[i] = negate; + next_pos += N; ++i; } - next_pos += n; + next_pos += N; /// We check that the entry does not pass through the boundaries of strings. if (pos + strstr_pattern.size() <= next_pos) - res[i] = !revert; + res[i] = !negate; else - res[i] = revert; + res[i] = negate; pos = next_pos; ++i; @@ -284,13 +291,11 @@ struct MatchImpl /// Tail, in which there can be no substring. if (i < res.size()) - memset(&res[i], revert, (res.size() - i) * sizeof(res[0])); + memset(&res[i], negate, (res.size() - i) * sizeof(res[0])); } else { - size_t size = data.size() / n; - - auto regexp = Regexps::get(pattern); + auto regexp = Regexps::get(needle); String required_substring; bool is_trivial; @@ -298,44 +303,46 @@ struct MatchImpl regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); + const size_t haystack_size = haystack.size() / N; + if (required_substring.empty()) { if (!regexp->getRE2()) /// An empty regexp. Always matches. { - if (size) - memset(res.data(), 1, size * sizeof(res[0])); + if (haystack_size) + memset(res.data(), 1, haystack_size * sizeof(res[0])); } else { size_t offset = 0; - for (size_t i = 0; i < size; ++i) + for (size_t i = 0; i < haystack_size; ++i) { - res[i] = revert + res[i] = negate ^ regexp->getRE2()->Match( - re2_st::StringPiece(reinterpret_cast(&data[offset]), n), + {reinterpret_cast(&haystack[offset]), N}, 0, - n, + N, re2_st::RE2::UNANCHORED, nullptr, 0); - offset += n; + offset += N; } } } else { - /// NOTE This almost matches with the case of LikePatternIsStrstr. + /// NOTE This almost matches with the case of likePatternIsSubstring. - const UInt8 * begin = data.data(); + const UInt8 * const begin = haystack.data(); + const UInt8 * const end = haystack.data() + haystack.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); size_t i = 0; const UInt8 * next_pos = begin; /// If required substring is larger than string size - it cannot be found. - if (required_substring.size() <= n) + if (required_substring.size() <= N) { Searcher searcher(required_substring.data(), required_substring.size(), end - pos); @@ -343,46 +350,43 @@ struct MatchImpl while (pos < end && end != (pos = searcher.search(pos, end - pos))) { /// Let's determine which index it refers to. - while (next_pos + n <= pos) + while (next_pos + N <= pos) { - res[i] = revert; - next_pos += n; + res[i] = negate; + next_pos += N; ++i; } - next_pos += n; + next_pos += N; if (pos + required_substring.size() <= next_pos) { /// And if it does not, if necessary, we check the regexp. if (is_trivial) - res[i] = !revert; + res[i] = !negate; else { - const char * str_data = reinterpret_cast(next_pos - n); + const char * str_data = reinterpret_cast(next_pos - N); /** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp, * so that it can match when `required_substring` occurs into the string several times, * and at the first occurrence, the regexp is not a match. */ + const size_t start_pos = (required_substring_is_prefix) ? (reinterpret_cast(pos) - str_data) : 0; + const size_t end_pos = N; - if (required_substring_is_prefix) - res[i] = revert - ^ regexp->getRE2()->Match( - re2_st::StringPiece(str_data, n), - reinterpret_cast(pos) - str_data, - n, - re2_st::RE2::UNANCHORED, - nullptr, - 0); - else - res[i] = revert - ^ regexp->getRE2()->Match( - re2_st::StringPiece(str_data, n), 0, n, re2_st::RE2::UNANCHORED, nullptr, 0); + res[i] = negate + ^ regexp->getRE2()->Match( + {str_data, N}, + start_pos, + end_pos, + re2_st::RE2::UNANCHORED, + nullptr, + 0); } } else - res[i] = revert; + res[i] = negate; pos = next_pos; ++i; @@ -391,22 +395,248 @@ struct MatchImpl /// Tail, in which there can be no substring. if (i < res.size()) - memset(&res[i], revert, (res.size() - i) * sizeof(res[0])); + memset(&res[i], negate, (res.size() - i) * sizeof(res[0])); } } } - template - static void vectorVector(Args &&...) + static void vectorVector( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offset, + const ColumnPtr & start_pos_, + PaddedPODArray & res) { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needle argument", name); + const size_t haystack_size = haystack_offsets.size(); + + if (haystack_size != needle_offset.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Function '{}' unexpectedly received a different number of haystacks and needles", name); + + if (start_pos_ != nullptr) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Function '{}' doesn't support start_pos argument", name); + + if (haystack_offsets.empty()) + return; + + String required_substr; + bool is_trivial; + bool required_substring_is_prefix; /// for `anchored` execution of the regexp. + + size_t prev_haystack_offset = 0; + size_t prev_needle_offset = 0; + + for (size_t i = 0; i < haystack_size; ++i) + { + const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset]; + const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1; + + const auto * const cur_needle_data = &needle_data[prev_needle_offset]; + const size_t cur_needle_length = needle_offset[i] - prev_needle_offset - 1; + + const auto & needle = String( + reinterpret_cast(cur_needle_data), + cur_needle_length); + + if (like && impl::likePatternIsSubstring(needle, required_substr)) + { + if (required_substr.size() > cur_haystack_length) + res[i] = negate; + else + { + Searcher searcher(required_substr.data(), required_substr.size(), cur_haystack_length); + const auto * match = searcher.search(cur_haystack_data, cur_haystack_length); + res[i] = negate + ^ (match != cur_haystack_data + cur_haystack_length); + } + } + else + { + // each row is expected to contain a different like/re2 pattern + // --> bypass the regexp cache, instead construct the pattern on-the-fly + const int flags = Regexps::buildRe2Flags(); + const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); + + regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); + + if (required_substr.empty()) + { + if (!regexp.getRE2()) /// An empty regexp. Always matches. + { + res[i] = 1; + } + else + { + res[i] = negate + ^ regexp.getRE2()->Match( + {reinterpret_cast(cur_haystack_data), cur_haystack_length}, + 0, + cur_haystack_length, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + } + } + else + { + Searcher searcher(required_substr.data(), required_substr.size(), cur_haystack_length); + const auto * match = searcher.search(cur_haystack_data, cur_haystack_length); + + if (match == cur_haystack_data + cur_haystack_length) + { + res[i] = negate; // no match + } + else + { + if (is_trivial) + { + res[i] = !negate; // no wildcards in pattern + } + else + { + const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0; + const size_t end_pos = cur_haystack_length; + + res[i] = negate + ^ regexp.getRE2()->Match( + {reinterpret_cast(cur_haystack_data), cur_haystack_length}, + start_pos, + end_pos, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + } + } + } + } + + prev_haystack_offset = haystack_offsets[i]; + prev_needle_offset = needle_offset[i]; + } + } + + static void vectorFixedVector( + const ColumnString::Chars & haystack, + size_t N, + const ColumnString::Chars & needle_data, + const ColumnString::Offsets & needle_offset, + const ColumnPtr & start_pos_, + PaddedPODArray & res) + { + const size_t haystack_size = haystack.size()/N; + + if (haystack_size != needle_offset.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Function '{}' unexpectedly received a different number of haystacks and needles", name); + + if (start_pos_ != nullptr) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Function '{}' doesn't support start_pos argument", name); + + if (haystack.empty()) + return; + + String required_substr; + bool is_trivial; + bool required_substring_is_prefix; // for `anchored` execution of the regexp. + + size_t prev_haystack_offset = 0; + size_t prev_needle_offset = 0; + + for (size_t i = 0; i < haystack_size; ++i) + { + const auto * const cur_haystack_data = &haystack[prev_haystack_offset]; + const size_t cur_haystack_length = N; + + const auto * const cur_needle_data = &needle_data[prev_needle_offset]; + const size_t cur_needle_length = needle_offset[i] - prev_needle_offset - 1; + + const auto & needle = String( + reinterpret_cast(cur_needle_data), + cur_needle_length); + + if (like && impl::likePatternIsSubstring(needle, required_substr)) + { + if (required_substr.size() > cur_haystack_length) + res[i] = negate; + else + { + Searcher searcher(required_substr.data(), required_substr.size(), cur_haystack_length); + const auto * match = searcher.search(cur_haystack_data, cur_haystack_length); + res[i] = negate + ^ (match != cur_haystack_data + cur_haystack_length); + } + } + else + { + // each row is expected to contain a different like/re2 pattern + // --> bypass the regexp cache, instead construct the pattern on-the-fly + const int flags = Regexps::buildRe2Flags(); + const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); + + regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); + + if (required_substr.empty()) + { + if (!regexp.getRE2()) /// An empty regexp. Always matches. + { + res[i] = 1; + } + else + { + res[i] = negate + ^ regexp.getRE2()->Match( + {reinterpret_cast(cur_haystack_data), cur_haystack_length}, + 0, + cur_haystack_length, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + } + } + else + { + Searcher searcher(required_substr.data(), required_substr.size(), cur_haystack_length); + const auto * match = searcher.search(cur_haystack_data, cur_haystack_length); + + if (match == cur_haystack_data + cur_haystack_length) + { + res[i] = negate; // no match + } + else + { + if (is_trivial) + { + res[i] = !negate; // no wildcards in pattern + } + else + { + const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0; + const size_t end_pos = cur_haystack_length; + + res[i] = negate + ^ regexp.getRE2()->Match( + {reinterpret_cast(cur_haystack_data), cur_haystack_length}, + start_pos, + end_pos, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + } + } + } + } + prev_haystack_offset += N; + prev_needle_offset = needle_offset[i]; + } } - /// Search different needles in single haystack. template static void constantVector(Args &&...) { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support non-constant needle argument", name); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support search with non-constant needles in constant haystack", name); } }; diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index f3e67008707..adf9e9b585f 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -11,8 +11,6 @@ #if USE_HYPERSCAN # include -#else -# include "MatchImpl.h" #endif diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index 747c0e5e62c..595a3c8de5b 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -120,7 +120,7 @@ struct MultiMatchAnyImpl memset(accum.data(), 0, accum.size()); for (size_t j = 0; j < needles.size(); ++j) { - MatchImpl::vectorConstant(haystack_data, haystack_offsets, needles[j].toString(), nullptr, accum); + MatchImpl::vectorConstant(haystack_data, haystack_offsets, needles[j].toString(), nullptr, accum); for (size_t i = 0; i < res.size(); ++i) { if constexpr (FindAny) diff --git a/src/Functions/PositionImpl.h b/src/Functions/PositionImpl.h index d3b6d74c3cd..82e58cdc643 100644 --- a/src/Functions/PositionImpl.h +++ b/src/Functions/PositionImpl.h @@ -182,19 +182,21 @@ struct PositionImpl static constexpr bool supports_start_pos = true; static constexpr auto name = Name::name; + static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {};} + using ResultType = UInt64; /// Find one substring in many strings. static void vectorConstant( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, const std::string & needle, const ColumnPtr & start_pos, PaddedPODArray & res) { - const UInt8 * begin = data.data(); + const UInt8 * const begin = haystack_data.data(); + const UInt8 * const end = haystack_data.data() + haystack_data.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); /// Current index in the array of strings. size_t i = 0; @@ -205,7 +207,7 @@ struct PositionImpl while (pos < end && end != (pos = searcher.search(pos, end - pos))) { /// Determine which index it refers to. - while (begin + offsets[i] <= pos) + while (begin + haystack_offsets[i] <= pos) { res[i] = 0; ++i; @@ -213,14 +215,14 @@ struct PositionImpl auto start = start_pos != nullptr ? start_pos->getUInt(i) : 0; /// We check that the entry does not pass through the boundaries of strings. - if (pos + needle.size() < begin + offsets[i]) + if (pos + needle.size() < begin + haystack_offsets[i]) { - auto res_pos = 1 + Impl::countChars(reinterpret_cast(begin + offsets[i - 1]), reinterpret_cast(pos)); + auto res_pos = 1 + Impl::countChars(reinterpret_cast(begin + haystack_offsets[i - 1]), reinterpret_cast(pos)); if (res_pos < start) { pos = reinterpret_cast(Impl::advancePos( reinterpret_cast(pos), - reinterpret_cast(begin + offsets[i]), + reinterpret_cast(begin + haystack_offsets[i]), start - res_pos)); continue; } @@ -230,7 +232,7 @@ struct PositionImpl { res[i] = 0; } - pos = begin + offsets[i]; + pos = begin + haystack_offsets[i]; ++i; } @@ -411,6 +413,12 @@ struct PositionImpl { throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name); } + + template + static void vectorFixedVector(Args &&...) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support FixedString haystack argument", name); + } }; } diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index 9a1938a3f32..dc94b75211c 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -38,7 +38,7 @@ namespace ErrorCodes namespace Regexps { - using Regexp = OptimizedRegularExpressionImpl; + using Regexp = OptimizedRegularExpressionSingleThreaded; using Pool = ObjectPoolMap; template @@ -50,6 +50,17 @@ namespace Regexps return {pattern, flags}; } + template + inline int buildRe2Flags() + { + int flags = OptimizedRegularExpression::RE_DOT_NL; + if constexpr (no_capture) + flags |= OptimizedRegularExpression::RE_NO_CAPTURE; + if constexpr (case_insensitive) + flags |= OptimizedRegularExpression::RE_CASELESS; + return flags; + } + /** Returns holder of an object from Pool. * You must hold the ownership while using the object. * In destructor, it returns the object back to the Pool for further reuse. @@ -62,14 +73,7 @@ namespace Regexps return known_regexps.get(pattern, [&pattern] { - int flags = OptimizedRegularExpression::RE_DOT_NL; - - if (no_capture) - flags |= OptimizedRegularExpression::RE_NO_CAPTURE; - - if (case_insensitive) - flags |= Regexps::Regexp::RE_CASELESS; - + const int flags = buildRe2Flags(); ProfileEvents::increment(ProfileEvents::RegexpCreated); return new Regexp{createRegexp(pattern, flags)}; }); diff --git a/src/Functions/array/CMakeLists.txt b/src/Functions/array/CMakeLists.txt index c98f4430078..9762674d6e9 100644 --- a/src/Functions/array/CMakeLists.txt +++ b/src/Functions/array/CMakeLists.txt @@ -1,7 +1,7 @@ include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") add_headers_and_sources(clickhouse_functions_array .) add_library(clickhouse_functions_array ${clickhouse_functions_array_sources} ${clickhouse_functions_array_headers}) -target_link_libraries(clickhouse_functions_array PRIVATE dbms clickhouse_functions_gatherutils ch_contrib::eigen) +target_link_libraries(clickhouse_functions_array PRIVATE dbms clickhouse_functions_gatherutils) if (STRIP_DEBUG_SYMBOLS_FUNCTIONS) target_compile_options(clickhouse_functions_array PRIVATE "-g0") diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index a533cb2c0cc..2ef1cab4647 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -6,8 +7,7 @@ #include #include #include - -#include +#include "base/range.h" namespace DB { @@ -18,40 +18,98 @@ namespace ErrorCodes extern const int SIZES_OF_ARRAYS_DOESNT_MATCH; } -template -struct LpDistance +struct L1Distance { - static inline String name = "L" + std::to_string(N); - template - static void compute(const Eigen::MatrixX & left, const Eigen::MatrixX & right, PaddedPODArray & array) + static inline String name = "L1"; + + template + struct State { - auto norms = (left - right).colwise().template lpNorm(); - array.reserve(norms.size()); - // array.insert() failed to work with Eigen iterators - for (auto n : norms) - array.push_back(n); + FloatType sum = 0; + }; + + template + static void accumulate(State & state, FirstArgType x, SecondArgType y) + { + state.sum += fabs(x - y); + } + + template + static ResultType finalize(const State & state) + { + return state.sum; } }; -struct LinfDistance : LpDistance +struct L2Distance { - static inline String name = "Linf"; + static inline String name = "L2"; + + template + struct State + { + FloatType sum = 0; + }; + + template + static void accumulate(State & state, FirstArgType x, SecondArgType y) + { + state.sum += (x - y) * (x - y); + } + + template + static ResultType finalize(const State & state) + { + return sqrt(state.sum); + } }; +struct LinfDistance +{ + static inline String name = "Linf"; + + template + struct State + { + FloatType dist = 0; + }; + + template + static void accumulate(State & state, FirstArgType x, SecondArgType y) + { + state.dist = fmax(state.dist, fabs(x - y)); + } + + template + static ResultType finalize(const State & state) + { + return state.dist; + } +}; struct CosineDistance { static inline String name = "Cosine"; - template - static void compute(const Eigen::MatrixX & left, const Eigen::MatrixX & right, PaddedPODArray & array) + + template + struct State { - auto prod = left.cwiseProduct(right).colwise().sum(); - auto nx = left.colwise().norm(); - auto ny = right.colwise().norm(); - auto nm = nx.cwiseProduct(ny).cwiseInverse(); - auto dist = 1.0 - prod.cwiseProduct(nm).array(); - array.reserve(dist.size()); - for (auto d : dist) - array.push_back(d); + FloatType dot_prod = 0; + FloatType x_squared = 0; + FloatType y_squared = 0; + }; + + template + static void accumulate(State & state, FirstArgType x, SecondArgType y) + { + state.dot_prod += x * y; + state.x_squared += x * x; + state.y_squared += y * y; + } + + template + static ResultType finalize(const State & state) + { + return 1 - state.dot_prod / sqrt(state.x_squared * state.y_squared); } }; @@ -102,144 +160,197 @@ public: } ColumnPtr - executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override + executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { - DataTypePtr type_x = typeid_cast(arguments[0].type.get())->getNestedType(); - DataTypePtr type_y = typeid_cast(arguments[1].type.get())->getNestedType(); - - ColumnPtr col_x = arguments[0].column->convertToFullColumnIfConst(); - ColumnPtr col_y = arguments[1].column->convertToFullColumnIfConst(); - - const auto * arr_x = assert_cast(col_x.get()); - const auto * arr_y = assert_cast(col_y.get()); - - auto result = result_type->createColumn(); switch (result_type->getTypeId()) { case TypeIndex::Float32: - executeWithType(*arr_x, *arr_y, type_x, type_y, result); + return executeWithResultType(arguments, input_rows_count); break; case TypeIndex::Float64: - executeWithType(*arr_x, *arr_y, type_x, type_y, result); + return executeWithResultType(arguments, input_rows_count); break; default: throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected result type."); } - return result; } + +#define SUPPORTED_TYPES(action) \ + action(UInt8) \ + action(UInt16) \ + action(UInt32) \ + action(UInt64) \ + action(Int8) \ + action(Int16) \ + action(Int32) \ + action(Int64) \ + action(Float32) \ + action(Float64) + + private: - template - void executeWithType( - const ColumnArray & array_x, - const ColumnArray & array_y, - const DataTypePtr & type_x, - const DataTypePtr & type_y, - MutableColumnPtr & column) const + template + ColumnPtr executeWithResultType(const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const { - Eigen::MatrixX mx, my; - columnToMatrix(array_x, type_x, mx); - columnToMatrix(array_y, type_y, my); + DataTypePtr type_x = typeid_cast(arguments[0].type.get())->getNestedType(); - if (mx.rows() && my.rows() && mx.rows() != my.rows()) + /// Dynamic disaptch based on the 1st argument type + switch (type_x->getTypeId()) { - throw Exception( - ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH, - "Arguments of function {} have different array sizes: {} and {}", - getName(), mx.rows(), my.rows()); - } - auto & data = assert_cast &>(*column).getData(); - Kernel::compute(mx, my, data); - } + #define ON_TYPE(type) \ + case TypeIndex::type: \ + return executeWithFirstType(arguments, input_rows_count); \ + break; - template - void columnToMatrix(const ColumnArray & array, const DataTypePtr & nested_type, Eigen::MatrixX & mat) const - { - const auto & offsets = array.getOffsets(); - size_t cols = offsets.size(); - size_t rows = cols > 0 ? offsets.front() : 0; + SUPPORTED_TYPES(ON_TYPE) + #undef ON_TYPE - ColumnArray::Offset prev = 0; - for (ColumnArray::Offset off : offsets) - { - if (off - prev != rows) - throw Exception( - ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH, - "Arrays in a column passed to function {} have different sizes: {} and {}", - getName(), rows, off - prev); - prev = off; - } - - switch (nested_type->getTypeId()) - { - case TypeIndex::UInt8: - fillMatrix(mat, array, rows, cols); - break; - case TypeIndex::UInt16: - fillMatrix(mat, array, rows, cols); - break; - case TypeIndex::UInt32: - fillMatrix(mat, array, rows, cols); - break; - case TypeIndex::UInt64: - fillMatrix(mat, array, rows, cols); - break; - case TypeIndex::Int8: - fillMatrix(mat, array, rows, cols); - break; - case TypeIndex::Int16: - fillMatrix(mat, array, rows, cols); - break; - case TypeIndex::Int32: - fillMatrix(mat, array, rows, cols); - break; - case TypeIndex::Int64: - fillMatrix(mat, array, rows, cols); - break; - case TypeIndex::Float32: - fillMatrix(mat, array, rows, cols); - break; - case TypeIndex::Float64: - fillMatrix(mat, array, rows, cols); - break; default: throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Arguments of function {} has nested type {}. " "Support: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", - getName(), nested_type->getName()); + getName(), type_x->getName()); } } - // optimize for float/ double - template - requires std::is_same_v - void fillMatrix(Eigen::MatrixX & mat, const ColumnArray & array, size_t rows, size_t cols) const + template + ColumnPtr executeWithFirstType(const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const { - const auto & data = typeid_cast &>(array.getData()).getData(); - mat = Eigen::Map>(data.data(), rows, cols); + DataTypePtr type_y = typeid_cast(arguments[1].type.get())->getNestedType(); + + /// Dynamic disaptch based on the 2nd argument type + switch (type_y->getTypeId()) + { + #define ON_TYPE(type) \ + case TypeIndex::type: \ + return executeWithTypes(arguments[0].column, arguments[1].column, input_rows_count); \ + break; + + SUPPORTED_TYPES(ON_TYPE) + #undef ON_TYPE + + default: + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Arguments of function {} has nested type {}. " + "Support: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", + getName(), type_y->getName()); + } } - template - void fillMatrix(Eigen::MatrixX & mat, const ColumnArray & array, size_t rows, size_t cols) const + template + ColumnPtr executeWithTypes(ColumnPtr col_x, ColumnPtr col_y, size_t input_rows_count) const { - const auto & data = typeid_cast &>(array.getData()).getData(); - mat.resize(rows, cols); - for (size_t col = 0; col < cols; ++col) + if (typeid_cast(col_x.get())) { - for (size_t row = 0; row < rows; ++row) + return executeWithTypesFirstArgConst(col_x, col_y, input_rows_count); + } + else if (typeid_cast(col_y.get())) + { + return executeWithTypesFirstArgConst(col_y, col_x, input_rows_count); + } + + col_x = col_x->convertToFullColumnIfConst(); + col_y = col_y->convertToFullColumnIfConst(); + + const auto & array_x = *assert_cast(col_x.get()); + const auto & array_y = *assert_cast(col_y.get()); + + const auto & data_x = typeid_cast &>(array_x.getData()).getData(); + const auto & data_y = typeid_cast &>(array_y.getData()).getData(); + + const auto & offsets_x = array_x.getOffsets(); + const auto & offsets_y = array_y.getOffsets(); + + /// Check that arrays in both columns are the sames size + for (size_t row = 0; row < offsets_x.size(); ++row) + { + if (unlikely(offsets_x[row] != offsets_y[row])) { - size_t off = col * rows; - mat(row, col) = static_cast(data[off + row]); + ColumnArray::Offset prev_offset = row > 0 ? offsets_x[row] : 0; + throw Exception( + ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH, + "Arguments of function {} have different array sizes: {} and {}", + getName(), offsets_x[row] - prev_offset, offsets_y[row] - prev_offset); } } + + auto result = ColumnVector::create(input_rows_count); + auto & result_data = result->getData(); + + /// Do the actual computation + ColumnArray::Offset prev = 0; + size_t row = 0; + for (auto off : offsets_x) + { + typename Kernel::template State state; + for (; prev < off; ++prev) + { + Kernel::accumulate(state, data_x[prev], data_y[prev]); + } + result_data[row] = Kernel::finalize(state); + row++; + } + return result; } + + /// Special case when the 1st parameter is Const + template + ColumnPtr executeWithTypesFirstArgConst(ColumnPtr col_x, ColumnPtr col_y, size_t input_rows_count) const + { + col_x = assert_cast(col_x.get())->getDataColumnPtr(); + col_y = col_y->convertToFullColumnIfConst(); + + const auto & array_x = *assert_cast(col_x.get()); + const auto & array_y = *assert_cast(col_y.get()); + + const auto & data_x = typeid_cast &>(array_x.getData()).getData(); + const auto & data_y = typeid_cast &>(array_y.getData()).getData(); + + const auto & offsets_x = array_x.getOffsets(); + const auto & offsets_y = array_y.getOffsets(); + + /// Check that arrays in both columns are the sames size + ColumnArray::Offset prev_offset = 0; + for (size_t row : collections::range(0, offsets_y.size())) + { + if (unlikely(offsets_x[0] != offsets_y[row] - prev_offset)) + { + throw Exception( + ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH, + "Arguments of function {} have different array sizes: {} and {}", + getName(), offsets_x[0], offsets_y[row] - prev_offset); + } + prev_offset = offsets_y[row]; + } + + auto result = ColumnVector::create(input_rows_count); + auto & result_data = result->getData(); + + /// Do the actual computation + ColumnArray::Offset prev = 0; + size_t row = 0; + for (auto off : offsets_y) + { + typename Kernel::template State state; + for (size_t i = 0; prev < off; ++i, ++prev) + { + Kernel::accumulate(state, data_x[i], data_y[prev]); + } + result_data[row] = Kernel::finalize(state); + row++; + } + return result; + } + }; void registerFunctionArrayDistance(FunctionFactory & factory) { - factory.registerFunction>>(); - factory.registerFunction>>(); + factory.registerFunction>(); + factory.registerFunction>(); factory.registerFunction>(); factory.registerFunction>(); } diff --git a/src/Functions/array/arrayNorm.cpp b/src/Functions/array/arrayNorm.cpp index 20fe85d7491..587c65a49ca 100644 --- a/src/Functions/array/arrayNorm.cpp +++ b/src/Functions/array/arrayNorm.cpp @@ -1,4 +1,6 @@ +#include #include +#include #include #include #include @@ -7,8 +9,6 @@ #include #include -#include - namespace DB { namespace ErrorCodes @@ -17,26 +17,59 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -template -struct LpNorm +struct L1Norm { - static inline String name = "L" + std::to_string(N); - template - static void compute(const std::vector> & vec, PaddedPODArray & array) + static inline String name = "L1"; + + template + inline static ResultType accumulate(ResultType result, ArgumentType value) { - array.reserve(vec.size()); - for (const auto & v : vec) - { - array.push_back(v.template lpNorm()); - } + return result + fabs(value); + } + + template + inline static ResultType finalize(ResultType result) + { + return result; } }; -struct LinfNorm : LpNorm +struct L2Norm +{ + static inline String name = "L2"; + + template + inline static ResultType accumulate(ResultType result, ArgumentType value) + { + return result + value * value; + } + + template + inline static ResultType finalize(ResultType result) + { + return sqrt(result); + } +}; + + +struct LinfNorm { static inline String name = "Linf"; + + template + inline static ResultType accumulate(ResultType result, ArgumentType value) + { + return fmax(result, fabs(value)); + } + + template + inline static ResultType finalize(ResultType result) + { + return result; + } }; + template class FunctionArrayNorm : public IFunction { @@ -84,72 +117,53 @@ public: } ColumnPtr - executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override + executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { DataTypePtr type = typeid_cast(arguments[0].type.get())->getNestedType(); ColumnPtr column = arguments[0].column->convertToFullColumnIfConst(); const auto * arr = assert_cast(column.get()); - auto result = result_type->createColumn(); switch (result_type->getTypeId()) { case TypeIndex::Float32: - executeWithType(*arr, type, result); + return executeWithResultType(*arr, type, input_rows_count); break; case TypeIndex::Float64: - executeWithType(*arr, type, result); + return executeWithResultType(*arr, type, input_rows_count); break; default: throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected result type."); } - return result; } private: - template - void executeWithType(const ColumnArray & array, const DataTypePtr & type, MutableColumnPtr & column) const - { - std::vector> vec; - columnToVectors(array, type, vec); - auto & data = assert_cast &>(*column).getData(); - Kernel::compute(vec, data); - } - template - void columnToVectors(const ColumnArray & array, const DataTypePtr & nested_type, std::vector> & vec) const +#define SUPPORTED_TYPES(action) \ + action(UInt8) \ + action(UInt16) \ + action(UInt32) \ + action(UInt64) \ + action(Int8) \ + action(Int16) \ + action(Int32) \ + action(Int64) \ + action(Float32) \ + action(Float64) + + + template + ColumnPtr executeWithResultType(const ColumnArray & array, const DataTypePtr & nested_type, size_t input_rows_count) const { switch (nested_type->getTypeId()) { - case TypeIndex::UInt8: - fillVectors(vec, array); - break; - case TypeIndex::UInt16: - fillVectors(vec, array); - break; - case TypeIndex::UInt32: - fillVectors(vec, array); - break; - case TypeIndex::UInt64: - fillVectors(vec, array); - break; - case TypeIndex::Int8: - fillVectors(vec, array); - break; - case TypeIndex::Int16: - fillVectors(vec, array); - break; - case TypeIndex::Int32: - fillVectors(vec, array); - break; - case TypeIndex::Int64: - fillVectors(vec, array); - break; - case TypeIndex::Float32: - fillVectors(vec, array); - break; - case TypeIndex::Float64: - fillVectors(vec, array); + #define ON_TYPE(type) \ + case TypeIndex::type: \ + return executeWithTypes(array, input_rows_count); \ break; + + SUPPORTED_TYPES(ON_TYPE) + #undef ON_TYPE + default: throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, @@ -159,46 +173,35 @@ private: } } - template - requires std::is_same_v - void fillVectors(std::vector> & vec, const ColumnArray & array) const + template + static ColumnPtr executeWithTypes(const ColumnArray & array, size_t input_rows_count) { - const auto & data = typeid_cast &>(array.getData()).getData(); + const auto & data = typeid_cast &>(array.getData()).getData(); const auto & offsets = array.getOffsets(); - vec.reserve(offsets.size()); - ColumnArray::Offset prev = 0; - for (auto off : offsets) - { - vec.emplace_back(Eigen::Map>(data.data() + prev, off - prev)); - prev = off; - } - } - template - void fillVectors(std::vector> & vec, const ColumnArray & array) const - { - const auto & data = typeid_cast &>(array.getData()).getData(); - const auto & offsets = array.getOffsets(); - vec.reserve(offsets.size()); + auto result_col = ColumnVector::create(input_rows_count); + auto & result_data = result_col->getData(); ColumnArray::Offset prev = 0; + size_t row = 0; for (auto off : offsets) { - Eigen::VectorX mat(off - prev); - for (ColumnArray::Offset row = 0; row + prev < off; ++row) + Float64 result = 0; + for (; prev < off; ++prev) { - mat[row] = static_cast(data[prev + row]); + result = Kernel::accumulate(result, data[prev]); } - prev = off; - vec.emplace_back(mat); + result_data[row] = Kernel::finalize(result); + row++; } + return result_col; } }; void registerFunctionArrayNorm(FunctionFactory & factory) { - factory.registerFunction>>(); - factory.registerFunction>>(); + factory.registerFunction>(); + factory.registerFunction>(); factory.registerFunction>(); } diff --git a/src/Functions/generateUUIDv4.cpp b/src/Functions/generateUUIDv4.cpp index 659c4c2c7c6..e4ecf5358f9 100644 --- a/src/Functions/generateUUIDv4.cpp +++ b/src/Functions/generateUUIDv4.cpp @@ -5,6 +5,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + #define DECLARE_SEVERAL_IMPLEMENTATIONS(...) \ DECLARE_DEFAULT_CODE (__VA_ARGS__) \ DECLARE_AVX2_SPECIFIC_CODE(__VA_ARGS__) @@ -23,10 +28,18 @@ public: size_t getNumberOfArguments() const override { return 0; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + bool isVariadic() const override { return true; } - DataTypePtr getReturnTypeImpl(const DataTypes &) const override + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { + if (arguments.size() > 1) + throw Exception("Number of arguments for function " + getName() + " doesn't match: passed " + + toString(arguments.size()) + ", should be 0 or 1.", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + return std::make_shared(); } diff --git a/src/Functions/ilike.cpp b/src/Functions/ilike.cpp index 116c945e04f..1222cc48d07 100644 --- a/src/Functions/ilike.cpp +++ b/src/Functions/ilike.cpp @@ -12,7 +12,7 @@ struct NameILike static constexpr auto name = "ilike"; }; -using ILikeImpl = MatchImpl; +using ILikeImpl = MatchImpl; using FunctionILike = FunctionsStringSearch; } diff --git a/src/Functions/like.h b/src/Functions/like.h index a00891ec64c..edb738d393b 100644 --- a/src/Functions/like.h +++ b/src/Functions/like.h @@ -11,7 +11,7 @@ struct NameLike static constexpr auto name = "like"; }; -using LikeImpl = MatchImpl; +using LikeImpl = MatchImpl; using FunctionLike = FunctionsStringSearch; } diff --git a/src/Functions/match.cpp b/src/Functions/match.cpp index 69dc1a3d99a..4c329701464 100644 --- a/src/Functions/match.cpp +++ b/src/Functions/match.cpp @@ -13,7 +13,7 @@ struct NameMatch static constexpr auto name = "match"; }; -using FunctionMatch = FunctionsStringSearch>; +using FunctionMatch = FunctionsStringSearch>; } diff --git a/src/Functions/normalizeString.cpp b/src/Functions/normalizeString.cpp index 8989e56d0d3..9b1d1292d2c 100644 --- a/src/Functions/normalizeString.cpp +++ b/src/Functions/normalizeString.cpp @@ -106,7 +106,7 @@ struct NormalizeUTF8Impl size_t from_size = offsets[i] - current_from_offset - 1; from_uchars.resize(from_size + 1); - int32_t from_code_points; + int32_t from_code_points = 0; u_strFromUTF8( from_uchars.data(), from_uchars.size(), @@ -133,7 +133,7 @@ struct NormalizeUTF8Impl if (res_data.size() < max_to_size) res_data.resize(max_to_size); - int32_t to_size; + int32_t to_size = 0; u_strToUTF8( reinterpret_cast(&res_data[current_to_offset]), res_data.size() - current_to_offset, @@ -151,6 +151,8 @@ struct NormalizeUTF8Impl current_from_offset = offsets[i]; } + + res_data.resize(current_to_offset); } [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) diff --git a/src/Functions/notILike.cpp b/src/Functions/notILike.cpp index be40e2b989e..b5e06ac55f4 100644 --- a/src/Functions/notILike.cpp +++ b/src/Functions/notILike.cpp @@ -12,7 +12,7 @@ struct NameNotILike static constexpr auto name = "notILike"; }; -using NotILikeImpl = MatchImpl; +using NotILikeImpl = MatchImpl; using FunctionNotILike = FunctionsStringSearch; } diff --git a/src/Functions/notLike.cpp b/src/Functions/notLike.cpp index 7c4ea8ab2dc..7fa1b6f9122 100644 --- a/src/Functions/notLike.cpp +++ b/src/Functions/notLike.cpp @@ -12,7 +12,7 @@ struct NameNotLike static constexpr auto name = "notLike"; }; -using FunctionNotLike = FunctionsStringSearch>; +using FunctionNotLike = FunctionsStringSearch>; } diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 69500a5efe4..21e943b36ef 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -256,6 +256,7 @@ void readString(String & s, ReadBuffer & buf) template void readStringInto>(PaddedPODArray & s, ReadBuffer & buf); template void readStringInto(String & s, ReadBuffer & buf); +template void readStringInto(NullOutput & s, ReadBuffer & buf); template void readStringUntilEOFInto(Vector & s, ReadBuffer & buf) @@ -617,6 +618,12 @@ void readBackQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) readBackQuotedStringInto(s, buf); } +template +concept WithResize = requires (T value) +{ + { value.resize(1) }; + { value.size() } -> std::integral<>; +}; template void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings) @@ -700,16 +707,18 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & if (!buf.hasPendingData()) continue; - /** CSV format can contain insignificant spaces and tabs. - * Usually the task of skipping them is for the calling code. - * But in this case, it will be difficult to do this, so remove the trailing whitespace by ourself. - */ - size_t size = s.size(); - while (size > 0 - && (s[size - 1] == ' ' || s[size - 1] == '\t')) - --size; + if constexpr (WithResize) + { + /** CSV format can contain insignificant spaces and tabs. + * Usually the task of skipping them is for the calling code. + * But in this case, it will be difficult to do this, so remove the trailing whitespace by ourself. + */ + size_t size = s.size(); + while (size > 0 && (s[size - 1] == ' ' || s[size - 1] == '\t')) + --size; - s.resize(size); + s.resize(size); + } return; } } @@ -741,6 +750,7 @@ void readCSVField(String & s, ReadBuffer & buf, const FormatSettings::CSV & sett } template void readCSVStringInto>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +template void readCSVStringInto(NullOutput & s, ReadBuffer & buf, const FormatSettings::CSV & settings); template @@ -1313,8 +1323,8 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim } // Use PeekableReadBuffer to copy field to string after parsing. -template -static void readParsedValueIntoString(String & s, ReadBuffer & buf, ParseFunc parse_func) +template +static void readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_func) { PeekableReadBuffer peekable_buf(buf); peekable_buf.setCheckpoint(); @@ -1326,8 +1336,8 @@ static void readParsedValueIntoString(String & s, ReadBuffer & buf, ParseFunc pa peekable_buf.position() = end; } -template -static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf) +template +static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) { assertChar(opening_bracket, buf); s.push_back(opening_bracket); @@ -1363,10 +1373,9 @@ static void readQuotedFieldInBrackets(String & s, ReadBuffer & buf) } } -void readQuotedFieldIntoString(String & s, ReadBuffer & buf) +template +void readQuotedFieldInto(Vector & s, ReadBuffer & buf) { - s.clear(); - if (buf.eof()) return; @@ -1386,11 +1395,11 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) s.push_back('\''); } else if (*buf.position() == '[') - readQuotedFieldInBrackets<'[', ']'>(s, buf); + readQuotedFieldInBracketsInto<'[', ']'>(s, buf); else if (*buf.position() == '(') - readQuotedFieldInBrackets<'(', ')'>(s, buf); + readQuotedFieldInBracketsInto<'(', ')'>(s, buf); else if (*buf.position() == '{') - readQuotedFieldInBrackets<'{', '}'>(s, buf); + readQuotedFieldInBracketsInto<'{', '}'>(s, buf); else if (checkCharCaseInsensitive('n', buf)) { /// NULL or NaN @@ -1423,14 +1432,23 @@ void readQuotedFieldIntoString(String & s, ReadBuffer & buf) Float64 tmp; readFloatText(tmp, in); }; - readParsedValueIntoString(s, buf, parse_func); + readParsedValueInto(s, buf, parse_func); } } -void readJSONFieldIntoString(String & s, ReadBuffer & buf) +template void readQuotedFieldInto(NullOutput & s, ReadBuffer & buf); + +void readQuotedField(String & s, ReadBuffer & buf) { + s.clear(); + readQuotedFieldInto(s, buf); +} + +void readJSONField(String & s, ReadBuffer & buf) +{ + s.clear(); auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); }; - readParsedValueIntoString(s, buf, parse_func); + readParsedValueInto(s, buf, parse_func); } } diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index c5ffa52c9b3..e11c57947d1 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -618,6 +618,8 @@ void readStringUntilNewlineInto(Vector & s, ReadBuffer & buf); struct NullOutput { void append(const char *, size_t) {} + void append(const char *) {} + void append(const char *, const char *) {} void push_back(char) {} /// NOLINT }; @@ -931,12 +933,29 @@ inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, Re ++buf.position(); /// Keep sign of fractional part the same with whole part if datetime64 is negative - /// 1965-12-12 12:12:12.123 => whole = -127914468, fraction = 123(sign>0) -> new whole = -127914467, new fraction = 877(sign<0) + /// Case1: + /// 1965-12-12 12:12:12.123 + /// => whole = -127914468, fractional = 123(coefficient>0) + /// => new whole = -127914467, new fractional = 877(coefficient<0) + /// + /// Case2: + /// 1969-12-31 23:59:59.123 + /// => whole = -1, fractional = 123(coefficient>0) + /// => new whole = 0, new fractional = -877(coefficient>0) if (components.whole < 0 && components.fractional != 0) { const auto scale_multiplier = DecimalUtils::scaleMultiplier(scale); ++components.whole; - components.fractional = scale_multiplier - components.fractional; + if (components.whole) + { + /// whole keep the sign, fractional should be non-negative + components.fractional = scale_multiplier - components.fractional; + } + else + { + /// when whole is zero, fractional should keep the sign + components.fractional = components.fractional - scale_multiplier; + } } } /// 9908870400 is time_t value for 2184-01-01 UTC (a bit over the last year supported by DateTime64) @@ -1425,8 +1444,11 @@ struct PcgDeserializer } }; -void readQuotedFieldIntoString(String & s, ReadBuffer & buf); +template +void readQuotedFieldInto(Vector & s, ReadBuffer & buf); -void readJSONFieldIntoString(String & s, ReadBuffer & buf); +void readQuotedField(String & s, ReadBuffer & buf); + +void readJSONField(String & s, ReadBuffer & buf); } diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h index 8547a0af1cd..5eab75f14b1 100644 --- a/src/IO/WriteHelpers.h +++ b/src/IO/WriteHelpers.h @@ -805,11 +805,21 @@ inline void writeDateTimeText(DateTime64 datetime64, UInt32 scale, WriteBuffer & scale = scale > MaxScale ? MaxScale : scale; auto components = DecimalUtils::split(datetime64, scale); - /// -127914467.877 => whole = -127914467, fraction = 877 => new whole = -127914468(1965-12-12 12:12:12), new fraction = 123(.123) => 1965-12-12 12:12:12.123 - if (components.whole < 0 && components.fractional != 0) + /// Case1: + /// -127914467.877 + /// => whole = -127914467, fraction = 877(After DecimalUtils::split) + /// => new whole = -127914468(1965-12-12 12:12:12), new fraction = 1000 - 877 = 123(.123) + /// => 1965-12-12 12:12:12.123 + /// + /// Case2: + /// -0.877 + /// => whole = 0, fractional = -877(After DecimalUtils::split) + /// => whole = -1(1969-12-31 23:59:59), fractional = 1000 + (-877) = 123(.123) + using T = typename DateTime64::NativeType; + if (datetime64.value < 0 && components.fractional) { + components.fractional = DecimalUtils::scaleMultiplier(scale) + (components.whole ? T(-1) : T(1)) * components.fractional; --components.whole; - components.fractional = DecimalUtils::scaleMultiplier(scale) - components.fractional; } writeDateTimeText(LocalDateTime(components.whole, time_zone), buf); @@ -989,7 +999,12 @@ void writeText(Decimal x, UInt32 scale, WriteBuffer & ostr, bool trailing_zer { part = DecimalUtils::getFractionalPart(x, scale); if (part || trailing_zeros) + { + if (part < 0) + part *= T(-1); + writeDecimalFractional(part, scale, ostr, trailing_zeros); + } } } diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index f796a55ff72..2fc9b51674f 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -997,8 +997,8 @@ void ActionsDAG::addMaterializingOutputActions() const ActionsDAG::Node & ActionsDAG::materializeNode(const Node & node) { - FunctionOverloadResolverPtr func_builder_materialize = std::make_unique( - std::make_shared()); + FunctionOverloadResolverPtr func_builder_materialize + = std::make_unique(std::make_shared()); const auto & name = node.result_name; const auto * func = &addFunction(func_builder_materialize, {&node}, {}); @@ -1102,7 +1102,8 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( const auto * left_arg = dst_node; FunctionCastBase::Diagnostic diagnostic = {dst_node->result_name, res_elem.name}; - FunctionOverloadResolverPtr func_builder_cast = CastInternalOverloadResolver::createImpl(std::move(diagnostic)); + FunctionOverloadResolverPtr func_builder_cast + = CastInternalOverloadResolver::createImpl(std::move(diagnostic)); NodeRawConstPtrs children = { left_arg, right_arg }; dst_node = &actions_dag->addFunction(func_builder_cast, std::move(children), {}); @@ -1150,7 +1151,8 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( ActionsDAGPtr ActionsDAG::makeAddingColumnActions(ColumnWithTypeAndName column) { auto adding_column_action = std::make_shared(); - FunctionOverloadResolverPtr func_builder_materialize = std::make_unique(std::make_shared()); + FunctionOverloadResolverPtr func_builder_materialize + = std::make_unique(std::make_shared()); auto column_name = column.name; const auto * column_node = &adding_column_action->addColumn(std::move(column)); @@ -1612,7 +1614,7 @@ ConjunctionNodes getConjunctionNodes(ActionsDAG::Node * predicate, std::unordere std::stack stack; std::unordered_set visited_nodes; - stack.push(Frame{.node = predicate}); + stack.push({.node = predicate}); visited_nodes.insert(predicate); while (!stack.empty()) { @@ -1798,9 +1800,8 @@ ActionsDAGPtr ActionsDAG::cloneActionsForFilterPushDown( { Node * predicate = const_cast(tryFindInIndex(filter_name)); if (!predicate) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Index for ActionsDAG does not contain filter column name {}. DAG:\n{}", - filter_name, dumpDAG()); + throw Exception( + ErrorCodes::LOGICAL_ERROR, "Index for ActionsDAG does not contain filter column name {}. DAG:\n{}", filter_name, dumpDAG()); /// If condition is constant let's do nothing. /// It means there is nothing to push down or optimization was already applied. @@ -1870,8 +1871,6 @@ ActionsDAGPtr ActionsDAG::cloneActionsForFilterPushDown( index_node = new_predicate; } } - - removeUnusedActions(false); } else { @@ -1926,10 +1925,9 @@ ActionsDAGPtr ActionsDAG::cloneActionsForFilterPushDown( predicate->function_base = predicate->function_builder->build(arguments); predicate->function = predicate->function_base->prepare(arguments); } - - removeUnusedActions(false); } + removeUnusedActions(false); return actions; } diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 4f951d69349..409b28a166d 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -342,8 +342,6 @@ struct ContextSharedPart /// Stop periodic reloading of the configuration files. /// This must be done first because otherwise the reloading may pass a changed config /// to some destroyed parts of ContextSharedPart. - if (access_control) - access_control->stopPeriodicReloadingUsersConfigs(); if (external_dictionaries_loader) external_dictionaries_loader->enablePeriodicUpdates(false); if (external_user_defined_executable_functions_loader) diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 0e12c5e9e5a..2b2de84c314 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -287,7 +287,7 @@ void DDLWorker::scheduleTasks(bool reinitialized) Strings queue_nodes = zookeeper->getChildren(queue_dir, &queue_node_stat, queue_updated_event); size_t size_before_filtering = queue_nodes.size(); filterAndSortQueueNodes(queue_nodes); - /// The following message is too verbose, but it can be useful too debug mysterious test failures in CI + /// The following message is too verbose, but it can be useful to debug mysterious test failures in CI LOG_TRACE(log, "scheduleTasks: initialized={}, size_before_filtering={}, queue_size={}, " "entries={}..{}, " "first_failed_task_name={}, current_tasks_size={}, " diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp index 2da53a2e258..2e96ffe7947 100644 --- a/src/Interpreters/ExpressionActions.cpp +++ b/src/Interpreters/ExpressionActions.cpp @@ -1041,7 +1041,7 @@ void ExpressionActionsChain::ArrayJoinStep::finalize(const NameSet & required_ou ExpressionActionsChain::JoinStep::JoinStep( std::shared_ptr analyzed_join_, JoinPtr join_, - ColumnsWithTypeAndName required_columns_) + const ColumnsWithTypeAndName & required_columns_) : Step({}) , analyzed_join(std::move(analyzed_join_)) , join(std::move(join_)) @@ -1049,11 +1049,8 @@ ExpressionActionsChain::JoinStep::JoinStep( for (const auto & column : required_columns_) required_columns.emplace_back(column.name, column.type); - NamesAndTypesList result_names_and_types = required_columns; - analyzed_join->addJoinedColumnsAndCorrectTypes(result_names_and_types, true); - for (const auto & [name, type] : result_names_and_types) - /// `column` is `nullptr` because we don't care on constness here, it may be changed in join - result_columns.emplace_back(nullptr, type, name); + result_columns = required_columns_; + analyzed_join->addJoinedColumnsAndCorrectTypes(result_columns, true); } void ExpressionActionsChain::JoinStep::finalize(const NameSet & required_output_) diff --git a/src/Interpreters/ExpressionActions.h b/src/Interpreters/ExpressionActions.h index c942f33b6df..332ae941bba 100644 --- a/src/Interpreters/ExpressionActions.h +++ b/src/Interpreters/ExpressionActions.h @@ -233,7 +233,7 @@ struct ExpressionActionsChain : WithContext NamesAndTypesList required_columns; ColumnsWithTypeAndName result_columns; - JoinStep(std::shared_ptr analyzed_join_, JoinPtr join_, ColumnsWithTypeAndName required_columns_); + JoinStep(std::shared_ptr analyzed_join_, JoinPtr join_, const ColumnsWithTypeAndName & required_columns_); NamesAndTypesList getRequiredColumns() const override { return required_columns; } ColumnsWithTypeAndName getResultColumns() const override { return result_columns; } void finalize(const NameSet & required_output_) override; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index df53333b635..304cfa2f3f4 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -578,7 +578,12 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (col_decl.type) column.type = name_type_it->type; else + { column.type = defaults_sample_block.getByName(column.name).type; + /// set nullability for case of column declaration w/o type but with default expression + if ((col_decl.null_modifier && *col_decl.null_modifier) || make_columns_nullable) + column.type = makeNullable(column.type); + } column.default_desc.kind = columnDefaultKindFromString(col_decl.default_specifier); column.default_desc.expression = default_expr; diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 1b34759e9dd..d143295181e 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1171,6 +1171,12 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

{}", QueryProcessingStage::toString(from_stage), QueryProcessingStage::toString(options.to_stage)); } + if (query_info.projection && query_info.projection->input_order_info && query_info.input_order_info) + throw Exception("InputOrderInfo is set for projection and for query", ErrorCodes::LOGICAL_ERROR); + InputOrderInfoPtr input_order_info_for_order; + if (!expressions.need_aggregate) + input_order_info_for_order = query_info.projection ? query_info.projection->input_order_info : query_info.input_order_info; + if (options.to_stage > QueryProcessingStage::FetchColumns) { auto preliminary_sort = [&]() @@ -1186,10 +1192,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

input_order_info : nullptr)); + executeOrder(query_plan, input_order_info_for_order); if (expressions.has_order_by && query.limitLength()) executeDistinct(query_plan, false, expressions.selected_columns, true); @@ -1314,16 +1317,9 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

input_order_info.reset(); - } - // Now we must execute: // 1) expressions before window functions, // 2) window functions, @@ -1458,10 +1454,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

input_order_info : nullptr)); + executeOrder(query_plan, input_order_info_for_order); } /** Optimization - if there are several sources and there is LIMIT, then first apply the preliminary LIMIT, @@ -2750,12 +2743,6 @@ void InterpreterSelectQuery::executeExtremes(QueryPlan & query_plan) void InterpreterSelectQuery::executeSubqueriesInSetsAndJoins(QueryPlan & query_plan, SubqueriesForSets & subqueries_for_sets) { - // const auto & input_order_info = query_info.input_order_info - // ? query_info.input_order_info - // : (query_info.projection ? query_info.projection->input_order_info : nullptr); - // if (input_order_info) - // executeMergeSorted(query_plan, input_order_info->order_key_prefix_descr, 0, "before creating sets for subqueries and joins"); - const Settings & settings = context->getSettingsRef(); SizeLimits limits(settings.max_rows_to_transfer, settings.max_bytes_to_transfer, settings.transfer_overflow_mode); diff --git a/src/Interpreters/InterpreterTransactionControlQuery.cpp b/src/Interpreters/InterpreterTransactionControlQuery.cpp index 61b2a4e865f..1e4868788ba 100644 --- a/src/Interpreters/InterpreterTransactionControlQuery.cpp +++ b/src/Interpreters/InterpreterTransactionControlQuery.cpp @@ -10,6 +10,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int INVALID_TRANSACTION; + extern const int UNKNOWN_STATUS_OF_TRANSACTION; } BlockIO InterpreterTransactionControlQuery::execute() @@ -55,7 +56,42 @@ BlockIO InterpreterTransactionControlQuery::executeCommit(ContextMutablePtr sess if (txn->getState() != MergeTreeTransaction::RUNNING) throw Exception(ErrorCodes::INVALID_TRANSACTION, "Transaction is not in RUNNING state"); - TransactionLog::instance().commitTransaction(txn); + TransactionsWaitCSNMode mode = query_context->getSettingsRef().wait_changes_become_visible_after_commit_mode; + CSN csn; + try + { + csn = TransactionLog::instance().commitTransaction(txn, /* throw_on_unknown_status */ mode != TransactionsWaitCSNMode::WAIT_UNKNOWN); + } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::UNKNOWN_STATUS_OF_TRANSACTION) + { + /// Detach transaction from current context if connection was lost and its status is unknown + session_context->setCurrentTransaction(NO_TRANSACTION_PTR); + } + throw; + } + + if (csn == Tx::CommittingCSN) + { + chassert(mode == TransactionsWaitCSNMode::WAIT_UNKNOWN); + + /// Try to wait for connection to be restored and its status to be loaded. + /// It's useful for testing. It allows to enable fault injection (after commit) without breaking tests. + txn->waitStateChange(Tx::CommittingCSN); + + if (txn->getState() == MergeTreeTransaction::ROLLED_BACK) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "Transaction {} was rolled back", txn->tid); + if (txn->getState() != MergeTreeTransaction::COMMITTED) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Transaction {} has invalid state {}", txn->tid, txn->getState()); + + csn = txn->getCSN(); + } + + /// Wait for committed changes to become actually visible, so the next transaction in this session will see the changes + if (mode != TransactionsWaitCSNMode::ASYNC) + TransactionLog::instance().waitForCSNLoaded(csn); + session_context->setCurrentTransaction(NO_TRANSACTION_PTR); return {}; } @@ -67,6 +103,8 @@ BlockIO InterpreterTransactionControlQuery::executeRollback(ContextMutablePtr se throw Exception(ErrorCodes::INVALID_TRANSACTION, "There is no current transaction"); if (txn->getState() == MergeTreeTransaction::COMMITTED) throw Exception(ErrorCodes::LOGICAL_ERROR, "Transaction is in COMMITTED state"); + if (txn->getState() == MergeTreeTransaction::COMMITTING) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Transaction is in COMMITTING state"); if (txn->getState() == MergeTreeTransaction::RUNNING) TransactionLog::instance().rollbackTransaction(txn); diff --git a/src/Interpreters/InterpreterTransactionControlQuery.h b/src/Interpreters/InterpreterTransactionControlQuery.h index 05d3068e095..bf2dc7891a7 100644 --- a/src/Interpreters/InterpreterTransactionControlQuery.h +++ b/src/Interpreters/InterpreterTransactionControlQuery.h @@ -22,7 +22,7 @@ public: private: BlockIO executeBegin(ContextMutablePtr session_context); - static BlockIO executeCommit(ContextMutablePtr session_context); + BlockIO executeCommit(ContextMutablePtr session_context); static BlockIO executeRollback(ContextMutablePtr session_context); static BlockIO executeSetSnapshot(ContextMutablePtr session_context, UInt64 snapshot); diff --git a/src/Interpreters/JoinSwitcher.cpp b/src/Interpreters/JoinSwitcher.cpp index 480d105ebb6..34c8bb4cfd5 100644 --- a/src/Interpreters/JoinSwitcher.cpp +++ b/src/Interpreters/JoinSwitcher.cpp @@ -66,7 +66,7 @@ void JoinSwitcher::switchJoin() for (const auto & sample_column : right_sample_block) { positions.emplace_back(tmp_block.getPositionByName(sample_column.name)); - is_nullable.emplace_back(sample_column.type->isNullable()); + is_nullable.emplace_back(JoinCommon::isNullable(sample_column.type)); } } diff --git a/src/Interpreters/MergeTreeTransaction.cpp b/src/Interpreters/MergeTreeTransaction.cpp index c0d3cdfeb62..cab40f3c6db 100644 --- a/src/Interpreters/MergeTreeTransaction.cpp +++ b/src/Interpreters/MergeTreeTransaction.cpp @@ -38,13 +38,26 @@ void MergeTreeTransaction::setSnapshot(CSN new_snapshot) MergeTreeTransaction::State MergeTreeTransaction::getState() const { CSN c = csn.load(); - if (c == Tx::UnknownCSN || c == Tx::CommittingCSN) + if (c == Tx::UnknownCSN) return RUNNING; + if (c == Tx::CommittingCSN) + return COMMITTING; if (c == Tx::RolledBackCSN) return ROLLED_BACK; return COMMITTED; } +bool MergeTreeTransaction::waitStateChange(CSN current_state_csn) const +{ + CSN current_value = current_state_csn; + while (current_value == current_state_csn && !TransactionLog::instance().isShuttingDown()) + { + csn.wait(current_value); + current_value = csn.load(); + } + return current_value != current_state_csn; +} + void MergeTreeTransaction::checkIsNotCancelled() const { CSN c = csn.load(); @@ -158,7 +171,7 @@ void MergeTreeTransaction::addMutation(const StoragePtr & table, const String & bool MergeTreeTransaction::isReadOnly() const { std::lock_guard lock{mutex}; - assert((creating_parts.empty() && removing_parts.empty() && mutations.empty()) == storages.empty()); + chassert((creating_parts.empty() && removing_parts.empty() && mutations.empty()) == storages.empty()); return storages.empty(); } @@ -204,7 +217,7 @@ void MergeTreeTransaction::afterCommit(CSN assigned_csn) noexcept /// and we will be able to remove old entries from transaction log in ZK. /// It's not a problem if server crash before CSN is written, because we already have TID in data part and entry in the log. [[maybe_unused]] CSN prev_value = csn.exchange(assigned_csn); - assert(prev_value == Tx::CommittingCSN); + chassert(prev_value == Tx::CommittingCSN); for (const auto & part : creating_parts) { part->version.creation_csn.store(csn); @@ -321,7 +334,7 @@ String MergeTreeTransaction::dumpDescription() const { String info = fmt::format("{} (created by {}, {})", part->name, part->version.getCreationTID(), part->version.creation_csn); std::get<1>(storage_to_changes[&(part->storage)]).push_back(std::move(info)); - assert(!part->version.creation_csn || part->version.creation_csn <= snapshot); + chassert(!part->version.creation_csn || part->version.creation_csn <= snapshot); } for (const auto & mutation : mutations) diff --git a/src/Interpreters/MergeTreeTransaction.h b/src/Interpreters/MergeTreeTransaction.h index 7ebea450dd0..309b8e3eeff 100644 --- a/src/Interpreters/MergeTreeTransaction.h +++ b/src/Interpreters/MergeTreeTransaction.h @@ -26,6 +26,7 @@ public: enum State { RUNNING, + COMMITTING, COMMITTED, ROLLED_BACK, }; @@ -55,6 +56,11 @@ public: Float64 elapsedSeconds() const { return elapsed.elapsedSeconds(); } + /// Waits for transaction state to become not equal to the state corresponding to current_state_csn + bool waitStateChange(CSN current_state_csn) const; + + CSN getCSN() const { return csn; } + private: scope_guard beforeCommit(); void afterCommit(CSN assigned_csn) noexcept; diff --git a/src/Interpreters/MergeTreeTransactionHolder.cpp b/src/Interpreters/MergeTreeTransactionHolder.cpp index bf63a471282..2944fb78b76 100644 --- a/src/Interpreters/MergeTreeTransactionHolder.cpp +++ b/src/Interpreters/MergeTreeTransactionHolder.cpp @@ -53,7 +53,7 @@ void MergeTreeTransactionHolder::onDestroy() noexcept { try { - TransactionLog::instance().commitTransaction(txn); + TransactionLog::instance().commitTransaction(txn, /* throw_on_unknown_status */ false); return; } catch (...) diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 69e60e3eef7..10a27b9efc5 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -328,6 +329,21 @@ NamesAndTypesList TableJoin::correctedColumnsAddedByJoin() const void TableJoin::addJoinedColumnsAndCorrectTypes(NamesAndTypesList & left_columns, bool correct_nullability) { + addJoinedColumnsAndCorrectTypesImpl(left_columns, correct_nullability); +} + +void TableJoin::addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & left_columns, bool correct_nullability) +{ + addJoinedColumnsAndCorrectTypesImpl(left_columns, correct_nullability); +} + +template +void TableJoin::addJoinedColumnsAndCorrectTypesImpl(TColumns & left_columns, bool correct_nullability) +{ + static_assert(std::is_same_v || + std::is_same_v); + + constexpr bool has_column = std::is_same_v; for (auto & col : left_columns) { if (hasUsing()) @@ -342,15 +358,26 @@ void TableJoin::addJoinedColumnsAndCorrectTypes(NamesAndTypesList & left_columns inferJoinKeyCommonType(left_columns, columns_from_joined_table, !isSpecialStorage()); if (auto it = left_type_map.find(col.name); it != left_type_map.end()) + { col.type = it->second; + if constexpr (has_column) + col.column = nullptr; + } } if (correct_nullability && leftBecomeNullable(col.type)) + { col.type = JoinCommon::convertTypeToNullable(col.type); + if constexpr (has_column) + col.column = nullptr; + } } for (const auto & col : correctedColumnsAddedByJoin()) - left_columns.emplace_back(col.name, col.type); + if constexpr (has_column) + left_columns.emplace_back(nullptr, col.type, col.name); + else + left_columns.emplace_back(col.name, col.type); } bool TableJoin::sameStrictnessAndKind(ASTTableJoin::Strictness strictness_, ASTTableJoin::Kind kind_) const diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index c7bd80ff2b7..37e9417bde7 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -254,7 +254,11 @@ public: bool rightBecomeNullable(const DataTypePtr & column_type) const; void addJoinedColumn(const NameAndTypePair & joined_column); + template + void addJoinedColumnsAndCorrectTypesImpl(TColumns & left_columns, bool correct_nullability); + void addJoinedColumnsAndCorrectTypes(NamesAndTypesList & left_columns, bool correct_nullability); + void addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & left_columns, bool correct_nullability); /// Calculate converting actions, rename key columns in required /// For `USING` join we will convert key columns inplace and affect into types in the result table diff --git a/src/Interpreters/TransactionLog.cpp b/src/Interpreters/TransactionLog.cpp index e65630d907b..4f0e79297b8 100644 --- a/src/Interpreters/TransactionLog.cpp +++ b/src/Interpreters/TransactionLog.cpp @@ -21,6 +21,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int UNKNOWN_STATUS_OF_TRANSACTION; } static void tryWriteEventToSystemLog(Poco::Logger * log, ContextPtr context, @@ -52,6 +53,8 @@ TransactionLog::TransactionLog() zookeeper_path = global_context->getConfigRef().getString("transaction_log.zookeeper_path", "/clickhouse/txn"); zookeeper_path_log = zookeeper_path + "/log"; + fault_probability_before_commit = global_context->getConfigRef().getDouble("transaction_log.fault_probability_before_commit", 0); + fault_probability_after_commit = global_context->getConfigRef().getDouble("transaction_log.fault_probability_after_commit", 0); loadLogFromZooKeeper(); @@ -145,24 +148,29 @@ void TransactionLog::loadEntries(Strings::const_iterator beg, Strings::const_ite NOEXCEPT_SCOPE; LockMemoryExceptionInThread lock_memory_tracker(VariableContext::Global); - std::lock_guard lock{mutex}; - for (const auto & entry : loaded) { - if (entry.first == Tx::EmptyTID.getHash()) - continue; + std::lock_guard lock{mutex}; + for (const auto & entry : loaded) + { + if (entry.first == Tx::EmptyTID.getHash()) + continue; - tid_to_csn.emplace(entry.first, entry.second); + tid_to_csn.emplace(entry.first, entry.second); + } + last_loaded_entry = last_entry; + } + { + std::lock_guard lock{running_list_mutex}; + latest_snapshot = loaded.back().second.csn; + local_tid_counter = Tx::MaxReservedLocalTID; } - last_loaded_entry = last_entry; - latest_snapshot = loaded.back().second.csn; - local_tid_counter = Tx::MaxReservedLocalTID; } void TransactionLog::loadLogFromZooKeeper() { - assert(!zookeeper); - assert(tid_to_csn.empty()); - assert(last_loaded_entry.empty()); + chassert(!zookeeper); + chassert(tid_to_csn.empty()); + chassert(last_loaded_entry.empty()); zookeeper = global_context->getZooKeeper(); /// We do not write local_tid_counter to disk or zk and maintain it only in memory. @@ -172,7 +180,7 @@ void TransactionLog::loadLogFromZooKeeper() if (code != Coordination::Error::ZOK) { /// Log probably does not exist, create it - assert(code == Coordination::Error::ZNONODE); + chassert(code == Coordination::Error::ZNONODE); zookeeper->createAncestors(zookeeper_path_log); Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/tail_ptr", serializeCSN(Tx::MaxReservedCSN), zkutil::CreateMode::Persistent)); @@ -192,11 +200,11 @@ void TransactionLog::loadLogFromZooKeeper() /// 2. simplify log rotation /// 3. support 64-bit CSNs on top of Apache ZooKeeper (it uses Int32 for sequential numbers) Strings entries_list = zookeeper->getChildren(zookeeper_path_log, nullptr, log_updated_event); - assert(!entries_list.empty()); + chassert(!entries_list.empty()); std::sort(entries_list.begin(), entries_list.end()); loadEntries(entries_list.begin(), entries_list.end()); - assert(!last_loaded_entry.empty()); - assert(latest_snapshot == deserializeCSN(last_loaded_entry)); + chassert(!last_loaded_entry.empty()); + chassert(latest_snapshot == deserializeCSN(last_loaded_entry)); local_tid_counter = Tx::MaxReservedLocalTID; tail_ptr = deserializeCSN(zookeeper->get(zookeeper_path + "/tail_ptr")); @@ -208,19 +216,31 @@ void TransactionLog::runUpdatingThread() { try { - log_updated_event->wait(); + /// Do not wait if we have some transactions to finalize + if (!unknown_state_list_loaded.empty()) + log_updated_event->wait(); + if (stop_flag.load()) return; - if (getZooKeeper()->expired()) + bool connection_loss = getZooKeeper()->expired(); + if (connection_loss) { auto new_zookeeper = global_context->getZooKeeper(); - std::lock_guard lock{mutex}; - zookeeper = new_zookeeper; + { + std::lock_guard lock{mutex}; + zookeeper = new_zookeeper; + } + + /// It's possible that we connected to different [Zoo]Keeper instance + /// so we may read a bit stale state. Run some writing request before loading log entries + /// to make that instance up-to-date. + zookeeper->set(zookeeper_path_log, ""); } loadNewEntries(); removeOldEntries(); + tryFinalizeUnknownStateTransactions(); } catch (const Coordination::Exception &) { @@ -241,12 +261,12 @@ void TransactionLog::runUpdatingThread() void TransactionLog::loadNewEntries() { Strings entries_list = zookeeper->getChildren(zookeeper_path_log, nullptr, log_updated_event); - assert(!entries_list.empty()); + chassert(!entries_list.empty()); std::sort(entries_list.begin(), entries_list.end()); auto it = std::upper_bound(entries_list.begin(), entries_list.end(), last_loaded_entry); loadEntries(it, entries_list.end()); - assert(last_loaded_entry == entries_list.back()); - assert(latest_snapshot == deserializeCSN(last_loaded_entry)); + chassert(last_loaded_entry == entries_list.back()); + chassert(latest_snapshot == deserializeCSN(last_loaded_entry)); latest_snapshot.notify_all(); } @@ -309,6 +329,46 @@ void TransactionLog::removeOldEntries() tid_to_csn.erase(tid_hash); } +void TransactionLog::tryFinalizeUnknownStateTransactions() +{ + /// We just recovered connection to [Zoo]Keeper. + /// Check if transactions in unknown state were actually committed or not and finalize or rollback them. + UnknownStateList list; + { + /// We must be sure that the corresponding CSN entry is loaded from ZK. + /// Otherwise we may accidentally rollback committed transaction in case of race condition like this: + /// - runUpdatingThread: loaded some entries, ready to call tryFinalizeUnknownStateTransactions() + /// - commitTransaction: creates CSN entry in the log (txn is committed) + /// - [session expires] + /// - commitTransaction: catches Coordination::Exception (maybe due to fault injection), appends txn to unknown_state_list + /// - runUpdatingThread: calls tryFinalizeUnknownStateTransactions(), fails to find CSN for this txn, rolls it back + /// So all CSN entries that might exist at the moment of appending txn to unknown_state_list + /// must be loaded from ZK before we start finalize that txn. + /// That's why we use two lists here: + /// 1. At first we put txn into unknown_state_list + /// 2. We move it to unknown_state_list_loaded when runUpdatingThread done at least one iteration + /// 3. Then we can safely finalize txns from unknown_state_list_loaded, because all required entries are loaded + std::lock_guard lock{running_list_mutex}; + std::swap(list, unknown_state_list); + std::swap(list, unknown_state_list_loaded); + } + + for (auto & [txn, state_guard] : list) + { + /// CSNs must be already loaded, only need to check if the corresponding mapping exists. + if (auto csn = getCSN(txn->tid)) + { + finalizeCommittedTransaction(txn, csn, state_guard); + } + else + { + assertTIDIsNotOutdated(txn->tid); + state_guard = {}; + rollbackTransaction(txn->shared_from_this()); + } + } +} + CSN TransactionLog::getLatestSnapshot() const { return latest_snapshot.load(); @@ -334,58 +394,117 @@ MergeTreeTransactionPtr TransactionLog::beginTransaction() return txn; } -CSN TransactionLog::commitTransaction(const MergeTreeTransactionPtr & txn) +CSN TransactionLog::commitTransaction(const MergeTreeTransactionPtr & txn, bool throw_on_unknown_status) { /// Some precommit checks, may throw - auto committing_lock = txn->beforeCommit(); + auto state_guard = txn->beforeCommit(); - CSN new_csn; + CSN allocated_csn = Tx::UnknownCSN; if (txn->isReadOnly()) { /// Don't need to allocate CSN in ZK for readonly transactions, it's safe to use snapshot/start_csn as "commit" timestamp LOG_TEST(log, "Closing readonly transaction {}", txn->tid); - new_csn = txn->snapshot; - tryWriteEventToSystemLog(log, global_context, TransactionsInfoLogElement::COMMIT, txn->tid, new_csn); } else { LOG_TEST(log, "Committing transaction {}", txn->dumpDescription()); - /// TODO handle connection loss /// TODO support batching auto current_zookeeper = getZooKeeper(); - String path_created = current_zookeeper->create(zookeeper_path_log + "/csn-", serializeTID(txn->tid), zkutil::CreateMode::PersistentSequential); /// Commit point - NOEXCEPT_SCOPE; + String csn_path_created; + try + { + if (unlikely(fault_probability_before_commit)) + { + std::bernoulli_distribution fault(fault_probability_before_commit); + if (fault(thread_local_rng)) + throw Coordination::Exception("Fault injected (before commit)", Coordination::Error::ZCONNECTIONLOSS); + } + /// Commit point + csn_path_created = current_zookeeper->create(zookeeper_path_log + "/csn-", serializeTID(txn->tid), zkutil::CreateMode::PersistentSequential); + + if (unlikely(fault_probability_after_commit)) + { + std::bernoulli_distribution fault(fault_probability_after_commit); + if (fault(thread_local_rng)) + throw Coordination::Exception("Fault injected (after commit)", Coordination::Error::ZCONNECTIONLOSS); + } + } + catch (const Coordination::Exception & e) + { + if (!Coordination::isHardwareError(e.code)) + throw; + + /// We don't know if transaction has been actually committed or not. + /// The only thing we can do is to postpone its finalization. + { + std::lock_guard lock{running_list_mutex}; + unknown_state_list.emplace_back(txn.get(), std::move(state_guard)); + } + log_updated_event->set(); + if (throw_on_unknown_status) + throw Exception(ErrorCodes::UNKNOWN_STATUS_OF_TRANSACTION, + "Connection lost on attempt to commit transaction {}, will finalize it later: {}", + txn->tid, e.message()); + + LOG_INFO(log, "Connection lost on attempt to commit transaction {}, will finalize it later: {}", txn->tid, e.message()); + return Tx::CommittingCSN; + } + + /// Do not allow exceptions between commit point and the and of transaction finalization + /// (otherwise it may stuck in COMMITTING state holding snapshot). + NOEXCEPT_SCOPE; /// FIXME Transactions: Sequential node numbers in ZooKeeper are Int32, but 31 bit is not enough for production use /// (overflow is possible in a several weeks/months of active usage) - new_csn = deserializeCSN(path_created.substr(zookeeper_path_log.size() + 1)); + allocated_csn = deserializeCSN(csn_path_created.substr(zookeeper_path_log.size() + 1)); + } - LOG_INFO(log, "Transaction {} committed with CSN={}", txn->tid, new_csn); - tryWriteEventToSystemLog(log, global_context, TransactionsInfoLogElement::COMMIT, txn->tid, new_csn); + return finalizeCommittedTransaction(txn.get(), allocated_csn, state_guard); +} - /// Wait for committed changes to become actually visible, so the next transaction in this session will see the changes - /// TODO it's optional, add a setting for this - auto current_latest_snapshot = latest_snapshot.load(); - while (current_latest_snapshot < new_csn && !stop_flag) - { - latest_snapshot.wait(current_latest_snapshot); - current_latest_snapshot = latest_snapshot.load(); - } +CSN TransactionLog::finalizeCommittedTransaction(MergeTreeTransaction * txn, CSN allocated_csn, scope_guard & state_guard) noexcept +{ + chassert(!allocated_csn == txn->isReadOnly()); + if (allocated_csn) + { + LOG_INFO(log, "Transaction {} committed with CSN={}", txn->tid, allocated_csn); + tryWriteEventToSystemLog(log, global_context, TransactionsInfoLogElement::COMMIT, txn->tid, allocated_csn); + } + else + { + /// Transaction was readonly + allocated_csn = txn->snapshot; + tryWriteEventToSystemLog(log, global_context, TransactionsInfoLogElement::COMMIT, txn->tid, allocated_csn); } /// Write allocated CSN, so we will be able to cleanup log in ZK. This method is noexcept. - txn->afterCommit(new_csn); + txn->afterCommit(allocated_csn); + state_guard = {}; { /// Finally we can remove transaction from the list and release the snapshot std::lock_guard lock{running_list_mutex}; + snapshots_in_use.erase(txn->snapshot_in_use_it); bool removed = running_list.erase(txn->tid.getHash()); if (!removed) - throw Exception(ErrorCodes::LOGICAL_ERROR, "I's a bug: TID {} {} doesn't exist", txn->tid.getHash(), txn->tid); - snapshots_in_use.erase(txn->snapshot_in_use_it); + { + LOG_ERROR(log , "I's a bug: TID {} {} doesn't exist", txn->tid.getHash(), txn->tid); + abort(); + } } - return new_csn; + return allocated_csn; +} + +bool TransactionLog::waitForCSNLoaded(CSN csn) const +{ + auto current_latest_snapshot = latest_snapshot.load(); + while (current_latest_snapshot < csn && !stop_flag) + { + latest_snapshot.wait(current_latest_snapshot); + current_latest_snapshot = latest_snapshot.load(); + } + return csn <= current_latest_snapshot; } void TransactionLog::rollbackTransaction(const MergeTreeTransactionPtr & txn) noexcept @@ -395,8 +514,8 @@ void TransactionLog::rollbackTransaction(const MergeTreeTransactionPtr & txn) no if (!txn->rollback()) { - /// Transaction was cancelled concurrently, it's already rolled back. - assert(txn->csn == Tx::RolledBackCSN); + /// Transaction was cancelled or committed concurrently + chassert(txn->csn != Tx::UnknownCSN); return; } @@ -438,8 +557,8 @@ CSN TransactionLog::getCSN(const TIDHash & tid) CSN TransactionLog::getCSNImpl(const TIDHash & tid_hash) const { - assert(tid_hash); - assert(tid_hash != Tx::EmptyTID.getHash()); + chassert(tid_hash); + chassert(tid_hash != Tx::EmptyTID.getHash()); std::lock_guard lock{mutex}; auto it = tid_to_csn.find(tid_hash); @@ -467,6 +586,8 @@ CSN TransactionLog::getOldestSnapshot() const std::lock_guard lock{running_list_mutex}; if (snapshots_in_use.empty()) return getLatestSnapshot(); + chassert(running_list.size() == snapshots_in_use.size()); + chassert(snapshots_in_use.size() < 2 || snapshots_in_use.front() <= *++snapshots_in_use.begin()); return snapshots_in_use.front(); } diff --git a/src/Interpreters/TransactionLog.h b/src/Interpreters/TransactionLog.h index 86584a74c68..a0268ce9b88 100644 --- a/src/Interpreters/TransactionLog.h +++ b/src/Interpreters/TransactionLog.h @@ -97,7 +97,8 @@ public: /// Tries to commit transaction. Returns Commit Sequence Number. /// Throw if transaction was concurrently killed or if some precommit check failed. /// May throw if ZK connection is lost. Transaction status is unknown in this case. - CSN commitTransaction(const MergeTreeTransactionPtr & txn); + /// Returns CommittingCSN if throw_on_unknown_status is false and connection was lost. + CSN commitTransaction(const MergeTreeTransactionPtr & txn, bool throw_on_unknown_status); /// Releases locks that that were acquired by transaction, releases snapshot, removes transaction from the list of active transactions. /// Normally it should not throw, but if it does for some reason (global memory limit exceeded, disk failure, etc) @@ -119,6 +120,12 @@ public: /// Returns copy of list of running transactions. TransactionsList getTransactionsList() const; + /// Waits for provided CSN (and all previous ones) to be loaded from the log. + /// Returns false if waiting was interrupted (e.g. by shutdown) + bool waitForCSNLoaded(CSN csn) const; + + bool isShuttingDown() const { return stop_flag.load(); } + private: void loadLogFromZooKeeper(); void runUpdatingThread(); @@ -127,6 +134,10 @@ private: void loadNewEntries(); void removeOldEntries(); + CSN finalizeCommittedTransaction(MergeTreeTransaction * txn, CSN allocated_csn, scope_guard & state_guard) noexcept; + + void tryFinalizeUnknownStateTransactions(); + static UInt64 deserializeCSN(const String & csn_node_name); static String serializeCSN(CSN csn); static TransactionID deserializeTID(const String & csn_node_content); @@ -159,6 +170,10 @@ private: mutable std::mutex running_list_mutex; /// Transactions that are currently processed TransactionsList running_list; + /// If we lost connection on attempt to create csn- node then we don't know transaction's state. + using UnknownStateList = std::vector>; + UnknownStateList unknown_state_list; + UnknownStateList unknown_state_list_loaded; /// Ordered list of snapshots that are currently used by some transactions. Needed for background cleanup. std::list snapshots_in_use; @@ -175,6 +190,9 @@ private: std::atomic_bool stop_flag = false; ThreadFromGlobalPool updating_thread; + + Float64 fault_probability_before_commit = 0; + Float64 fault_probability_after_commit = 0; }; template diff --git a/src/Interpreters/TransactionVersionMetadata.cpp b/src/Interpreters/TransactionVersionMetadata.cpp index b965ade8d10..36a4fb9cc5b 100644 --- a/src/Interpreters/TransactionVersionMetadata.cpp +++ b/src/Interpreters/TransactionVersionMetadata.cpp @@ -88,8 +88,8 @@ void VersionMetadata::lockRemovalTID(const TransactionID & tid, const Transactio bool VersionMetadata::tryLockRemovalTID(const TransactionID & tid, const TransactionInfoContext & context, TIDHash * locked_by_id) { - assert(!tid.isEmpty()); - assert(!creation_tid.isEmpty()); + chassert(!tid.isEmpty()); + chassert(!creation_tid.isEmpty()); TIDHash removal_lock_value = tid.getHash(); TIDHash expected_removal_lock_value = 0; bool locked = removal_tid_lock.compare_exchange_strong(expected_removal_lock_value, removal_lock_value); @@ -115,7 +115,7 @@ bool VersionMetadata::tryLockRemovalTID(const TransactionID & tid, const Transac void VersionMetadata::unlockRemovalTID(const TransactionID & tid, const TransactionInfoContext & context) { LOG_TEST(log, "Unlocking removal_tid by {}, table: {}, part: {}", tid, context.table.getNameForLogs(), context.part_name); - assert(!tid.isEmpty()); + chassert(!tid.isEmpty()); TIDHash removal_lock_value = tid.getHash(); TIDHash locked_by = removal_tid_lock.load(); @@ -145,7 +145,7 @@ bool VersionMetadata::isRemovalTIDLocked() const void VersionMetadata::setCreationTID(const TransactionID & tid, TransactionInfoContext * context) { /// NOTE ReplicatedMergeTreeSink may add one part multiple times - assert(creation_tid.isEmpty() || creation_tid == tid); + chassert(creation_tid.isEmpty() || creation_tid == tid); creation_tid = tid; if (context) tryWriteEventToSystemLog(log, TransactionsInfoLogElement::ADD_PART, tid, *context); @@ -158,7 +158,7 @@ bool VersionMetadata::isVisible(const MergeTreeTransaction & txn) bool VersionMetadata::isVisible(CSN snapshot_version, TransactionID current_tid) { - assert(!creation_tid.isEmpty()); + chassert(!creation_tid.isEmpty()); CSN creation = creation_csn.load(std::memory_order_relaxed); TIDHash removal_lock = removal_tid_lock.load(std::memory_order_relaxed); CSN removal = removal_csn.load(std::memory_order_relaxed); @@ -166,10 +166,10 @@ bool VersionMetadata::isVisible(CSN snapshot_version, TransactionID current_tid) [[maybe_unused]] bool had_creation_csn = creation; [[maybe_unused]] bool had_removal_tid = removal_lock; [[maybe_unused]] bool had_removal_csn = removal; - assert(!had_removal_csn || had_removal_tid); - assert(!had_removal_csn || had_creation_csn); - assert(creation == Tx::UnknownCSN || creation == Tx::PrehistoricCSN || Tx::MaxReservedCSN < creation); - assert(removal == Tx::UnknownCSN || removal == Tx::PrehistoricCSN || Tx::MaxReservedCSN < removal); + chassert(!had_removal_csn || had_removal_tid); + chassert(!had_removal_csn || had_creation_csn); + chassert(creation == Tx::UnknownCSN || creation == Tx::PrehistoricCSN || Tx::MaxReservedCSN < creation); + chassert(removal == Tx::UnknownCSN || removal == Tx::PrehistoricCSN || Tx::MaxReservedCSN < removal); /// Special snapshot for introspection purposes if (unlikely(snapshot_version == Tx::EverythingVisibleCSN)) @@ -204,8 +204,8 @@ bool VersionMetadata::isVisible(CSN snapshot_version, TransactionID current_tid) /// Data part has creation_tid/removal_tid, but does not have creation_csn/removal_csn. /// It means that some transaction is creating/removing the part right now or has done it recently /// and we don't know if it was already committed or not. - assert(!had_creation_csn || (had_removal_tid && !had_removal_csn)); - assert(current_tid.isEmpty() || (creation_tid != current_tid && removal_lock != current_tid.getHash())); + chassert(!had_creation_csn || (had_removal_tid && !had_removal_csn)); + chassert(current_tid.isEmpty() || (creation_tid != current_tid && removal_lock != current_tid.getHash())); /// Before doing CSN lookup, let's check some extra conditions. /// If snapshot_version <= some_tid.start_csn, then changes of the transaction with some_tid @@ -347,8 +347,8 @@ void VersionMetadata::write(WriteBuffer & buf) const if (removal_tid_lock) { - assert(!removal_tid.isEmpty()); - assert(removal_tid.getHash() == removal_tid_lock); + chassert(!removal_tid.isEmpty()); + chassert(removal_tid.getHash() == removal_tid_lock); writeRemovalTID(buf); writeCSN(buf, REMOVAL, /* internal */ true); } @@ -384,21 +384,23 @@ void VersionMetadata::read(ReadBuffer & buf) if (name == CREATION_CSN_STR) { - assert(!creation_csn); + chassert(!creation_csn); creation_csn = read_csn(); } else if (name == REMOVAL_TID_STR) { /// NOTE Metadata file may actually contain multiple creation TIDs, we need the last one. removal_tid = TransactionID::read(buf); - if (!removal_tid.isEmpty()) + if (removal_tid.isEmpty()) + removal_tid_lock = 0; + else removal_tid_lock = removal_tid.getHash(); } else if (name == REMOVAL_CSN_STR) { if (removal_tid.isEmpty()) throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Found removal_csn in metadata file, but removal_tid is {}", removal_tid); - assert(!removal_csn); + chassert(!removal_csn); removal_csn = read_csn(); } else diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 3c03bea3dd1..186c8c30cfa 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -444,9 +444,10 @@ static std::tuple executeQueryImpl( if (auto txn = context->getCurrentTransaction()) { - assert(txn->getState() != MergeTreeTransaction::COMMITTED); + chassert(txn->getState() != MergeTreeTransaction::COMMITTING); + chassert(txn->getState() != MergeTreeTransaction::COMMITTED); if (txn->getState() == MergeTreeTransaction::ROLLED_BACK && !ast->as() && !ast->as()) - throw Exception(ErrorCodes::INVALID_TRANSACTION, "Cannot execute query: transaction is rolled back"); + throw Exception(ErrorCodes::INVALID_TRANSACTION, "Cannot execute query because current transaction failed. Expecting ROLLBACK statement."); } /// Interpret SETTINGS clauses as early as possible (before invoking the corresponding interpreter), diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index 6a34e1d2700..daf27c0dc67 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -105,9 +105,9 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; - bool require_type = true; - bool allow_null_modifiers = false; - bool check_keywords_after_name = false; + const bool require_type = true; + const bool allow_null_modifiers = false; + const bool check_keywords_after_name = false; /// just for ALTER TABLE ALTER COLUMN use bool check_type_keyword = false; }; @@ -175,7 +175,22 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E ASTPtr ttl_expression; ASTPtr collation_expression; - if (!s_default.checkWithoutMoving(pos, expected) + auto null_check_without_moving = [&]() -> bool + { + if (!allow_null_modifiers) + return false; + + if (s_null.checkWithoutMoving(pos, expected)) + return true; + + Pos before_null = pos; + bool res = s_not.check(pos, expected) && s_null.checkWithoutMoving(pos, expected); + pos = before_null; + return res; + }; + + if (!null_check_without_moving() + && !s_default.checkWithoutMoving(pos, expected) && !s_materialized.checkWithoutMoving(pos, expected) && !s_ephemeral.checkWithoutMoving(pos, expected) && !s_alias.checkWithoutMoving(pos, expected) @@ -195,6 +210,18 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E } } + if (allow_null_modifiers) + { + if (s_not.check(pos, expected)) + { + if (!s_null.check(pos, expected)) + return false; + null_modifier.emplace(false); + } + else if (s_null.check(pos, expected)) + null_modifier.emplace(true); + } + Pos pos_before_specifier = pos; if (s_default.ignore(pos, expected) || s_materialized.ignore(pos, expected) || s_alias.ignore(pos, expected)) { @@ -230,7 +257,7 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E if (require_type && !type && !default_expression) return false; /// reject column name without type - if (type && allow_null_modifiers) + if ((type || default_expression) && allow_null_modifiers && !null_modifier.has_value()) { if (s_not.ignore(pos, expected)) { @@ -419,7 +446,7 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; -/// CREATE|ATTACH WINDOW VIEW [IF NOT EXISTS] [db.]name [TO [db.]name] [INNER ENGINE [db.]name] [ENGINE [db.]name] [WATERMARK function] [ALLOWED_LATENESS = interval_function] [POPULATE] AS SELECT ... +/// CREATE|ATTACH WINDOW VIEW [IF NOT EXISTS] [db.]name [TO [db.]name] [INNER ENGINE engine] [ENGINE engine] [WATERMARK strategy] [ALLOWED_LATENESS interval_function] [POPULATE] AS SELECT ... class ParserCreateWindowViewQuery : public IParserBase { protected: diff --git a/src/Processors/Chunk.cpp b/src/Processors/Chunk.cpp index 8c7c09abf01..61bd118636d 100644 --- a/src/Processors/Chunk.cpp +++ b/src/Processors/Chunk.cpp @@ -169,6 +169,17 @@ std::string Chunk::dumpStructure() const return out.str(); } +void Chunk::append(const Chunk & chunk) +{ + MutableColumns mutation = mutateColumns(); + for (size_t position = 0; position < mutation.size(); ++position) + { + auto column = chunk.getColumns()[position]; + mutation[position]->insertRangeFrom(*column, 0, column->size()); + } + size_t rows = mutation[0]->size(); + setColumns(std::move(mutation), rows); +} void ChunkMissingValues::setBit(size_t column_idx, size_t row_idx) { diff --git a/src/Processors/Chunk.h b/src/Processors/Chunk.h index da5fe0c23ed..ec514846f24 100644 --- a/src/Processors/Chunk.h +++ b/src/Processors/Chunk.h @@ -101,6 +101,8 @@ public: std::string dumpStructure() const; + void append(const Chunk & chunk); + private: Columns columns; UInt64 num_rows = 0; diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp index 481f77c1ef8..dbe28147d8f 100644 --- a/src/Processors/Formats/IRowInputFormat.cpp +++ b/src/Processors/Formats/IRowInputFormat.cpp @@ -48,13 +48,8 @@ bool isParseError(int code) } IRowInputFormat::IRowInputFormat(Block header, ReadBuffer & in_, Params params_) - : IInputFormat(std::move(header), in_), params(params_) + : IInputFormat(std::move(header), in_), serializations(getPort().getHeader().getSerializations()), params(params_) { - const auto & port_header = getPort().getHeader(); - size_t num_columns = port_header.columns(); - serializations.resize(num_columns); - for (size_t i = 0; i < num_columns; ++i) - serializations[i] = port_header.getByPosition(i).type->getDefaultSerialization(); } diff --git a/src/Processors/Formats/IRowOutputFormat.cpp b/src/Processors/Formats/IRowOutputFormat.cpp index b48c4a2b3e6..f2f6b49ed3f 100644 --- a/src/Processors/Formats/IRowOutputFormat.cpp +++ b/src/Processors/Formats/IRowOutputFormat.cpp @@ -12,13 +12,11 @@ namespace ErrorCodes IRowOutputFormat::IRowOutputFormat(const Block & header, WriteBuffer & out_, const Params & params_) : IOutputFormat(header, out_) + , num_columns(header.columns()) , types(header.getDataTypes()) + , serializations(header.getSerializations()) , params(params_) { - num_columns = types.size(); - serializations.reserve(num_columns); - for (const auto & type : types) - serializations.push_back(type->getDefaultSerialization()); } void IRowOutputFormat::consume(DB::Chunk chunk) diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 11b5afc78f1..5a6ebf00660 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -15,7 +15,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -static void chooseResultType( +void chooseResultColumnType( DataTypePtr & type, const DataTypePtr & new_type, CommonDataTypeChecker common_type_checker, @@ -49,7 +49,7 @@ static void chooseResultType( } } -static void checkTypeAndAppend(NamesAndTypesList & result, DataTypePtr & type, const String & name, const DataTypePtr & default_type, size_t rows_read) +void checkResultColumnTypeAndAppend(NamesAndTypesList & result, DataTypePtr & type, const String & name, const DataTypePtr & default_type, size_t rows_read) { if (!type) { @@ -116,7 +116,7 @@ NamesAndTypesList IRowSchemaReader::readSchema() if (!new_data_types[i]) continue; - chooseResultType(data_types[i], new_data_types[i], common_type_checker, getDefaultType(i), std::to_string(i + 1), rows_read); + chooseResultColumnType(data_types[i], new_data_types[i], common_type_checker, getDefaultType(i), std::to_string(i + 1), rows_read); } } @@ -141,7 +141,7 @@ NamesAndTypesList IRowSchemaReader::readSchema() for (size_t i = 0; i != data_types.size(); ++i) { /// Check that we could determine the type of this column. - checkTypeAndAppend(result, data_types[i], column_names[i], getDefaultType(i), rows_read); + checkResultColumnTypeAndAppend(result, data_types[i], column_names[i], getDefaultType(i), rows_read); } return result; @@ -200,7 +200,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() } auto & type = it->second; - chooseResultType(type, new_type, common_type_checker, default_type, name, rows_read); + chooseResultColumnType(type, new_type, common_type_checker, default_type, name, rows_read); } } @@ -213,7 +213,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() { auto & type = names_to_types[name]; /// Check that we could determine the type of this column. - checkTypeAndAppend(result, type, name, default_type, rows_read); + checkResultColumnTypeAndAppend(result, type, name, default_type, rows_read); } return result; diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index f748680ed24..00987540d04 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -120,4 +120,15 @@ public: virtual ~IExternalSchemaReader() = default; }; +void chooseResultColumnType( + DataTypePtr & type, + const DataTypePtr & new_type, + CommonDataTypeChecker common_type_checker, + const DataTypePtr & default_type, + const String & column_name, + size_t row); + +void checkResultColumnTypeAndAppend( + NamesAndTypesList & result, DataTypePtr & type, const String & name, const DataTypePtr & default_type, size_t rows_read); + } diff --git a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp index 07331d82bb8..05fc3b8ca2a 100644 --- a/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp @@ -76,9 +76,8 @@ Chunk ArrowBlockInputFormat::generate() /// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields. /// Otherwise fill the missing columns with zero values of its type. if (format_settings.defaults_for_omitted_fields) - for (size_t row_idx = 0; row_idx < res.getNumRows(); ++row_idx) - for (const auto & column_idx : missing_columns) - block_missing_values.setBit(column_idx, row_idx); + for (const auto & column_idx : missing_columns) + block_missing_values.setBits(column_idx, res.getNumRows()); return res; } @@ -188,7 +187,7 @@ void registerInputFormatArrow(FormatFactory & factory) { return std::make_shared(buf, sample, false, format_settings); }); - factory.markFormatAsColumnOriented("Arrow"); + factory.markFormatSupportsSubsetOfColumns("Arrow"); factory.registerInputFormat( "ArrowStream", [](ReadBuffer & buf, diff --git a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp index 60408f13ff0..83eaefa8cf7 100644 --- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp @@ -34,7 +34,7 @@ void ArrowBlockOutputFormat::consume(Chunk chunk) { const Block & header = getPort(PortKind::Main).getHeader(); ch_column_to_arrow_column - = std::make_unique(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary); + = std::make_unique(header, "Arrow", format_settings.arrow.low_cardinality_as_dictionary, format_settings.arrow.output_string_as_string); } ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num); diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp index 3f6a36e8e8c..e3cc896466b 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -168,6 +167,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values); template @@ -180,6 +180,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values) { const auto * column_array = assert_cast(column.get()); @@ -196,7 +197,7 @@ namespace DB /// Start new array. components_status = builder.Append(); checkStatus(components_status, nested_column->getName(), format_name); - fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], dictionary_values); + fillArrowArray(column_name, nested_column, nested_type, null_bytemap, value_builder, format_name, offsets[array_idx - 1], offsets[array_idx], output_string_as_string, dictionary_values); } } @@ -209,17 +210,20 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values) { const auto * column_tuple = assert_cast(column.get()); - const auto & nested_types = assert_cast(column_type.get())->getElements(); + const auto * type_tuple = assert_cast(column_type.get()); + const auto & nested_types = type_tuple->getElements(); + const auto & nested_names = type_tuple->getElementNames(); arrow::StructBuilder & builder = assert_cast(*array_builder); for (size_t i = 0; i != column_tuple->tupleSize(); ++i) { ColumnPtr nested_column = column_tuple->getColumnPtr(i); - fillArrowArray(column_name + "." + std::to_string(i), nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, dictionary_values); + fillArrowArray(column_name + "." + nested_names[i], nested_column, nested_types[i], null_bytemap, builder.field_builder(i), format_name, start, end, output_string_as_string, dictionary_values); } for (size_t i = start; i != end; ++i) @@ -267,6 +271,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values) { const auto * column_lc = assert_cast(column.get()); @@ -284,7 +289,7 @@ namespace DB auto dict_column = column_lc->getDictionary().getNestedColumn(); const auto & dict_type = assert_cast(column_type.get())->getDictionaryType(); - fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), dictionary_values); + fillArrowArray(column_name, dict_column, dict_type, nullptr, values_builder.get(), format_name, 0, dict_column->size(), output_string_as_string, dictionary_values); status = values_builder->Finish(&dict_values); checkStatus(status, column->getName(), format_name); } @@ -321,6 +326,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values) { auto value_type = assert_cast(array_builder->type().get())->value_type(); @@ -328,7 +334,7 @@ namespace DB #define DISPATCH(ARROW_TYPE_ID, ARROW_TYPE) \ if (arrow::Type::ARROW_TYPE_ID == value_type->id()) \ { \ - fillArrowArrayWithLowCardinalityColumnDataImpl(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); \ + fillArrowArrayWithLowCardinalityColumnDataImpl(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); \ return; \ } @@ -338,7 +344,7 @@ namespace DB throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot fill arrow array with {} data.", column_type->getName()); } - template + template static void fillArrowArrayWithStringColumnData( ColumnPtr write_column, const PaddedPODArray * null_bytemap, @@ -348,7 +354,7 @@ namespace DB size_t end) { const auto & internal_column = assert_cast(*write_column); - arrow::BinaryBuilder & builder = assert_cast(*array_builder); + ArrowBuilder & builder = assert_cast(*array_builder); arrow::Status status; for (size_t string_i = start; string_i < end; ++string_i) @@ -442,6 +448,7 @@ namespace DB String format_name, size_t start, size_t end, + bool output_string_as_string, std::unordered_map> & dictionary_values) { const String column_type_name = column_type->getFamilyName(); @@ -453,15 +460,21 @@ namespace DB DataTypePtr nested_type = assert_cast(column_type.get())->getNestedType(); const ColumnPtr & null_column = column_nullable->getNullMapColumnPtr(); const PaddedPODArray & bytemap = assert_cast &>(*null_column).getData(); - fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, dictionary_values); + fillArrowArray(column_name, nested_column, nested_type, &bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); } else if (isString(column_type)) { - fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); + if (output_string_as_string) + fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); + else + fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); } else if (isFixedString(column_type)) { - fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); + if (output_string_as_string) + fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); + else + fillArrowArrayWithStringColumnData(column, null_bytemap, format_name, array_builder, start, end); } else if (isDate(column_type)) { @@ -477,21 +490,21 @@ namespace DB } else if (isArray(column_type)) { - fillArrowArrayWithArrayColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + fillArrowArrayWithArrayColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); } else if (isTuple(column_type)) { - fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + fillArrowArrayWithTupleColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); } else if (column_type->getTypeId() == TypeIndex::LowCardinality) { - fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + fillArrowArrayWithLowCardinalityColumnData(column_name, column, column_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); } else if (isMap(column_type)) { ColumnPtr column_array = assert_cast(column.get())->getNestedColumnPtr(); DataTypePtr array_type = assert_cast(column_type.get())->getNestedType(); - fillArrowArrayWithArrayColumnData(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, dictionary_values); + fillArrowArrayWithArrayColumnData(column_name, column_array, array_type, null_bytemap, array_builder, format_name, start, end, output_string_as_string, dictionary_values); } else if (isDecimal(column_type)) { @@ -603,13 +616,13 @@ namespace DB } static std::shared_ptr getArrowType( - DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool * out_is_column_nullable) + DataTypePtr column_type, ColumnPtr column, const std::string & column_name, const std::string & format_name, bool output_string_as_string, bool * out_is_column_nullable) { if (column_type->isNullable()) { DataTypePtr nested_type = assert_cast(column_type.get())->getNestedType(); ColumnPtr nested_column = assert_cast(column.get())->getNestedColumnPtr(); - auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable); + auto arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable); *out_is_column_nullable = true; return arrow_type; } @@ -643,20 +656,21 @@ namespace DB { auto nested_type = assert_cast(column_type.get())->getNestedType(); auto nested_column = assert_cast(column.get())->getDataPtr(); - auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable); + auto nested_arrow_type = getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable); return arrow::list(nested_arrow_type); } if (isTuple(column_type)) { - const auto & nested_types = assert_cast(column_type.get())->getElements(); + const auto & tuple_type = assert_cast(column_type.get()); + const auto & nested_types = tuple_type->getElements(); + const auto & nested_names = tuple_type->getElementNames(); const auto * tuple_column = assert_cast(column.get()); std::vector> nested_fields; for (size_t i = 0; i != nested_types.size(); ++i) { - String name = column_name + "." + std::to_string(i); - auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), name, format_name, out_is_column_nullable); - nested_fields.push_back(std::make_shared(name, nested_arrow_type, *out_is_column_nullable)); + auto nested_arrow_type = getArrowType(nested_types[i], tuple_column->getColumnPtr(i), nested_names[i], format_name, output_string_as_string, out_is_column_nullable); + nested_fields.push_back(std::make_shared(nested_names[i], nested_arrow_type, *out_is_column_nullable)); } return arrow::struct_(nested_fields); } @@ -669,7 +683,7 @@ namespace DB const auto & indexes_column = lc_column->getIndexesPtr(); return arrow::dictionary( getArrowTypeForLowCardinalityIndexes(indexes_column), - getArrowType(nested_type, nested_column, column_name, format_name, out_is_column_nullable)); + getArrowType(nested_type, nested_column, column_name, format_name, output_string_as_string, out_is_column_nullable)); } if (isMap(column_type)) @@ -680,10 +694,19 @@ namespace DB const auto & columns = assert_cast(column.get())->getNestedData().getColumns(); return arrow::map( - getArrowType(key_type, columns[0], column_name, format_name, out_is_column_nullable), - getArrowType(val_type, columns[1], column_name, format_name, out_is_column_nullable)); + getArrowType(key_type, columns[0], column_name, format_name, output_string_as_string, out_is_column_nullable), + getArrowType(val_type, columns[1], column_name, format_name, output_string_as_string, out_is_column_nullable)); } + if (isDateTime64(column_type)) + { + const auto * datetime64_type = assert_cast(column_type.get()); + return arrow::timestamp(getArrowTimeUnit(datetime64_type), datetime64_type->getTimeZone().getTimeZone()); + } + + if (isStringOrFixedString(column_type) && output_string_as_string) + return arrow::utf8(); + const std::string type_name = column_type->getFamilyName(); if (const auto * arrow_type_it = std::find_if( internal_type_to_arrow_type.begin(), @@ -694,19 +717,13 @@ namespace DB return arrow_type_it->second; } - if (isDateTime64(column_type)) - { - const auto * datetime64_type = assert_cast(column_type.get()); - return arrow::timestamp(getArrowTimeUnit(datetime64_type), datetime64_type->getTimeZone().getTimeZone()); - } - throw Exception(ErrorCodes::UNKNOWN_TYPE, "The type '{}' of a column '{}' is not supported for conversion into {} data format.", column_type->getName(), column_name, format_name); } - CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_) - : format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_) + CHColumnToArrowColumn::CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_) + : format_name(format_name_), low_cardinality_as_dictionary(low_cardinality_as_dictionary_), output_string_as_string(output_string_as_string_) { arrow_fields.reserve(header.columns()); header_columns.reserve(header.columns()); @@ -741,7 +758,7 @@ namespace DB if (!is_arrow_fields_initialized) { bool is_column_nullable = false; - auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, &is_column_nullable); + auto arrow_type = getArrowType(header_column.type, column, header_column.name, format_name, output_string_as_string, &is_column_nullable); arrow_fields.emplace_back(std::make_shared(header_column.name, arrow_type, is_column_nullable)); } @@ -751,7 +768,7 @@ namespace DB checkStatus(status, column->getName(), format_name); fillArrowArray( - header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), dictionary_values); + header_column.name, column, header_column.type, nullptr, array_builder.get(), format_name, 0, column->size(), output_string_as_string, dictionary_values); std::shared_ptr arrow_array; status = array_builder->Finish(&arrow_array); diff --git a/src/Processors/Formats/Impl/CHColumnToArrowColumn.h b/src/Processors/Formats/Impl/CHColumnToArrowColumn.h index 50de8045d5f..2896fb3642f 100644 --- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.h +++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.h @@ -14,7 +14,7 @@ namespace DB class CHColumnToArrowColumn { public: - CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_); + CHColumnToArrowColumn(const Block & header, const std::string & format_name_, bool low_cardinality_as_dictionary_, bool output_string_as_string_); void chChunkToArrowTable(std::shared_ptr & res, const Chunk & chunk, size_t columns_num); @@ -32,6 +32,10 @@ private: /// because LowCardinality column from header always has indexes type UInt8, so, we should get /// proper indexes type from first chunk of data. bool is_arrow_fields_initialized = false; + + /// Output columns with String data type as Arrow::String type. + /// By default Arrow::Binary is used. + bool output_string_as_string = false; }; } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 9990c33f0bb..0eaa02c97cb 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -112,7 +112,9 @@ String CSVFormatReader::readCSVFieldIntoString() void CSVFormatReader::skipField() { - readCSVFieldIntoString(); + skipWhitespacesAndTabs(*in); + NullOutput out; + readCSVStringInto(out, *in, format_settings.csv); } void CSVFormatReader::skipRowEndDelimiter() @@ -374,6 +376,7 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory) }; registerWithNamesAndTypes("CSV", register_func); + markFormatWithNamesAndTypesSupportsSamplingColumns("CSV", factory); } void registerCSVSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index 67743a04bf3..ad173e449d6 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -310,6 +310,7 @@ void registerInputFormatCapnProto(FormatFactory & factory) return std::make_shared(buf, sample, std::move(params), FormatSchemaInfo(settings, "CapnProto", true), settings); }); + factory.markFormatSupportsSubsetOfColumns("CapnProto"); factory.registerFileExtension("capnp", "CapnProto"); } diff --git a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp index 74c5fb1945a..56a639a0e30 100644 --- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.cpp @@ -333,6 +333,7 @@ void registerInputFormatCustomSeparated(FormatFactory & factory) }); }; registerWithNamesAndTypes(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", register_func); + markFormatWithNamesAndTypesSupportsSamplingColumns(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", factory); } } diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index e31006ff0f6..d369eedceea 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include @@ -221,12 +221,12 @@ void registerInputFormatJSONAsString(FormatFactory & factory) void registerFileSegmentationEngineJSONAsString(FormatFactory & factory) { - factory.registerFileSegmentationEngine("JSONAsString", &fileSegmentationEngineJSONEachRow); + factory.registerFileSegmentationEngine("JSONAsString", &JSONUtils::fileSegmentationEngineJSONEachRow); } void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory) { - factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsString", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); + factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsString", JSONUtils::nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } void registerJSONAsStringSchemaReader(FormatFactory & factory) @@ -251,12 +251,12 @@ void registerInputFormatJSONAsObject(FormatFactory & factory) void registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(FormatFactory & factory) { - factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsObject", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); + factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsObject", JSONUtils::nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } void registerFileSegmentationEngineJSONAsObject(FormatFactory & factory) { - factory.registerFileSegmentationEngine("JSONAsObject", &fileSegmentationEngineJSONEachRow); + factory.registerFileSegmentationEngine("JSONAsObject", &JSONUtils::fileSegmentationEngineJSONEachRow); } void registerJSONAsObjectSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp new file mode 100644 index 00000000000..22264d01a57 --- /dev/null +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.cpp @@ -0,0 +1,71 @@ +#include +#include +#include + +namespace DB +{ + +JSONColumnsReader::JSONColumnsReader(ReadBuffer & in_) : JSONColumnsReaderBase(in_) +{ +} + +void JSONColumnsReader::readChunkStart() +{ + skipWhitespaceIfAny(*in); + assertChar('{', *in); + skipWhitespaceIfAny(*in); +} + +std::optional JSONColumnsReader::readColumnStart() +{ + skipWhitespaceIfAny(*in); + String name; + readJSONString(name, *in); + skipWhitespaceIfAny(*in); + assertChar(':', *in); + skipWhitespaceIfAny(*in); + assertChar('[', *in); + skipWhitespaceIfAny(*in); + return name; +} + +bool JSONColumnsReader::checkChunkEnd() +{ + skipWhitespaceIfAny(*in); + if (!in->eof() && *in->position() == '}') + { + ++in->position(); + skipWhitespaceIfAny(*in); + return true; + } + return false; +} + + +void registerInputFormatJSONColumns(FormatFactory & factory) +{ + factory.registerInputFormat( + "JSONColumns", + [](ReadBuffer & buf, + const Block &sample, + const RowInputFormatParams &, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, settings, std::make_unique(buf)); + } + ); + factory.markFormatSupportsSubsetOfColumns("JSONColumns"); +} + +void registerJSONColumnsSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "JSONColumns", + [](ReadBuffer & buf, const FormatSettings & settings) + { + return std::make_shared(buf, settings, std::make_unique(buf)); + } + ); +} + +} diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.h b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.h new file mode 100644 index 00000000000..f8b8a80731e --- /dev/null +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormat.h @@ -0,0 +1,25 @@ +#pragma once + +#include + +namespace DB +{ + +/* Format JSONColumns reads each block of data in the next format: + * { + * "name1": [value1, value2, value3, ...], + * "name2": [value1, value2m value3, ...], + * ... + * } + */ +class JSONColumnsReader : public JSONColumnsReaderBase +{ +public: + JSONColumnsReader(ReadBuffer & in_); + + void readChunkStart() override; + std::optional readColumnStart() override; + bool checkChunkEnd() override; +}; + +} diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp new file mode 100644 index 00000000000..cdde87f2cf6 --- /dev/null +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp @@ -0,0 +1,272 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; + extern const int EMPTY_DATA_PASSED; +} + + +JSONColumnsReaderBase::JSONColumnsReaderBase(ReadBuffer & in_) : in(&in_) +{ +} + +bool JSONColumnsReaderBase::checkColumnEnd() +{ + skipWhitespaceIfAny(*in); + if (!in->eof() && *in->position() == ']') + { + ++in->position(); + skipWhitespaceIfAny(*in); + return true; + } + return false; +} + +bool JSONColumnsReaderBase::checkColumnEndOrSkipFieldDelimiter() +{ + if (checkColumnEnd()) + return true; + skipWhitespaceIfAny(*in); + assertChar(',', *in); + skipWhitespaceIfAny(*in); + return false; +} + +bool JSONColumnsReaderBase::checkChunkEndOrSkipColumnDelimiter() +{ + if (checkChunkEnd()) + return true; + skipWhitespaceIfAny(*in); + assertChar(',', *in); + skipWhitespaceIfAny(*in); + return false; +} + +void JSONColumnsReaderBase::skipColumn() +{ + /// We assume that we already read '[', so we should skip until matched ']'. + size_t balance = 1; + bool inside_quotes = false; + char * pos; + while (!in->eof() && balance) + { + if (inside_quotes) + pos = find_first_symbols<'"'>(in->position(), in->buffer().end()); + else + pos = find_first_symbols<'[', ']', '"'>(in->position(), in->buffer().end()); + + in->position() = pos; + if (in->position() == in->buffer().end()) + continue; + + if (*in->position() == '"') + inside_quotes = !inside_quotes; + else if (*in->position() == ']') + --balance; + else if (*in->position() == '[') + ++balance; + ++in->position(); + } +} + +JSONColumnsBlockInputFormatBase::JSONColumnsBlockInputFormatBase( + ReadBuffer & in_, const Block & header_, const FormatSettings & format_settings_, std::unique_ptr reader_) + : IInputFormat(header_, in_) + , format_settings(format_settings_) + , fields(header_.getNamesAndTypes()) + , name_to_index(header_.getNamesToIndexesMap()) + , serializations(header_.getSerializations()) + , reader(std::move(reader_)) +{ +} + +size_t JSONColumnsBlockInputFormatBase::readColumn( + IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name) +{ + /// Check for empty column. + if (reader->checkColumnEnd()) + return 0; + + do + { + JSONUtils::readField(*in, column, type, serialization, column_name, format_settings, false); + } + while (!reader->checkColumnEndOrSkipFieldDelimiter()); + + return column.size(); +} + +void JSONColumnsBlockInputFormatBase::setReadBuffer(ReadBuffer & in_) +{ + reader->setReadBuffer(in_); + IInputFormat::setReadBuffer(in_); +} + +Chunk JSONColumnsBlockInputFormatBase::generate() +{ + MutableColumns columns = getPort().getHeader().cloneEmptyColumns(); + block_missing_values.clear(); + + skipWhitespaceIfAny(*in); + if (in->eof()) + return {}; + + reader->readChunkStart(); + /// Check for empty block. + if (reader->checkChunkEnd()) + return Chunk(std::move(columns), 0); + + std::vector seen_columns(columns.size(), 0); + Int64 rows = -1; + size_t iteration = 0; + do + { + auto column_name = reader->readColumnStart(); + size_t column_index = iteration; + if (column_name.has_value()) + { + /// Check if this name appears in header. If no, skip this column or throw + /// an exception according to setting input_format_skip_unknown_fields + if (!name_to_index.contains(*column_name)) + { + if (!format_settings.skip_unknown_fields) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown column found in input data: {}", *column_name); + + reader->skipColumn(); + continue; + } + column_index = name_to_index[*column_name]; + } + + if (column_index >= columns.size()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Input data has too many columns, expected {} columns", columns.size()); + + seen_columns[column_index] = 1; + size_t columns_size = readColumn(*columns[column_index], fields[column_index].type, serializations[column_index], fields[column_index].name); + if (rows != -1 && size_t(rows) != columns_size) + throw Exception(ErrorCodes::INCORRECT_DATA, "Number of rows differs in different columns: {} != {}", rows, columns_size); + rows = columns_size; + ++iteration; + } + while (!reader->checkChunkEndOrSkipColumnDelimiter()); + + if (rows <= 0) + return Chunk(std::move(columns), 0); + + /// Insert defaults in columns that were not presented in block and fill + /// block_missing_values accordingly if setting input_format_defaults_for_omitted_fields is enabled + for (size_t i = 0; i != seen_columns.size(); ++i) + { + if (!seen_columns[i]) + { + columns[i]->insertManyDefaults(rows); + if (format_settings.defaults_for_omitted_fields) + block_missing_values.setBits(i, rows); + } + } + + return Chunk(std::move(columns), rows); +} + +JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase( + ReadBuffer & in_, const FormatSettings & format_settings_, std::unique_ptr reader_) + : ISchemaReader(in_), format_settings(format_settings_), reader(std::move(reader_)) +{ +} + +void JSONColumnsSchemaReaderBase::chooseResulType(DataTypePtr & type, const DataTypePtr & new_type, const String & column_name, size_t row) const +{ + auto common_type_checker = [&](const DataTypePtr & first, const DataTypePtr & second) + { + return JSONUtils::getCommonTypeForJSONFormats(first, second, format_settings.json.read_bools_as_numbers); + }; + chooseResultColumnType(type, new_type, common_type_checker, nullptr, column_name, row); +} + +NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema() +{ + size_t total_rows_read = 0; + std::unordered_map names_to_types; + std::vector names_order; + /// Read data block by block and determine the type for each column + /// until max_rows_to_read_for_schema_inference is reached. + while (total_rows_read < format_settings.max_rows_to_read_for_schema_inference) + { + if (in.eof()) + break; + + reader->readChunkStart(); + /// Check for empty block. + if (reader->checkChunkEnd()) + continue; + + size_t iteration = 0; + size_t rows_in_block = 0; + do + { + auto column_name_opt = reader->readColumnStart(); + /// If format doesn't have named for columns, use default names 'c1', 'c2', ... + String column_name = column_name_opt.has_value() ? *column_name_opt : "c" + std::to_string(iteration + 1); + /// Keep order of column names as it is in input data. + if (!names_to_types.contains(column_name)) + names_order.push_back(column_name); + + rows_in_block = 0; + auto column_type = readColumnAndGetDataType(column_name, rows_in_block, format_settings.max_rows_to_read_for_schema_inference - total_rows_read); + chooseResulType(names_to_types[column_name], column_type, column_name, total_rows_read + 1); + ++iteration; + } + while (!reader->checkChunkEndOrSkipColumnDelimiter()); + + total_rows_read += rows_in_block; + } + + if (names_to_types.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Cannot read rows from the data"); + + NamesAndTypesList result; + for (auto & name : names_order) + { + auto & type = names_to_types[name]; + /// Check that we could determine the type of this column. + checkResultColumnTypeAndAppend(result, type, name, nullptr, format_settings.max_rows_to_read_for_schema_inference); + } + + return result; +} + +DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read) +{ + /// Check for empty column. + if (reader->checkColumnEnd()) + return nullptr; + + String field; + DataTypePtr column_type; + do + { + /// If we reached max_rows_to_read, skip the rest part of this column. + if (rows_read == max_rows_to_read) + { + reader->skipColumn(); + break; + } + + readJSONField(field, in); + DataTypePtr field_type = JSONUtils::getDataTypeFromField(field); + chooseResulType(column_type, field_type, column_name, rows_read); + ++rows_read; + } + while (!reader->checkColumnEndOrSkipFieldDelimiter()); + + return column_type; +} + +} diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h new file mode 100644 index 00000000000..ac746a2e2d1 --- /dev/null +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h @@ -0,0 +1,92 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +class ReadBuffer; + + +/// Base class for reading data in Columnar JSON formats. +class JSONColumnsReaderBase +{ +public: + JSONColumnsReaderBase(ReadBuffer & in_); + + virtual ~JSONColumnsReaderBase() = default; + + void setReadBuffer(ReadBuffer & in_) { in = &in_; } + + virtual void readChunkStart() = 0; + virtual std::optional readColumnStart() = 0; + + virtual bool checkChunkEnd() = 0; + bool checkChunkEndOrSkipColumnDelimiter(); + + bool checkColumnEnd(); + bool checkColumnEndOrSkipFieldDelimiter(); + + void skipColumn(); + +protected: + ReadBuffer * in; +}; + + +/// Base class for Columnar JSON input formats. It works with data using +/// JSONColumnsReaderBase interface. +/// To implement new Columnar JSON format you need to implement new JSONColumnsReaderBase +/// interface and provide it to JSONColumnsBlockInputFormatBase. +class JSONColumnsBlockInputFormatBase : public IInputFormat +{ +public: + JSONColumnsBlockInputFormatBase(ReadBuffer & in_, const Block & header_, const FormatSettings & format_settings_, std::unique_ptr reader_); + + String getName() const override { return "JSONColumnsBlockInputFormatBase"; } + + void setReadBuffer(ReadBuffer & in_) override; + + const BlockMissingValues & getMissingValues() const override { return block_missing_values; } + +protected: + Chunk generate() override; + + size_t readColumn(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name); + + const FormatSettings format_settings; + const NamesAndTypes fields; + /// Maps column names and their positions in header. + std::unordered_map name_to_index; + Serializations serializations; + std::unique_ptr reader; + BlockMissingValues block_missing_values; +}; + + +/// Base class for schema inference from Columnar JSON input formats. It works with data using +/// JSONColumnsReaderBase interface. +/// To implement schema reader for the new Columnar JSON format you need to implement new JSONColumnsReaderBase +/// interface and provide it to JSONColumnsSchemaReaderBase. +class JSONColumnsSchemaReaderBase : public ISchemaReader +{ +public: + JSONColumnsSchemaReaderBase(ReadBuffer & in_, const FormatSettings & format_settings_, std::unique_ptr reader_); + +private: + NamesAndTypesList readSchema() override; + + /// Read whole column in the block (up to max_rows_to_read rows) and extract the data type. + DataTypePtr readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read); + + /// Choose result type for column from two inferred types from different rows. + void chooseResulType(DataTypePtr & type, const DataTypePtr & new_type, const String & column_name, size_t row) const; + + const FormatSettings format_settings; + std::unique_ptr reader; +}; + +} diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormat.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormat.cpp new file mode 100644 index 00000000000..dd8688c655e --- /dev/null +++ b/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormat.cpp @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +JSONColumnsBlockOutputFormat::JSONColumnsBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_, size_t indent_) + : JSONColumnsBlockOutputFormatBase(out_, header_, format_settings_), fields(header_.getNamesAndTypes()), indent(indent_) +{ + for (auto & field : fields) + { + WriteBufferFromOwnString buf; + writeJSONString(field.name, buf, format_settings); + field.name = buf.str().substr(1, buf.str().size() - 2); + } +} + +void JSONColumnsBlockOutputFormat::writeChunkStart() +{ + JSONUtils::writeObjectStart(*ostr, indent); +} + +void JSONColumnsBlockOutputFormat::writeColumnStart(size_t column_index) +{ + JSONUtils::writeCompactArrayStart(*ostr, indent + 1, fields[column_index].name.data()); +} + +void JSONColumnsBlockOutputFormat::writeChunkEnd() +{ + JSONUtils::writeObjectEnd(*ostr, indent); + writeChar('\n', *ostr); +} + +void registerOutputFormatJSONColumns(FormatFactory & factory) +{ + factory.registerOutputFormat("JSONColumns", []( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams &, + const FormatSettings & format_settings) + { + return std::make_shared(buf, sample, format_settings); + }); +} + +} diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormat.h b/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormat.h new file mode 100644 index 00000000000..e52f5f61aec --- /dev/null +++ b/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormat.h @@ -0,0 +1,32 @@ +#pragma once + +#include + +namespace DB +{ + +/* Format JSONColumns outputs all data as a single block in the next format: + * { + * "name1": [value1, value2, value3, ...], + * "name2": [value1, value2m value3, ...], + * ... + * } + */ +class JSONColumnsBlockOutputFormat : public JSONColumnsBlockOutputFormatBase +{ +public: + JSONColumnsBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_, size_t indent_ = 0); + + String getName() const override { return "JSONColumnsBlockOutputFormat"; } + +protected: + void writeChunkStart() override; + void writeChunkEnd() override; + + void writeColumnStart(size_t column_index) override; + + NamesAndTypes fields; + size_t indent; +}; + +} diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormatBase.cpp new file mode 100644 index 00000000000..8e83282408b --- /dev/null +++ b/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormatBase.cpp @@ -0,0 +1,66 @@ +#include +#include +#include + + +namespace DB +{ + +JSONColumnsBlockOutputFormatBase::JSONColumnsBlockOutputFormatBase( + WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_) + : IOutputFormat(header_, out_) + , format_settings(format_settings_) + , serializations(header_.getSerializations()) + , ostr(&out) +{ +} + +void JSONColumnsBlockOutputFormatBase::consume(Chunk chunk) +{ + if (!mono_chunk) + { + mono_chunk = std::move(chunk); + return; + } + + mono_chunk.append(chunk); +} + +void JSONColumnsBlockOutputFormatBase::writeSuffix() +{ + + writeChunk(mono_chunk); + mono_chunk.clear(); +} + +void JSONColumnsBlockOutputFormatBase::writeChunk(Chunk & chunk) +{ + writeChunkStart(); + const auto & columns = chunk.getColumns(); + for (size_t i = 0; i != columns.size(); ++i) + { + writeColumnStart(i); + writeColumn(*columns[i], *serializations[i]); + writeColumnEnd(i == columns.size() - 1); + } + writeChunkEnd(); +} + +void JSONColumnsBlockOutputFormatBase::writeColumnEnd(bool is_last) +{ + JSONUtils::writeCompactArrayEnd(*ostr); + if (!is_last) + JSONUtils::writeFieldDelimiter(*ostr); +} + +void JSONColumnsBlockOutputFormatBase::writeColumn(const IColumn & column, const ISerialization & serialization) +{ + for (size_t i = 0; i != column.size(); ++i) + { + if (i != 0) + JSONUtils::writeFieldCompactDelimiter(*ostr); + serialization.serializeTextJSON(column, i, *ostr, format_settings); + } +} + +} diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormatBase.h b/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormatBase.h new file mode 100644 index 00000000000..133979523f9 --- /dev/null +++ b/src/Processors/Formats/Impl/JSONColumnsBlockOutputFormatBase.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +class WriteBuffer; + +/// Base class for Columnar JSON output formats. +/// It buffers all data and outputs it as a single block in writeSuffix() method. +class JSONColumnsBlockOutputFormatBase : public IOutputFormat +{ +public: + JSONColumnsBlockOutputFormatBase(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_); + + String getName() const override { return "JSONColumnsBlockOutputFormatBase"; } + +protected: + void consume(Chunk chunk) override; + void writeSuffix() override; + + void writeChunk(Chunk & chunk); + void writeColumn(const IColumn & column, const ISerialization & serialization); + + virtual void writeChunkStart() = 0; + virtual void writeChunkEnd() = 0; + virtual void writeColumnStart(size_t /*column_index*/) = 0; + void writeColumnEnd(bool is_last); + + const FormatSettings format_settings; + const Serializations serializations; + + WriteBuffer * ostr; + + Chunk mono_chunk; +}; + +} diff --git a/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockOutputFormat.cpp b/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockOutputFormat.cpp new file mode 100644 index 00000000000..394385e548d --- /dev/null +++ b/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockOutputFormat.cpp @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +JSONColumnsWithMetadataBlockOutputFormat::JSONColumnsWithMetadataBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_) + : JSONColumnsBlockOutputFormat(out_, header_, format_settings_, 1) +{ + bool need_validate_utf8 = false; + JSONUtils::makeNamesAndTypesWithValidUTF8(fields, format_settings, need_validate_utf8); + + if (need_validate_utf8) + { + validating_ostr = std::make_unique(out); + ostr = validating_ostr.get(); + } +} + +void JSONColumnsWithMetadataBlockOutputFormat::writePrefix() +{ + JSONUtils::writeObjectStart(*ostr); + JSONUtils::writeMetadata(fields, format_settings, *ostr); +} + +void JSONColumnsWithMetadataBlockOutputFormat::writeSuffix() +{ + rows = mono_chunk.getNumRows(); + JSONColumnsBlockOutputFormatBase::writeSuffix(); +} + +void JSONColumnsWithMetadataBlockOutputFormat::writeChunkStart() +{ + JSONUtils::writeFieldDelimiter(*ostr, 2); + JSONUtils::writeObjectStart(*ostr, 1, "data"); +} + +void JSONColumnsWithMetadataBlockOutputFormat::writeChunkEnd() +{ + JSONUtils::writeObjectEnd(*ostr, indent); +} + +void JSONColumnsWithMetadataBlockOutputFormat::consumeExtremes(Chunk chunk) +{ + auto num_rows = chunk.getNumRows(); + if (num_rows != 2) + throw Exception("Got " + toString(num_rows) + " in extremes chunk, expected 2", ErrorCodes::LOGICAL_ERROR); + + const auto & columns = chunk.getColumns(); + JSONUtils::writeFieldDelimiter(*ostr, 2); + JSONUtils::writeObjectStart(*ostr, 1, "extremes"); + writeExtremesElement("min", columns, 0); + JSONUtils::writeFieldDelimiter(*ostr); + writeExtremesElement("max", columns, 1); + JSONUtils::writeObjectEnd(*ostr, 1); +} + +void JSONColumnsWithMetadataBlockOutputFormat::writeExtremesElement(const char * title, const Columns & columns, size_t row_num) +{ + JSONUtils::writeObjectStart(*ostr, 2, title); + JSONUtils::writeColumns(columns, fields, serializations, row_num, false, format_settings, *ostr, 3); + JSONUtils::writeObjectEnd(*ostr, 2); +} + +void JSONColumnsWithMetadataBlockOutputFormat::consumeTotals(Chunk chunk) +{ + auto num_rows = chunk.getNumRows(); + if (num_rows != 1) + throw Exception("Got " + toString(num_rows) + " in totals chunk, expected 1", ErrorCodes::LOGICAL_ERROR); + + const auto & columns = chunk.getColumns(); + JSONUtils::writeFieldDelimiter(*ostr, 2); + JSONUtils::writeObjectStart(*ostr, 1, "totals"); + JSONUtils::writeColumns(columns, fields, serializations, 0, false, format_settings, *ostr, 2); + JSONUtils::writeObjectEnd(*ostr, 1); +} + +void JSONColumnsWithMetadataBlockOutputFormat::finalizeImpl() +{ + auto outside_statistics = getOutsideStatistics(); + if (outside_statistics) + statistics = std::move(*outside_statistics); + + JSONUtils::writeAdditionalInfo( + rows, + statistics.rows_before_limit, + statistics.applied_limit, + statistics.watch, + statistics.progress, + format_settings.write_statistics, + *ostr); + + JSONUtils::writeObjectEnd(*ostr); + writeChar('\n', *ostr); + ostr->next(); +} + +void registerOutputFormatJSONColumnsWithMetadata(FormatFactory & factory) +{ + factory.registerOutputFormat("JSONColumnsWithMetadata", []( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams &, + const FormatSettings & format_settings) + { + return std::make_shared(buf, sample, format_settings); + }); + + factory.markFormatHasNoAppendSupport("JSONColumnsWithMetadata"); +} + +} diff --git a/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockOutputFormat.h b/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockOutputFormat.h new file mode 100644 index 00000000000..f56a79bdf56 --- /dev/null +++ b/src/Processors/Formats/Impl/JSONColumnsWithMetadataBlockOutputFormat.h @@ -0,0 +1,67 @@ +#pragma once +#include + +namespace DB +{ + +/* Format JSONColumnsWithMetadata outputs all data as a single block in the next format: + * { + * "meta": + * [ + * { + * "name": "name1", + * "type": "type1" + * }, + * { + * "name": "name2", + * "type": "type2" + * }, + * ... + * ], + * + * "data": + * { + * "name1": [value1, value2, value3, ...], + * "name2": [value1, value2m value3, ...], + * ... + * }, + * + * "rows": ..., + * + * "statistics": + * { + * "elapsed": ..., + * "rows_read": ..., + * "bytes_read": ... + * } + * } + */ +class JSONColumnsWithMetadataBlockOutputFormat : public JSONColumnsBlockOutputFormat +{ +public: + JSONColumnsWithMetadataBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_); + + String getName() const override { return "JSONCompactColumnsBlockOutputFormat"; } + + void setRowsBeforeLimit(size_t rows_before_limit_) override { statistics.rows_before_limit = rows_before_limit_; statistics.applied_limit = true; } + void onProgress(const Progress & progress_) override { statistics.progress.incrementPiecewiseAtomically(progress_); } + +protected: + void consumeTotals(Chunk chunk) override; + void consumeExtremes(Chunk chunk) override; + + void writePrefix() override; + void writeSuffix() override; + void finalizeImpl() override; + + void writeChunkStart() override; + void writeChunkEnd() override; + + void writeExtremesElement(const char * title, const Columns & columns, size_t row_num); + + Statistics statistics; + std::unique_ptr validating_ostr; /// Validates UTF-8 sequences, replaces bad sequences with replacement character. + size_t rows; +}; + +} diff --git a/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp new file mode 100644 index 00000000000..5b26ee2677b --- /dev/null +++ b/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.cpp @@ -0,0 +1,65 @@ +#include +#include +#include + +namespace DB +{ + +JSONCompactColumnsReader::JSONCompactColumnsReader(ReadBuffer & in_) : JSONColumnsReaderBase(in_) +{ +} + +void JSONCompactColumnsReader::readChunkStart() +{ + skipWhitespaceIfAny(*in); + assertChar('[', *in); + skipWhitespaceIfAny(*in); +} + +std::optional JSONCompactColumnsReader::readColumnStart() +{ + skipWhitespaceIfAny(*in); + assertChar('[', *in); + skipWhitespaceIfAny(*in); + return std::nullopt; +} + +bool JSONCompactColumnsReader::checkChunkEnd() +{ + skipWhitespaceIfAny(*in); + if (!in->eof() && *in->position() == ']') + { + ++in->position(); + skipWhitespaceIfAny(*in); + return true; + } + return false; +} + + +void registerInputFormatJSONCompactColumns(FormatFactory & factory) +{ + factory.registerInputFormat( + "JSONCompactColumns", + [](ReadBuffer & buf, + const Block &sample, + const RowInputFormatParams &, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, settings, std::make_unique(buf)); + } + ); +} + +void registerJSONCompactColumnsSchemaReader(FormatFactory & factory) +{ + factory.registerSchemaReader( + "JSONCompactColumns", + [](ReadBuffer & buf, const FormatSettings & settings) + { + return std::make_shared(buf, settings, std::make_unique(buf)); + } + ); +} + +} diff --git a/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.h b/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.h new file mode 100644 index 00000000000..7f23e127ab4 --- /dev/null +++ b/src/Processors/Formats/Impl/JSONCompactColumnsBlockInputFormat.h @@ -0,0 +1,25 @@ +#pragma once + +#include + +namespace DB +{ + +/* Format JSONCompactColumns reads each block of data in the next format: + * [ + * [value1, value2, value3, ...], + * [value1, value2m value3, ...], + * ... + * ] + */ +class JSONCompactColumnsReader : public JSONColumnsReaderBase +{ +public: + JSONCompactColumnsReader(ReadBuffer & in_); + + void readChunkStart() override; + std::optional readColumnStart() override; + bool checkChunkEnd() override; +}; + +} diff --git a/src/Processors/Formats/Impl/JSONCompactColumnsBlockOutputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactColumnsBlockOutputFormat.cpp new file mode 100644 index 00000000000..757345cbbe0 --- /dev/null +++ b/src/Processors/Formats/Impl/JSONCompactColumnsBlockOutputFormat.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +namespace DB +{ + +JSONCompactColumnsBlockOutputFormat::JSONCompactColumnsBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_) + : JSONColumnsBlockOutputFormatBase(out_, header_, format_settings_), column_names(header_.getNames()) +{ +} + +void JSONCompactColumnsBlockOutputFormat::writeChunkStart() +{ + JSONUtils::writeArrayStart(*ostr); +} + +void JSONCompactColumnsBlockOutputFormat::writeColumnStart(size_t) +{ + JSONUtils::writeCompactArrayStart(*ostr, 1); +} + +void JSONCompactColumnsBlockOutputFormat::writeChunkEnd() +{ + JSONUtils::writeArrayEnd(*ostr); + writeChar('\n', *ostr); +} + +void registerOutputFormatJSONCompactColumns(FormatFactory & factory) +{ + factory.registerOutputFormat("JSONCompactColumns", []( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams &, + const FormatSettings & format_settings) + { + return std::make_shared(buf, sample, format_settings); + }); +} + +} diff --git a/src/Processors/Formats/Impl/JSONCompactColumnsBlockOutputFormat.h b/src/Processors/Formats/Impl/JSONCompactColumnsBlockOutputFormat.h new file mode 100644 index 00000000000..49612ed67f6 --- /dev/null +++ b/src/Processors/Formats/Impl/JSONCompactColumnsBlockOutputFormat.h @@ -0,0 +1,30 @@ +#pragma once +#include + +namespace DB +{ + +/* Format JSONCompactColumns outputs all data as a single block in the next format: + * [ + * [value1, value2, value3, ...], + * [value1, value2m value3, ...], + * ... + * ] + */ +class JSONCompactColumnsBlockOutputFormat : public JSONColumnsBlockOutputFormatBase +{ +public: + JSONCompactColumnsBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_); + + String getName() const override { return "JSONCompactColumnsBlockOutputFormat"; } + +protected: + void writeChunkStart() override; + void writeChunkEnd() override; + + void writeColumnStart(size_t column_index) override; + + const Names column_names; +}; + +} diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 867b56c541b..0b7cc6669be 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include @@ -109,7 +109,7 @@ std::vector JSONCompactEachRowFormatReader::readHeaderRow() bool JSONCompactEachRowFormatReader::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) { skipWhitespaceIfAny(*in); - return readFieldImpl(*in, column, type, serialization, column_name, format_settings, yield_strings); + return JSONUtils::readField(*in, column, type, serialization, column_name, format_settings, yield_strings); } bool JSONCompactEachRowFormatReader::parseRowStartWithDiagnosticInfo(WriteBuffer & out) @@ -189,7 +189,7 @@ JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader( bool allow_bools_as_numbers = format_settings_.json.read_bools_as_numbers; setCommonTypeChecker([allow_bools_as_numbers](const DataTypePtr & first, const DataTypePtr & second) { - return getCommonTypeForJSONFormats(first, second, allow_bools_as_numbers); + return JSONUtils::getCommonTypeForJSONFormats(first, second, allow_bools_as_numbers); }); } @@ -209,7 +209,7 @@ DataTypes JSONCompactEachRowRowSchemaReader::readRowAndGetDataTypes() if (in.eof()) return {}; - return readRowAndGetDataTypesForJSONCompactEachRow(in, reader.yieldStrings()); + return JSONUtils::readRowAndGetDataTypesForJSONCompactEachRow(in, reader.yieldStrings()); } void registerInputFormatJSONCompactEachRow(FormatFactory & factory) @@ -229,6 +229,7 @@ void registerInputFormatJSONCompactEachRow(FormatFactory & factory) }; registerWithNamesAndTypes(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func); + markFormatWithNamesAndTypesSupportsSamplingColumns(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", factory); } } @@ -258,7 +259,7 @@ void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory) size_t min_rows = 1 + int(with_names) + int(with_types); factory.registerFileSegmentationEngine(format_name, [min_rows](ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) { - return fileSegmentationEngineJSONCompactEachRow(in, memory, min_chunk_size, min_rows); + return JSONUtils::fileSegmentationEngineJSONCompactEachRow(in, memory, min_chunk_size, min_rows); }); }; diff --git a/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.cpp index b31c04b4554..47b79b71ae2 100644 --- a/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.cpp @@ -1,5 +1,6 @@ #include #include +#include #include @@ -20,72 +21,50 @@ JSONCompactRowOutputFormat::JSONCompactRowOutputFormat( void JSONCompactRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { - if (yield_strings) - { - WriteBufferFromOwnString buf; - - serialization.serializeText(column, row_num, buf, settings); - writeJSONString(buf.str(), *ostr, settings); - } - else - serialization.serializeTextJSON(column, row_num, *ostr, settings); - + JSONUtils::writeFieldFromColumn(column, serialization, row_num, yield_strings, settings, *ostr); ++field_number; } void JSONCompactRowOutputFormat::writeFieldDelimiter() { - writeCString(", ", *ostr); + JSONUtils::writeFieldCompactDelimiter(*ostr); } -void JSONCompactRowOutputFormat::writeTotalsFieldDelimiter() -{ - writeCString(",", *ostr); -} - - void JSONCompactRowOutputFormat::writeRowStartDelimiter() { - writeCString("\t\t[", *ostr); + JSONUtils::writeCompactArrayStart(*ostr, 2); } void JSONCompactRowOutputFormat::writeRowEndDelimiter() { - writeChar(']', *ostr); + JSONUtils::writeCompactArrayEnd(*ostr); field_number = 0; ++row_count; } void JSONCompactRowOutputFormat::writeBeforeTotals() { - writeCString(",\n", *ostr); - writeChar('\n', *ostr); - writeCString("\t\"totals\": [", *ostr); + JSONUtils::writeFieldDelimiter(*ostr, 2); + JSONUtils::writeCompactArrayStart(*ostr, 1, "totals"); +} + +void JSONCompactRowOutputFormat::writeTotals(const Columns & columns, size_t row_num) +{ + JSONUtils::writeCompactColumns(columns, serializations, row_num, yield_strings, settings, *ostr); } void JSONCompactRowOutputFormat::writeAfterTotals() { - writeChar(']', *ostr); + JSONUtils::writeCompactArrayEnd(*ostr); } void JSONCompactRowOutputFormat::writeExtremesElement(const char * title, const Columns & columns, size_t row_num) { - writeCString("\t\t\"", *ostr); - writeCString(title, *ostr); - writeCString("\": [", *ostr); - - size_t extremes_columns = columns.size(); - for (size_t i = 0; i < extremes_columns; ++i) - { - if (i != 0) - writeTotalsFieldDelimiter(); - - writeField(*columns[i], *serializations[i], row_num); - } - - writeChar(']', *ostr); + JSONUtils::writeCompactArrayStart(*ostr, 2, title); + JSONUtils::writeCompactColumns(columns, serializations, row_num, yield_strings, settings, *ostr); + JSONUtils::writeCompactArrayEnd(*ostr); } void registerOutputFormatJSONCompact(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.h b/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.h index a0e9a2a6026..d17a6acf019 100644 --- a/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactRowOutputFormat.h @@ -36,12 +36,7 @@ private: void writeExtremesElement(const char * title, const Columns & columns, size_t row_num) override; - void writeTotalsField(const IColumn & column, const ISerialization & serialization, size_t row_num) override - { - return writeField(column, serialization, row_num); - } - - void writeTotalsFieldDelimiter() override; + void writeTotals(const Columns & columns, size_t row_num) override; }; } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 30dca893afa..9eef72f95da 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -140,7 +140,7 @@ void JSONEachRowRowInputFormat::readField(size_t index, MutableColumns & columns seen_columns[index] = true; const auto & type = getPort().getHeader().getByPosition(index).type; const auto & serialization = serializations[index]; - read_columns[index] = readFieldImpl(*in, *columns[index], type, serialization, columnName(index), format_settings, yield_strings); + read_columns[index] = JSONUtils::readField(*in, *columns[index], type, serialization, columnName(index), format_settings, yield_strings); } inline bool JSONEachRowRowInputFormat::advanceToNextKey(size_t key_index) @@ -313,7 +313,7 @@ JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_str bool allow_bools_as_numbers = format_settings.json.read_bools_as_numbers; setCommonTypeChecker([allow_bools_as_numbers](const DataTypePtr & first, const DataTypePtr & second) { - return getCommonTypeForJSONFormats(first, second, allow_bools_as_numbers); + return JSONUtils::getCommonTypeForJSONFormats(first, second, allow_bools_as_numbers); }); } @@ -350,7 +350,7 @@ NamesAndTypesList JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes(bool & return {}; } - return readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings); + return JSONUtils::readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings); } void registerInputFormatJSONEachRow(FormatFactory & factory) @@ -393,22 +393,27 @@ void registerInputFormatJSONEachRow(FormatFactory & factory) { return std::make_shared(buf, sample, std::move(params), settings, true); }); + + factory.markFormatSupportsSubsetOfColumns("JSONEachRow"); + factory.markFormatSupportsSubsetOfColumns("JSONLines"); + factory.markFormatSupportsSubsetOfColumns("NDJSON"); + factory.markFormatSupportsSubsetOfColumns("JSONStringsEachRow"); } void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory) { - factory.registerFileSegmentationEngine("JSONEachRow", &fileSegmentationEngineJSONEachRow); - factory.registerFileSegmentationEngine("JSONStringsEachRow", &fileSegmentationEngineJSONEachRow); - factory.registerFileSegmentationEngine("JSONLines", &fileSegmentationEngineJSONEachRow); - factory.registerFileSegmentationEngine("NDJSON", &fileSegmentationEngineJSONEachRow); + factory.registerFileSegmentationEngine("JSONEachRow", &JSONUtils::fileSegmentationEngineJSONEachRow); + factory.registerFileSegmentationEngine("JSONStringsEachRow", &JSONUtils::fileSegmentationEngineJSONEachRow); + factory.registerFileSegmentationEngine("JSONLines", &JSONUtils::fileSegmentationEngineJSONEachRow); + factory.registerFileSegmentationEngine("NDJSON", &JSONUtils::fileSegmentationEngineJSONEachRow); } void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory) { - factory.registerNonTrivialPrefixAndSuffixChecker("JSONEachRow", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); - factory.registerNonTrivialPrefixAndSuffixChecker("JSONStringsEachRow", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); - factory.registerNonTrivialPrefixAndSuffixChecker("JSONLines", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); - factory.registerNonTrivialPrefixAndSuffixChecker("NDJSON", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); + factory.registerNonTrivialPrefixAndSuffixChecker("JSONEachRow", JSONUtils::nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); + factory.registerNonTrivialPrefixAndSuffixChecker("JSONStringsEachRow", JSONUtils::nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); + factory.registerNonTrivialPrefixAndSuffixChecker("JSONLines", JSONUtils::nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); + factory.registerNonTrivialPrefixAndSuffixChecker("NDJSON", JSONUtils::nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } void registerJSONEachRowSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp b/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp index 61ac25ca441..fc2d3cb8133 100644 --- a/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONRowOutputFormat.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB @@ -15,23 +16,9 @@ JSONRowOutputFormat::JSONRowOutputFormat( bool yield_strings_) : IRowOutputFormat(header, out_, params_), settings(settings_), yield_strings(yield_strings_) { - const auto & sample = getPort(PortKind::Main).getHeader(); - NamesAndTypesList columns(sample.getNamesAndTypesList()); - fields.assign(columns.begin(), columns.end()); - bool need_validate_utf8 = false; - for (size_t i = 0; i < sample.columns(); ++i) - { - if (!sample.getByPosition(i).type->textCanContainOnlyValidUTF8()) - need_validate_utf8 = true; - - WriteBufferFromOwnString buf; - { - WriteBufferValidUTF8 validating_buf(buf); - writeJSONString(fields[i].name, validating_buf, settings); - } - fields[i].name = buf.str(); - } + fields = header.getNamesAndTypes(); + JSONUtils::makeNamesAndTypesWithValidUTF8(fields, settings, need_validate_utf8); if (need_validate_utf8) { @@ -45,88 +32,34 @@ JSONRowOutputFormat::JSONRowOutputFormat( void JSONRowOutputFormat::writePrefix() { - writeCString("{\n", *ostr); - writeCString("\t\"meta\":\n", *ostr); - writeCString("\t[\n", *ostr); - - for (size_t i = 0; i < fields.size(); ++i) - { - writeCString("\t\t{\n", *ostr); - - writeCString("\t\t\t\"name\": ", *ostr); - writeString(fields[i].name, *ostr); - writeCString(",\n", *ostr); - writeCString("\t\t\t\"type\": ", *ostr); - writeJSONString(fields[i].type->getName(), *ostr, settings); - writeChar('\n', *ostr); - - writeCString("\t\t}", *ostr); - if (i + 1 < fields.size()) - writeChar(',', *ostr); - writeChar('\n', *ostr); - } - - writeCString("\t],\n", *ostr); - writeChar('\n', *ostr); - writeCString("\t\"data\":\n", *ostr); - writeCString("\t[\n", *ostr); + JSONUtils::writeObjectStart(*ostr); + JSONUtils::writeMetadata(fields, settings, *ostr); + JSONUtils::writeFieldDelimiter(*ostr, 2); + JSONUtils::writeArrayStart(*ostr, 1, "data"); } void JSONRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { - writeCString("\t\t\t", *ostr); - writeString(fields[field_number].name, *ostr); - writeCString(": ", *ostr); - - if (yield_strings) - { - WriteBufferFromOwnString buf; - - serialization.serializeText(column, row_num, buf, settings); - writeJSONString(buf.str(), *ostr, settings); - } - else - serialization.serializeTextJSON(column, row_num, *ostr, settings); - - ++field_number; -} - -void JSONRowOutputFormat::writeTotalsField(const IColumn & column, const ISerialization & serialization, size_t row_num) -{ - writeCString("\t\t", *ostr); - writeString(fields[field_number].name, *ostr); - writeCString(": ", *ostr); - - if (yield_strings) - { - WriteBufferFromOwnString buf; - - serialization.serializeText(column, row_num, buf, settings); - writeJSONString(buf.str(), *ostr, settings); - } - else - serialization.serializeTextJSON(column, row_num, *ostr, settings); - + JSONUtils::writeFieldFromColumn(column, serialization, row_num, yield_strings, settings, *ostr, fields[field_number].name, 3); ++field_number; } void JSONRowOutputFormat::writeFieldDelimiter() { - writeCString(",\n", *ostr); + JSONUtils::writeFieldDelimiter(*ostr); } void JSONRowOutputFormat::writeRowStartDelimiter() { - writeCString("\t\t{\n", *ostr); + JSONUtils::writeObjectStart(*ostr, 2); } void JSONRowOutputFormat::writeRowEndDelimiter() { - writeChar('\n', *ostr); - writeCString("\t\t}", *ostr); + JSONUtils::writeObjectEnd(*ostr, 2); field_number = 0; ++row_count; } @@ -134,71 +67,42 @@ void JSONRowOutputFormat::writeRowEndDelimiter() void JSONRowOutputFormat::writeRowBetweenDelimiter() { - writeCString(",\n", *ostr); + JSONUtils::writeFieldDelimiter(*ostr); } void JSONRowOutputFormat::writeSuffix() { - writeChar('\n', *ostr); - writeCString("\t]", *ostr); + JSONUtils::writeArrayEnd(*ostr, 1); } void JSONRowOutputFormat::writeBeforeTotals() { - writeCString(",\n", *ostr); - writeChar('\n', *ostr); - writeCString("\t\"totals\":\n", *ostr); - writeCString("\t{\n", *ostr); + JSONUtils::writeFieldDelimiter(*ostr, 2); + JSONUtils::writeObjectStart(*ostr, 1, "totals"); } void JSONRowOutputFormat::writeTotals(const Columns & columns, size_t row_num) { - size_t columns_size = columns.size(); - - for (size_t i = 0; i < columns_size; ++i) - { - if (i != 0) - writeTotalsFieldDelimiter(); - - writeTotalsField(*columns[i], *serializations[i], row_num); - } + JSONUtils::writeColumns(columns, fields, serializations, row_num, yield_strings, settings, *ostr, 2); } void JSONRowOutputFormat::writeAfterTotals() { - writeChar('\n', *ostr); - writeCString("\t}", *ostr); - field_number = 0; + JSONUtils::writeObjectEnd(*ostr, 1); } void JSONRowOutputFormat::writeBeforeExtremes() { - writeCString(",\n", *ostr); - writeChar('\n', *ostr); - writeCString("\t\"extremes\":\n", *ostr); - writeCString("\t{\n", *ostr); + JSONUtils::writeFieldDelimiter(*ostr, 2); + JSONUtils::writeObjectStart(*ostr, 1, "extremes"); } void JSONRowOutputFormat::writeExtremesElement(const char * title, const Columns & columns, size_t row_num) { - writeCString("\t\t\"", *ostr); - writeCString(title, *ostr); - writeCString("\":\n", *ostr); - writeCString("\t\t{\n", *ostr); - - size_t extremes_columns = columns.size(); - for (size_t i = 0; i < extremes_columns; ++i) - { - if (i != 0) - writeFieldDelimiter(); - - writeField(*columns[i], *serializations[i], row_num); - } - - writeChar('\n', *ostr); - writeCString("\t\t}", *ostr); - field_number = 0; + JSONUtils::writeObjectStart(*ostr, 2, title); + JSONUtils::writeColumns(columns, fields, serializations, row_num, yield_strings, settings, *ostr, 3); + JSONUtils::writeObjectEnd(*ostr, 2); } void JSONRowOutputFormat::writeMinExtreme(const Columns & columns, size_t row_num) @@ -213,58 +117,29 @@ void JSONRowOutputFormat::writeMaxExtreme(const Columns & columns, size_t row_nu void JSONRowOutputFormat::writeAfterExtremes() { - writeChar('\n', *ostr); - writeCString("\t}", *ostr); + JSONUtils::writeObjectEnd(*ostr, 1); } void JSONRowOutputFormat::finalizeImpl() { - writeCString(",\n\n", *ostr); - writeCString("\t\"rows\": ", *ostr); - writeIntText(row_count, *ostr); - auto outside_statistics = getOutsideStatistics(); if (outside_statistics) statistics = std::move(*outside_statistics); - writeRowsBeforeLimitAtLeast(); - - if (settings.write_statistics) - writeStatistics(); + JSONUtils::writeAdditionalInfo( + row_count, + statistics.rows_before_limit, + statistics.applied_limit, + statistics.watch, + statistics.progress, + settings.write_statistics, + *ostr); + JSONUtils::writeObjectEnd(*ostr); writeChar('\n', *ostr); - writeCString("}\n", *ostr); ostr->next(); } -void JSONRowOutputFormat::writeRowsBeforeLimitAtLeast() -{ - if (statistics.applied_limit) - { - writeCString(",\n\n", *ostr); - writeCString("\t\"rows_before_limit_at_least\": ", *ostr); - writeIntText(statistics.rows_before_limit, *ostr); - } -} - -void JSONRowOutputFormat::writeStatistics() -{ - writeCString(",\n\n", *ostr); - writeCString("\t\"statistics\":\n", *ostr); - writeCString("\t{\n", *ostr); - - writeCString("\t\t\"elapsed\": ", *ostr); - writeText(statistics.watch.elapsedSeconds(), *ostr); - writeCString(",\n", *ostr); - writeCString("\t\t\"rows_read\": ", *ostr); - writeText(statistics.progress.read_rows.load(), *ostr); - writeCString(",\n", *ostr); - writeCString("\t\t\"bytes_read\": ", *ostr); - writeText(statistics.progress.read_bytes.load(), *ostr); - writeChar('\n', *ostr); - - writeCString("\t}", *ostr); -} void JSONRowOutputFormat::onProgress(const Progress & value) { diff --git a/src/Processors/Formats/Impl/JSONRowOutputFormat.h b/src/Processors/Formats/Impl/JSONRowOutputFormat.h index 8561f5b4870..3459cc1b7a6 100644 --- a/src/Processors/Formats/Impl/JSONRowOutputFormat.h +++ b/src/Processors/Formats/Impl/JSONRowOutputFormat.h @@ -63,12 +63,7 @@ protected: void finalizeImpl() override; - virtual void writeTotalsField(const IColumn & column, const ISerialization & serialization, size_t row_num); virtual void writeExtremesElement(const char * title, const Columns & columns, size_t row_num); - virtual void writeTotalsFieldDelimiter() { writeFieldDelimiter(); } - - void writeRowsBeforeLimitAtLeast(); - void writeStatistics(); void onRowsReadBeforeUpdate() override { row_count = getRowsReadBefore(); } diff --git a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp index dc346b4f5f5..30084804d92 100644 --- a/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/LineAsStringRowInputFormat.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp index 5f3f015a5b1..8e787edf8ab 100644 --- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp @@ -397,8 +397,8 @@ bool MySQLDumpRowInputFormat::readField(IColumn & column, size_t column_idx) void MySQLDumpRowInputFormat::skipField() { - String tmp; - readQuotedFieldIntoString(tmp, *in); + NullOutput out; + readQuotedFieldInto(out, *in); } MySQLDumpSchemaReader::MySQLDumpSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) @@ -434,7 +434,7 @@ DataTypes MySQLDumpSchemaReader::readRowAndGetDataTypes() if (!data_types.empty()) skipFieldDelimiter(in); - readQuotedFieldIntoString(value, in); + readQuotedField(value, in); auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted); data_types.push_back(std::move(type)); } diff --git a/src/Processors/Formats/Impl/NativeFormat.cpp b/src/Processors/Formats/Impl/NativeFormat.cpp index c1dc60022f5..423fd483712 100644 --- a/src/Processors/Formats/Impl/NativeFormat.cpp +++ b/src/Processors/Formats/Impl/NativeFormat.cpp @@ -15,9 +15,9 @@ namespace DB class NativeInputFormat final : public IInputFormat { public: - NativeInputFormat(ReadBuffer & buf, const Block & header_) + NativeInputFormat(ReadBuffer & buf, const Block & header_, const FormatSettings & settings) : IInputFormat(header_, buf) - , reader(std::make_unique(buf, header_, 0)) + , reader(std::make_unique(buf, header_, 0, settings.skip_unknown_fields)) , header(header_) {} String getName() const override { return "Native"; } @@ -112,10 +112,11 @@ void registerInputFormatNative(FormatFactory & factory) ReadBuffer & buf, const Block & sample, const RowInputFormatParams &, - const FormatSettings &) + const FormatSettings & settings) { - return std::make_shared(buf, sample); + return std::make_shared(buf, sample, settings); }); + factory.markFormatSupportsSubsetOfColumns("Native"); } void registerOutputFormatNative(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.cpp b/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.cpp index a82285c1c19..7cf133e5739 100644 --- a/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.cpp @@ -13,7 +13,7 @@ namespace DB { ODBCDriver2BlockOutputFormat::ODBCDriver2BlockOutputFormat( WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_) - : IOutputFormat(header_, out_), format_settings(format_settings_) + : IOutputFormat(header_, out_), format_settings(format_settings_), serializations(header_.getSerializations()) { } @@ -23,7 +23,7 @@ static void writeODBCString(WriteBuffer & out, const std::string & str) out.write(str.data(), str.size()); } -void ODBCDriver2BlockOutputFormat::writeRow(const Serializations & serializations, const Columns & columns, size_t row_idx, std::string & buffer) +void ODBCDriver2BlockOutputFormat::writeRow(const Columns & columns, size_t row_idx, std::string & buffer) { size_t num_columns = columns.size(); for (size_t column_idx = 0; column_idx < num_columns; ++column_idx) @@ -46,20 +46,14 @@ void ODBCDriver2BlockOutputFormat::writeRow(const Serializations & serialization } } -void ODBCDriver2BlockOutputFormat::write(Chunk chunk, PortKind port_kind) +void ODBCDriver2BlockOutputFormat::write(Chunk chunk, PortKind) { String text_value; - const auto & header = getPort(port_kind).getHeader(); const auto & columns = chunk.getColumns(); - size_t num_columns = columns.size(); - Serializations serializations(num_columns); - for (size_t i = 0; i < num_columns; ++i) - serializations[i] = header.getByPosition(i).type->getDefaultSerialization(); - const size_t rows = chunk.getNumRows(); for (size_t i = 0; i < rows; ++i) - writeRow(serializations, columns, i, text_value); + writeRow(columns, i, text_value); } void ODBCDriver2BlockOutputFormat::consume(Chunk chunk) diff --git a/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.h b/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.h index de6ea22dfd7..9a0a43aa5bb 100644 --- a/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.h +++ b/src/Processors/Formats/Impl/ODBCDriver2BlockOutputFormat.h @@ -35,8 +35,9 @@ private: void writePrefix() override; const FormatSettings format_settings; + Serializations serializations; - void writeRow(const Serializations & serializations, const Columns & columns, size_t row_idx, std::string & buffer); + void writeRow(const Columns & columns, size_t row_idx, std::string & buffer); void write(Chunk chunk, PortKind port_kind); }; diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index 1531c0d2794..36126c21bf1 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -65,10 +65,8 @@ Chunk ORCBlockInputFormat::generate() /// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields. /// Otherwise fill the missing columns with zero values of its type. if (format_settings.defaults_for_omitted_fields) - for (size_t row_idx = 0; row_idx < res.getNumRows(); ++row_idx) - for (const auto & column_idx : missing_columns) - block_missing_values.setBit(column_idx, row_idx); - + for (const auto & column_idx : missing_columns) + block_missing_values.setBits(column_idx, res.getNumRows()); return res; } @@ -200,7 +198,7 @@ void registerInputFormatORC(FormatFactory & factory) { return std::make_shared(buf, sample, settings); }); - factory.markFormatAsColumnOriented("ORC"); + factory.markFormatSupportsSubsetOfColumns("ORC"); } void registerORCSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 106b71a9df5..5e979c3d35a 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -55,7 +55,7 @@ ORCBlockOutputFormat::ORCBlockOutputFormat(WriteBuffer & out_, const Block & hea data_types.push_back(recursiveRemoveLowCardinality(type)); } -ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & type, const std::string & column_name) +ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & type) { switch (type->getTypeId()) { @@ -100,16 +100,18 @@ ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & t case TypeIndex::FixedString: [[fallthrough]]; case TypeIndex::String: { + if (format_settings.orc.output_string_as_string) + return orc::createPrimitiveType(orc::TypeKind::STRING); return orc::createPrimitiveType(orc::TypeKind::BINARY); } case TypeIndex::Nullable: { - return getORCType(removeNullable(type), column_name); + return getORCType(removeNullable(type)); } case TypeIndex::Array: { const auto * array_type = assert_cast(type.get()); - return orc::createListType(getORCType(array_type->getNestedType(), column_name)); + return orc::createListType(getORCType(array_type->getNestedType())); } case TypeIndex::Decimal32: { @@ -129,21 +131,19 @@ ORC_UNIQUE_PTR ORCBlockOutputFormat::getORCType(const DataTypePtr & t case TypeIndex::Tuple: { const auto * tuple_type = assert_cast(type.get()); + const auto & nested_names = tuple_type->getElementNames(); const auto & nested_types = tuple_type->getElements(); auto struct_type = orc::createStructType(); for (size_t i = 0; i < nested_types.size(); ++i) - { - String name = column_name + "." + std::to_string(i); - struct_type->addStructField(name, getORCType(nested_types[i], name)); - } + struct_type->addStructField(nested_names[i], getORCType(nested_types[i])); return struct_type; } case TypeIndex::Map: { const auto * map_type = assert_cast(type.get()); return orc::createMapType( - getORCType(map_type->getKeyType(), column_name), - getORCType(map_type->getValueType(), column_name) + getORCType(map_type->getKeyType()), + getORCType(map_type->getValueType()) ); } default: @@ -512,7 +512,7 @@ void ORCBlockOutputFormat::prepareWriter() options.setCompression(orc::CompressionKind::CompressionKind_NONE); size_t columns_count = header.columns(); for (size_t i = 0; i != columns_count; ++i) - schema->addStructField(header.safeGetByPosition(i).name, getORCType(recursiveRemoveLowCardinality(data_types[i]), header.safeGetByPosition(i).name)); + schema->addStructField(header.safeGetByPosition(i).name, getORCType(recursiveRemoveLowCardinality(data_types[i]))); writer = orc::createWriter(*schema, &output_stream, options); } diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.h b/src/Processors/Formats/Impl/ORCBlockOutputFormat.h index f69fd1c0aab..d4a19353915 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.h @@ -42,7 +42,7 @@ private: void consume(Chunk chunk) override; void finalizeImpl() override; - ORC_UNIQUE_PTR getORCType(const DataTypePtr & type, const std::string & column_name); + ORC_UNIQUE_PTR getORCType(const DataTypePtr & type); /// ConvertFunc is needed for type UInt8, because firstly UInt8 (char8_t) must be /// converted to unsigned char (bugprone-signed-char-misuse in clang). diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 86987c665e0..12fa9710c42 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -66,9 +66,8 @@ Chunk ParquetBlockInputFormat::generate() /// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields. /// Otherwise fill the missing columns with zero values of its type. if (format_settings.defaults_for_omitted_fields) - for (size_t row_idx = 0; row_idx < res.getNumRows(); ++row_idx) - for (const auto & column_idx : missing_columns) - block_missing_values.setBit(column_idx, row_idx); + for (const auto & column_idx : missing_columns) + block_missing_values.setBits(column_idx, res.getNumRows()); return res; } @@ -193,7 +192,7 @@ void registerInputFormatParquet(FormatFactory & factory) { return std::make_shared(buf, sample, settings); }); - factory.markFormatAsColumnOriented("Parquet"); + factory.markFormatSupportsSubsetOfColumns("Parquet"); } void registerParquetSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index 68e2ae1c6eb..c8e94311af5 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -29,7 +29,7 @@ void ParquetBlockOutputFormat::consume(Chunk chunk) if (!ch_column_to_arrow_column) { const Block & header = getPort(PortKind::Main).getHeader(); - ch_column_to_arrow_column = std::make_unique(header, "Parquet", false); + ch_column_to_arrow_column = std::make_unique(header, "Parquet", false, format_settings.parquet.output_string_as_string); } ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunk, columns_num); diff --git a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp index ad65a5f707d..8fbf0a14916 100644 --- a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.cpp @@ -22,7 +22,7 @@ namespace ErrorCodes PrettyBlockOutputFormat::PrettyBlockOutputFormat( WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_) - : IOutputFormat(header_, out_), format_settings(format_settings_) + : IOutputFormat(header_, out_), format_settings(format_settings_), serializations(header_.getSerializations()) { struct winsize w; if (0 == ioctl(STDOUT_FILENO, TIOCGWINSZ, &w)) @@ -143,7 +143,7 @@ GridSymbols ascii_grid_symbols { } -void PrettyBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind) +void PrettyBlockOutputFormat::write(Chunk chunk, PortKind port_kind) { UInt64 max_rows = format_settings.pretty.max_rows; @@ -158,10 +158,6 @@ void PrettyBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind) const auto & columns = chunk.getColumns(); const auto & header = getPort(port_kind).getHeader(); - Serializations serializations(num_columns); - for (size_t i = 0; i < num_columns; ++i) - serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]->getSerializationInfo()); - WidthsPerColumn widths; Widths max_widths; Widths name_widths; @@ -371,21 +367,21 @@ void PrettyBlockOutputFormat::writeValueWithPadding( void PrettyBlockOutputFormat::consume(Chunk chunk) { - write(chunk, PortKind::Main); + write(std::move(chunk), PortKind::Main); } void PrettyBlockOutputFormat::consumeTotals(Chunk chunk) { total_rows = 0; writeCString("\nTotals:\n", out); - write(chunk, PortKind::Totals); + write(std::move(chunk), PortKind::Totals); } void PrettyBlockOutputFormat::consumeExtremes(Chunk chunk) { total_rows = 0; writeCString("\nExtremes:\n", out); - write(chunk, PortKind::Extremes); + write(std::move(chunk), PortKind::Extremes); } diff --git a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.h b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.h index 091010f9131..cfdd2213515 100644 --- a/src/Processors/Formats/Impl/PrettyBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/PrettyBlockOutputFormat.h @@ -33,11 +33,12 @@ protected: size_t row_number_width = 7; // "10000. " const FormatSettings format_settings; + Serializations serializations; using Widths = PODArray; using WidthsPerColumn = std::vector; - virtual void write(const Chunk & chunk, PortKind port_kind); + virtual void write(Chunk chunk, PortKind port_kind); void writeSuffix() override; void onRowsReadBeforeUpdate() override { total_rows = getRowsReadBefore(); } diff --git a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp index be8751cde13..9ba358a76e1 100644 --- a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.cpp @@ -149,7 +149,6 @@ void PrettyCompactBlockOutputFormat::writeBottom(const Widths & max_widths) void PrettyCompactBlockOutputFormat::writeRow( size_t row_num, const Block & header, - const Serializations & serializations, const Columns & columns, const WidthsPerColumn & widths, const Widths & max_widths) @@ -187,7 +186,7 @@ void PrettyCompactBlockOutputFormat::writeRow( writeCString("\n", out); } -void PrettyCompactBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind) +void PrettyCompactBlockOutputFormat::write(Chunk chunk, PortKind port_kind) { UInt64 max_rows = format_settings.pretty.max_rows; @@ -202,18 +201,11 @@ void PrettyCompactBlockOutputFormat::write(const Chunk & chunk, PortKind port_ki { if (!mono_chunk) { - mono_chunk = chunk.clone(); + mono_chunk = std::move(chunk); return; } - MutableColumns mutation = mono_chunk.mutateColumns(); - for (size_t position = 0; position < mutation.size(); ++position) - { - auto column = chunk.getColumns()[position]; - mutation[position]->insertRangeFrom(*column, 0, column->size()); - } - size_t rows = mutation[0]->size(); - mono_chunk.setColumns(std::move(mutation), rows); + mono_chunk.append(chunk); return; } else @@ -241,13 +233,8 @@ void PrettyCompactBlockOutputFormat::writeChunk(const Chunk & chunk, PortKind po writeHeader(header, max_widths, name_widths); - size_t num_columns = header.columns(); - Serializations serializations(num_columns); - for (size_t i = 0; i < num_columns; ++i) - serializations[i] = header.getByPosition(i).type->getDefaultSerialization(); - for (size_t i = 0; i < num_rows && total_rows + i < max_rows; ++i) - writeRow(i, header, serializations, columns, widths, max_widths); + writeRow(i, header, columns, widths, max_widths); writeBottom(max_widths); diff --git a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.h b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.h index a52ffe3d70a..5c39328051c 100644 --- a/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/PrettyCompactBlockOutputFormat.h @@ -17,13 +17,12 @@ public: String getName() const override { return "PrettyCompactBlockOutputFormat"; } private: - void write(const Chunk & chunk, PortKind port_kind) override; + void write(Chunk chunk, PortKind port_kind) override; void writeHeader(const Block & block, const Widths & max_widths, const Widths & name_widths); void writeBottom(const Widths & max_widths); void writeRow( size_t row_num, const Block & header, - const Serializations & serializations, const Columns & columns, const WidthsPerColumn & widths, const Widths & max_widths); diff --git a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp index 85b27a6fb57..730907ba45c 100644 --- a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.cpp @@ -9,7 +9,7 @@ namespace DB { -void PrettySpaceBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind) +void PrettySpaceBlockOutputFormat::write(Chunk chunk, PortKind port_kind) { UInt64 max_rows = format_settings.pretty.max_rows; @@ -24,10 +24,6 @@ void PrettySpaceBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind const auto & header = getPort(port_kind).getHeader(); const auto & columns = chunk.getColumns(); - Serializations serializations(num_columns); - for (size_t i = 0; i < num_columns; ++i) - serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]->getSerializationInfo()); - WidthsPerColumn widths; Widths max_widths; Widths name_widths; diff --git a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.h b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.h index b3090497783..6a8cb4e799c 100644 --- a/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/PrettySpaceBlockOutputFormat.h @@ -17,7 +17,7 @@ public: String getName() const override { return "PrettySpaceBlockOutputFormat"; } private: - void write(const Chunk & chunk, PortKind port_kind) override; + void write(Chunk chunk, PortKind port_kind) override; void writeSuffix() override; }; diff --git a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp index 6fbcaa15536..4599734591f 100644 --- a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp @@ -79,7 +79,7 @@ void registerInputFormatProtobufList(FormatFactory & factory) return std::make_shared(buf, sample, std::move(params), FormatSchemaInfo(settings, "Protobuf", true), settings.protobuf.input_flatten_google_wrappers); }); - factory.markFormatAsColumnOriented("ProtobufList"); + factory.markFormatSupportsSubsetOfColumns("ProtobufList"); } void registerProtobufListSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index 5c953a3fcc9..0376bf2c292 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -69,6 +69,7 @@ void registerInputFormatProtobuf(FormatFactory & factory) with_length_delimiter, settings.protobuf.input_flatten_google_wrappers); }); + factory.markFormatSupportsSubsetOfColumns(with_length_delimiter ? "Protobuf" : "ProtobufSingle"); } } diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 5c48062ace8..fe2c0c5ecdd 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -277,6 +277,8 @@ void registerInputFormatTSKV(FormatFactory & factory) { return std::make_shared(buf, sample, std::move(params), settings); }); + + factory.markFormatSupportsSubsetOfColumns("TSKV"); } void registerTSKVSchemaReader(FormatFactory & factory) { diff --git a/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp index 14dec8420a8..0e29d74b419 100644 --- a/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp @@ -8,12 +8,8 @@ namespace DB { TSKVRowOutputFormat::TSKVRowOutputFormat(WriteBuffer & out_, const Block & header, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) - : TabSeparatedRowOutputFormat(out_, header, false, false, false, params_, format_settings_) + : TabSeparatedRowOutputFormat(out_, header, false, false, false, params_, format_settings_), fields(header.getNamesAndTypes()) { - const auto & sample = getPort(PortKind::Main).getHeader(); - NamesAndTypesList columns(sample.getNamesAndTypesList()); - fields.assign(columns.begin(), columns.end()); - for (auto & field : fields) { WriteBufferFromOwnString wb; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 5f39c7bd646..0be8257f463 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -80,7 +80,11 @@ String TabSeparatedFormatReader::readFieldIntoString() void TabSeparatedFormatReader::skipField() { - readFieldIntoString(); + NullOutput out; + if (is_raw) + readStringInto(out, *in); + else + readEscapedStringInto(out, *in); } void TabSeparatedFormatReader::skipHeaderRow() @@ -347,6 +351,8 @@ void registerFileSegmentationEngineTabSeparated(FormatFactory & factory) registerWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); registerWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); + markFormatWithNamesAndTypesSupportsSamplingColumns(is_raw ? "TSVRaw" : "TSV", factory); + markFormatWithNamesAndTypesSupportsSamplingColumns(is_raw ? "TabSeparatedRaw" : "TabSeparated", factory); } // We can use the same segmentation engine for TSKV. diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 5c5b99f61da..0e7bdb259ac 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -3,7 +3,6 @@ #include #include #include -#include namespace DB @@ -17,15 +16,9 @@ namespace ErrorCodes TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_, ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_, std::string row_between_delimiter_) - : IOutputFormat(header_, out_), settings(settings_), format(std::move(format_)) + : IOutputFormat(header_, out_), settings(settings_), serializations(header_.getSerializations()), format(std::move(format_)) , row_format(std::move(row_format_)), row_between_delimiter(std::move(row_between_delimiter_)) { - const auto & sample = getPort(PortKind::Main).getHeader(); - size_t columns = sample.columns(); - serializations.resize(columns); - for (size_t i = 0; i < columns; ++i) - serializations[i] = sample.safeGetByPosition(i).type->getDefaultSerialization(); - /// Validate format string for whole output size_t data_idx = format.format_idx_to_column_idx.size() + 1; for (size_t i = 0; i < format.format_idx_to_column_idx.size(); ++i) diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index 727b7fb0a1f..41f77f8bbf2 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -49,11 +49,8 @@ ValuesBlockInputFormat::ValuesBlockInputFormat( params(params_), format_settings(format_settings_), num_columns(header_.columns()), parser_type_for_column(num_columns, ParserType::Streaming), attempts_to_deduce_template(num_columns), attempts_to_deduce_template_cached(num_columns), - rows_parsed_using_template(num_columns), templates(num_columns), types(header_.getDataTypes()) + rows_parsed_using_template(num_columns), templates(num_columns), types(header_.getDataTypes()), serializations(header_.getSerializations()) { - serializations.resize(types.size()); - for (size_t i = 0; i < types.size(); ++i) - serializations[i] = types[i]->getDefaultSerialization(); } Chunk ValuesBlockInputFormat::generate() @@ -599,7 +596,7 @@ DataTypes ValuesSchemaReader::readRowAndGetDataTypes() skipWhitespaceIfAny(buf); } - readQuotedFieldIntoString(value, buf); + readQuotedField(value, buf); auto type = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Quoted); data_types.push_back(std::move(type)); } diff --git a/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp b/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp index cc2b37189f9..d5fb29874d1 100644 --- a/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/XMLRowOutputFormat.cpp @@ -8,11 +8,9 @@ namespace DB { XMLRowOutputFormat::XMLRowOutputFormat(WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) - : IRowOutputFormat(header_, out_, params_), format_settings(format_settings_) + : IRowOutputFormat(header_, out_, params_), fields(header_.getNamesAndTypes()), format_settings(format_settings_) { const auto & sample = getPort(PortKind::Main).getHeader(); - NamesAndTypesList columns(sample.getNamesAndTypesList()); - fields.assign(columns.begin(), columns.end()); field_tag_names.resize(sample.columns()); bool need_validate_utf8 = false; @@ -200,7 +198,6 @@ void XMLRowOutputFormat::onProgress(const Progress & value) void XMLRowOutputFormat::finalizeImpl() { - writeCString("\t", *ostr); writeIntText(row_count, *ostr); writeCString("\n", *ostr); diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 9b7eed9f5ee..2625bf38bf7 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -77,8 +77,7 @@ static size_t tryAddNewFilterStep( /// New filter column is the first one. auto split_filter_column_name = (*split_filter->getIndex().begin())->result_name; node.step = std::make_unique( - node.children.at(0)->step->getOutputStream(), - std::move(split_filter), std::move(split_filter_column_name), true); + node.children.at(0)->step->getOutputStream(), std::move(split_filter), std::move(split_filter_column_name), true); return 3; } @@ -194,13 +193,13 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes /// Push down is for left table only. We need to update JoinStep for push down into right. /// Only inner and left join are supported. Other types may generate default values for left table keys. /// So, if we push down a condition like `key != 0`, not all rows may be filtered. - if (table_join.oneDisjunct() && (table_join.kind() == ASTTableJoin::Kind::Inner || table_join.kind() == ASTTableJoin::Kind::Left)) + if (table_join.kind() == ASTTableJoin::Kind::Inner || table_join.kind() == ASTTableJoin::Kind::Left) { const auto & left_header = join->getInputStreams().front().header; const auto & res_header = join->getOutputStream().header; Names allowed_keys; - const auto & key_names_left = table_join.getOnlyClause().key_names_left; - for (const auto & name : key_names_left) + const auto & source_columns = left_header.getNames(); + for (const auto & name : source_columns) { /// Skip key if it is renamed. /// I don't know if it is possible. Just in case. diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp index 9a8dd151830..ff30cfd8cf3 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp @@ -34,7 +34,7 @@ void optimizeTree(const QueryPlanOptimizationSettings & settings, QueryPlan::Nod }; std::stack stack; - stack.push(Frame{.node = &root}); + stack.push({.node = &root}); size_t max_optimizations_to_apply = settings.max_optimizations_to_apply; size_t total_applied_optimizations = 0; @@ -50,10 +50,10 @@ void optimizeTree(const QueryPlanOptimizationSettings & settings, QueryPlan::Nod /// Traverse all children first. if (frame.next_child < frame.node->children.size()) { - stack.push(Frame + stack.push( { - .node = frame.node->children[frame.next_child], - .depth_limit = frame.depth_limit ? (frame.depth_limit - 1) : 0, + .node = frame.node->children[frame.next_child], + .depth_limit = frame.depth_limit ? (frame.depth_limit - 1) : 0, }); ++frame.next_child; diff --git a/src/Processors/Transforms/PartialSortingTransform.cpp b/src/Processors/Transforms/PartialSortingTransform.cpp index 6a787a6cd15..3687fa770f0 100644 --- a/src/Processors/Transforms/PartialSortingTransform.cpp +++ b/src/Processors/Transforms/PartialSortingTransform.cpp @@ -81,9 +81,7 @@ size_t getFilterMask(const ColumnRawPtrs & lhs, const ColumnRawPtrs & rhs, size_ { /// Leave only rows that are less then row from rhs. filter[i] = compare_results[i] < 0; - - if (filter[i]) - ++result_size_hint; + result_size_hint += filter[i]; } return result_size_hint; diff --git a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp index 07584075097..c8ae9c6e07c 100644 --- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp @@ -53,11 +53,20 @@ void WriteBufferFromHTTPServerResponse::writeHeaderProgress() *response_header_ostr << "X-ClickHouse-Progress: " << progress_string_writer.str() << "\r\n" << std::flush; } +void WriteBufferFromHTTPServerResponse::writeExceptionCode() +{ + if (headers_finished_sending || !exception_code) + return; + if (response_header_ostr) + *response_header_ostr << "X-ClickHouse-Exception-Code: " << exception_code << "\r\n" << std::flush; +} + void WriteBufferFromHTTPServerResponse::finishSendHeaders() { if (!headers_finished_sending) { writeHeaderSummary(); + writeExceptionCode(); headers_finished_sending = true; if (!is_http_method_head) @@ -150,7 +159,7 @@ void WriteBufferFromHTTPServerResponse::onProgress(const Progress & progress) accumulated_progress.incrementPiecewiseAtomically(progress); - if (progress_watch.elapsed() >= send_progress_interval_ms * 1000000) + if (send_progress && progress_watch.elapsed() >= send_progress_interval_ms * 1000000) { progress_watch.restart(); diff --git a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h index 9f1d3e897e3..6905d5df8b5 100644 --- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h @@ -66,12 +66,17 @@ public: add_cors_header = enable_cors; } + /// Send progress + void setSendProgress(bool send_progress_) { send_progress = send_progress_; } + /// Don't send HTTP headers with progress more frequently. void setSendProgressInterval(size_t send_progress_interval_ms_) { send_progress_interval_ms = send_progress_interval_ms_; } + void setExceptionCode(int exception_code_) { exception_code = exception_code_; } + private: /// Send at least HTTP headers if no data has been sent yet. /// Use after the data has possibly been sent and no error happened (and thus you do not plan @@ -88,6 +93,8 @@ private: void writeHeaderProgress(); // Used for write the header X-ClickHouse-Summary void writeHeaderSummary(); + // Use to write the header X-ClickHouse-Exception-Code even when progress has been sent + void writeExceptionCode(); /// This method finish headers with \r\n, allowing to start to send body. void finishSendHeaders(); @@ -113,9 +120,12 @@ private: bool headers_finished_sending = false; /// If true, you could not add any headers. Progress accumulated_progress; + bool send_progress = false; size_t send_progress_interval_ms = 100; Stopwatch progress_watch; + int exception_code = 0; + std::mutex mutex; /// progress callback could be called from different threads. }; diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index 4a1de7ddf2d..39870fc91dc 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -103,6 +103,8 @@ namespace ErrorCodes extern const int INVALID_SESSION_TIMEOUT; extern const int HTTP_LENGTH_REQUIRED; extern const int SUPPORT_IS_DISABLED; + + extern const int TIMEOUT_EXCEEDED; } namespace @@ -228,6 +230,10 @@ static Poco::Net::HTTPResponse::HTTPStatus exceptionCodeToHTTPStatus(int excepti { return HTTPResponse::HTTP_LENGTH_REQUIRED; } + else if (exception_code == ErrorCodes::TIMEOUT_EXCEEDED) + { + return HTTPResponse::HTTP_REQUEST_TIMEOUT; + } return HTTPResponse::HTTP_INTERNAL_SERVER_ERROR; } @@ -771,6 +777,7 @@ void HTTPHandler::processQuery( if (client_supports_http_compression) used_output.out->setCompressionLevel(settings.http_zlib_compression_level); + used_output.out->setSendProgress(settings.send_progress_in_http_headers); used_output.out->setSendProgressInterval(settings.http_headers_progress_interval_ms); /// If 'http_native_compression_disable_checksumming_on_decompress' setting is turned on, @@ -803,8 +810,8 @@ void HTTPHandler::processQuery( }; /// While still no data has been sent, we will report about query execution progress by sending HTTP headers. - if (settings.send_progress_in_http_headers) - append_callback([&used_output] (const Progress & progress) { used_output.out->onProgress(progress); }); + /// Note that we add it unconditionally so the progress is available for `X-ClickHouse-Summary` + append_callback([&used_output](const Progress & progress) { used_output.out->onProgress(progress); }); if (settings.readonly > 0 && settings.cancel_http_readonly_queries_on_client_close) { @@ -843,7 +850,12 @@ void HTTPHandler::trySendExceptionToClient( const std::string & s, int exception_code, HTTPServerRequest & request, HTTPServerResponse & response, Output & used_output) try { - response.set("X-ClickHouse-Exception-Code", toString(exception_code)); + /// In case data has already been sent, like progress headers, try using the output buffer to + /// set the exception code since it will be able to append it if it hasn't finished writing headers + if (response.sent() && used_output.out) + used_output.out->setExceptionCode(exception_code); + else + response.set("X-ClickHouse-Exception-Code", toString(exception_code)); /// FIXME: make sure that no one else is reading from the same stream at the moment. diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 3aa5b28fed5..7a43ae7af4b 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -196,7 +196,7 @@ static auto getNameRange(const ColumnsDescription::ColumnsContainer & columns, c return std::make_pair(begin, end); } -void ColumnsDescription::add(ColumnDescription column, const String & after_column, bool first) +void ColumnsDescription::add(ColumnDescription column, const String & after_column, bool first, bool add_subcolumns) { if (has(column.name)) throw Exception("Cannot add column " + column.name + ": column with this name already exists", @@ -222,7 +222,8 @@ void ColumnsDescription::add(ColumnDescription column, const String & after_colu insert_it = range.second; } - addSubcolumns(column.name, column.type); + if (add_subcolumns) + addSubcolumns(column.name, column.type); columns.get<0>().insert(insert_it, std::move(column)); } @@ -572,6 +573,27 @@ std::optional ColumnsDescription::tryGetColumnOrSubcolumn(GetCo return tryGetColumn(GetColumnsOptions(kind).withSubcolumns(), column_name); } +std::optional ColumnsDescription::tryGetColumnDescription(const GetColumnsOptions & options, const String & column_name) const +{ + auto it = columns.get<1>().find(column_name); + if (it != columns.get<1>().end() && (defaultKindToGetKind(it->default_desc.kind) & options.kind)) + return *it; + + if (options.with_subcolumns) + { + auto jt = subcolumns.get<0>().find(column_name); + if (jt != subcolumns.get<0>().end()) + return ColumnDescription{jt->name, jt->type}; + } + + return {}; +} + +std::optional ColumnsDescription::tryGetColumnOrSubcolumnDescription(GetColumnsOptions::Kind kind, const String & column_name) const +{ + return tryGetColumnDescription(GetColumnsOptions(kind).withSubcolumns(), column_name); +} + NameAndTypePair ColumnsDescription::getColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const { auto column = tryGetColumnOrSubcolumn(kind, column_name); diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 75db8b92545..c81ccb5d217 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -100,7 +100,7 @@ public: explicit ColumnsDescription(NamesAndTypesList ordinary, NamesAndAliases aliases); /// `after_column` can be a Nested column name; - void add(ColumnDescription column, const String & after_column = String(), bool first = false); + void add(ColumnDescription column, const String & after_column = String(), bool first = false, bool add_subcolumns = true); /// `column_name` can be a Nested column name; void remove(const String & column_name); @@ -180,6 +180,9 @@ public: std::optional tryGetColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const; std::optional tryGetColumn(const GetColumnsOptions & options, const String & column_name) const; + std::optional tryGetColumnOrSubcolumnDescription(GetColumnsOptions::Kind kind, const String & column_name) const; + std::optional tryGetColumnDescription(const GetColumnsOptions & options, const String & column_name) const; + ColumnDefaults getDefaults() const; /// TODO: remove bool hasDefault(const String & column_name) const; bool hasDefaults() const; diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 0176487bbfe..5799100664e 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -476,9 +476,9 @@ private: }; -bool StorageHDFS::isColumnOriented() const +bool StorageHDFS::supportsSubsetOfColumns() const { - return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); + return format_name != "Distributed" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name); } Pipe StorageHDFS::read( @@ -527,7 +527,7 @@ Pipe StorageHDFS::read( ColumnsDescription columns_description; Block block_for_format; - if (isColumnOriented()) + if (supportsSubsetOfColumns()) { auto fetch_columns = column_names; const auto & virtuals = getVirtuals(); @@ -539,8 +539,7 @@ Pipe StorageHDFS::read( if (fetch_columns.empty()) fetch_columns.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical())); - columns_description = ColumnsDescription{ - storage_snapshot->getSampleBlockForColumns(fetch_columns).getNamesAndTypesList()}; + columns_description = storage_snapshot->getDescriptionForColumns(fetch_columns); block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); } else diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index b431407eba5..c8ebbfcfaac 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -57,7 +57,7 @@ public: /// Is is useful because column oriented formats could effectively skip unknown columns /// So we can create a header of only required columns in read method and ask /// format to read only them. Note: this hack cannot be done with ordinary formats like TSV. - bool isColumnOriented() const override; + bool supportsSubsetOfColumns() const override; static ColumnsDescription getTableStructureFromData( const String & format, diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index c66e1acc6e5..38c8c054a9b 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -668,7 +668,7 @@ HiveFilePtr StorageHive::getHiveFileIfNeeded( return hive_file; } -bool StorageHive::isColumnOriented() const +bool StorageHive::supportsSubsetOfColumns() const { return format_name == "Parquet" || format_name == "ORC"; } @@ -822,7 +822,7 @@ std::optional StorageHive::totalRowsImpl(const Settings & settings, const SelectQueryInfo & query_info, ContextPtr context_, PruneLevel prune_level) const { /// Row-based format like Text doesn't support totalRowsByPartitionPredicate - if (!isColumnOriented()) + if (!supportsSubsetOfColumns()) return {}; auto hive_metastore_client = HiveMetastoreClientFactory::instance().getOrCreate(hive_metastore_url); diff --git a/src/Storages/Hive/StorageHive.h b/src/Storages/Hive/StorageHive.h index 1b37a0afd15..d92d2dbd745 100644 --- a/src/Storages/Hive/StorageHive.h +++ b/src/Storages/Hive/StorageHive.h @@ -63,7 +63,7 @@ public: NamesAndTypesList getVirtuals() const override; - bool isColumnOriented() const override; + bool supportsSubsetOfColumns() const override; std::optional totalRows(const Settings & settings) const override; std::optional totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, ContextPtr context_) const override; diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 005229acf4b..ed17a3af972 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -585,7 +585,7 @@ public: /// Returns true if all disks of storage are read-only. virtual bool isStaticStorage() const; - virtual bool isColumnOriented() const { return false; } + virtual bool supportsSubsetOfColumns() const { return false; } /// If it is possible to quickly determine exact number of rows in the table at this moment of time, then return it. /// Used for: diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 2c9dd2b4934..40fba34cd03 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1282,12 +1282,12 @@ void IMergeTreeDataPart::storeVersionMetadata() const void IMergeTreeDataPart::appendCSNToVersionMetadata(VersionMetadata::WhichCSN which_csn) const { - assert(!version.creation_tid.isEmpty()); - assert(!(which_csn == VersionMetadata::WhichCSN::CREATION && version.creation_tid.isPrehistoric())); - assert(!(which_csn == VersionMetadata::WhichCSN::CREATION && version.creation_csn == 0)); - assert(!(which_csn == VersionMetadata::WhichCSN::REMOVAL && (version.removal_tid.isPrehistoric() || version.removal_tid.isEmpty()))); - assert(!(which_csn == VersionMetadata::WhichCSN::REMOVAL && version.removal_csn == 0)); - assert(isStoredOnDisk()); + chassert(!version.creation_tid.isEmpty()); + chassert(!(which_csn == VersionMetadata::WhichCSN::CREATION && version.creation_tid.isPrehistoric())); + chassert(!(which_csn == VersionMetadata::WhichCSN::CREATION && version.creation_csn == 0)); + chassert(!(which_csn == VersionMetadata::WhichCSN::REMOVAL && (version.removal_tid.isPrehistoric() || version.removal_tid.isEmpty()))); + chassert(!(which_csn == VersionMetadata::WhichCSN::REMOVAL && version.removal_csn == 0)); + chassert(isStoredOnDisk()); /// Small enough appends to file are usually atomic, /// so we append new metadata instead of rewriting file to reduce number of fsyncs. @@ -1303,10 +1303,10 @@ void IMergeTreeDataPart::appendCSNToVersionMetadata(VersionMetadata::WhichCSN wh void IMergeTreeDataPart::appendRemovalTIDToVersionMetadata(bool clear) const { - assert(!version.creation_tid.isEmpty()); - assert(version.removal_csn == 0); - assert(!version.removal_tid.isEmpty()); - assert(isStoredOnDisk()); + chassert(!version.creation_tid.isEmpty()); + chassert(version.removal_csn == 0); + chassert(!version.removal_tid.isEmpty()); + chassert(isStoredOnDisk()); if (version.creation_tid.isPrehistoric() && !clear) { @@ -1437,7 +1437,9 @@ bool IMergeTreeDataPart::assertHasValidVersionMetadata() const bool valid_removal_tid = version.removal_tid == file.removal_tid || version.removal_tid == Tx::PrehistoricTID; bool valid_creation_csn = version.creation_csn == file.creation_csn || version.creation_csn == Tx::RolledBackCSN; bool valid_removal_csn = version.removal_csn == file.removal_csn || version.removal_csn == Tx::PrehistoricCSN; - if (!valid_creation_tid || !valid_removal_tid || !valid_creation_csn || !valid_removal_csn) + bool valid_removal_tid_lock = (version.removal_tid.isEmpty() && version.removal_tid_lock == 0) + || (version.removal_tid_lock == version.removal_tid.getHash()); + if (!valid_creation_tid || !valid_removal_tid || !valid_creation_csn || !valid_removal_csn || !valid_removal_tid_lock) throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid version metadata file"); return true; } @@ -1445,7 +1447,8 @@ bool IMergeTreeDataPart::assertHasValidVersionMetadata() const { WriteBufferFromOwnString expected; version.write(expected); - tryLogCurrentException(storage.log, fmt::format("File {} contains:\n{}\nexpected:\n{}", version_file_name, content, expected.str())); + tryLogCurrentException(storage.log, fmt::format("File {} contains:\n{}\nexpected:\n{}\nlock: {}", + version_file_name, content, expected.str(), version.removal_tid_lock)); return false; } } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 50811daa4ab..d2c757f6750 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1364,7 +1364,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) /// Check if CSNs were witten after committing transaction, update and write if needed. bool version_updated = false; - assert(!version.creation_tid.isEmpty()); + chassert(!version.creation_tid.isEmpty()); if (!part->version.creation_csn) { auto min = TransactionLog::getCSN(version.creation_tid); diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 5b191b37f5e..9dadab56eb5 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -316,9 +316,9 @@ ColumnsDescription StorageFile::getTableStructureFromFile( return readSchemaFromFormat(format, format_settings, read_buffer_iterator, paths.size() > 1, context); } -bool StorageFile::isColumnOriented() const +bool StorageFile::supportsSubsetOfColumns() const { - return format_name != "Distributed" && FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); + return format_name != "Distributed" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name); } StorageFile::StorageFile(int table_fd_, CommonArguments args) @@ -465,7 +465,7 @@ public: const ColumnsDescription & columns_description, const FilesInfoPtr & files_info) { - if (storage->isColumnOriented()) + if (storage->supportsSubsetOfColumns()) return storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); else return getHeader(storage_snapshot->metadata, files_info->need_path_column, files_info->need_file_column); @@ -530,7 +530,7 @@ public: auto get_block_for_format = [&]() -> Block { - if (storage->isColumnOriented()) + if (storage->supportsSubsetOfColumns()) return storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); return storage_snapshot->metadata->getSampleBlock(); }; @@ -690,9 +690,8 @@ Pipe StorageFile::read( { const auto get_columns_for_format = [&]() -> ColumnsDescription { - if (isColumnOriented()) - return ColumnsDescription{ - storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()}; + if (supportsSubsetOfColumns()) + return storage_snapshot->getDescriptionForColumns(column_names); else return storage_snapshot->metadata->getColumns(); }; diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index 35ab185b14d..f47f6172c1c 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -69,11 +69,11 @@ public: static Strings getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read); - /// Check if the format is column-oriented. - /// Is is useful because column oriented formats could effectively skip unknown columns + /// Check if the format supports reading only some subset of columns. + /// Is is useful because such formats could effectively skip unknown columns /// So we can create a header of only required columns in read method and ask /// format to read only them. Note: this hack cannot be done with ordinary formats like TSV. - bool isColumnOriented() const override; + bool supportsSubsetOfColumns() const override; bool supportsPartitionBy() const override { return true; } diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 171ad0bd877..99cc8a284b8 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -127,9 +127,6 @@ StorageMaterializedView::StorageMaterializedView( target_table_id = DatabaseCatalog::instance().getTable({manual_create_query->getDatabase(), manual_create_query->getTable()}, getContext())->getStorageID(); } - - if (!select.select_table_id.empty()) - DatabaseCatalog::instance().addDependency(select.select_table_id, getStorageID()); } QueryProcessingStage::Enum StorageMaterializedView::getQueryProcessingStage( @@ -400,6 +397,14 @@ void StorageMaterializedView::renameInMemory(const StorageID & new_table_id) DatabaseCatalog::instance().updateDependency(select_query.select_table_id, old_table_id, select_query.select_table_id, getStorageID()); } +void StorageMaterializedView::startup() +{ + auto metadata_snapshot = getInMemoryMetadataPtr(); + const auto & select_query = metadata_snapshot->getSelectQuery(); + if (!select_query.select_table_id.empty()) + DatabaseCatalog::instance().addDependency(select_query.select_table_id, getStorageID()); +} + void StorageMaterializedView::shutdown() { auto metadata_snapshot = getInMemoryMetadataPtr(); diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index 16817c930b2..001bf39f10f 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -69,6 +69,7 @@ public: void renameInMemory(const StorageID & new_table_id) override; + void startup() override; void shutdown() override; QueryProcessingStage::Enum diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 6107c1a5117..914aef0e54b 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -676,9 +676,9 @@ std::shared_ptr StorageS3::createFileIterator( } } -bool StorageS3::isColumnOriented() const +bool StorageS3::supportsSubsetOfColumns() const { - return FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name); } Pipe StorageS3::read( @@ -707,7 +707,7 @@ Pipe StorageS3::read( ColumnsDescription columns_description; Block block_for_format; - if (isColumnOriented()) + if (supportsSubsetOfColumns()) { auto fetch_columns = column_names; const auto & virtuals = getVirtuals(); @@ -719,8 +719,7 @@ Pipe StorageS3::read( if (fetch_columns.empty()) fetch_columns.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical())); - columns_description = ColumnsDescription{ - storage_snapshot->getSampleBlockForColumns(fetch_columns).getNamesAndTypesList()}; + columns_description = storage_snapshot->getDescriptionForColumns(fetch_columns); block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); } else diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index cac5b3c270f..102f74b83cd 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -234,7 +234,7 @@ private: ContextPtr ctx, std::vector * read_keys_in_distributed_processing = nullptr); - bool isColumnOriented() const override; + bool supportsSubsetOfColumns() const override; }; } diff --git a/src/Storages/StorageSnapshot.cpp b/src/Storages/StorageSnapshot.cpp index a4b64c798f3..d935d73d03d 100644 --- a/src/Storages/StorageSnapshot.cpp +++ b/src/Storages/StorageSnapshot.cpp @@ -92,13 +92,11 @@ NameAndTypePair StorageSnapshot::getColumn(const GetColumnsOptions & options, co Block StorageSnapshot::getSampleBlockForColumns(const Names & column_names) const { Block res; - const auto & columns = getMetadataForQuery()->getColumns(); for (const auto & name : column_names) { auto column = columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, name); auto object_column = object_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, name); - if (column && !object_column) { res.insert({column->type->createColumn(), column->type, column->name}); @@ -120,6 +118,38 @@ Block StorageSnapshot::getSampleBlockForColumns(const Names & column_names) cons "Column {} not found in table {}", backQuote(name), storage.getStorageID().getNameForLogs()); } } + return res; +} + +ColumnsDescription StorageSnapshot::getDescriptionForColumns(const Names & column_names) const +{ + ColumnsDescription res; + const auto & columns = getMetadataForQuery()->getColumns(); + for (const auto & name : column_names) + { + auto column = columns.tryGetColumnOrSubcolumnDescription(GetColumnsOptions::All, name); + auto object_column = object_columns.tryGetColumnOrSubcolumnDescription(GetColumnsOptions::All, name); + if (column && !object_column) + { + res.add(*column, "", false, false); + } + else if (object_column) + { + res.add(*object_column, "", false, false); + } + else if (auto it = virtual_columns.find(name); it != virtual_columns.end()) + { + /// Virtual columns must be appended after ordinary, because user can + /// override them. + const auto & type = it->second; + res.add({name, type}); + } + else + { + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, + "Column {} not found in table {}", backQuote(name), storage.getStorageID().getNameForLogs()); + } + } return res; } diff --git a/src/Storages/StorageSnapshot.h b/src/Storages/StorageSnapshot.h index 909f4fd5cab..5b76a4b37e5 100644 --- a/src/Storages/StorageSnapshot.h +++ b/src/Storages/StorageSnapshot.h @@ -68,6 +68,8 @@ struct StorageSnapshot /// Block with ordinary + materialized + aliases + virtuals + subcolumns. Block getSampleBlockForColumns(const Names & column_names) const; + ColumnsDescription getDescriptionForColumns(const Names & column_names) const; + /// Verify that all the requested names are in the table and are set correctly: /// list of names is not empty and the names do not repeat. void check(const Names & column_names) const; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 03bd1d5e7d9..14931a6b230 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -582,9 +582,9 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData( return readSchemaFromFormat(format, format_settings, read_buffer_iterator, urls_to_check.size() > 1, context); } -bool IStorageURLBase::isColumnOriented() const +bool IStorageURLBase::supportsSubsetOfColumns() const { - return FormatFactory::instance().checkIfFormatIsColumnOriented(format_name); + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name); } Pipe IStorageURLBase::read( @@ -600,10 +600,9 @@ Pipe IStorageURLBase::read( ColumnsDescription columns_description; Block block_for_format; - if (isColumnOriented()) + if (supportsSubsetOfColumns()) { - columns_description = ColumnsDescription{ - storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()}; + columns_description = storage_snapshot->getDescriptionForColumns(column_names); block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); } else @@ -688,10 +687,9 @@ Pipe StorageURLWithFailover::read( { ColumnsDescription columns_description; Block block_for_format; - if (isColumnOriented()) + if (supportsSubsetOfColumns()) { - columns_description = ColumnsDescription{ - storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()}; + columns_description = storage_snapshot->getDescriptionForColumns(column_names); block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); } else diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index 25b88a827b6..85c77b00550 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -93,7 +93,7 @@ protected: QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const; - bool isColumnOriented() const override; + bool supportsSubsetOfColumns() const override; private: virtual Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const = 0; diff --git a/src/Storages/StorageXDBC.cpp b/src/Storages/StorageXDBC.cpp index 2c2f1ec3034..f44daf2557e 100644 --- a/src/Storages/StorageXDBC.cpp +++ b/src/Storages/StorageXDBC.cpp @@ -140,7 +140,7 @@ SinkToStoragePtr StorageXDBC::write(const ASTPtr & /* query */, const StorageMet chooseCompressionMethod(uri, compression_method)); } -bool StorageXDBC::isColumnOriented() const +bool StorageXDBC::supportsSubsetOfColumns() const { return true; } diff --git a/src/Storages/StorageXDBC.h b/src/Storages/StorageXDBC.h index 910ba162f86..442db5277e0 100644 --- a/src/Storages/StorageXDBC.h +++ b/src/Storages/StorageXDBC.h @@ -67,7 +67,7 @@ private: Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const override; - bool isColumnOriented() const override; + bool supportsSubsetOfColumns() const override; }; } diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index 6674de06c07..a8edb8dd78b 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -85,6 +85,7 @@ StorageSystemParts::StorageSystemParts(const StorageID & table_id_) {"visible", std::make_shared()}, {"creation_tid", getTransactionIDDataType()}, + {"removal_tid_lock", std::make_shared()}, {"removal_tid", getTransactionIDDataType()}, {"creation_csn", std::make_shared()}, {"removal_csn", std::make_shared()}, @@ -295,6 +296,8 @@ void StorageSystemParts::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(get_tid_as_field(part->version.creation_tid)); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->version.removal_tid_lock.load(std::memory_order_relaxed)); if (columns_mask[src_index++]) columns[res_index++]->insert(get_tid_as_field(part->version.getRemovalTID())); if (columns_mask[src_index++]) diff --git a/src/Storages/System/StorageSystemTransactions.cpp b/src/Storages/System/StorageSystemTransactions.cpp index 396fc875f74..21fa72ea12a 100644 --- a/src/Storages/System/StorageSystemTransactions.cpp +++ b/src/Storages/System/StorageSystemTransactions.cpp @@ -15,6 +15,7 @@ static DataTypePtr getStateEnumType() DataTypeEnum8::Values { {"RUNNING", static_cast(MergeTreeTransaction::State::RUNNING)}, + {"COMMITTING", static_cast(MergeTreeTransaction::State::COMMITTING)}, {"COMMITTED", static_cast(MergeTreeTransaction::State::COMMITTED)}, {"ROLLED_BACK", static_cast(MergeTreeTransaction::State::ROLLED_BACK)}, }); diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index 119f23b65f2..a44b8954e3c 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -272,40 +272,13 @@ namespace } }; - IntervalKind strToIntervalKind(const String& interval_str) - { - if (interval_str == "Nanosecond") - return IntervalKind::Nanosecond; - else if (interval_str == "Microsecond") - return IntervalKind::Microsecond; - else if (interval_str == "Millisecond") - return IntervalKind::Millisecond; - else if (interval_str == "Second") - return IntervalKind::Second; - else if (interval_str == "Minute") - return IntervalKind::Minute; - else if (interval_str == "Hour") - return IntervalKind::Hour; - else if (interval_str == "Day") - return IntervalKind::Day; - else if (interval_str == "Week") - return IntervalKind::Week; - else if (interval_str == "Month") - return IntervalKind::Month; - else if (interval_str == "Quarter") - return IntervalKind::Quarter; - else if (interval_str == "Year") - return IntervalKind::Year; - __builtin_unreachable(); - } - void extractWindowArgument(const ASTPtr & ast, IntervalKind::Kind & kind, Int64 & num_units, String err_msg) { const auto * arg = ast->as(); - if (!arg || !startsWith(arg->name, "toInterval")) + if (!arg || !startsWith(arg->name, "toInterval") + || !IntervalKind::tryParseString(Poco::toLower(arg->name.substr(10)), kind)) throw Exception(err_msg, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - kind = strToIntervalKind(arg->name.substr(10)); const auto * interval_unit = arg->children.front()->children.front()->as(); if (!interval_unit || (interval_unit->value.getType() != Field::Types::String @@ -1061,7 +1034,7 @@ void StorageWindowView::threadFuncCleanup() } if (!shutdown_called) - clean_cache_task->scheduleAfter(1000); + clean_cache_task->scheduleAfter(clean_interval_ms); } void StorageWindowView::threadFuncFireProc() @@ -1102,7 +1075,7 @@ void StorageWindowView::threadFuncFireEvent() std::unique_lock lock(fire_signal_mutex); while (!shutdown_called) { - bool signaled = std::cv_status::no_timeout == fire_signal_condition.wait_for(lock, std::chrono::seconds(5)); + bool signaled = std::cv_status::no_timeout == fire_signal_condition.wait_for(lock, std::chrono::seconds(fire_signal_timeout_s)); if (!signaled) continue; @@ -1229,6 +1202,7 @@ StorageWindowView::StorageWindowView( : IStorage(table_id_) , WithContext(context_->getGlobalContext()) , log(&Poco::Logger::get(fmt::format("StorageWindowView({}.{})", table_id_.database_name, table_id_.table_name))) + , fire_signal_timeout_s(context_->getSettingsRef().wait_for_window_view_fire_signal_timeout.totalSeconds()) , clean_interval_ms(context_->getSettingsRef().window_view_clean_interval.totalMilliseconds()) { if (!query.select) @@ -1497,14 +1471,10 @@ void StorageWindowView::writeIntoWindowView( if (lateness_bound > 0) /// Add filter, which leaves rows with timestamp >= lateness_bound { - ASTPtr args = std::make_shared(); - args->children.push_back(std::make_shared(window_view.timestamp_column_name)); - args->children.push_back(std::make_shared(lateness_bound)); - - auto filter_function = std::make_shared(); - filter_function->name = "greaterOrEquals"; - filter_function->arguments = args; - filter_function->children.push_back(filter_function->arguments); + auto filter_function = makeASTFunction( + "greaterOrEquals", + std::make_shared(window_view.timestamp_column_name), + std::make_shared(lateness_bound)); ASTPtr query = filter_function; NamesAndTypesList columns; diff --git a/src/Storages/WindowView/StorageWindowView.h b/src/Storages/WindowView/StorageWindowView.h index 709ffd9c58e..d9343aa03ac 100644 --- a/src/Storages/WindowView/StorageWindowView.h +++ b/src/Storages/WindowView/StorageWindowView.h @@ -18,8 +18,9 @@ using ASTPtr = std::shared_ptr; * StorageWindowView. * * CREATE WINDOW VIEW [IF NOT EXISTS] [db.]name [TO [db.]name] - * [ENGINE [db.]name] + * [INNER ENGINE engine] [ENGINE engine] * [WATERMARK strategy] [ALLOWED_LATENESS interval_function] + * [POPULATE] * AS SELECT ... * GROUP BY [tumble/hop(...)] * @@ -203,6 +204,7 @@ private: bool inner_target_table{false}; mutable Block input_header; mutable Block output_header; + UInt64 fire_signal_timeout_s; UInt64 clean_interval_ms; const DateLUTImpl * time_zone = nullptr; UInt32 max_timestamp = 0; diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index 57227ef307e..c1d1c1df1f1 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -404,7 +404,11 @@ def main(): elif args.image_path: pr_info.changed_files = set(i for i in args.image_path) else: - pr_info.fetch_changed_files() + try: + pr_info.fetch_changed_files() + except TypeError: + # If the event does not contain diff, nothing will be built + pass changed_images = get_changed_docker_images(pr_info, images_dict) if changed_images: diff --git a/tests/ci/docs_release.py b/tests/ci/docs_release.py index b6d47326f9b..806db28c1b1 100644 --- a/tests/ci/docs_release.py +++ b/tests/ci/docs_release.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import argparse import logging import subprocess import os @@ -15,17 +16,31 @@ from upload_result_helper import upload_results from docker_pull_helper import get_image_with_version from commit_status_helper import get_commit from rerun_helper import RerunHelper +from tee_popen import TeePopen NAME = "Docs Release (actions)" + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="ClickHouse building script using prebuilt Docker image", + ) + parser.add_argument( + "--as-root", action="store_true", help="if the container should run as root" + ) + return parser.parse_args() + + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) + args = parse_args() temp_path = TEMP_PATH repo_path = REPO_COPY gh = Github(get_best_robot_token()) - pr_info = PRInfo(need_changed_files=True) + pr_info = PRInfo() rerun_helper = RerunHelper(gh, pr_info, NAME) if rerun_helper.is_already_finished_by_status(): logging.info("Check is already finished according to github status, exiting") @@ -40,18 +55,23 @@ if __name__ == "__main__": if not os.path.exists(test_output): os.makedirs(test_output) - token = CLOUDFLARE_TOKEN - cmd = ( - "docker run --cap-add=SYS_PTRACE --volume=$SSH_AUTH_SOCK:/ssh-agent -e SSH_AUTH_SOCK=/ssh-agent " - f"-e CLOUDFLARE_TOKEN={token} --volume={repo_path}:/repo_path --volume={test_output}:/output_path {docker_image}" - ) + if args.as_root: + user = "0:0" + else: + user = f"{os.geteuid()}:{os.getegid()}" run_log_path = os.path.join(test_output, "runlog.log") - with open(run_log_path, "w", encoding="utf-8") as log, SSHKey( - "ROBOT_CLICKHOUSE_SSH_KEY" - ): - with subprocess.Popen(cmd, shell=True, stderr=log, stdout=log) as process: + with SSHKey("ROBOT_CLICKHOUSE_SSH_KEY"): + cmd = ( + f"docker run --cap-add=SYS_PTRACE --user={user} " + f"--volume='{os.getenv('SSH_AUTH_SOCK', '')}:/ssh-agent' " + f"--volume={repo_path}:/repo_path --volume={test_output}:/output_path " + f"-e SSH_AUTH_SOCK=/ssh-agent -e EXTRA_BUILD_ARGS='--verbose' " + f"-e CLOUDFLARE_TOKEN={CLOUDFLARE_TOKEN} {docker_image}" + ) + logging.info("Running command: %s", cmd) + with TeePopen(cmd, run_log_path) as process: retcode = process.wait() if retcode == 0: logging.info("Run successfully") @@ -96,3 +116,6 @@ if __name__ == "__main__": commit.create_status( context=NAME, description=description, state=status, target_url=report_url ) + + if status == "failure": + sys.exit(1) diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index 9d287d2a07e..d17e0f1f379 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -186,6 +186,7 @@ class PRInfo: else: self.diff_url = pull_request["diff_url"] else: + print("event.json does not match pull_request or push:") print(json.dumps(github_event, sort_keys=True, indent=4)) self.sha = os.getenv("GITHUB_SHA") self.number = 0 @@ -204,8 +205,8 @@ class PRInfo: self.fetch_changed_files() def fetch_changed_files(self): - if not self.diff_url: - raise Exception("Diff URL cannot be find for event") + if not getattr(self, "diff_url", False): + raise TypeError("The event does not have diff URL") response = get_with_retries( self.diff_url, diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index bd70134760a..87139c5bb8a 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -253,16 +253,7 @@ if __name__ == "__main__": ) sys.exit(1) else: - if "pr-documentation" in pr_info.labels or "pr-doc-fix" in pr_info.labels: - commit.create_status( - context=NAME, - description="Skipping checks for documentation", - state="success", - target_url=url, - ) - print("::notice ::Can run, but it's documentation PR, skipping") - else: - print("::notice ::Can run") - commit.create_status( - context=NAME, description=description, state="pending", target_url=url - ) + print("::notice ::Can run") + commit.create_status( + context=NAME, description=description, state="pending", target_url=url + ) diff --git a/tests/config/config.d/transactions.xml b/tests/config/config.d/transactions.xml index 19810986ea1..9948b1f1865 100644 --- a/tests/config/config.d/transactions.xml +++ b/tests/config/config.d/transactions.xml @@ -10,4 +10,12 @@ 7500 + + /test/clickhouse/txn + + 0.0 + + + 0.01 + diff --git a/tests/integration/test_keeper_force_recovery_single_node/__init__.py b/tests/integration/test_keeper_force_recovery_single_node/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery_single_node/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper1.xml b/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper1.xml new file mode 100644 index 00000000000..441c1bc185d --- /dev/null +++ b/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper1.xml @@ -0,0 +1,33 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper1_solo.xml b/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper1_solo.xml new file mode 100644 index 00000000000..f0cb887b062 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper1_solo.xml @@ -0,0 +1,24 @@ + + + 1 + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper2.xml b/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper2.xml new file mode 100644 index 00000000000..e2e2c1fd7db --- /dev/null +++ b/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper2.xml @@ -0,0 +1,33 @@ + + + 9181 + 2 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper3.xml b/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper3.xml new file mode 100644 index 00000000000..e2ac0400d88 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery_single_node/configs/enable_keeper3.xml @@ -0,0 +1,33 @@ + + + 9181 + 3 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery_single_node/configs/use_keeper.xml b/tests/integration/test_keeper_force_recovery_single_node/configs/use_keeper.xml new file mode 100644 index 00000000000..384e984f210 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery_single_node/configs/use_keeper.xml @@ -0,0 +1,16 @@ + + + + node1 + 9181 + + + node2 + 9181 + + + node3 + 9181 + + + diff --git a/tests/integration/test_keeper_force_recovery_single_node/test.py b/tests/integration/test_keeper_force_recovery_single_node/test.py new file mode 100644 index 00000000000..1e58a25221e --- /dev/null +++ b/tests/integration/test_keeper_force_recovery_single_node/test.py @@ -0,0 +1,157 @@ +import os +import pytest +import socket +from helpers.cluster import ClickHouseCluster +import time + + +from kazoo.client import KazooClient + +CLUSTER_SIZE = 3 + +cluster = ClickHouseCluster(__file__) +CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs") + + +def get_nodes(): + nodes = [] + for i in range(CLUSTER_SIZE): + nodes.append( + cluster.add_instance( + f"node{i+1}", + main_configs=[ + f"configs/enable_keeper{i+1}.xml", + f"configs/use_keeper.xml", + ], + stay_alive=True, + ) + ) + + return nodes + + +nodes = get_nodes() + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + finally: + cluster.shutdown() + + +def get_fake_zk(nodename, timeout=30.0): + _fake_zk_instance = KazooClient( + hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout + ) + _fake_zk_instance.start() + return _fake_zk_instance + + +def get_keeper_socket(node_name): + hosts = cluster.get_instance_ip(node_name) + client = socket.socket() + client.settimeout(10) + client.connect((hosts, 9181)) + return client + + +def send_4lw_cmd(node_name, cmd="ruok"): + client = None + try: + client = get_keeper_socket(node_name) + client.send(cmd.encode()) + data = client.recv(100_000) + data = data.decode() + return data + finally: + if client is not None: + client.close() + + +def wait_until_connected(node_name): + while send_4lw_cmd(node_name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG: + time.sleep(0.1) + + +def wait_nodes(nodes): + for node in nodes: + wait_until_connected(node.name) + + +def wait_and_assert_data(zk, path, data): + while zk.exists(path) is None: + time.sleep(0.1) + assert zk.get(path)[0] == data.encode() + + +def close_zk(zk): + zk.stop() + zk.close() + + +NOT_SERVING_REQUESTS_ERROR_MSG = "This instance is not currently serving requests" + + +def test_cluster_recovery(started_cluster): + node_zks = [] + try: + wait_nodes(nodes) + + node_zks = [get_fake_zk(node.name) for node in nodes] + + data_in_cluster = [] + + def add_data(zk, path, data): + zk.create(path, data.encode()) + data_in_cluster.append((path, data)) + + def assert_all_data(zk): + for path, data in data_in_cluster: + wait_and_assert_data(zk, path, data) + + for i, zk in enumerate(node_zks): + add_data(zk, f"/test_force_recovery_node{i+1}", f"somedata{i+1}") + + for zk in node_zks: + assert_all_data(zk) + + nodes[0].stop_clickhouse() + + add_data(node_zks[1], "/test_force_recovery_extra", "somedataextra") + + for node_zk in node_zks[2:CLUSTER_SIZE]: + wait_and_assert_data(node_zk, "/test_force_recovery_extra", "somedataextra") + + nodes[0].start_clickhouse() + wait_until_connected(nodes[0].name) + + node_zks[0] = get_fake_zk(nodes[0].name) + wait_and_assert_data(node_zks[0], "/test_force_recovery_extra", "somedataextra") + + # stop all nodes + for node_zk in node_zks: + close_zk(node_zk) + node_zks = [] + + for node in nodes: + node.stop_clickhouse() + + nodes[0].copy_file_to_container( + os.path.join(CONFIG_DIR, "enable_keeper1_solo.xml"), + "/etc/clickhouse-server/config.d/enable_keeper1.xml", + ) + + nodes[0].start_clickhouse() + wait_until_connected(nodes[0].name) + + assert_all_data(get_fake_zk(nodes[0].name)) + finally: + try: + for zk_conn in node_zks: + close_zk(zk_conn) + except: + pass diff --git a/tests/performance/formats_columns_sampling.xml b/tests/performance/formats_columns_sampling.xml new file mode 100644 index 00000000000..25f9dc000a3 --- /dev/null +++ b/tests/performance/formats_columns_sampling.xml @@ -0,0 +1,32 @@ + + + 1 + + + + + format + + TabSeparatedWithNames + CustomSeparatedWithNames + CSVWithNames + JSONEachRow + JSONCompactEachRowWithNames + TSKV + Avro + ORC + Parquet + Arrow + Native + + + + + CREATE TABLE IF NOT EXISTS table_{format} ENGINE = File({format}) AS test.hits + + INSERT INTO table_{format} SELECT * FROM test.hits LIMIT 100000 + + SELECT WatchID FROM table_{format} FORMAT Null + + DROP TABLE IF EXISTS table_{format} + diff --git a/tests/performance/norm_distance.xml b/tests/performance/norm_distance.xml new file mode 100644 index 00000000000..b6a7f9724c2 --- /dev/null +++ b/tests/performance/norm_distance.xml @@ -0,0 +1,100 @@ + + + + + element_type + + UInt8 + Int16 + Int32 + Int64 + Float32 + Float64 + + + + + + CREATE TABLE vecs_{element_type} ( + v Array({element_type}) + ) ENGINE=Memory; + + + + + + INSERT INTO vecs_{element_type} + SELECT v FROM ( + SELECT + number AS n, + [ + rand(n*10), + rand(n*10+1), + rand(n*10+2), + rand(n*10+3), + rand(n*10+4), + rand(n*10+5), + rand(n*10+6), + rand(n*10+7), + rand(n*10+8), + rand(n*10+9) + ] AS v + FROM system.numbers + LIMIT 10000000 + ); + + + + + + CREATE TABLE tuples_{element_type} ( + t Tuple( + {element_type}, + {element_type}, + {element_type}, + {element_type}, + {element_type}, + {element_type}, + {element_type}, + {element_type}, + {element_type}, + {element_type} + ) + ) ENGINE=Memory; + + + + INSERT INTO tuples_{element_type} + SELECT (v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10]) FROM vecs_{element_type}; + + + + 1 + + + + + + norm + + L1 + L2 + Linf + + + + + + SELECT sum(dist) FROM (SELECT {norm}Norm(t) AS dist FROM tuples_{element_type}) + WITH (SELECT t FROM tuples_{element_type} limit 1) AS a SELECT sum(dist) FROM (SELECT {norm}Distance(a, t) AS dist FROM tuples_{element_type}) + WITH (SELECT t FROM tuples_{element_type} limit 1) AS a SELECT sum(dist) FROM (SELECT cosineDistance(a, t) AS dist FROM tuples_{element_type}) + + + SELECT sum(dist) FROM (SELECT array{norm}Norm(v) AS dist FROM vecs_{element_type}) + WITH (SELECT v FROM vecs_{element_type} limit 1) AS a SELECT sum(dist) FROM (SELECT array{norm}Distance(a, v) AS dist FROM vecs_{element_type}) + WITH (SELECT v FROM vecs_{element_type} limit 1) AS a SELECT sum(dist) FROM (SELECT arrayCosineDistance(a, v) AS dist FROM vecs_{element_type}) + + DROP TABLE vecs_{element_type} + DROP TABLE tuples_{element_type} + + diff --git a/tests/performance/unary_arithmetic_functions.xml b/tests/performance/unary_arithmetic_functions.xml index 62e11457ac4..93dd5244c9b 100644 --- a/tests/performance/unary_arithmetic_functions.xml +++ b/tests/performance/unary_arithmetic_functions.xml @@ -1,6 +1,4 @@ - - func diff --git a/tests/queries/0_stateless/00285_not_all_data_in_totals.reference b/tests/queries/0_stateless/00285_not_all_data_in_totals.reference index 961d8a34c09..065c39f5909 100644 --- a/tests/queries/0_stateless/00285_not_all_data_in_totals.reference +++ b/tests/queries/0_stateless/00285_not_all_data_in_totals.reference @@ -25,7 +25,7 @@ [0, "2"] ], - "totals": [0,"2000"], + "totals": [0, "2000"], "rows": 10, @@ -58,7 +58,7 @@ [0, "2"] ], - "totals": [0,"2000"], + "totals": [0, "2000"], "rows": 10, diff --git a/tests/queries/0_stateless/00313_const_totals_extremes.reference b/tests/queries/0_stateless/00313_const_totals_extremes.reference index f9084065989..fcb39b8080c 100644 --- a/tests/queries/0_stateless/00313_const_totals_extremes.reference +++ b/tests/queries/0_stateless/00313_const_totals_extremes.reference @@ -65,12 +65,12 @@ [1.23, "1"] ], - "totals": [1.23,"1"], + "totals": [1.23, "1"], "extremes": { - "min": [1.23,"1"], - "max": [1.23,"1"] + "min": [1.23, "1"], + "max": [1.23, "1"] }, "rows": 1 @@ -142,12 +142,12 @@ [1.1, "1"] ], - "totals": [1.1,"1"], + "totals": [1.1, "1"], "extremes": { - "min": [1.1,"1"], - "max": [1.1,"1"] + "min": [1.1, "1"], + "max": [1.1, "1"] }, "rows": 1 diff --git a/tests/queries/0_stateless/00378_json_quote_64bit_integers.reference b/tests/queries/0_stateless/00378_json_quote_64bit_integers.reference index 49c937e09df..5174c13a9e0 100644 --- a/tests/queries/0_stateless/00378_json_quote_64bit_integers.reference +++ b/tests/queries/0_stateless/00378_json_quote_64bit_integers.reference @@ -119,12 +119,12 @@ ["0", "0", "9223372036854775807", "-9223372036854775808", "18446744073709551615", ["0"], ["0","0"]] ], - "totals": ["0","0","9223372036854775807","-9223372036854775808","18446744073709551615",["0"],["0","0"]], + "totals": ["0", "0", "9223372036854775807", "-9223372036854775808", "18446744073709551615", ["0"], ["0","0"]], "extremes": { - "min": ["0","0","9223372036854775807","-9223372036854775808","18446744073709551615",["0"],["0","0"]], - "max": ["0","0","9223372036854775807","-9223372036854775808","18446744073709551615",["0"],["0","0"]] + "min": ["0", "0", "9223372036854775807", "-9223372036854775808", "18446744073709551615", ["0"], ["0","0"]], + "max": ["0", "0", "9223372036854775807", "-9223372036854775808", "18446744073709551615", ["0"], ["0","0"]] }, "rows": 1 @@ -251,12 +251,12 @@ [0, 0, 9223372036854775807, -9223372036854775808, 18446744073709551615, [0], [0,0]] ], - "totals": [0,0,9223372036854775807,-9223372036854775808,18446744073709551615,[0],[0,0]], + "totals": [0, 0, 9223372036854775807, -9223372036854775808, 18446744073709551615, [0], [0,0]], "extremes": { - "min": [0,0,9223372036854775807,-9223372036854775808,18446744073709551615,[0],[0,0]], - "max": [0,0,9223372036854775807,-9223372036854775808,18446744073709551615,[0],[0,0]] + "min": [0, 0, 9223372036854775807, -9223372036854775808, 18446744073709551615, [0], [0,0]], + "max": [0, 0, 9223372036854775807, -9223372036854775808, 18446744073709551615, [0], [0,0]] }, "rows": 1 diff --git a/tests/queries/0_stateless/01133_begin_commit_race.reference b/tests/queries/0_stateless/01133_begin_commit_race.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01133_begin_commit_race.sh b/tests/queries/0_stateless/01133_begin_commit_race.sh new file mode 100755 index 00000000000..f64570950c7 --- /dev/null +++ b/tests/queries/0_stateless/01133_begin_commit_race.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Tags: long + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -e + +$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS mt"; +$CLICKHOUSE_CLIENT --query "CREATE TABLE mt (n Int64) ENGINE=MergeTree ORDER BY n SETTINGS old_parts_lifetime=0"; + + +function begin_commit_readonly() +{ + $CLICKHOUSE_CLIENT --multiquery --query " + SET wait_changes_become_visible_after_commit_mode='wait'; + BEGIN TRANSACTION; + COMMIT;" 2>&1| grep -Fa "Exception: " | grep -Fv UNKNOWN_STATUS_OF_TRANSACTION +} + +function begin_rollback_readonly() +{ + $CLICKHOUSE_CLIENT --wait_changes_become_visible_after_commit_mode=wait_unknown --multiquery --query " + BEGIN TRANSACTION; + SET TRANSACTION SNAPSHOT 42; + ROLLBACK;" +} + +function begin_insert_commit() +{ + $CLICKHOUSE_CLIENT --wait_changes_become_visible_after_commit_mode=async --multiquery --query " + BEGIN TRANSACTION; + INSERT INTO mt VALUES ($RANDOM); + COMMIT;" 2>&1| grep -Fa "Exception: " | grep -Fv UNKNOWN_STATUS_OF_TRANSACTION +} + +function introspection() +{ + $CLICKHOUSE_CLIENT -q "SELECT * FROM system.transactions FORMAT Null" + $CLICKHOUSE_CLIENT -q "SELECT transactionLatestSnapshot(), transactionOldestSnapshot() FORMAT Null" +} + +export -f begin_commit_readonly +export -f begin_rollback_readonly +export -f begin_insert_commit +export -f introspection + +TIMEOUT=20 + +clickhouse_client_loop_timeout $TIMEOUT begin_commit_readonly & +clickhouse_client_loop_timeout $TIMEOUT begin_rollback_readonly & +clickhouse_client_loop_timeout $TIMEOUT begin_insert_commit & +clickhouse_client_loop_timeout $TIMEOUT introspection & + +wait + +$CLICKHOUSE_CLIENT --query "DROP TABLE mt"; diff --git a/tests/queries/0_stateless/01172_transaction_counters.sql b/tests/queries/0_stateless/01172_transaction_counters.sql index 5431673fd62..b84a7b25c47 100644 --- a/tests/queries/0_stateless/01172_transaction_counters.sql +++ b/tests/queries/0_stateless/01172_transaction_counters.sql @@ -42,7 +42,13 @@ rollback; system flush logs; select indexOf((select arraySort(groupUniqArray(tid)) from system.transactions_info_log where database=currentDatabase() and table='txn_counters'), tid), - (toDecimal64(now64(6), 6) - toDecimal64(event_time, 6)) < 100, type, thread_id!=0, length(query_id)=length(queryID()), tid_hash!=0, csn=0, part + (toDecimal64(now64(6), 6) - toDecimal64(event_time, 6)) < 100, + type, + thread_id!=0, + length(query_id)=length(queryID()) or type='Commit' and query_id='', -- ignore fault injection after commit + tid_hash!=0, + csn=0, + part from system.transactions_info_log where tid in (select tid from system.transactions_info_log where database=currentDatabase() and table='txn_counters' and not (tid.1=1 and tid.2=1)) or (database=currentDatabase() and table='txn_counters') order by event_time; diff --git a/tests/queries/0_stateless/01269_create_with_null.reference b/tests/queries/0_stateless/01269_create_with_null.reference index 73f834da75a..4e52c4a42d6 100644 --- a/tests/queries/0_stateless/01269_create_with_null.reference +++ b/tests/queries/0_stateless/01269_create_with_null.reference @@ -1,7 +1,7 @@ Nullable(Int32) Int32 Nullable(Int32) Int32 CREATE TABLE default.data_null\n(\n `a` Nullable(Int32),\n `b` Int32,\n `c` Nullable(Int32),\n `d` Int32\n)\nENGINE = Memory -Nullable(Int32) Int32 Nullable(Int32) Nullable(Int32) -CREATE TABLE default.set_null\n(\n `a` Nullable(Int32),\n `b` Int32,\n `c` Nullable(Int32),\n `d` Nullable(Int32)\n)\nENGINE = Memory -CREATE TABLE default.set_null\n(\n `a` Nullable(Int32),\n `b` Int32,\n `c` Nullable(Int32),\n `d` Nullable(Int32)\n)\nENGINE = Memory +Nullable(Int32) Int32 Nullable(Int32) Nullable(Int32) Nullable(UInt8) +CREATE TABLE default.set_null\n(\n `a` Nullable(Int32),\n `b` Int32,\n `c` Nullable(Int32),\n `d` Nullable(Int32),\n `f` Nullable(UInt8) DEFAULT 1\n)\nENGINE = Memory +CREATE TABLE default.set_null\n(\n `a` Nullable(Int32),\n `b` Int32,\n `c` Nullable(Int32),\n `d` Nullable(Int32),\n `f` Nullable(UInt8) DEFAULT 1\n)\nENGINE = Memory CREATE TABLE default.cannot_be_nullable\n(\n `n` Nullable(Int8),\n `a` Array(UInt8)\n)\nENGINE = Memory CREATE TABLE default.cannot_be_nullable\n(\n `n` Nullable(Int8),\n `a` Array(UInt8)\n)\nENGINE = Memory diff --git a/tests/queries/0_stateless/01269_create_with_null.sql b/tests/queries/0_stateless/01269_create_with_null.sql index 7548070ce4b..ac57f613dfd 100644 --- a/tests/queries/0_stateless/01269_create_with_null.sql +++ b/tests/queries/0_stateless/01269_create_with_null.sql @@ -39,13 +39,14 @@ CREATE TABLE set_null ( a INT NULL, b INT NOT NULL, c Nullable(INT), - d INT + d INT, + f DEFAULT 1 ) engine=Memory(); -INSERT INTO set_null VALUES (NULL, 2, NULL, NULL); +INSERT INTO set_null VALUES (NULL, 2, NULL, NULL, NULL); -SELECT toTypeName(a), toTypeName(b), toTypeName(c), toTypeName(d) FROM set_null; +SELECT toTypeName(a), toTypeName(b), toTypeName(c), toTypeName(d), toTypeName(f) FROM set_null; SHOW CREATE TABLE set_null; DETACH TABLE set_null; diff --git a/tests/queries/0_stateless/01710_projection_aggregation_in_order.sql b/tests/queries/0_stateless/01710_projection_aggregation_in_order.sql index 557bd297436..add38dbd3f8 100644 --- a/tests/queries/0_stateless/01710_projection_aggregation_in_order.sql +++ b/tests/queries/0_stateless/01710_projection_aggregation_in_order.sql @@ -55,5 +55,5 @@ FROM numbers(100000); SET allow_experimental_projection_optimization=1, optimize_aggregation_in_order=1, force_optimize_projection = 1; -WITH toStartOfHour(ts) AS a SELECT sum(value) v FROM normal WHERE ts > '2021-12-06 22:00:00' GROUP BY a ORDER BY v LIMIT 5; -WITH toStartOfHour(ts) AS a SELECT sum(value) v FROM normal WHERE ts > '2021-12-06 22:00:00' GROUP BY toStartOfHour(ts), a ORDER BY v LIMIT 5; +WITH toStartOfHour(ts) AS a SELECT sum(value) v FROM agg WHERE ts > '2021-12-06 22:00:00' GROUP BY a ORDER BY v LIMIT 5; +WITH toStartOfHour(ts) AS a SELECT sum(value) v FROM agg WHERE ts > '2021-12-06 22:00:00' GROUP BY toStartOfHour(ts), a ORDER BY v LIMIT 5; diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index ad18e38adcc..d4ada9ba5c8 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -485,6 +485,7 @@ CREATE TABLE system.parts `projections` Array(String), `visible` UInt8, `creation_tid` Tuple(UInt64, UInt64, UUID), + `removal_tid_lock` UInt64, `removal_tid` Tuple(UInt64, UInt64, UUID), `creation_csn` UInt64, `removal_csn` UInt64, diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats.reference b/tests/queries/0_stateless/02187_async_inserts_all_formats.reference index 92ce8dfd2c7..4f7f50bca13 100644 --- a/tests/queries/0_stateless/02187_async_inserts_all_formats.reference +++ b/tests/queries/0_stateless/02187_async_inserts_all_formats.reference @@ -7,6 +7,8 @@ CSVWithNamesAndTypes CustomSeparated CustomSeparatedWithNames CustomSeparatedWithNamesAndTypes +JSONColumns +JSONCompactColumns JSONCompactEachRow JSONCompactEachRowWithNames JSONCompactEachRowWithNamesAndTypes diff --git a/tests/queries/0_stateless/02242_negative_datetime64.reference b/tests/queries/0_stateless/02242_negative_datetime64.reference index 7f14679ac56..fbbebb520ae 100644 --- a/tests/queries/0_stateless/02242_negative_datetime64.reference +++ b/tests/queries/0_stateless/02242_negative_datetime64.reference @@ -1,2 +1,3 @@ -127914467.877 187618332.123 +1969-12-31 23:59:59.123 diff --git a/tests/queries/0_stateless/02242_negative_datetime64.sql b/tests/queries/0_stateless/02242_negative_datetime64.sql index 32086188608..40679841943 100644 --- a/tests/queries/0_stateless/02242_negative_datetime64.sql +++ b/tests/queries/0_stateless/02242_negative_datetime64.sql @@ -1,2 +1,3 @@ SELECT cast(toDateTime64('1965-12-12 12:12:12.123', 3, 'UTC') as Decimal64(3)); SELECT cast(toDateTime64('1975-12-12 12:12:12.123', 3, 'UTC') as Decimal64(3)); +SELECT toDateTime64('1969-12-31 23:59:59.123', 3, 'UTC'); diff --git a/tests/queries/0_stateless/02282_array_distance.reference b/tests/queries/0_stateless/02282_array_distance.reference index 158df656403..2fd6c66c817 100644 --- a/tests/queries/0_stateless/02282_array_distance.reference +++ b/tests/queries/0_stateless/02282_array_distance.reference @@ -1,7 +1,7 @@ 6 3.7416575 3 -0.0025851727 +0.002585097 \N nan 12 @@ -13,14 +13,14 @@ nan 2 5 4 -0.16847819 +0.16847816 0.35846698 -0.07417989 +0.0741799 6 8 9 0.020204102886728692 -0.11808289631180302 +0.11808289631180313 0 1 1 218.74642854227358 1 2 1348.2117786164013 diff --git a/tests/queries/0_stateless/02293_formats_json_columns.reference b/tests/queries/0_stateless/02293_formats_json_columns.reference new file mode 100644 index 00000000000..da8d080ac05 --- /dev/null +++ b/tests/queries/0_stateless/02293_formats_json_columns.reference @@ -0,0 +1,103 @@ +JSONColumns +{ + "a": [0, 1, 2, 3, 4], + "b": ["String", "String", "String", "String", "String"], + "c": [[[[],"String"],[[],"gnirtS"]], [[[0],"String"],[[0],"gnirtS"]], [[[0,1],"String"],[[0,1],"gnirtS"]], [[[],"String"],[[0,1,2],"gnirtS"]], [[[0],"String"],[[],"gnirtS"]]] +} +a Nullable(Float64) +b Nullable(String) +c Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +0 String [([],'String'),([],'gnirtS')] +1 String [([0],'String'),([0],'gnirtS')] +2 String [([0,1],'String'),([0,1],'gnirtS')] +3 String [([],'String'),([0,1,2],'gnirtS')] +4 String [([0],'String'),([],'gnirtS')] +JSONCompactColumns +[ + [0, 1, 2, 3, 4], + ["String", "String", "String", "String", "String"], + [[[[],"String"],[[],"gnirtS"]], [[[0],"String"],[[0],"gnirtS"]], [[[0,1],"String"],[[0,1],"gnirtS"]], [[[],"String"],[[0,1,2],"gnirtS"]], [[[0],"String"],[[],"gnirtS"]]] +] +c1 Nullable(Float64) +c2 Nullable(String) +c3 Array(Tuple(Array(Nullable(Float64)), Nullable(String))) +0 String [([],'String'),([],'gnirtS')] +1 String [([0],'String'),([0],'gnirtS')] +2 String [([0,1],'String'),([0,1],'gnirtS')] +3 String [([],'String'),([0,1,2],'gnirtS')] +4 String [([0],'String'),([],'gnirtS')] +JSONColumnsWithMetadata +{ + "meta": + [ + { + "name": "sum", + "type": "UInt64" + }, + { + "name": "avg", + "type": "Float64" + } + ], + + "data": + { + "sum": ["1", "2", "3", "4"], + "avg": [1, 2, 3, 2] + }, + + "totals": + { + "sum": "10", + "avg": 2 + }, + + "extremes": + { + "min": + { + "sum": "1", + "avg": 1 + }, + "max": + { + "sum": "4", + "avg": 3 + } + }, + + "rows": 4, + + "statistics": + { + "rows_read": 5, + "bytes_read": 20 + } +} +b Nullable(Float64) +a Nullable(Float64) +c Nullable(Float64) +d Nullable(String) +1 3 \N \N +2 2 \N \N +3 1 \N \N +\N \N 1 \N +\N \N 2 \N +\N \N 3 \N +\N \N \N String +OK +3 +2 +1 +c1 Nullable(Float64) +c2 Nullable(Float64) +c3 Nullable(String) +1 1 \N +2 2 \N +3 3 \N +1 \N \N +2 \N \N +3 \N \N +1 2 String +OK +OK diff --git a/tests/queries/0_stateless/02293_formats_json_columns.sh b/tests/queries/0_stateless/02293_formats_json_columns.sh new file mode 100755 index 00000000000..74d9a4f5aab --- /dev/null +++ b/tests/queries/0_stateless/02293_formats_json_columns.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-parallel + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +DATA_FILE=$USER_FILES_PATH/data_02293 + +$CLICKHOUSE_CLIENT -q "drop table if exists test_02293" +$CLICKHOUSE_CLIENT -q "create table test_02293 (a UInt32, b String, c Array(Tuple(Array(UInt32), String))) engine=Memory" +$CLICKHOUSE_CLIENT -q "insert into test_02293 select number, 'String', [(range(number % 3), 'String'), (range(number % 4), 'gnirtS')] from numbers(5) settings max_block_size=2" + +echo "JSONColumns" +$CLICKHOUSE_CLIENT -q "select * from test_02293 order by a format JSONColumns" +$CLICKHOUSE_CLIENT -q "select * from test_02293 order by a format JSONColumns" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONColumns)" +$CLICKHOUSE_CLIENT -q "select * from file(data_02293, JSONColumns)" + +echo "JSONCompactColumns" +$CLICKHOUSE_CLIENT -q "select * from test_02293 order by a format JSONCompactColumns" +$CLICKHOUSE_CLIENT -q "select * from test_02293 order by a format JSONCompactColumns" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONCompactColumns)" +$CLICKHOUSE_CLIENT -q "select * from file(data_02293, JSONCompactColumns)" + +echo "JSONColumnsWithMetadata" +$CLICKHOUSE_CLIENT -q "select sum(a) as sum, avg(a) as avg from test_02293 group by a % 4 with totals order by tuple(sum, avg) format JSONColumnsWithMetadata" --extremes=1 | grep -v "elapsed" + + +echo ' +{ + "b": [1, 2, 3], + "a": [3, 2, 1] +} +{ + "c": [1, 2, 3] +} +{ +} +{ + "a": [], + "d": [] +} +{ + "d": ["String"] +} +' > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONColumns)" +$CLICKHOUSE_CLIENT -q "select * from file(data_02293, JSONColumns)" +$CLICKHOUSE_CLIENT -q "select * from file(data_02293, JSONColumns, 'a UInt32, t String') settings input_format_skip_unknown_fields=0" 2>&1 | grep -F -q 'INCORRECT_DATA' && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT -q "select * from file(data_02293, JSONColumns, 'a UInt32, t String') settings input_format_skip_unknown_fields=1" + +echo ' +[ + [1, 2, 3], + [1, 2, 3] +] +[ + [1, 2, 3] +] +[ +] +[ + [], + [] +] +[ + [1], + [2], + ["String"] +] +' > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONCompactColumns)" +$CLICKHOUSE_CLIENT -q "select * from file(data_02293, JSONCompactColumns)" +$CLICKHOUSE_CLIENT -q "select * from file(data_02293, JSONCompactColumns, 'a UInt32, t UInt32')" 2>&1 | grep -F -q 'INCORRECT_DATA' && echo 'OK' || echo 'FAIL' + +echo ' +{ + "a": [null, null, null], + "b": [3, 2, 1] +} +{ + "a": [1, 2, 3] +} +' > $DATA_FILE + +$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONColumns) settings input_format_max_rows_to_read_for_schema_inference=3" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' + + diff --git a/tests/queries/0_stateless/02293_http_header_full_summary_without_progress.reference b/tests/queries/0_stateless/02293_http_header_full_summary_without_progress.reference new file mode 100644 index 00000000000..538ac795107 --- /dev/null +++ b/tests/queries/0_stateless/02293_http_header_full_summary_without_progress.reference @@ -0,0 +1,2 @@ +Read rows in summary is not zero +< HTTP/1.1 408 Request Time-out diff --git a/tests/queries/0_stateless/02293_http_header_full_summary_without_progress.sh b/tests/queries/0_stateless/02293_http_header_full_summary_without_progress.sh new file mode 100755 index 00000000000..8f08bd6f84b --- /dev/null +++ b/tests/queries/0_stateless/02293_http_header_full_summary_without_progress.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +CURL_OUTPUT=$(echo 'SELECT 1 + sleepEachRow(0.00002) FROM numbers(100000)' | \ + ${CLICKHOUSE_CURL_COMMAND} -v "${CLICKHOUSE_URL}&wait_end_of_query=1&send_progress_in_http_headers=0&max_execution_time=1" --data-binary @- 2>&1) + +READ_ROWS=$(echo "${CURL_OUTPUT}" | \ + grep 'X-ClickHouse-Summary' | \ + awk '{print $3}' | \ + sed -E 's/.*"read_rows":"?([^,"]*)"?.*/\1/' + ) + +if [ "$READ_ROWS" -ne 0 ]; +then + echo "Read rows in summary is not zero" +else + echo "Read rows in summary is zero!" +fi + +# Check that the response code is correct too +echo "${CURL_OUTPUT}" | grep "< HTTP/1.1" diff --git a/tests/queries/0_stateless/02293_http_header_summary_contains_exception_code_with_progress.reference b/tests/queries/0_stateless/02293_http_header_summary_contains_exception_code_with_progress.reference new file mode 100644 index 00000000000..487bd5d5bc3 --- /dev/null +++ b/tests/queries/0_stateless/02293_http_header_summary_contains_exception_code_with_progress.reference @@ -0,0 +1 @@ +Expected exception: 159 diff --git a/tests/queries/0_stateless/02293_http_header_summary_contains_exception_code_with_progress.sh b/tests/queries/0_stateless/02293_http_header_summary_contains_exception_code_with_progress.sh new file mode 100755 index 00000000000..fba136e7c38 --- /dev/null +++ b/tests/queries/0_stateless/02293_http_header_summary_contains_exception_code_with_progress.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +CURL_OUTPUT=$(echo 'SELECT 1 + sleepEachRow(0.00002) FROM numbers(100000)' | \ + ${CLICKHOUSE_CURL_COMMAND} -v "${CLICKHOUSE_URL}&wait_end_of_query=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0&max_execution_time=1" --data-binary @- 2>&1) + +EXCEPTION=$(echo "${CURL_OUTPUT}" | grep 'X-ClickHouse-Exception-Code') + +if [[ "$EXCEPTION" =~ .*"159".* ]]; +then + echo "Expected exception: 159" +else + echo "Unexpected exception" + echo "EXCEPTION:" + echo "'${EXCEPTION}'" + echo "DATA:" + echo "$CURL_OUTPUT" +fi + diff --git a/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.reference b/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.reference new file mode 100644 index 00000000000..7471bcad00c --- /dev/null +++ b/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.reference @@ -0,0 +1,190 @@ +LIKE +1 Hello 0 +2 Hello % 1 +3 Hello %% 1 +4 Hello %%% 1 +5 Hello %_% 1 +6 Hello _ 0 +7 Hello _% 1 +8 Hello %_ 1 +9 Hello H%o 1 +10 hello H%0 0 +11 hello h%o 1 +12 Hello h%o 0 +13 OHello %lhell% 0 +14 OHello %hell% 0 +15 hEllo %HEL% 0 +16 abcdef %aBc%def% 0 +17 ABCDDEF %abc%def% 0 +18 Abc\nDef %abc%def% 0 +19 abc\ntdef %abc%def% 1 +20 abct\ndef %abc%dEf% 0 +21 abc\n\ndeF %abc%def% 0 +22 abc\n\ntdef %abc%deF% 0 +23 Abc\nt\ndef %abc%def% 0 +24 abct\n\ndef %abc%def% 1 +25 ab\ndef %Abc%def% 0 +26 aBc\nef %ABC%DEF% 0 +27 ёЁё Ё%Ё 0 +28 ощщЁё Щ%Ё 0 +29 ощЩЁё %Щ%Ё 0 +30 Щущпандер %щп%е% 1 +31 Щущпандер %щП%е% 0 +32 ощщЁё %щ% 1 +33 ощЩЁё %ё% 1 +34 Hello .* 0 +35 Hello .*ell.* 0 +36 Hello o$ 0 +37 Hello hE.*lO 0 +NOT LIKE +1 Hello 1 +2 Hello % 0 +3 Hello %% 0 +4 Hello %%% 0 +5 Hello %_% 0 +6 Hello _ 1 +7 Hello _% 0 +8 Hello %_ 0 +9 Hello H%o 0 +10 hello H%0 1 +11 hello h%o 0 +12 Hello h%o 1 +13 OHello %lhell% 1 +14 OHello %hell% 1 +15 hEllo %HEL% 1 +16 abcdef %aBc%def% 1 +17 ABCDDEF %abc%def% 1 +18 Abc\nDef %abc%def% 1 +19 abc\ntdef %abc%def% 0 +20 abct\ndef %abc%dEf% 1 +21 abc\n\ndeF %abc%def% 1 +22 abc\n\ntdef %abc%deF% 1 +23 Abc\nt\ndef %abc%def% 1 +24 abct\n\ndef %abc%def% 0 +25 ab\ndef %Abc%def% 1 +26 aBc\nef %ABC%DEF% 1 +27 ёЁё Ё%Ё 1 +28 ощщЁё Щ%Ё 1 +29 ощЩЁё %Щ%Ё 1 +30 Щущпандер %щп%е% 0 +31 Щущпандер %щП%е% 1 +32 ощщЁё %щ% 0 +33 ощЩЁё %ё% 0 +34 Hello .* 1 +35 Hello .*ell.* 1 +36 Hello o$ 1 +37 Hello hE.*lO 1 +ILIKE +1 Hello 0 +2 Hello % 1 +3 Hello %% 1 +4 Hello %%% 1 +5 Hello %_% 1 +6 Hello _ 0 +7 Hello _% 1 +8 Hello %_ 1 +9 Hello H%o 1 +10 hello H%0 0 +11 hello h%o 1 +12 Hello h%o 1 +13 OHello %lhell% 0 +14 OHello %hell% 1 +15 hEllo %HEL% 1 +16 abcdef %aBc%def% 1 +17 ABCDDEF %abc%def% 1 +18 Abc\nDef %abc%def% 1 +19 abc\ntdef %abc%def% 1 +20 abct\ndef %abc%dEf% 1 +21 abc\n\ndeF %abc%def% 1 +22 abc\n\ntdef %abc%deF% 1 +23 Abc\nt\ndef %abc%def% 1 +24 abct\n\ndef %abc%def% 1 +25 ab\ndef %Abc%def% 0 +26 aBc\nef %ABC%DEF% 0 +27 ёЁё Ё%Ё 1 +28 ощщЁё Щ%Ё 0 +29 ощЩЁё %Щ%Ё 1 +30 Щущпандер %щп%е% 1 +31 Щущпандер %щП%е% 1 +32 ощщЁё %щ% 1 +33 ощЩЁё %ё% 1 +34 Hello .* 0 +35 Hello .*ell.* 0 +36 Hello o$ 0 +37 Hello hE.*lO 0 +NOT ILIKE +1 Hello 1 +2 Hello % 0 +3 Hello %% 0 +4 Hello %%% 0 +5 Hello %_% 0 +6 Hello _ 1 +7 Hello _% 0 +8 Hello %_ 0 +9 Hello H%o 0 +10 hello H%0 1 +11 hello h%o 0 +12 Hello h%o 0 +13 OHello %lhell% 1 +14 OHello %hell% 0 +15 hEllo %HEL% 0 +16 abcdef %aBc%def% 0 +17 ABCDDEF %abc%def% 0 +18 Abc\nDef %abc%def% 0 +19 abc\ntdef %abc%def% 0 +20 abct\ndef %abc%dEf% 0 +21 abc\n\ndeF %abc%def% 0 +22 abc\n\ntdef %abc%deF% 0 +23 Abc\nt\ndef %abc%def% 0 +24 abct\n\ndef %abc%def% 0 +25 ab\ndef %Abc%def% 1 +26 aBc\nef %ABC%DEF% 1 +27 ёЁё Ё%Ё 0 +28 ощщЁё Щ%Ё 1 +29 ощЩЁё %Щ%Ё 0 +30 Щущпандер %щп%е% 0 +31 Щущпандер %щП%е% 0 +32 ощщЁё %щ% 0 +33 ощЩЁё %ё% 0 +34 Hello .* 1 +35 Hello .*ell.* 1 +36 Hello o$ 1 +37 Hello hE.*lO 1 +MATCH +1 Hello 1 +2 Hello % 0 +3 Hello %% 0 +4 Hello %%% 0 +5 Hello %_% 0 +6 Hello _ 0 +7 Hello _% 0 +8 Hello %_ 0 +9 Hello H%o 0 +10 hello H%0 0 +11 hello h%o 0 +12 Hello h%o 0 +13 OHello %lhell% 0 +14 OHello %hell% 0 +15 hEllo %HEL% 0 +16 abcdef %aBc%def% 0 +17 ABCDDEF %abc%def% 0 +18 Abc\nDef %abc%def% 0 +19 abc\ntdef %abc%def% 0 +20 abct\ndef %abc%dEf% 0 +21 abc\n\ndeF %abc%def% 0 +22 abc\n\ntdef %abc%deF% 0 +23 Abc\nt\ndef %abc%def% 0 +24 abct\n\ndef %abc%def% 0 +25 ab\ndef %Abc%def% 0 +26 aBc\nef %ABC%DEF% 0 +27 ёЁё Ё%Ё 0 +28 ощщЁё Щ%Ё 0 +29 ощЩЁё %Щ%Ё 0 +30 Щущпандер %щп%е% 0 +31 Щущпандер %щП%е% 0 +32 ощщЁё %щ% 0 +33 ощЩЁё %ё% 0 +34 Hello .* 1 +35 Hello .*ell.* 1 +36 Hello o$ 1 +37 Hello hE.*lO 0 diff --git a/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.sql b/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.sql new file mode 100644 index 00000000000..3057e342733 --- /dev/null +++ b/tests/queries/0_stateless/02294_stringsearch_with_nonconst_needle.sql @@ -0,0 +1,36 @@ +drop table if exists non_const_needle; + +create table non_const_needle + (id UInt32, haystack String, needle String) + engine = MergeTree() + order by id; + +-- 1 - 33: LIKE-syntax, 34-37: re2-syntax +insert into non_const_needle values (1, 'Hello', '') (2, 'Hello', '%') (3, 'Hello', '%%') (4, 'Hello', '%%%') (5, 'Hello', '%_%') (6, 'Hello', '_') (7, 'Hello', '_%') (8, 'Hello', '%_') (9, 'Hello', 'H%o') (10, 'hello', 'H%0') (11, 'hello', 'h%o') (12, 'Hello', 'h%o') (13, 'OHello', '%lhell%') (14, 'OHello', '%hell%') (15, 'hEllo', '%HEL%') (16, 'abcdef', '%aBc%def%') (17, 'ABCDDEF', '%abc%def%') (18, 'Abc\nDef', '%abc%def%') (19, 'abc\ntdef', '%abc%def%') (20, 'abct\ndef', '%abc%dEf%') (21, 'abc\n\ndeF', '%abc%def%') (22, 'abc\n\ntdef', '%abc%deF%') (23, 'Abc\nt\ndef', '%abc%def%') (24, 'abct\n\ndef', '%abc%def%') (25, 'ab\ndef', '%Abc%def%') (26, 'aBc\nef', '%ABC%DEF%') (27, 'ёЁё', 'Ё%Ё') (28, 'ощщЁё', 'Щ%Ё') (29, 'ощЩЁё', '%Щ%Ё') (30, 'Щущпандер', '%щп%е%') (31, 'Щущпандер', '%щП%е%') (32, 'ощщЁё', '%щ%') (33, 'ощЩЁё', '%ё%') (34, 'Hello', '.*') (35, 'Hello', '.*ell.*') (36, 'Hello', 'o$') (37, 'Hello', 'hE.*lO'); + +select 'LIKE'; +select id, haystack, needle, like(haystack, needle) + from non_const_needle + order by id; + +select 'NOT LIKE'; +select id, haystack, needle, not like(haystack, needle) + from non_const_needle + order by id; + +select 'ILIKE'; +select id, haystack, needle, ilike(haystack, needle) + from non_const_needle + order by id; + +select 'NOT ILIKE'; +select id, haystack, needle, not ilike(haystack, needle) + from non_const_needle + order by id; + +select 'MATCH'; +select id, haystack, needle, match(haystack, needle) + from non_const_needle + order by id; + +drop table if exists non_const_needle; diff --git a/tests/queries/0_stateless/02302_clash_const_aggegate_join.reference b/tests/queries/0_stateless/02302_clash_const_aggegate_join.reference new file mode 100644 index 00000000000..bfa283c3478 --- /dev/null +++ b/tests/queries/0_stateless/02302_clash_const_aggegate_join.reference @@ -0,0 +1,7 @@ +0 +1970-01-01 00:00:00 +0 +2020-01-01 00:00:00 + + +1 diff --git a/tests/queries/0_stateless/02302_clash_const_aggegate_join.sql b/tests/queries/0_stateless/02302_clash_const_aggegate_join.sql new file mode 100644 index 00000000000..32c602e0d36 --- /dev/null +++ b/tests/queries/0_stateless/02302_clash_const_aggegate_join.sql @@ -0,0 +1,28 @@ +DROP TABLE IF EXISTS e; +-- https://github.com/ClickHouse/ClickHouse/issues/36891 + +CREATE TABLE e ( a UInt64, t DateTime ) ENGINE = MergeTree PARTITION BY toDate(t) ORDER BY tuple(); +INSERT INTO e SELECT 1, toDateTime('2020-02-01 12:00:01') + INTERVAL number MONTH FROM numbers(10); + +SELECT sumIf( 1, if( 1, toDateTime('2020-01-01 00:00:00', 'UTC'), toDateTime('1970-01-01 00:00:00', 'UTC')) > t ) +FROM e JOIN ( SELECT 1 joinKey) AS da ON joinKey = a +WHERE t >= toDateTime('2021-07-19T13:00:00', 'UTC') AND t <= toDateTime('2021-07-19T13:59:59', 'UTC'); + +SELECT any( toDateTime('2020-01-01T00:00:00', 'UTC')) +FROM e JOIN ( SELECT 1 joinKey) AS da ON joinKey = a +PREWHERE t >= toDateTime('2021-07-19T13:00:00', 'UTC'); + +SELECT sumIf( 1, if( 1, toDateTime('2020-01-01 00:00:00', 'UTC'), toDateTime('1970-01-01 00:00:00', 'UTC')) > t ) +FROM e JOIN ( SELECT 1 joinKey) AS da ON joinKey = a +WHERE t >= toDateTime('2020-01-01 00:00:00', 'UTC') AND t <= toDateTime('2021-07-19T13:59:59', 'UTC'); + +SELECT any(toDateTime('2020-01-01 00:00:00')) +FROM e JOIN ( SELECT 1 joinKey) AS da ON joinKey = a +PREWHERE t >= toDateTime('2020-01-01 00:00:00'); + +SELECT any('2020-01-01 00:00:00') FROM e JOIN ( SELECT 1 joinKey) AS da ON joinKey = a PREWHERE t = '2020-01-01 00:00:00'; + +SELECT any('x') FROM e JOIN ( SELECT 1 joinKey) AS da ON joinKey = a PREWHERE toString(a) = 'x'; + +SELECT any('1') FROM e JOIN ( SELECT 1 joinKey) AS da ON joinKey = a PREWHERE toString(a) = '1'; + diff --git a/tests/queries/0_stateless/02302_column_decl_null_before_defaul_value.reference b/tests/queries/0_stateless/02302_column_decl_null_before_defaul_value.reference new file mode 100644 index 00000000000..2079872ee73 --- /dev/null +++ b/tests/queries/0_stateless/02302_column_decl_null_before_defaul_value.reference @@ -0,0 +1,22 @@ +create table, column +type +NULL +id Nullable(Int32) +create table, column +type +NOT NULL +id Int32 +create table, column +type +NULL +DEFAULT +id Nullable(Int32) DEFAULT 1 +create table, column +type +NOT NULL +DEFAULT +id Int32 DEFAULT 1 +create table, column +type +DEFAULT +NULL +id Nullable(Int32) DEFAULT 1 +create table, column +type +DEFAULT +NOT NULL +id Int32 DEFAULT 1 +create table, column -type +NULL +DEFAULT +id Nullable(UInt8) DEFAULT 1 +create table, column -type +NOT NULL +DEFAULT +id UInt8 DEFAULT 1 +create table, column -type +DEFAULT +NULL +id Nullable(UInt8) DEFAULT 1 +create table, column -type +DEFAULT +NOT NULL +id UInt8 DEFAULT 1 +alter column, NULL modifier is not allowed +modify column, NULL modifier is not allowed diff --git a/tests/queries/0_stateless/02302_column_decl_null_before_defaul_value.sql b/tests/queries/0_stateless/02302_column_decl_null_before_defaul_value.sql new file mode 100644 index 00000000000..3825df1e557 --- /dev/null +++ b/tests/queries/0_stateless/02302_column_decl_null_before_defaul_value.sql @@ -0,0 +1,61 @@ +select 'create table, column +type +NULL'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id INT NULL) ENGINE=MergeTree() ORDER BY tuple(); +DESCRIBE TABLE null_before; + +select 'create table, column +type +NOT NULL'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id INT NOT NULL) ENGINE=MergeTree() ORDER BY tuple(); +DESCRIBE TABLE null_before; + +select 'create table, column +type +NULL +DEFAULT'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id INT NULL DEFAULT 1) ENGINE=MergeTree() ORDER BY tuple(); +DESCRIBE TABLE null_before; + +select 'create table, column +type +NOT NULL +DEFAULT'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id INT NOT NULL DEFAULT 1) ENGINE=MergeTree() ORDER BY tuple(); +DESCRIBE TABLE null_before; + +select 'create table, column +type +DEFAULT +NULL'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id INT DEFAULT 1 NULL) ENGINE=MergeTree() ORDER BY tuple(); +DESCRIBE TABLE null_before; + +select 'create table, column +type +DEFAULT +NOT NULL'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id INT DEFAULT 1 NOT NULL) ENGINE=MergeTree() ORDER BY tuple(); +DESCRIBE TABLE null_before; + +select 'create table, column -type +NULL +DEFAULT'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id NULL DEFAULT 1) ENGINE=MergeTree() ORDER BY tuple(); +DESCRIBE TABLE null_before; + +select 'create table, column -type +NOT NULL +DEFAULT'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id NOT NULL DEFAULT 1) ENGINE=MergeTree() ORDER BY tuple(); +DESCRIBE TABLE null_before; + +select 'create table, column -type +DEFAULT +NULL'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id DEFAULT 1 NULL) ENGINE=MergeTree() ORDER BY tuple(); +DESCRIBE TABLE null_before; + +select 'create table, column -type +DEFAULT +NOT NULL'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id DEFAULT 1 NOT NULL) ENGINE=MergeTree() ORDER BY tuple(); +DESCRIBE TABLE null_before; + +select 'alter column, NULL modifier is not allowed'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id INT NOT NULL) ENGINE=MergeTree() ORDER BY tuple(); +ALTER TABLE null_before ALTER COLUMN id TYPE INT NULL; -- { clientError SYNTAX_ERROR } + +select 'modify column, NULL modifier is not allowed'; +DROP TABLE IF EXISTS null_before SYNC; +CREATE TABLE null_before (id INT NOT NULL) ENGINE=MergeTree() ORDER BY tuple(); +ALTER TABLE null_before MODIFY COLUMN id NULL DEFAULT 1; -- { serverError UNKNOWN_TYPE } + +DROP TABLE IF EXISTS null_before SYNC; diff --git a/tests/queries/0_stateless/02302_defaults_in_columnar_formats.reference b/tests/queries/0_stateless/02302_defaults_in_columnar_formats.reference new file mode 100644 index 00000000000..9de3c47b3b2 --- /dev/null +++ b/tests/queries/0_stateless/02302_defaults_in_columnar_formats.reference @@ -0,0 +1,3 @@ +1 42 43 +1 42 43 +1 42 43 diff --git a/tests/queries/0_stateless/02302_defaults_in_columnar_formats.sql b/tests/queries/0_stateless/02302_defaults_in_columnar_formats.sql new file mode 100644 index 00000000000..5946f2d37e5 --- /dev/null +++ b/tests/queries/0_stateless/02302_defaults_in_columnar_formats.sql @@ -0,0 +1,8 @@ +-- Tags: no-fasttest, no-parallel + +insert into function file(data_02302.parquet) select 1 as x settings engine_file_truncate_on_insert=1; +select * from file(data_02302.parquet, auto, 'x UInt8, y default 42, z default x + y') settings input_format_parquet_allow_missing_columns=1; +insert into function file(data_02302.orc) select 1 as x settings engine_file_truncate_on_insert=1; +select * from file(data_02302.orc, auto, 'x UInt8, y default 42, z default x + y') settings input_format_orc_allow_missing_columns=1; +insert into function file(data_02302.arrow) select 1 as x settings engine_file_truncate_on_insert=1; +select * from file(data_02302.arrow, auto, 'x UInt8, y default 42, z default x + y') settings input_format_arrow_allow_missing_columns=1; diff --git a/tests/queries/0_stateless/02302_join_auto_lc_nullable_bug.reference b/tests/queries/0_stateless/02302_join_auto_lc_nullable_bug.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02302_join_auto_lc_nullable_bug.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02302_join_auto_lc_nullable_bug.sql b/tests/queries/0_stateless/02302_join_auto_lc_nullable_bug.sql new file mode 100644 index 00000000000..469476c82bf --- /dev/null +++ b/tests/queries/0_stateless/02302_join_auto_lc_nullable_bug.sql @@ -0,0 +1,7 @@ +-- Tags: no-backward-compatibility-check + +SET max_bytes_in_join = '100', join_algorithm = 'auto'; + +SELECT 3 == count() FROM (SELECT toLowCardinality(toNullable(number)) AS l FROM system.numbers LIMIT 3) AS s1 +ANY LEFT JOIN (SELECT toLowCardinality(toNullable(number)) AS r FROM system.numbers LIMIT 4) AS s2 ON l = r +; diff --git a/tests/queries/0_stateless/02302_projections_GROUP_BY_ORDERY_BY_optimize_aggregation_in_order.reference b/tests/queries/0_stateless/02302_projections_GROUP_BY_ORDERY_BY_optimize_aggregation_in_order.reference new file mode 100644 index 00000000000..0a83fa24d49 --- /dev/null +++ b/tests/queries/0_stateless/02302_projections_GROUP_BY_ORDERY_BY_optimize_aggregation_in_order.reference @@ -0,0 +1,13 @@ +-- { echoOn } +select x + y, sum(x - y) as s from test_agg_proj_02302 group by x + y order by s desc limit 5 settings allow_experimental_projection_optimization=1, optimize_aggregation_in_order=0, optimize_read_in_order=0; +15 480 +14 450 +13 420 +12 390 +11 360 +select x + y, sum(x - y) as s from test_agg_proj_02302 group by x + y order by s desc limit 5 settings allow_experimental_projection_optimization=1, optimize_aggregation_in_order=1, optimize_read_in_order=1; +15 480 +14 450 +13 420 +12 390 +11 360 diff --git a/tests/queries/0_stateless/02302_projections_GROUP_BY_ORDERY_BY_optimize_aggregation_in_order.sql b/tests/queries/0_stateless/02302_projections_GROUP_BY_ORDERY_BY_optimize_aggregation_in_order.sql new file mode 100644 index 00000000000..be050cc3080 --- /dev/null +++ b/tests/queries/0_stateless/02302_projections_GROUP_BY_ORDERY_BY_optimize_aggregation_in_order.sql @@ -0,0 +1,13 @@ +-- Tags: no-s3-storage + +drop table if exists test_agg_proj_02302; + +create table test_agg_proj_02302 (x Int32, y Int32, PROJECTION x_plus_y (select sum(x - y), argMax(x, y) group by x + y)) ENGINE = MergeTree order by tuple() settings index_granularity = 1; +insert into test_agg_proj_02302 select intDiv(number, 2), -intDiv(number,3) - 1 from numbers(100); + +-- { echoOn } +select x + y, sum(x - y) as s from test_agg_proj_02302 group by x + y order by s desc limit 5 settings allow_experimental_projection_optimization=1, optimize_aggregation_in_order=0, optimize_read_in_order=0; +select x + y, sum(x - y) as s from test_agg_proj_02302 group by x + y order by s desc limit 5 settings allow_experimental_projection_optimization=1, optimize_aggregation_in_order=1, optimize_read_in_order=1; + +-- { echoOff } +drop table test_agg_proj_02302; diff --git a/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.reference b/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.reference new file mode 100644 index 00000000000..f0ab418f0ce --- /dev/null +++ b/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.reference @@ -0,0 +1,3 @@ +s Nullable(String) +s Nullable(String) +s Nullable(String) diff --git a/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.sql b/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.sql new file mode 100644 index 00000000000..2d971bba9db --- /dev/null +++ b/tests/queries/0_stateless/02304_orc_arrow_parquet_string_as_string.sql @@ -0,0 +1,8 @@ +-- Tags: no-fasttest, no-parallel + +insert into function file(data_02304.parquet) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_parquet_string_as_string=1; +desc file(data_02304.parquet); +insert into function file(data_02304.orc) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_orc_string_as_string=1; +desc file(data_02304.orc); +insert into function file(data_02304.arrow) select 'hello' as s from numbers(3) settings engine_file_truncate_on_insert=1, output_format_arrow_string_as_string=1; +desc file(data_02304.arrow); diff --git a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference new file mode 100644 index 00000000000..bb5ee5c21eb --- /dev/null +++ b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference @@ -0,0 +1,3 @@ +0 +0 +1 diff --git a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql new file mode 100644 index 00000000000..3ab19446b3e --- /dev/null +++ b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql @@ -0,0 +1,5 @@ +SELECT generateUUIDv4(1) = generateUUIDv4(2); + +SELECT generateUUIDv4() = generateUUIDv4(1); + +SELECT generateUUIDv4(1) = generateUUIDv4(1); diff --git a/tests/queries/0_stateless/02311_normalize_utf8_constant.reference b/tests/queries/0_stateless/02311_normalize_utf8_constant.reference new file mode 100644 index 00000000000..efd3caf8a45 --- /dev/null +++ b/tests/queries/0_stateless/02311_normalize_utf8_constant.reference @@ -0,0 +1 @@ +â â â â â C3A2 C3A2 61CC82 C3A2 61CC82 diff --git a/tests/queries/0_stateless/02311_normalize_utf8_constant.sql b/tests/queries/0_stateless/02311_normalize_utf8_constant.sql new file mode 100644 index 00000000000..2747aa073ec --- /dev/null +++ b/tests/queries/0_stateless/02311_normalize_utf8_constant.sql @@ -0,0 +1,13 @@ +-- Tags: no-fasttest + +SELECT + 'â' AS s, + normalizeUTF8NFC(s) s1, + normalizeUTF8NFD(s) s2, + normalizeUTF8NFKC(s) s3, + normalizeUTF8NFKD(s) s4, + hex(s), + hex(s1), + hex(s2), + hex(s3), + hex(s4); diff --git a/tests/queries/0_stateless/02311_range_hashed_dictionary_range_cast.reference b/tests/queries/0_stateless/02311_range_hashed_dictionary_range_cast.reference new file mode 100644 index 00000000000..95a5cf09f70 --- /dev/null +++ b/tests/queries/0_stateless/02311_range_hashed_dictionary_range_cast.reference @@ -0,0 +1,2 @@ +Value +Value diff --git a/tests/queries/0_stateless/02311_range_hashed_dictionary_range_cast.sql b/tests/queries/0_stateless/02311_range_hashed_dictionary_range_cast.sql new file mode 100644 index 00000000000..623b369da38 --- /dev/null +++ b/tests/queries/0_stateless/02311_range_hashed_dictionary_range_cast.sql @@ -0,0 +1,30 @@ +DROP TABLE IF EXISTS dictionary_source_table; +CREATE TABLE dictionary_source_table +( + key UInt64, + start UInt64, + end UInt64, + value String +) Engine = TinyLog; + +INSERT INTO dictionary_source_table values (1, 0, 18446744073709551615, 'Value'); + +DROP DICTIONARY IF EXISTS range_hashed_dictionary; +CREATE DICTIONARY range_hashed_dictionary +( + key UInt64, + start UInt64, + end UInt64, + value String +) +PRIMARY KEY key +SOURCE(CLICKHOUSE(TABLE 'dictionary_source_table')) +LAYOUT(RANGE_HASHED()) +RANGE(MIN start MAX end) +LIFETIME(0); + +SELECT dictGet('range_hashed_dictionary', 'value', toUInt64(1), toUInt64(18446744073709551615)); +SELECT dictGet('range_hashed_dictionary', 'value', toUInt64(1), toUInt64(-1)); + +DROP DICTIONARY range_hashed_dictionary; +DROP TABLE dictionary_source_table; diff --git a/tests/queries/0_stateless/02312_parquet_orc_arrow_names_tuples.reference b/tests/queries/0_stateless/02312_parquet_orc_arrow_names_tuples.reference new file mode 100644 index 00000000000..4697d53a23b --- /dev/null +++ b/tests/queries/0_stateless/02312_parquet_orc_arrow_names_tuples.reference @@ -0,0 +1,12 @@ +(1,2) +(2,3) +(3,4) +(1,2) +(2,3) +(3,4) +(1,2) +(2,3) +(3,4) +[[(1),(2),(3)]] +[[(1),(2),(3)]] +[[(1),(2),(3)]] diff --git a/tests/queries/0_stateless/02312_parquet_orc_arrow_names_tuples.sql b/tests/queries/0_stateless/02312_parquet_orc_arrow_names_tuples.sql new file mode 100644 index 00000000000..4c2158e4a0c --- /dev/null +++ b/tests/queries/0_stateless/02312_parquet_orc_arrow_names_tuples.sql @@ -0,0 +1,29 @@ +-- Tags: no-fasttest + +drop table if exists test_02312; +create table test_02312 (x Tuple(a UInt32, b UInt32)) engine=File(Parquet); +insert into test_02312 values ((1,2)), ((2,3)), ((3,4)); +select * from test_02312; +drop table test_02312; +create table test_02312 (x Tuple(a UInt32, b UInt32)) engine=File(Arrow); +insert into test_02312 values ((1,2)), ((2,3)), ((3,4)); +select * from test_02312; +drop table test_02312; +create table test_02312 (x Tuple(a UInt32, b UInt32)) engine=File(ORC); +insert into test_02312 values ((1,2)), ((2,3)), ((3,4)); +select * from test_02312; +drop table test_02312; + +create table test_02312 (a Nested(b Nested(c UInt32))) engine=File(Parquet); +insert into test_02312 values ([[(1), (2), (3)]]); +select * from test_02312; +drop table test_02312; +create table test_02312 (a Nested(b Nested(c UInt32))) engine=File(Arrow); +insert into test_02312 values ([[(1), (2), (3)]]); +select * from test_02312; +drop table test_02312; +create table test_02312 (a Nested(b Nested(c UInt32))) engine=File(ORC); +insert into test_02312 values ([[(1), (2), (3)]]); +select * from test_02312; +drop table test_02312; + diff --git a/tests/queries/1_stateful/00023_totals_limit.reference b/tests/queries/1_stateful/00023_totals_limit.reference index fc4a02662d7..c76452411d7 100644 --- a/tests/queries/1_stateful/00023_totals_limit.reference +++ b/tests/queries/1_stateful/00023_totals_limit.reference @@ -16,7 +16,7 @@ [1604017, "189"] ], - "totals": [0,"4652"], + "totals": [0, "4652"], "rows": 1, diff --git a/website/README.md b/website/README.md index 57cd87cbfe2..f96f1e0075d 100644 --- a/website/README.md +++ b/website/README.md @@ -22,19 +22,7 @@ pip3 install -r requirements.txt virtualenv build -./build.py --skip-multi-page --skip-blog --skip-docs --livereload 8080 +./build.py --livereload 8080 # Open the web browser and go to http://localhost:8080/ ``` - -# How to quickly test the blog - -``` -./build.py --skip-multi-page --skip-docs --livereload 8080 -``` - -# How to quickly test the broken links in docs - -``` -./build.py --skip-multi-page --skip-blog --lang en --livereload 8080 -``` diff --git a/website/benchmark/hardware/results/aws_c6g_16xlarge.json b/website/benchmark/hardware/results/aws_c6g_16xlarge.json new file mode 100644 index 00000000000..364b40f657a --- /dev/null +++ b/website/benchmark/hardware/results/aws_c6g_16xlarge.json @@ -0,0 +1,54 @@ +[ + { + "system": "AWS c6g.16xlarge (Graviton 2)", + "system_full": "AWS c6g.16xlarge (Graviton 2) 64 vCPU, 128 GiB RAM, EBS", + "time": "2022-05-24 00:00:00", + "kind": "cloud", + "result": + [ +[0.002, 0.002, 0.002], +[0.051, 0.024, 0.026], +[0.037, 0.021, 0.028], +[0.102, 0.065, 0.061], +[0.243, 0.080, 0.080], +[0.976, 0.138, 0.138], +[0.003, 0.002, 0.002], +[0.044, 0.040, 0.039], +[0.204, 0.145, 0.146], +[0.799, 0.165, 0.165], +[0.306, 0.095, 0.095], +[0.523, 0.101, 0.096], +[0.973, 0.226, 0.224], +[1.520, 0.282, 0.277], +[0.645, 0.239, 0.236], +[0.260, 0.312, 0.280], +[1.535, 0.660, 0.629], +[1.426, 0.470, 0.427], +[3.456, 1.372, 1.138], +[0.147, 0.119, 0.079], +[9.101, 0.406, 0.358], +[10.117, 0.330, 0.323], +[19.495, 0.756, 0.748], +[16.173, 1.500, 1.532], +[1.832, 0.105, 0.094], +[0.836, 0.092, 0.090], +[2.363, 0.108, 0.099], +[9.269, 0.367, 0.363], +[7.317, 0.422, 0.414], +[0.918, 1.020, 1.058], +[1.347, 0.210, 0.209], +[4.535, 0.343, 0.335], +[4.288, 2.411, 2.501], +[9.310, 1.240, 1.172], +[9.301, 1.209, 1.205], +[0.446, 0.428, 0.421], +[0.245, 0.207, 0.202], +[0.107, 0.091, 0.098], +[0.112, 0.095, 0.101], +[0.546, 0.485, 0.444], +[0.061, 0.049, 0.037], +[0.041, 0.035, 0.033], +[0.006, 0.005, 0.005] + ] + } +] diff --git a/website/benchmark/hardware/results/aws_c7g_16xlarge.json b/website/benchmark/hardware/results/aws_c7g_16xlarge.json new file mode 100644 index 00000000000..91230ecceee --- /dev/null +++ b/website/benchmark/hardware/results/aws_c7g_16xlarge.json @@ -0,0 +1,54 @@ +[ + { + "system": "AWS c7g.16xlarge (Graviton 3)", + "system_full": "AWS c7g.16xlarge (Graviton 3) 64 vCPU, 128 GiB RAM, EBS", + "time": "2022-05-24 00:00:00", + "kind": "cloud", + "result": + [ +[0.002, 0.002, 0.002], +[0.031, 0.022, 0.023], +[0.066, 0.025, 0.025], +[0.240, 0.061, 0.059], +[0.328, 0.073, 0.076], +[0.955, 0.101, 0.098], +[0.002, 0.002, 0.002], +[0.035, 0.030, 0.030], +[0.499, 0.113, 0.115], +[0.704, 0.127, 0.127], +[0.452, 0.070, 0.070], +[0.613, 0.074, 0.072], +[1.060, 0.147, 0.144], +[1.749, 0.190, 0.187], +[0.933, 0.176, 0.175], +[0.408, 0.206, 0.188], +[1.714, 0.476, 0.464], +[1.391, 0.349, 0.307], +[3.271, 0.876, 0.719], +[0.375, 0.079, 0.071], +[9.094, 0.270, 0.293], +[10.251, 0.236, 0.222], +[19.763, 0.783, 0.839], +[16.380, 1.164, 1.192], +[1.861, 0.112, 0.114], +[0.863, 0.062, 0.060], +[2.499, 0.103, 0.113], +[9.448, 0.257, 0.245], +[7.546, 0.288, 0.285], +[0.822, 0.837, 0.837], +[1.352, 0.151, 0.142], +[4.743, 0.224, 0.214], +[3.807, 1.236, 1.366], +[10.096, 0.805, 0.780], +[9.191, 0.830, 0.792], +[0.320, 0.304, 0.294], +[0.209, 0.143, 0.175], +[0.099, 0.066, 0.068], +[0.141, 0.073, 0.064], +[0.499, 0.386, 0.372], +[0.061, 0.030, 0.032], +[0.035, 0.030, 0.028], +[0.016, 0.016, 0.004] + ] + } +]