From 6d32149fa64f882dd8a3218855757146b2571cb3 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Tue, 18 Oct 2022 11:18:01 +0800 Subject: [PATCH 1/8] Correct implementation of agg state comparison --- src/AggregateFunctions/IAggregateFunction.cpp | 15 +++++++-------- .../02456_aggregate_state_conversion.reference | 1 + .../02456_aggregate_state_conversion.sql | 1 + 3 files changed, 9 insertions(+), 8 deletions(-) create mode 100644 tests/queries/0_stateless/02456_aggregate_state_conversion.reference create mode 100644 tests/queries/0_stateless/02456_aggregate_state_conversion.sql diff --git a/src/AggregateFunctions/IAggregateFunction.cpp b/src/AggregateFunctions/IAggregateFunction.cpp index 25d2a9a4530..7da341cc5b9 100644 --- a/src/AggregateFunctions/IAggregateFunction.cpp +++ b/src/AggregateFunctions/IAggregateFunction.cpp @@ -53,9 +53,12 @@ String IAggregateFunction::getDescription() const bool IAggregateFunction::haveEqualArgumentTypes(const IAggregateFunction & rhs) const { - return std::equal(argument_types.begin(), argument_types.end(), - rhs.argument_types.begin(), rhs.argument_types.end(), - [](const auto & t1, const auto & t2) { return t1->equals(*t2); }); + return std::equal( + argument_types.begin(), + argument_types.end(), + rhs.argument_types.begin(), + rhs.argument_types.end(), + [](const auto & t1, const auto & t2) { return t1->equals(*t2); }); } bool IAggregateFunction::haveSameStateRepresentation(const IAggregateFunction & rhs) const @@ -67,11 +70,7 @@ bool IAggregateFunction::haveSameStateRepresentation(const IAggregateFunction & bool IAggregateFunction::haveSameStateRepresentationImpl(const IAggregateFunction & rhs) const { - bool res = getName() == rhs.getName() - && parameters == rhs.parameters - && haveEqualArgumentTypes(rhs); - assert(res == (getStateType()->getName() == rhs.getStateType()->getName())); - return res; + return getStateType()->equals(*rhs.getStateType()); } } diff --git a/tests/queries/0_stateless/02456_aggregate_state_conversion.reference b/tests/queries/0_stateless/02456_aggregate_state_conversion.reference new file mode 100644 index 00000000000..abf55dde8a7 --- /dev/null +++ b/tests/queries/0_stateless/02456_aggregate_state_conversion.reference @@ -0,0 +1 @@ +1027000000000000000000000000000000000000000000000000000000000000 diff --git a/tests/queries/0_stateless/02456_aggregate_state_conversion.sql b/tests/queries/0_stateless/02456_aggregate_state_conversion.sql new file mode 100644 index 00000000000..3c05c59de59 --- /dev/null +++ b/tests/queries/0_stateless/02456_aggregate_state_conversion.sql @@ -0,0 +1 @@ +SELECT hex(CAST(x, 'AggregateFunction(sum, Decimal(50, 10))')) FROM (SELECT arrayReduce('sumState', [toDecimal256('0.0000010.000001', 10)]) AS x) GROUP BY x; From fabc8f5a1833167c6cbc5f6566157304b3b51a15 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 19 Oct 2022 23:27:12 +0200 Subject: [PATCH 2/8] Remove support for {database} macro from the client's prompt --- programs/client/clickhouse-client.xml | 1 - src/Client/ClientBase.cpp | 7 ++----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/programs/client/clickhouse-client.xml b/programs/client/clickhouse-client.xml index 66e7afd8f8c..00f5b26eddf 100644 --- a/programs/client/clickhouse-client.xml +++ b/programs/client/clickhouse-client.xml @@ -19,7 +19,6 @@ {host} {port} {user} - {database} {display_name} Terminal colors: https://misc.flogisoft.com/bash/tip_colors_and_formatting See also: https://wiki.hackzine.org/development/misc/readline-color-prompt.html diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 0a2fbcf9f46..0db7a9533db 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1,7 +1,6 @@ #include #include -#include #include #include #include @@ -9,7 +8,6 @@ #include "config.h" #include -#include #include #include #include @@ -32,7 +30,6 @@ #include #include #include -#include #include #include @@ -70,10 +67,10 @@ #include #include #include -#include #include #include + namespace fs = std::filesystem; using namespace std::literals; @@ -1925,7 +1922,7 @@ bool ClientBase::processQueryText(const String & text) String ClientBase::prompt() const { - return boost::replace_all_copy(prompt_by_server_display_name, "{database}", config().getString("database", "default")); + return prompt_by_server_display_name; } From 81750a81e768eabaf772e5be259064db3f4fb26b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 20 Oct 2022 01:17:11 +0200 Subject: [PATCH 3/8] Add a test for #16827 --- .../0_stateless/02467_cross_join_three_table_functions.reference | 1 + .../0_stateless/02467_cross_join_three_table_functions.sql | 1 + 2 files changed, 2 insertions(+) create mode 100644 tests/queries/0_stateless/02467_cross_join_three_table_functions.reference create mode 100644 tests/queries/0_stateless/02467_cross_join_three_table_functions.sql diff --git a/tests/queries/0_stateless/02467_cross_join_three_table_functions.reference b/tests/queries/0_stateless/02467_cross_join_three_table_functions.reference new file mode 100644 index 00000000000..0718dd8e65f --- /dev/null +++ b/tests/queries/0_stateless/02467_cross_join_three_table_functions.reference @@ -0,0 +1 @@ +1320 diff --git a/tests/queries/0_stateless/02467_cross_join_three_table_functions.sql b/tests/queries/0_stateless/02467_cross_join_three_table_functions.sql new file mode 100644 index 00000000000..5c7da815bbe --- /dev/null +++ b/tests/queries/0_stateless/02467_cross_join_three_table_functions.sql @@ -0,0 +1 @@ +SELECT count(*) FROM numbers(10) AS a, numbers(11) AS b, numbers(12) AS c; From b4d241b54dd5abe797e618521707df4dacdd35c0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 20 Oct 2022 01:39:08 +0200 Subject: [PATCH 4/8] Add a test for #13653 --- tests/queries/0_stateless/02468_has_any_tuple.reference | 4 ++++ tests/queries/0_stateless/02468_has_any_tuple.sql | 4 ++++ 2 files changed, 8 insertions(+) create mode 100644 tests/queries/0_stateless/02468_has_any_tuple.reference create mode 100644 tests/queries/0_stateless/02468_has_any_tuple.sql diff --git a/tests/queries/0_stateless/02468_has_any_tuple.reference b/tests/queries/0_stateless/02468_has_any_tuple.reference new file mode 100644 index 00000000000..252a9293563 --- /dev/null +++ b/tests/queries/0_stateless/02468_has_any_tuple.reference @@ -0,0 +1,4 @@ +1 +1 +[(3,3)] +1 diff --git a/tests/queries/0_stateless/02468_has_any_tuple.sql b/tests/queries/0_stateless/02468_has_any_tuple.sql new file mode 100644 index 00000000000..12c7222d593 --- /dev/null +++ b/tests/queries/0_stateless/02468_has_any_tuple.sql @@ -0,0 +1,4 @@ +select [(toUInt8(3), toUInt8(3))] = [(toInt16(3), toInt16(3))]; +select hasAny([(toInt16(3), toInt16(3))],[(toInt16(3), toInt16(3))]); +select arrayFilter(x -> x = (toInt16(3), toInt16(3)), arrayZip([toUInt8(3)], [toUInt8(3)])); +select hasAny([(toUInt8(3), toUInt8(3))],[(toInt16(3), toInt16(3))]); From 00f9ae99249c636320141607ee51dac170ae6938 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 20 Oct 2022 04:42:35 +0200 Subject: [PATCH 5/8] Correct documentation for settings --- src/Core/Settings.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 07618ee731d..0b8d24b1abc 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -331,8 +331,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, max_bytes_before_remerge_sort, 1000000000, "In case of ORDER BY with LIMIT, when memory usage is higher than specified threshold, perform additional steps of merging blocks before final merge to keep just top LIMIT rows.", 0) \ M(Float, remerge_sort_lowered_memory_bytes_ratio, 2., "If memory usage after remerge does not reduced by this ratio, remerge will be disabled.", 0) \ \ - M(UInt64, max_result_rows, 0, "Limit on result size in rows. Also checked for intermediate data sent from remote servers.", 0) \ - M(UInt64, max_result_bytes, 0, "Limit on result size in bytes (uncompressed). Also checked for intermediate data sent from remote servers.", 0) \ + M(UInt64, max_result_rows, 0, "Limit on result size in rows. The query will stop after processing a block of data if the threshold is met, but it will not cut the last block of the result, therefore the result size can be larger than the threshold.", 0) \ + M(UInt64, max_result_bytes, 0, "Limit on result size in bytes (uncompressed). The query will stop after processing a block of data if the threshold is met, but it will not cut the last block of the result, therefore the result size can be larger than the threshold. Caveats: the result size in memory is taken into account for this threshold. Even if the result size is small, it can reference larger data structures in memory, representing dictionaries of LowCardinality columns, and Arenas of AggregateFunction columns, so the threshold can be exceeded despite the small result size. The setting is fairly low level and should be used with caution.", 0) \ M(OverflowMode, result_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ \ /* TODO: Check also when merging and finalizing aggregate functions. */ \ From e2417eb518540b631f83b5c34adb00a0f136442c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 20 Oct 2022 09:33:48 +0200 Subject: [PATCH 6/8] tests: fix 00705_drop_create_merge_tree flakiness CI caught tiemout for this test [1]: 2022.10.19 16:43:46.238264 [ 24048 ] {aea0ff2a-f8de-498a-bd9f-0b8069a49f48} executeQuery: Code: 60. DB::Exception: Table test_orfkwn0y.table doesn't exist. (UNKNOWN_TABLE) (version 22.10.1.1) (from [::1]:60028) (comment: 00705_drop_create_merge_tree.sh) (in query: DROP TABLE table), Stack trace (when copying this message, always include the lines below): ... 2022.10.19 16:53:34.484777 [ 24042 ] {aec5a80a-4492-429b-87fb-7dbf5ffb5d67} executeQuery: (from [::1]:57944) (comment: 00705_drop_create_merge_tree.sh) DROP DATABASE test_orfkwn0y (stage: Complete) But as you can see there is huge delay between last query from the test and final DROP DATABASE. [1]: https://s3.amazonaws.com/clickhouse-test-reports/42457/65cd040d1565bb7b2a9ba515041c3a139d31a4f9/stateless_tests__tsan__[1/3]/runlog.log Apparently it is the same issue in bash [1]. [1]: https://gist.github.com/azat/affbda3f8c6b5c38648d4ab105777d88 Anyway it is easier to simply invoke clickhouse-client only two times, since each invocation is very slow (~1-2 sec) in debug build. Signed-off-by: Azat Khuzhin --- .../00705_drop_create_merge_tree.reference | 1 - .../00705_drop_create_merge_tree.sh | 33 ++----------------- 2 files changed, 3 insertions(+), 31 deletions(-) diff --git a/tests/queries/0_stateless/00705_drop_create_merge_tree.reference b/tests/queries/0_stateless/00705_drop_create_merge_tree.reference index 8b137891791..e69de29bb2d 100644 --- a/tests/queries/0_stateless/00705_drop_create_merge_tree.reference +++ b/tests/queries/0_stateless/00705_drop_create_merge_tree.reference @@ -1 +0,0 @@ - diff --git a/tests/queries/0_stateless/00705_drop_create_merge_tree.sh b/tests/queries/0_stateless/00705_drop_create_merge_tree.sh index 146d6e54c0b..d7754091290 100755 --- a/tests/queries/0_stateless/00705_drop_create_merge_tree.sh +++ b/tests/queries/0_stateless/00705_drop_create_merge_tree.sh @@ -1,39 +1,12 @@ #!/usr/bin/env bash # Tags: no-fasttest -set -e - CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -function stress() -{ - # We set up a signal handler to make sure to wait for all queries to be finished before ending - CONTINUE=true - handle_interruption() - { - CONTINUE=false - } - trap handle_interruption INT - - while $CONTINUE; do - ${CLICKHOUSE_CLIENT} --query "CREATE TABLE IF NOT EXISTS table (x UInt8) ENGINE = MergeTree ORDER BY tuple()" 2>/dev/null - ${CLICKHOUSE_CLIENT} --query "DROP TABLE table" 2>/dev/null - done - - trap - INT -} - -# https://stackoverflow.com/questions/9954794/execute-a-shell-function-with-timeout -export -f stress - -for _ in {1..5}; do - # Ten seconds are just barely enough to reproduce the issue in most of runs. - timeout -s INT 10 bash -c stress & -done - +yes 'CREATE TABLE IF NOT EXISTS table (x UInt8) ENGINE = MergeTree ORDER BY tuple();' | head -n 1000 | $CLICKHOUSE_CLIENT --ignore-error -nm 2>/dev/null & +yes 'DROP TABLE table;' | head -n 1000 | $CLICKHOUSE_CLIENT --ignore-error -nm 2>/dev/null & wait -echo -${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS table"; +${CLICKHOUSE_CLIENT} --query "DROP TABLE IF EXISTS table" From d09a5e8fd7194cbdd23919570d05fa73b87f953c Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 20 Oct 2022 21:46:26 +0200 Subject: [PATCH 7/8] Revert "Attempt to fix abort from parallel parsing (#42496)" This reverts commit 4d703b792c122bb32a4291694b7382c103c6073d. --- src/Common/ThreadPool.h | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h index b3ab20ae592..76ada9e0d75 100644 --- a/src/Common/ThreadPool.h +++ b/src/Common/ThreadPool.h @@ -178,11 +178,7 @@ public: func = std::forward(func), args = std::make_tuple(std::forward(args)...)]() mutable /// mutable is needed to destroy capture { - SCOPE_EXIT( - { - state->finished = true; - state->event.set(); - }); + SCOPE_EXIT(state->event.set()); state->thread_id = std::this_thread::get_id(); @@ -217,17 +213,6 @@ public: ~ThreadFromGlobalPoolImpl() { - /// The problem is that the our ThreadFromGlobalPool can be actually finished - /// before we try to join the thread or check whether it is joinable or not. - /// In some places we have code like: - /// if (thread->joinable()) - /// thread->join(); - /// Where join() won't be executed in case when we call it - /// from the same std::thread and it will end to std::abort(). - /// So we just do nothing in this case - if (state->finished) - return; - if (initialized()) abort(); } @@ -267,9 +252,6 @@ protected: /// The state used in this object and inside the thread job. Poco::Event event; - - /// To allow joining to the same std::thread after finishing - std::atomic finished{false}; }; std::shared_ptr state; From b720030ac645aaef4c4850471d0449b34b82b842 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 21 Oct 2022 00:13:30 +0200 Subject: [PATCH 8/8] Remove outdated documentation --- docs/en/development/architecture.md | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/docs/en/development/architecture.md b/docs/en/development/architecture.md index c13b2519b84..fe644c43889 100644 --- a/docs/en/development/architecture.md +++ b/docs/en/development/architecture.md @@ -49,27 +49,13 @@ When we calculate some function over columns in a block, we add another column w Blocks are created for every processed chunk of data. Note that for the same type of calculation, the column names and types remain the same for different blocks, and only column data changes. It is better to split block data from the block header because small block sizes have a high overhead of temporary strings for copying shared_ptrs and column names. -## Block Streams {#block-streams} +## Processors -Block streams are for processing data. We use streams of blocks to read data from somewhere, perform data transformations, or write data to somewhere. `IBlockInputStream` has the `read` method to fetch the next block while available. `IBlockOutputStream` has the `write` method to push the block somewhere. - -Streams are responsible for: - -1. Reading or writing to a table. The table just returns a stream for reading or writing blocks. -2. Implementing data formats. For example, if you want to output data to a terminal in `Pretty` format, you create a block output stream where you push blocks, and it formats them. -3. Performing data transformations. Let’s say you have `IBlockInputStream` and want to create a filtered stream. You create `FilterBlockInputStream` and initialize it with your stream. Then when you pull a block from `FilterBlockInputStream`, it pulls a block from your stream, filters it, and returns the filtered block to you. Query execution pipelines are represented this way. - -There are more sophisticated transformations. For example, when you pull from `AggregatingBlockInputStream`, it reads all data from its source, aggregates it, and then returns a stream of aggregated data for you. Another example: `UnionBlockInputStream` accepts many input sources in the constructor and also a number of threads. It launches multiple threads and reads from multiple sources in parallel. - -> Block streams use the “pull” approach to control flow: when you pull a block from the first stream, it consequently pulls the required blocks from nested streams, and the entire execution pipeline will work. Neither “pull” nor “push” is the best solution, because control flow is implicit, and that limits the implementation of various features like simultaneous execution of multiple queries (merging many pipelines together). This limitation could be overcome with coroutines or just running extra threads that wait for each other. We may have more possibilities if we make control flow explicit: if we locate the logic for passing data from one calculation unit to another outside of those calculation units. Read this [article](http://journal.stuffwithstuff.com/2013/01/13/iteration-inside-and-out/) for more thoughts. - -We should note that the query execution pipeline creates temporary data at each step. We try to keep block size small enough so that temporary data fits in the CPU cache. With that assumption, writing and reading temporary data is almost free in comparison with other calculations. We could consider an alternative, which is to fuse many operations in the pipeline together. It could make the pipeline as short as possible and remove much of the temporary data, which could be an advantage, but it also has drawbacks. For example, a split pipeline makes it easy to implement caching intermediate data, stealing intermediate data from similar queries running at the same time, and merging pipelines for similar queries. +See the description at [https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/IProcessor.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/IProcessor.h). ## Formats {#formats} -Data formats are implemented with block streams. There are “presentational” formats only suitable for the output of data to the client, such as `Pretty` format, which provides only `IBlockOutputStream`. And there are input/output formats, such as `TabSeparated` or `JSONEachRow`. - -There are also row streams: `IRowInputStream` and `IRowOutputStream`. They allow you to pull/push data by individual rows, not by blocks. And they are only needed to simplify the implementation of row-oriented formats. The wrappers `BlockInputStreamFromRowInputStream` and `BlockOutputStreamFromRowOutputStream` allow you to convert row-oriented streams to regular block-oriented streams. +Data formats are implemented with processors. ## I/O {#io}