From 50002653fe66851a4c4f2af349db29d06c65fe87 Mon Sep 17 00:00:00 2001 From: "ducle.canh" Date: Wed, 21 Jun 2023 12:11:13 +0800 Subject: [PATCH 1/5] add a test to limit client max opening fd --- .../0_stateless/02790_client_max_opening_fd.reference | 1 + tests/queries/0_stateless/02790_client_max_opening_fd.sh | 9 +++++++++ 2 files changed, 10 insertions(+) create mode 100644 tests/queries/0_stateless/02790_client_max_opening_fd.reference create mode 100755 tests/queries/0_stateless/02790_client_max_opening_fd.sh diff --git a/tests/queries/0_stateless/02790_client_max_opening_fd.reference b/tests/queries/0_stateless/02790_client_max_opening_fd.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02790_client_max_opening_fd.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02790_client_max_opening_fd.sh b/tests/queries/0_stateless/02790_client_max_opening_fd.sh new file mode 100755 index 00000000000..289486ce389 --- /dev/null +++ b/tests/queries/0_stateless/02790_client_max_opening_fd.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +ulimit -n 1024 + +${CLICKHOUSE_CLIENT} --query "SELECT 1" From cc01b81a5ff607ea2c18f68cfc74bb07d58d8bfa Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 22 Jun 2023 01:26:04 +0300 Subject: [PATCH 2/5] Update 02790_client_max_opening_fd.sh --- tests/queries/0_stateless/02790_client_max_opening_fd.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02790_client_max_opening_fd.sh b/tests/queries/0_stateless/02790_client_max_opening_fd.sh index 289486ce389..ecc05d32050 100755 --- a/tests/queries/0_stateless/02790_client_max_opening_fd.sh +++ b/tests/queries/0_stateless/02790_client_max_opening_fd.sh @@ -4,6 +4,6 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh +# Ensure that clickhouse-client does not open a large number of files. ulimit -n 1024 - ${CLICKHOUSE_CLIENT} --query "SELECT 1" From 10662b64256dc85469b7f03dc73e8c889b2ae07c Mon Sep 17 00:00:00 2001 From: daviddhc20120601 Date: Tue, 1 Aug 2023 15:35:25 +0800 Subject: [PATCH 3/5] Update annindexes.md explain more about l2distance and cosine distance Update annindexes.md explain more about l2distance and cosine distance --- .../table-engines/mergetree-family/annindexes.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 4a4ebb47bdc..5944048f6c3 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -193,6 +193,19 @@ index creation, `L2Distance` is used as default. Parameter `NumTrees` is the num specified: 100). Higher values of `NumTree` mean more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes. +`L2Distance` is also called Euclidean distance, the Euclidean distance between two points in Euclidean space is the length of a line segment between the two points. +For example: If we have point P(p1,p2), Q(q1,q2), their distance will be d(p,q) +![L2Distance](https://en.wikipedia.org/wiki/Euclidean_distance#/media/File:Euclidean_distance_2d.svg) + +`cosineDistance` also called cosine similarity is a measure of similarity between two non-zero vectors defined in an inner product space. Cosine similarity is the cosine of the angle between the vectors; that is, it is the dot product of the vectors divided by the product of their lengths. +![cosineDistance](https://www.tyrrell4innovation.ca/wp-content/uploads/2021/06/rsz_jenny_du_miword.png) + +The Euclidean distance corresponds to the L2-norm of a difference between vectors. The cosine similarity is proportional to the dot product of two vectors and inversely proportional to the product of their magnitudes. +![compare](https://www.researchgate.net/publication/320914786/figure/fig2/AS:558221849841664@1510101868614/The-difference-between-Euclidean-distance-and-cosine-similarity.png) +In one sentence: cosine similarity care only about the angle between them, but do not care about the "distance" we normally think. +![L2 distance](https://www.baeldung.com/wp-content/uploads/sites/4/2020/06/4-1.png) +![cosineDistance](https://www.baeldung.com/wp-content/uploads/sites/4/2020/06/5.png) + :::note Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1 From 9caef8b4a56e94d684f18318af0928c437ae2b83 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 1 Aug 2023 11:18:53 +0000 Subject: [PATCH 4/5] Try to fix a rare fail in 00612_http_max_query_size --- tests/queries/0_stateless/00612_http_max_query_size.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00612_http_max_query_size.sh b/tests/queries/0_stateless/00612_http_max_query_size.sh index cfcae330b85..6289470c21e 100755 --- a/tests/queries/0_stateless/00612_http_max_query_size.sh +++ b/tests/queries/0_stateless/00612_http_max_query_size.sh @@ -36,7 +36,7 @@ def gen_data(q): pattern = ''' or toString(number) = '{}'\n''' - for i in range(1, 4 * 1024): + for i in range(0, 1024 * 2): yield pattern.format(str(i).zfill(1024 - len(pattern) + 2)).encode() s = requests.Session() From cd74da4c032a6134d484fb7cbab863992e3a999c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 2 Aug 2023 21:21:40 +0200 Subject: [PATCH 5/5] Remove unused code in StorageSystemStackTrace This columns mask is actually useless, since the main thing is done without out, in particular: - detect does the signal should be sent to thread - does the thread name should be read for this thread And this cannot be done with columns mask, because multiple columns depends on signals and thread names. Signed-off-by: Azat Khuzhin --- src/Storages/System/StorageSystemStackTrace.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/Storages/System/StorageSystemStackTrace.cpp b/src/Storages/System/StorageSystemStackTrace.cpp index 887d7f2a5d4..8d703632c68 100644 --- a/src/Storages/System/StorageSystemStackTrace.cpp +++ b/src/Storages/System/StorageSystemStackTrace.cpp @@ -275,15 +275,6 @@ Pipe StorageSystemStackTrace::read( Block sample_block = storage_snapshot->metadata->getSampleBlock(); - std::vector columns_mask(sample_block.columns()); - for (size_t i = 0, size = columns_mask.size(); i < size; ++i) - { - if (names_set.contains(sample_block.getByPosition(i).name)) - { - columns_mask[i] = 1; - } - } - bool send_signal = names_set.contains("trace") || names_set.contains("query_id"); bool read_thread_names = names_set.contains("thread_name");