From d7129855757f38ceec3e4ecc6dafacdabe9b178f Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Fri, 6 May 2022 15:17:46 +0000
Subject: [PATCH 01/11] Minor style changes for ConcurrentHashJoin

---
 src/Interpreters/ConcurrentHashJoin.cpp       | 89 +++++++++++--------
 src/Interpreters/ConcurrentHashJoin.h         |  7 +-
 src/Interpreters/ExpressionAnalyzer.cpp       |  2 +-
 src/QueryPipeline/QueryPipelineBuilder.cpp    |  2 +-
 .../1_stateful/00172_parallel_join.sql        |  3 +-
 5 files changed, 59 insertions(+), 44 deletions(-)
diff --git a/src/Interpreters/ConcurrentHashJoin.cpp b/src/Interpreters/ConcurrentHashJoin.cpp
index f4e49d3230d..7dfe6a5549e 100644
--- a/src/Interpreters/ConcurrentHashJoin.cpp
+++ b/src/Interpreters/ConcurrentHashJoin.cpp
@@ -16,25 +16,28 @@
 #include <Parsers/IAST_fwd.h>
 #include <Parsers/parseQuery.h>
 #include <Common/Exception.h>
+#include <Common/WeakHash.h>
 #include <Common/typeid_cast.h>
+#include <base/scope_guard.h>
+
 namespace DB
 {
+
 namespace ErrorCodes
 {
     extern const int LOGICAL_ERROR;
     extern const int SET_SIZE_LIMIT_EXCEEDED;
     extern const int BAD_ARGUMENTS;
 }
-namespace JoinStuff
-{
+
 ConcurrentHashJoin::ConcurrentHashJoin(ContextPtr context_, std::shared_ptr<TableJoin> table_join_, size_t slots_, const Block & right_sample_block, bool any_take_last_row_)
     : context(context_)
     , table_join(table_join_)
     , slots(slots_)
 {
-    if (!slots_ || slots_ >= 256)
+    if (slots < 1 || 256 < slots)
     {
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid argument slot : {}", slots_);
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of slots should be [1, 255], got {}", slots);
     }
 
     for (size_t i = 0; i < slots; ++i)
@@ -43,36 +46,45 @@ ConcurrentHashJoin::ConcurrentHashJoin(ContextPtr context_, std::shared_ptr<Tabl
         inner_hash_join->data = std::make_unique<HashJoin>(table_join_, right_sample_block, any_take_last_row_);
         hash_joins.emplace_back(std::move(inner_hash_join));
     }
-
 }
 
-bool ConcurrentHashJoin::addJoinedBlock(const Block & block, bool check_limits)
+bool ConcurrentHashJoin::addJoinedBlock(const Block & right_block, bool check_limits)
 {
-    Blocks dispatched_blocks = dispatchBlock(table_join->getOnlyClause().key_names_right, block);
+    Blocks dispatched_blocks = dispatchBlock(table_join->getOnlyClause().key_names_right, right_block);
 
-    std::list<size_t> pending_blocks;
-    for (size_t i = 0; i < dispatched_blocks.size(); ++i)
-        pending_blocks.emplace_back(i);
-    while (!pending_blocks.empty())
+    std::atomic<size_t> blocks_left = 0;
+    for (const auto & block : dispatched_blocks)
     {
-        for (auto iter = pending_blocks.begin(); iter != pending_blocks.end();)
+        if (block)
+        {
+            ++blocks_left;
+        }
+    }
+
+    while (blocks_left > 0)
+    {
+        /// insert blocks into corresponding HashJoin instances
+        for (size_t i = 0; i < dispatched_blocks.size(); ++i)
         {
-            auto & i = *iter;
             auto & hash_join = hash_joins[i];
             auto & dispatched_block = dispatched_blocks[i];
-            if (hash_join->mutex.try_lock())
             {
-                if (!hash_join->data->addJoinedBlock(dispatched_block, check_limits))
-                {
-                    hash_join->mutex.unlock();
-                    return false;
-                }
+                /// if current hash_join is already processed by another thread, skip it and try later
+                std::unique_lock<std::mutex> lock(hash_join->mutex, std::try_to_lock);
+                if (!lock.owns_lock())
+                    continue;
 
-                hash_join->mutex.unlock();
-                iter = pending_blocks.erase(iter);
+                if (!dispatched_block)
+                    continue;
+
+                bool limit_exceeded = !hash_join->data->addJoinedBlock(dispatched_block, check_limits);
+
+                dispatched_block = {};
+                blocks_left--;
+
+                if (limit_exceeded)
+                    return false;
             }
-            else
-                iter++;
         }
     }
 
@@ -161,30 +173,32 @@ std::shared_ptr<NotJoinedBlocks> ConcurrentHashJoin::getNonJoinedBlocks(
     throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid join type. join kind: {}, strictness: {}", table_join->kind(), table_join->strictness());
 }
 
+static IColumn::Selector hashToSelector(const WeakHash32 & hash, size_t num_shards)
+{
+    const auto & data = hash.getData();
+    size_t num_rows = data.size();
+
+    IColumn::Selector selector(num_rows);
+    for (size_t i = 0; i < num_rows; ++i)
+        selector[i] = data[i] % num_shards;
+    return selector;
+}
+
 Blocks ConcurrentHashJoin::dispatchBlock(const Strings & key_columns_names, const Block & from_block)
 {
-    Blocks result;
-
     size_t num_shards = hash_joins.size();
     size_t num_rows = from_block.rows();
     size_t num_cols = from_block.columns();
 
-    ColumnRawPtrs key_cols;
+    WeakHash32 hash(num_rows);
     for (const auto & key_name : key_columns_names)
     {
-        key_cols.push_back(from_block.getByName(key_name).column.get());
-    }
-    IColumn::Selector selector(num_rows);
-    for (size_t i = 0; i < num_rows; ++i)
-    {
-        SipHash hash;
-        for (const auto & key_col : key_cols)
-        {
-            key_col->updateHashWithValue(i, hash);
-        }
-        selector[i] = hash.get64() % num_shards;
+        const auto & key_col = from_block.getByName(key_name).column;
+        key_col->updateWeakHash32(hash);
     }
+    auto selector = hashToSelector(hash, num_shards);
 
+    Blocks result;
     for (size_t i = 0; i < num_shards; ++i)
     {
         result.emplace_back(from_block.cloneEmpty());
@@ -203,4 +217,3 @@ Blocks ConcurrentHashJoin::dispatchBlock(const Strings & key_columns_names, cons
 }
 
 }
-}
diff --git a/src/Interpreters/ConcurrentHashJoin.h b/src/Interpreters/ConcurrentHashJoin.h
index 47fa2b2112f..fb226c39a0c 100644
--- a/src/Interpreters/ConcurrentHashJoin.h
+++ b/src/Interpreters/ConcurrentHashJoin.h
@@ -15,8 +15,7 @@
 
 namespace DB
 {
-namespace JoinStuff
-{
+
 /**
  * Can run addJoinedBlock() parallelly to speedup the join process. On test, it almose linear speedup by
  * the degree of parallelism.
@@ -33,6 +32,7 @@ namespace JoinStuff
  */
 class ConcurrentHashJoin : public IJoin
 {
+
 public:
     explicit ConcurrentHashJoin(ContextPtr context_, std::shared_ptr<TableJoin> table_join_, size_t slots_, const Block & right_sample_block, bool any_take_last_row_ = false);
     ~ConcurrentHashJoin() override = default;
@@ -49,6 +49,7 @@ public:
     bool supportParallelJoin() const override { return true; }
     std::shared_ptr<NotJoinedBlocks>
     getNonJoinedBlocks(const Block & left_sample_block, const Block & result_sample_block, UInt64 max_block_size) const override;
+
 private:
     struct InternalHashJoin
     {
@@ -71,5 +72,5 @@ private:
     Blocks dispatchBlock(const Strings & key_columns_names, const Block & from_block);
 
 };
-}
+
 }
diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index 0a156ba0b3e..ee69e39782d 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -939,7 +939,7 @@ static std::shared_ptr<IJoin> chooseJoinAlgorithm(std::shared_ptr<TableJoin> ana
     {
         if (analyzed_join->allowParallelHashJoin())
         {
-            return std::make_shared<JoinStuff::ConcurrentHashJoin>(context, analyzed_join, context->getSettings().max_threads, sample_block);
+            return std::make_shared<ConcurrentHashJoin>(context, analyzed_join, context->getSettings().max_threads, sample_block);
         }
         return std::make_shared<HashJoin>(analyzed_join, sample_block);
     }
diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp
index 5e074861110..012a825a9d5 100644
--- a/src/QueryPipeline/QueryPipelineBuilder.cpp
+++ b/src/QueryPipeline/QueryPipelineBuilder.cpp
@@ -347,7 +347,7 @@ std::unique_ptr<QueryPipelineBuilder> QueryPipelineBuilder::joinPipelines(
     ///                   ╞> FillingJoin ─> Resize ╣     ╞> Joining ─> (totals)
     /// (totals) ─────────┘                        ╙─────┘
 
-    auto num_streams = left->getNumStreams();
+    size_t num_streams = left->getNumStreams();
 
     if (join->supportParallelJoin() && !right->hasTotals())
     {
diff --git a/tests/queries/1_stateful/00172_parallel_join.sql b/tests/queries/1_stateful/00172_parallel_join.sql
index fce41d7a761..36b12a43b88 100644
--- a/tests/queries/1_stateful/00172_parallel_join.sql
+++ b/tests/queries/1_stateful/00172_parallel_join.sql
@@ -1,4 +1,5 @@
-set join_algorithm='parallel_hash';
+SET join_algorithm='parallel_hash';
+
 SELECT
     EventDate,
     hits,

From 43935f71f7ca0e5ff72db3d6222cfd00805c498b Mon Sep 17 00:00:00 2001
From: Vladimir C <vdimir@clickhouse.com>
Date: Tue, 10 May 2022 14:09:49 +0200
Subject: [PATCH 02/11] Update ConcurrentHashJoin.cpp

---
 src/Interpreters/ConcurrentHashJoin.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/Interpreters/ConcurrentHashJoin.cpp b/src/Interpreters/ConcurrentHashJoin.cpp
index 7dfe6a5549e..f4ab8555d5f 100644
--- a/src/Interpreters/ConcurrentHashJoin.cpp
+++ b/src/Interpreters/ConcurrentHashJoin.cpp
@@ -18,7 +18,6 @@
 #include <Common/Exception.h>
 #include <Common/WeakHash.h>
 #include <Common/typeid_cast.h>
-#include <base/scope_guard.h>
 
 namespace DB
 {
@@ -52,7 +51,7 @@ bool ConcurrentHashJoin::addJoinedBlock(const Block & right_block, bool check_li
 {
     Blocks dispatched_blocks = dispatchBlock(table_join->getOnlyClause().key_names_right, right_block);
 
-    std::atomic<size_t> blocks_left = 0;
+    size_t blocks_left = 0;
     for (const auto & block : dispatched_blocks)
     {
         if (block)
@@ -68,15 +67,14 @@ bool ConcurrentHashJoin::addJoinedBlock(const Block & right_block, bool check_li
         {
             auto & hash_join = hash_joins[i];
             auto & dispatched_block = dispatched_blocks[i];
+
+            if (dispatched_block)
             {
                 /// if current hash_join is already processed by another thread, skip it and try later
                 std::unique_lock<std::mutex> lock(hash_join->mutex, std::try_to_lock);
                 if (!lock.owns_lock())
                     continue;
 
-                if (!dispatched_block)
-                    continue;
-
                 bool limit_exceeded = !hash_join->data->addJoinedBlock(dispatched_block, check_limits);
 
                 dispatched_block = {};

From 4065b83ba99c6b394e5b4d9ec7fa905d420c795a Mon Sep 17 00:00:00 2001
From: Vladimir C <vdimir@clickhouse.com>
Date: Tue, 10 May 2022 14:14:58 +0200
Subject: [PATCH 03/11] Update ConcurrentHashJoin.cpp

---
 src/Interpreters/ConcurrentHashJoin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/ConcurrentHashJoin.cpp b/src/Interpreters/ConcurrentHashJoin.cpp
index f4ab8555d5f..f6ba9f95bbc 100644
--- a/src/Interpreters/ConcurrentHashJoin.cpp
+++ b/src/Interpreters/ConcurrentHashJoin.cpp
@@ -34,7 +34,7 @@ ConcurrentHashJoin::ConcurrentHashJoin(ContextPtr context_, std::shared_ptr<Tabl
     , table_join(table_join_)
     , slots(slots_)
 {
-    if (slots < 1 || 256 < slots)
+    if (slots < 1 || 255 < slots)
     {
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of slots should be [1, 255], got {}", slots);
     }

From 3dfede3165a240dbc1eec1cd6b64fca1e1615976 Mon Sep 17 00:00:00 2001
From: Vxider <lb@vxider.com>
Date: Wed, 11 May 2022 09:00:49 +0000
Subject: [PATCH 04/11] support read for windowview

---
 src/Storages/WindowView/StorageWindowView.cpp | 72 +++++++++++++++++++
 src/Storages/WindowView/StorageWindowView.h   | 19 +++++
 2 files changed, 91 insertions(+)

diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp
index 9462e115087..0f8d27772d4 100644
--- a/src/Storages/WindowView/StorageWindowView.cpp
+++ b/src/Storages/WindowView/StorageWindowView.cpp
@@ -37,6 +37,11 @@
 #include <Processors/Transforms/WatermarkTransform.h>
 #include <Processors/Transforms/SquashingChunksTransform.h>
 #include <Processors/Transforms/MaterializingTransform.h>
+#include <Processors/QueryPlan/QueryPlan.h>
+#include <Processors/QueryPlan/SettingQuotaAndLimitsStep.h>
+#include <Processors/QueryPlan/BuildQueryPipelineSettings.h>
+#include <Processors/QueryPlan/ExpressionStep.h>
+#include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
 #include <Processors/Executors/PipelineExecutor.h>
 #include <Processors/Sinks/EmptySink.h>
 #include <Storages/StorageFactory.h>
@@ -965,6 +970,73 @@ void StorageWindowView::threadFuncFireEvent()
     }
 }
 
+Pipe StorageWindowView::read(
+    const Names & column_names,
+    const StorageSnapshotPtr & storage_snapshot,
+    SelectQueryInfo & query_info,
+    ContextPtr local_context,
+    QueryProcessingStage::Enum processed_stage,
+    const size_t max_block_size,
+    const unsigned num_streams)
+{
+    QueryPlan plan;
+    read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams);
+    return plan.convertToPipe(
+        QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context));
+}
+
+void StorageWindowView::read(
+    QueryPlan & query_plan,
+    const Names & column_names,
+    const StorageSnapshotPtr & storage_snapshot,
+    SelectQueryInfo & query_info,
+    ContextPtr local_context,
+    QueryProcessingStage::Enum processed_stage,
+    const size_t max_block_size,
+    const unsigned num_streams)
+{
+    auto storage = getTargetStorage();
+    auto lock = storage->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout);
+    auto target_metadata_snapshot = storage->getInMemoryMetadataPtr();
+    auto target_storage_snapshot = storage->getStorageSnapshot(target_metadata_snapshot, local_context);
+
+    if (query_info.order_optimizer)
+        query_info.input_order_info = query_info.order_optimizer->getInputOrder(target_metadata_snapshot, local_context);
+
+    storage->read(query_plan, column_names, target_storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams);
+
+    if (query_plan.isInitialized())
+    {
+        auto wv_header = getHeaderForProcessingStage(column_names, storage_snapshot, query_info, local_context, processed_stage);
+        auto target_header = query_plan.getCurrentDataStream().header;
+
+        if (!blocksHaveEqualStructure(wv_header, target_header))
+        {
+            auto converting_actions = ActionsDAG::makeConvertingActions(
+                target_header.getColumnsWithTypeAndName(), wv_header.getColumnsWithTypeAndName(), ActionsDAG::MatchColumnsMode::Name);
+            auto converting_step = std::make_unique<ExpressionStep>(query_plan.getCurrentDataStream(), converting_actions);
+            converting_step->setStepDescription("Convert Target table structure to WindowView structure");
+            query_plan.addStep(std::move(converting_step));
+        }
+
+        StreamLocalLimits limits;
+        SizeLimits leaf_limits;
+
+        /// Add table lock for target table.
+        auto adding_limits_and_quota = std::make_unique<SettingQuotaAndLimitsStep>(
+                query_plan.getCurrentDataStream(),
+                storage,
+                std::move(lock),
+                limits,
+                leaf_limits,
+                nullptr,
+                nullptr);
+
+        adding_limits_and_quota->setStepDescription("Lock target table for WindowView");
+        query_plan.addStep(std::move(adding_limits_and_quota));
+    }
+}
+
 Pipe StorageWindowView::watch(
     const Names & /*column_names*/,
     const SelectQueryInfo & query_info,
diff --git a/src/Storages/WindowView/StorageWindowView.h b/src/Storages/WindowView/StorageWindowView.h
index 397e4d4946c..c09923f2859 100644
--- a/src/Storages/WindowView/StorageWindowView.h
+++ b/src/Storages/WindowView/StorageWindowView.h
@@ -137,6 +137,25 @@ public:
     void startup() override;
     void shutdown() override;
 
+    Pipe read(
+        const Names & column_names,
+        const StorageSnapshotPtr & storage_snapshot,
+        SelectQueryInfo & query_info,
+        ContextPtr context,
+        QueryProcessingStage::Enum processed_stage,
+        size_t max_block_size,
+        unsigned num_streams) override;
+
+    void read(
+        QueryPlan & query_plan,
+        const Names & column_names,
+        const StorageSnapshotPtr & storage_snapshot,
+        SelectQueryInfo & query_info,
+        ContextPtr context,
+        QueryProcessingStage::Enum processed_stage,
+        size_t max_block_size,
+        unsigned num_streams) override;
+
     Pipe watch(
         const Names & column_names,
         const SelectQueryInfo & query_info,

From 88930ae42f3010f89a732c9131b59875c7a06d6e Mon Sep 17 00:00:00 2001
From: Vxider <lb@vxider.com>
Date: Wed, 11 May 2022 09:01:12 +0000
Subject: [PATCH 05/11] add test

---
 .../01076_window_view_select.reference        | 14 ++++++++
 .../0_stateless/01076_window_view_select.sh   | 35 +++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 tests/queries/0_stateless/01076_window_view_select.reference
 create mode 100755 tests/queries/0_stateless/01076_window_view_select.sh

diff --git a/tests/queries/0_stateless/01076_window_view_select.reference b/tests/queries/0_stateless/01076_window_view_select.reference
new file mode 100644
index 00000000000..febbcc49a76
--- /dev/null
+++ b/tests/queries/0_stateless/01076_window_view_select.reference
@@ -0,0 +1,14 @@
+1	1	1990-01-01 12:00:05
+1	2	1990-01-01 12:00:05
+1	3	1990-01-01 12:00:05
+1	4	1990-01-01 12:00:10
+1	5	1990-01-01 12:00:10
+1	6	1990-01-01 12:00:15
+1	7	1990-01-01 12:00:15
+1
+2
+3
+4
+5
+6
+7
diff --git a/tests/queries/0_stateless/01076_window_view_select.sh b/tests/queries/0_stateless/01076_window_view_select.sh
new file mode 100755
index 00000000000..cbcc472c127
--- /dev/null
+++ b/tests/queries/0_stateless/01076_window_view_select.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+$CLICKHOUSE_CLIENT --multiquery <<EOF
+SET allow_experimental_window_view = 1;
+DROP TABLE IF EXISTS mt;
+DROP TABLE IF EXISTS dst;
+DROP TABLE IF EXISTS wv;
+
+CREATE TABLE dst(count UInt64, market Int32, w_end DateTime) Engine=MergeTree ORDER BY tuple();
+CREATE TABLE mt(a Int32, market Int32, timestamp DateTime) ENGINE=MergeTree ORDER BY tuple();
+CREATE WINDOW VIEW wv TO dst WATERMARK=ASCENDING AS SELECT count(a) AS count, market, tumbleEnd(wid) AS w_end FROM mt GROUP BY tumble(timestamp, INTERVAL '5' SECOND, 'US/Samoa') AS wid, market;
+
+INSERT INTO mt VALUES (1, 1, '1990/01/01 12:00:00');
+INSERT INTO mt VALUES (1, 2, '1990/01/01 12:00:01');
+INSERT INTO mt VALUES (1, 3, '1990/01/01 12:00:02');
+INSERT INTO mt VALUES (1, 4, '1990/01/01 12:00:05');
+INSERT INTO mt VALUES (1, 5, '1990/01/01 12:00:06');
+INSERT INTO mt VALUES (1, 6, '1990/01/01 12:00:10');
+INSERT INTO mt VALUES (1, 7, '1990/01/01 12:00:11');
+INSERT INTO mt VALUES (1, 8, '1990/01/01 12:00:30');
+EOF
+
+while true; do
+	$CLICKHOUSE_CLIENT --query="SELECT count(*) FROM wv" | grep -q "7" && break || sleep .5 ||:
+done
+
+$CLICKHOUSE_CLIENT --query="SELECT * FROM wv ORDER BY market, w_end;"
+$CLICKHOUSE_CLIENT --query="SELECT market FROM wv ORDER BY market, w_end;"
+$CLICKHOUSE_CLIENT --query="DROP TABLE wv"
+$CLICKHOUSE_CLIENT --query="DROP TABLE mt"
+$CLICKHOUSE_CLIENT --query="DROP TABLE dst"

From 07fe7f5c09ac681f32dec396665fd0728ee37b44 Mon Sep 17 00:00:00 2001
From: Vxider <lb@vxider.com>
Date: Wed, 11 May 2022 09:16:26 +0000
Subject: [PATCH 06/11] disable read when windowview has no target table

---
 src/Storages/WindowView/StorageWindowView.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp
index 0f8d27772d4..7ab3fae2675 100644
--- a/src/Storages/WindowView/StorageWindowView.cpp
+++ b/src/Storages/WindowView/StorageWindowView.cpp
@@ -995,6 +995,9 @@ void StorageWindowView::read(
     const size_t max_block_size,
     const unsigned num_streams)
 {
+    if (target_table_id.empty())
+        return;
+
     auto storage = getTargetStorage();
     auto lock = storage->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout);
     auto target_metadata_snapshot = storage->getInMemoryMetadataPtr();

From 17608b3d93b1ee3ff0565b597d5f89ff81cf0b2d Mon Sep 17 00:00:00 2001
From: Dmitry Novik <n0vik@clickhouse.com>
Date: Wed, 11 May 2022 16:18:41 +0000
Subject: [PATCH 07/11] Update documentation and defaults for memory overcommit

---
 .../settings.md                               | 10 +++++
 .../operations/settings/memory-overcommit.md  | 37 +++++++++++++++++++
 docs/en/operations/settings/settings.md       | 26 +++++++++++++
 src/Core/Settings.h                           |  8 ++--
 src/Interpreters/ProcessList.cpp              |  4 +-
 src/Storages/MergeTree/MergeList.cpp          |  2 +-
 .../test_global_overcommit_tracker/test.py    |  4 +-
 .../0_stateless/02104_overcommit_memory.sh    |  4 +-
 8 files changed, 84 insertions(+), 11 deletions(-)
 create mode 100644 docs/en/operations/settings/memory-overcommit.md

diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index f235fba84f7..fd5c2a187b5 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -1745,3 +1745,13 @@ Possible values:
 -   Positive integer.
 
 Default value: `10000`.
+
+## global_memory_usage_overcommit_max_wait_microseconds {#global_memory_usage_overcommit_max_wait_microseconds}
+
+Sets maximum waiting time for global overcommit tracker.
+
+Possible values:
+
+-   Positive integer.
+
+Default value: `200`.
diff --git a/docs/en/operations/settings/memory-overcommit.md b/docs/en/operations/settings/memory-overcommit.md
new file mode 100644
index 00000000000..b0c2e3bd35e
--- /dev/null
+++ b/docs/en/operations/settings/memory-overcommit.md
@@ -0,0 +1,37 @@
+# Memory overcommit
+
+Memory overcommit is an experimental technique intended to allow to set more flexible memory limits for queries.
+
+The idea of this technique is to introduce settings which can represent guaranteed amount of memory a query can use.
+When memory overcommit is enabled and the memory limit is reached ClickHouse will select the most overcommitted query and try to free memory by killing this query.
+
+When memory limit is reached any query will wait some time during atempt to allocate new memory.
+If timeout is passed and memory is freed, the query continues execution.
+Otherwise an exception will be thrown and the query is killed.
+
+Selection of query to stop or kill is performed by either global or user overcommit trackers depending on what memory limit is reached.
+If overcommit tracker can't choose query to stop, MEMORY_LIMIT_EXCEEDED exception is thrown.
+
+## User overcommit tracker
+
+User overcommit tracker finds a query with the biggest overcommit ratio in the user's query list.
+Overcommit ratio for a query is computed as number of allocated bytes divided by value of `memory_overcommit_ratio_denominator` setting.
+
+If `memory_overcommit_ratio_denominator` for the query is equals to zero, overcommit tracker won't choose this query.
+
+Waiting timeout is set by `memory_usage_overcommit_max_wait_microseconds` setting.
+
+**Example**
+
+```sql
+SELECT number FROM numbers(1000) GROUP BY number SETTINGS max_guaranteed_memory_usage=4000, memory_usage_overcommit_max_wait_microseconds=500
+```
+
+## Global overcommit tracker
+
+Global overcommit tracker finds a query with the biggest overcommit ratio in the list of all queries.
+In this case overcommit ratio is computed as number of allocated bytes divided by value of `memory_overcommit_ratio_denominator_for_user` setting.
+
+If `memory_overcommit_ratio_denominator_for_user` for the query is equals to zero, overcommit tracker won't choose this query.
+
+Waiting timeout is set by `global_memory_usage_overcommit_max_wait_microseconds` parameter in the configuration file.
diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index 8f2b9bc86fc..76fbc5f239d 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -4263,3 +4263,29 @@ Possible values:
 - 1 — Enabled.
 
 Default value: 1.
+
+## memory_overcommit_ratio_denominator
+
+It represents soft memory limit in case when hard limit is reached on user level.
+This value is used to compute overcommit ratio for the query.
+Zero means skip the query.
+Read more about [memory overcommit](memory-overcommit.md).
+
+Default value: `1GiB`.
+
+## memory_usage_overcommit_max_wait_microseconds
+
+Maximum time thread will wait for memory to be freed in the case of memory overcommit on a user level.
+If the timeout is reached and memory is not freed, an exception is thrown.
+Read more about [memory overcommit](memory-overcommit.md).
+
+Default value: `200`.
+
+## memory_overcommit_ratio_denominator_for_user
+
+It represents soft memory limit in case when hard limit is reached on global level.
+This value is used to compute overcommit ratio for the query.
+Zero means skip the query.
+Read more about [memory overcommit](memory-overcommit.md).
+
+Default value: `1GiB`.
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 76d2dca117b..993e7b759b0 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -22,7 +22,7 @@ namespace DB
 {
 class IColumn;
 
-static constexpr UInt64 operator""_Gb(unsigned long long value)
+static constexpr UInt64 operator""_GiB(unsigned long long value)
 {
     return value * 1024 * 1024 * 1024;
 }
@@ -360,14 +360,14 @@ static constexpr UInt64 operator""_Gb(unsigned long long value)
     M(OverflowMode, distinct_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \
     \
     M(UInt64, max_memory_usage, 0, "Maximum memory usage for processing of single query. Zero means unlimited.", 0) \
-    M(UInt64, max_guaranteed_memory_usage, 10_Gb, "Maximum guaranteed memory usage for processing of single query. It represents soft limit. Zero means unlimited.", 0) \
+    M(UInt64, memory_overcommit_ratio_denominator, 1_GiB, "It represents soft memory limit on the user level. This value is used to compute query overcommit ratio.", 0) \
     M(UInt64, max_memory_usage_for_user, 0, "Maximum memory usage for processing all concurrently running queries for the user. Zero means unlimited.", 0) \
-    M(UInt64, max_guaranteed_memory_usage_for_user, 10_Gb, "Maximum guaranteed memory usage for processing all concurrently running queries for the user. It represents soft limit. Zero means unlimited.", 0) \
+    M(UInt64, memory_overcommit_ratio_denominator_for_user, 1_GiB, "It represents soft memory limit on the global level. This value is used to compute query overcommit ratio.", 0) \
     M(UInt64, max_untracked_memory, (4 * 1024 * 1024), "Small allocations and deallocations are grouped in thread local variable and tracked or profiled only when amount (in absolute value) becomes larger than specified value. If the value is higher than 'memory_profiler_step' it will be effectively lowered to 'memory_profiler_step'.", 0) \
     M(UInt64, memory_profiler_step, (4 * 1024 * 1024), "Whenever query memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stack trace. Zero means disabled memory profiler. Values lower than a few megabytes will slow down query processing.", 0) \
     M(Float, memory_profiler_sample_probability, 0., "Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless to the size of the allocation. Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine grained sampling.", 0) \
     \
-    M(UInt64, memory_usage_overcommit_max_wait_microseconds, 0, "Maximum time thread will wait for memory to be freed in the case of memory overcommit. If timeout is reached and memory is not freed, exception is thrown", 0) \
+    M(UInt64, memory_usage_overcommit_max_wait_microseconds, 200, "Maximum time thread will wait for memory to be freed in the case of memory overcommit on user level. If timeout is reached and memory is not freed, exception is thrown.", 0) \
     \
     M(UInt64, max_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for a query. Zero means unlimited.", 0) \
     M(UInt64, max_network_bytes, 0, "The maximum number of bytes (compressed) to receive or transmit over the network for execution of the query.", 0) \
diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp
index ac59d2c7235..6c101143234 100644
--- a/src/Interpreters/ProcessList.cpp
+++ b/src/Interpreters/ProcessList.cpp
@@ -212,7 +212,7 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as
 
             /// Set query-level memory trackers
             thread_group->memory_tracker.setOrRaiseHardLimit(settings.max_memory_usage);
-            thread_group->memory_tracker.setSoftLimit(settings.max_guaranteed_memory_usage);
+            thread_group->memory_tracker.setSoftLimit(settings.memory_overcommit_ratio_denominator);
 
             if (query_context->hasTraceCollector())
             {
@@ -242,7 +242,7 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as
 
         /// Track memory usage for all simultaneously running queries from single user.
         user_process_list.user_memory_tracker.setOrRaiseHardLimit(settings.max_memory_usage_for_user);
-        user_process_list.user_memory_tracker.setSoftLimit(settings.max_guaranteed_memory_usage_for_user);
+        user_process_list.user_memory_tracker.setSoftLimit(settings.memory_overcommit_ratio_denominator_for_user);
         user_process_list.user_memory_tracker.setDescription("(for user)");
         user_process_list.user_overcommit_tracker.setMaxWaitTime(settings.memory_usage_overcommit_max_wait_microseconds);
 
diff --git a/src/Storages/MergeTree/MergeList.cpp b/src/Storages/MergeTree/MergeList.cpp
index 11d0bc8c565..8722ddc5a82 100644
--- a/src/Storages/MergeTree/MergeList.cpp
+++ b/src/Storages/MergeTree/MergeList.cpp
@@ -87,7 +87,7 @@ MergeListElement::MergeListElement(
     /// thread_group::memory_tracker, but MemoryTrackerThreadSwitcher will reset parent).
     memory_tracker.setProfilerStep(settings.memory_profiler_step);
     memory_tracker.setSampleProbability(settings.memory_profiler_sample_probability);
-    memory_tracker.setSoftLimit(settings.max_guaranteed_memory_usage);
+    memory_tracker.setSoftLimit(settings.memory_overcommit_ratio_denominator);
     if (settings.memory_tracker_fault_probability)
         memory_tracker.setFaultProbability(settings.memory_tracker_fault_probability);
 
diff --git a/tests/integration/test_global_overcommit_tracker/test.py b/tests/integration/test_global_overcommit_tracker/test.py
index cacc447be1a..d3d56e82f38 100644
--- a/tests/integration/test_global_overcommit_tracker/test.py
+++ b/tests/integration/test_global_overcommit_tracker/test.py
@@ -18,8 +18,8 @@ def start_cluster():
         cluster.shutdown()
 
 
-TEST_QUERY_A = "SELECT number FROM numbers(1000) GROUP BY number SETTINGS max_guaranteed_memory_usage_for_user=1"
-TEST_QUERY_B = "SELECT number FROM numbers(1000) GROUP BY number SETTINGS max_guaranteed_memory_usage_for_user=2"
+TEST_QUERY_A = "SELECT number FROM numbers(1000) GROUP BY number SETTINGS memory_overcommit_ratio_denominator_for_user=1"
+TEST_QUERY_B = "SELECT number FROM numbers(1000) GROUP BY number SETTINGS memory_overcommit_ratio_denominator_for_user=2"
 
 
 def test_overcommited_is_killed():
diff --git a/tests/queries/0_stateless/02104_overcommit_memory.sh b/tests/queries/0_stateless/02104_overcommit_memory.sh
index 7fdf74a30bf..f2016dbc0c1 100755
--- a/tests/queries/0_stateless/02104_overcommit_memory.sh
+++ b/tests/queries/0_stateless/02104_overcommit_memory.sh
@@ -11,13 +11,13 @@ $CLICKHOUSE_CLIENT -q 'GRANT ALL ON *.* TO u02104'
 
 function overcommited()
 {
-    $CLICKHOUSE_CLIENT -u u02104 -q 'SELECT number FROM numbers(130000) GROUP BY number SETTINGS max_guaranteed_memory_usage=1,memory_usage_overcommit_max_wait_microseconds=500' 2>&1 \
+    $CLICKHOUSE_CLIENT -u u02104 -q 'SELECT number FROM numbers(130000) GROUP BY number SETTINGS memory_overcommit_ratio_denominator=1,memory_usage_overcommit_max_wait_microseconds=500' 2>&1 \
         | grep -F -q "MEMORY_LIMIT_EXCEEDED" && echo "OVERCOMMITED WITH USER LIMIT IS KILLED"
 }
 
 function expect_execution()
 {
-    $CLICKHOUSE_CLIENT -u u02104 -q 'SELECT number FROM numbers(130000) GROUP BY number SETTINGS max_memory_usage_for_user=5000000,max_guaranteed_memory_usage=2,memory_usage_overcommit_max_wait_microseconds=500' >/dev/null 2>/dev/null
+    $CLICKHOUSE_CLIENT -u u02104 -q 'SELECT number FROM numbers(130000) GROUP BY number SETTINGS max_memory_usage_for_user=5000000,memory_overcommit_ratio_denominator=2,memory_usage_overcommit_max_wait_microseconds=500' >/dev/null 2>/dev/null
 }
 
 export -f overcommited

From f9e635763e77b9b69723bb61fc70914f39f46d47 Mon Sep 17 00:00:00 2001
From: Dmitry Novik <n0vik@clickhouse.com>
Date: Wed, 11 May 2022 18:54:20 +0200
Subject: [PATCH 08/11] Update docs/en/operations/settings/memory-overcommit.md

Co-authored-by: Nikita Taranov <nikita.taranov@clickhouse.com>
---
 docs/en/operations/settings/memory-overcommit.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/operations/settings/memory-overcommit.md b/docs/en/operations/settings/memory-overcommit.md
index b0c2e3bd35e..5da6cd8eb74 100644
--- a/docs/en/operations/settings/memory-overcommit.md
+++ b/docs/en/operations/settings/memory-overcommit.md
@@ -5,7 +5,7 @@ Memory overcommit is an experimental technique intended to allow to set more fle
 The idea of this technique is to introduce settings which can represent guaranteed amount of memory a query can use.
 When memory overcommit is enabled and the memory limit is reached ClickHouse will select the most overcommitted query and try to free memory by killing this query.
 
-When memory limit is reached any query will wait some time during atempt to allocate new memory.
+When memory limit is reached any query will wait some time during attempt to allocate new memory.
 If timeout is passed and memory is freed, the query continues execution.
 Otherwise an exception will be thrown and the query is killed.
 

From 3b733ec8eb58e9151faab35af2aebc7e0d3fd393 Mon Sep 17 00:00:00 2001
From: Marcelo Rodriguez <marcelo@clickhouse.com>
Date: Wed, 11 May 2022 10:56:12 -0600
Subject: [PATCH 09/11] Update Exception Message for allowed auth types

update error message per this commit:
https://github.com/ClickHouse/ClickHouse/pull/31484/commits/cb66a63aa4a5b4b3a9d55b0d041b755330dc7a17

the xml tag changed from `<certificates>` to `<ssl_certificates>`

will also submit a correction to the following doc page:
https://clickhouse.com/docs/en/operations/external-authenticators/ssl-x509
---
 src/Access/UsersConfigAccessStorage.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp
index 240130bbf74..712e5393ce7 100644
--- a/src/Access/UsersConfigAccessStorage.cpp
+++ b/src/Access/UsersConfigAccessStorage.cpp
@@ -67,11 +67,11 @@ namespace
         size_t num_password_fields = has_no_password + has_password_plaintext + has_password_sha256_hex + has_password_double_sha1_hex + has_ldap + has_kerberos + has_certificates;
 
         if (num_password_fields > 1)
-            throw Exception("More than one field of 'password', 'password_sha256_hex', 'password_double_sha1_hex', 'no_password', 'ldap', 'kerberos', 'certificates' are used to specify authentication info for user " + user_name + ". Must be only one of them.",
+            throw Exception("More than one field of 'password', 'password_sha256_hex', 'password_double_sha1_hex', 'no_password', 'ldap', 'kerberos', 'ssl_certificates' are used to specify authentication info for user " + user_name + ". Must be only one of them.",
                 ErrorCodes::BAD_ARGUMENTS);
 
         if (num_password_fields < 1)
-            throw Exception("Either 'password' or 'password_sha256_hex' or 'password_double_sha1_hex' or 'no_password' or 'ldap' or 'kerberos' or 'certificates' must be specified for user " + user_name + ".", ErrorCodes::BAD_ARGUMENTS);
+            throw Exception("Either 'password' or 'password_sha256_hex' or 'password_double_sha1_hex' or 'no_password' or 'ldap' or 'kerberos' or 'ssl_certificates' must be specified for user " + user_name + ".", ErrorCodes::BAD_ARGUMENTS);
 
         if (has_password_plaintext)
         {

From d756a34d8958b4d0ed2e0c8670a8d7af2c726dbd Mon Sep 17 00:00:00 2001
From: Dmitry Novik <n0vik@clickhouse.com>
Date: Wed, 11 May 2022 19:01:39 +0200
Subject: [PATCH 10/11] Update docs/en/operations/settings/memory-overcommit.md

---
 docs/en/operations/settings/memory-overcommit.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/operations/settings/memory-overcommit.md b/docs/en/operations/settings/memory-overcommit.md
index 5da6cd8eb74..74cbc4dbd03 100644
--- a/docs/en/operations/settings/memory-overcommit.md
+++ b/docs/en/operations/settings/memory-overcommit.md
@@ -24,7 +24,7 @@ Waiting timeout is set by `memory_usage_overcommit_max_wait_microseconds` settin
 **Example**
 
 ```sql
-SELECT number FROM numbers(1000) GROUP BY number SETTINGS max_guaranteed_memory_usage=4000, memory_usage_overcommit_max_wait_microseconds=500
+SELECT number FROM numbers(1000) GROUP BY number SETTINGS memory_overcommit_ratio_denominator=4000, memory_usage_overcommit_max_wait_microseconds=500
 ```
 
 ## Global overcommit tracker

From 2d0020e9f69b791901f565ea02db3c2fd5b16a61 Mon Sep 17 00:00:00 2001
From: Marcelo Rodriguez <marcelo@clickhouse.com>
Date: Wed, 11 May 2022 11:36:53 -0600
Subject: [PATCH 11/11] updated config file and xml tag

Updated with correct config file.
updated xml tag per this commit:
https://github.com/ClickHouse/ClickHouse/commit/cb66a63aa4a5b4b3a9d55b0d041b755330dc7a17
---
 docs/en/operations/external-authenticators/ssl-x509.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/operations/external-authenticators/ssl-x509.md b/docs/en/operations/external-authenticators/ssl-x509.md
index dd4f35257bb..15b5990d00e 100644
--- a/docs/en/operations/external-authenticators/ssl-x509.md
+++ b/docs/en/operations/external-authenticators/ssl-x509.md
@@ -2,7 +2,7 @@
 
 [SSL 'strict' option](../server-configuration-parameters/settings.md#server_configuration_parameters-openssl) enables mandatory certificate validation for the incoming connections. In this case, only connections with trusted certificates can be established. Connections with untrusted certificates will be rejected. Thus, certificate validation allows to uniquely authenticate an incoming connection. `Common Name` field of the certificate is used to identify connected user. This allows to associate multiple certificates with the same user. Additionally, reissuing and revoking of the certificates does not affect the ClickHouse configuration.
 
-To enable SSL certificate authentication, a list of `Common Name`'s for each ClickHouse user must be sspecified in the settings file `config.xml `:
+To enable SSL certificate authentication, a list of `Common Name`'s for each ClickHouse user must be specified in the settings file `users.xml `:
 
 **Example**
 ```xml
@@ -10,11 +10,11 @@ To enable SSL certificate authentication, a list of `Common Name`'s for each Cli
     <!- ... -->
     <users>
         <user_name>
-            <certificates>
+            <ssl_certificates>
                 <common_name>host.domain.com:example_user</common_name>
                 <common_name>host.domain.com:example_user_dev</common_name>
                 <!-- More names -->
-            </certificates>
+            </ssl_certificates>
             <!-- Other settings -->
         </user_name>
     </users>