From eef7d8c0bc6ed04597881ccfa83b012025c0341b Mon Sep 17 00:00:00 2001
From: Pavel Kruglov <avogar@sandbox-633380738>
Date: Wed, 3 Mar 2021 16:10:15 +0300
Subject: [PATCH 001/260] Fix blocking mode and timeouts in SecureStreamSocket

---
 src/IO/ReadBufferFromPocoSocket.cpp  | 23 ++++++++++++-----------
 src/IO/WriteBufferFromPocoSocket.cpp |  8 ++++++++
 2 files changed, 20 insertions(+), 11 deletions(-)
diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp
index 37896a387bb..df8739904ec 100644
--- a/src/IO/ReadBufferFromPocoSocket.cpp
+++ b/src/IO/ReadBufferFromPocoSocket.cpp
@@ -4,6 +4,7 @@
 #include <Common/Exception.h>
 #include <Common/NetException.h>
 #include <Common/Stopwatch.h>
+#include <Client/TimeoutSetter.h>
 
 
 namespace ProfileEvents
@@ -27,23 +28,23 @@ bool ReadBufferFromPocoSocket::nextImpl()
     ssize_t bytes_read = 0;
     Stopwatch watch;
 
-    int flags = 0;
-    if (async_callback)
-        flags |= MSG_DONTWAIT;
-
     /// Add more details to exceptions.
     try
     {
-        bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
-
-        /// If async_callback is specified, and read is blocking, run async_callback and try again later.
+        /// If async_callback is specified, and read will block, run async_callback and try again later.
         /// It is expected that file descriptor may be polled externally.
         /// Note that receive timeout is not checked here. External code should check it while polling.
-        while (bytes_read < 0 && async_callback && errno == EAGAIN)
-        {
+        while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_READ))
             async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), socket_description);
-            bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
-        }
+
+        /// receiveBytes in SecureStreamSocket throws TimeoutException after max(receive_timeout, send_timeout),
+        /// but we want to get this exception exactly after receive_timeout. So, set send_timeout = receive_timeout
+        /// before receiveBytes.
+        std::unique_ptr<TimeoutSetter> timeout_setter = nullptr;
+        if (socket.secure())
+            timeout_setter = std::make_unique<TimeoutSetter>(dynamic_cast<Poco::Net::StreamSocket &>(socket), socket.getReceiveTimeout(), socket.getReceiveTimeout());
+
+        bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size());
     }
     catch (const Poco::Net::NetException & e)
     {
diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp
index 284fa5dbd97..c666586770e 100644
--- a/src/IO/WriteBufferFromPocoSocket.cpp
+++ b/src/IO/WriteBufferFromPocoSocket.cpp
@@ -6,6 +6,7 @@
 #include <Common/NetException.h>
 #include <Common/Stopwatch.h>
 #include <Common/MemoryTracker.h>
+#include <Client/TimeoutSetter.h>
 
 
 namespace ProfileEvents
@@ -40,6 +41,13 @@ void WriteBufferFromPocoSocket::nextImpl()
         /// Add more details to exceptions.
         try
         {
+            /// sendBytes in SecureStreamSocket throws TimeoutException after max(receive_timeout, send_timeout),
+            /// but we want to get this exception exactly after send_timeout. So, set receive_timeout = send_timeout
+            /// before sendBytes.
+            std::unique_ptr<TimeoutSetter> timeout_setter = nullptr;
+            if (socket.secure())
+                timeout_setter = std::make_unique<TimeoutSetter>(dynamic_cast<Poco::Net::StreamSocket &>(socket), socket.getSendTimeout(), socket.getSendTimeout());
+
             res = socket.impl()->sendBytes(working_buffer.begin() + bytes_written, offset() - bytes_written);
         }
         catch (const Poco::Net::NetException & e)

From 1c7f16e0ff48a5de3cc302880b4888262fe1876c Mon Sep 17 00:00:00 2001
From: Pavel Kruglov <avogar@sandbox-633380738>
Date: Wed, 3 Mar 2021 20:47:27 +0300
Subject: [PATCH 002/260] Add test and move TimeoutSetter in IO/

---
 src/Client/Connection.cpp                     |  2 +-
 src/Client/ya.make                            |  1 -
 src/IO/ReadBufferFromPocoSocket.cpp           |  2 +-
 src/{Client => IO}/TimeoutSetter.cpp          |  2 +-
 src/{Client => IO}/TimeoutSetter.h            |  0
 src/IO/WriteBufferFromPocoSocket.cpp          |  2 +-
 src/IO/ya.make                                |  1 +
 src/Server/TCPHandler.h                       |  2 +-
 .../config.d/remote_servers.xml               | 14 ++++
 .../configs_secure/config.d/ssl_conf.xml      | 18 ++++
 .../configs_secure/dhparam.pem                |  8 ++
 .../configs_secure/server.crt                 | 19 +++++
 .../configs_secure/server.key                 | 28 +++++++
 .../configs_secure/users.d/users.xml          |  6 ++
 tests/integration/test_secure_socket/test.py  | 83 +++++++++++++++++++
 15 files changed, 182 insertions(+), 6 deletions(-)
 rename src/{Client => IO}/TimeoutSetter.cpp (97%)
 rename src/{Client => IO}/TimeoutSetter.h (100%)
 create mode 100644 tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml
 create mode 100644 tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml
 create mode 100644 tests/integration/test_secure_socket/configs_secure/dhparam.pem
 create mode 100644 tests/integration/test_secure_socket/configs_secure/server.crt
 create mode 100644 tests/integration/test_secure_socket/configs_secure/server.key
 create mode 100644 tests/integration/test_secure_socket/configs_secure/users.d/users.xml
 create mode 100644 tests/integration/test_secure_socket/test.py

diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp
index 80d44a336a5..939a48d949f 100644
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@@ -8,10 +8,10 @@
 #include <IO/ReadHelpers.h>
 #include <IO/WriteHelpers.h>
 #include <IO/copyData.h>
+#include <IO/TimeoutSetter.h>
 #include <DataStreams/NativeBlockInputStream.h>
 #include <DataStreams/NativeBlockOutputStream.h>
 #include <Client/Connection.h>
-#include <Client/TimeoutSetter.h>
 #include <Common/ClickHouseRevision.h>
 #include <Common/Exception.h>
 #include <Common/NetException.h>
diff --git a/src/Client/ya.make b/src/Client/ya.make
index af1dd05f1d4..4201203a8e9 100644
--- a/src/Client/ya.make
+++ b/src/Client/ya.make
@@ -16,7 +16,6 @@ SRCS(
     HedgedConnections.cpp
     HedgedConnectionsFactory.cpp
     MultiplexedConnections.cpp
-    TimeoutSetter.cpp
 
 )
 
diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp
index df8739904ec..c70993c5c3a 100644
--- a/src/IO/ReadBufferFromPocoSocket.cpp
+++ b/src/IO/ReadBufferFromPocoSocket.cpp
@@ -1,10 +1,10 @@
 #include <Poco/Net/NetException.h>
 
 #include <IO/ReadBufferFromPocoSocket.h>
+#include <IO/TimeoutSetter.h>
 #include <Common/Exception.h>
 #include <Common/NetException.h>
 #include <Common/Stopwatch.h>
-#include <Client/TimeoutSetter.h>
 
 
 namespace ProfileEvents
diff --git a/src/Client/TimeoutSetter.cpp b/src/IO/TimeoutSetter.cpp
similarity index 97%
rename from src/Client/TimeoutSetter.cpp
rename to src/IO/TimeoutSetter.cpp
index 87368f93ba3..f06cafecff8 100644
--- a/src/Client/TimeoutSetter.cpp
+++ b/src/IO/TimeoutSetter.cpp
@@ -1,4 +1,4 @@
-#include "TimeoutSetter.h"
+#include <IO/TimeoutSetter.h>
 
 #include <common/logger_useful.h>
 
diff --git a/src/Client/TimeoutSetter.h b/src/IO/TimeoutSetter.h
similarity index 100%
rename from src/Client/TimeoutSetter.h
rename to src/IO/TimeoutSetter.h
diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp
index c666586770e..4edfc8a2795 100644
--- a/src/IO/WriteBufferFromPocoSocket.cpp
+++ b/src/IO/WriteBufferFromPocoSocket.cpp
@@ -1,12 +1,12 @@
 #include <Poco/Net/NetException.h>
 
 #include <IO/WriteBufferFromPocoSocket.h>
+#include <IO/TimeoutSetter.h>
 
 #include <Common/Exception.h>
 #include <Common/NetException.h>
 #include <Common/Stopwatch.h>
 #include <Common/MemoryTracker.h>
-#include <Client/TimeoutSetter.h>
 
 
 namespace ProfileEvents
diff --git a/src/IO/ya.make b/src/IO/ya.make
index 6605cf64277..58df027c561 100644
--- a/src/IO/ya.make
+++ b/src/IO/ya.make
@@ -50,6 +50,7 @@ SRCS(
     ReadBufferFromPocoSocket.cpp
     ReadHelpers.cpp
     SeekAvoidingReadBuffer.cpp
+    TimeoutSetter.cpp
     UseSSL.cpp
     WriteBufferFromFile.cpp
     WriteBufferFromFileBase.cpp
diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h
index ee2f7c96b5a..c3dd8346c8e 100644
--- a/src/Server/TCPHandler.h
+++ b/src/Server/TCPHandler.h
@@ -8,10 +8,10 @@
 #include <Core/Protocol.h>
 #include <Core/QueryProcessingStage.h>
 #include <IO/Progress.h>
+#include <IO/TimeoutSetter.h>
 #include <DataStreams/BlockIO.h>
 #include <Interpreters/InternalTextLogsQueue.h>
 #include <Interpreters/Context.h>
-#include <Client/TimeoutSetter.h>
 
 #include "IServer.h"
 
diff --git a/tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml b/tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml
new file mode 100644
index 00000000000..0c109d6d768
--- /dev/null
+++ b/tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml
@@ -0,0 +1,14 @@
+<yandex>
+    <tcp_port_secure>9440</tcp_port_secure>
+    <remote_servers>
+        <test_cluster>
+            <shard>
+                <replica>
+                    <host>node2</host>
+                    <port>9440</port>
+                    <secure>1</secure>
+                </replica>
+            </shard>
+        </test_cluster>
+    </remote_servers>
+</yandex>
diff --git a/tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml b/tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml
new file mode 100644
index 00000000000..fe39e3712b8
--- /dev/null
+++ b/tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml
@@ -0,0 +1,18 @@
+<yandex>
+    <openSSL>
+        <server>
+            <certificateFile>/etc/clickhouse-server/config.d/server.crt</certificateFile>
+            <privateKeyFile>/etc/clickhouse-server/config.d/server.key</privateKeyFile>
+            <dhParamsFile>/etc/clickhouse-server/config.d/dhparam.pem</dhParamsFile>
+            <verificationMode>none</verificationMode>
+            <cacheSessions>true</cacheSessions>
+        </server>
+        <client>
+            <cacheSessions>true</cacheSessions>
+            <verificationMode>none</verificationMode>
+            <invalidCertificateHandler>
+                <name>AcceptCertificateHandler</name>
+            </invalidCertificateHandler>
+        </client>
+    </openSSL>
+</yandex>
diff --git a/tests/integration/test_secure_socket/configs_secure/dhparam.pem b/tests/integration/test_secure_socket/configs_secure/dhparam.pem
new file mode 100644
index 00000000000..2e6cee0798d
--- /dev/null
+++ b/tests/integration/test_secure_socket/configs_secure/dhparam.pem
@@ -0,0 +1,8 @@
+-----BEGIN DH PARAMETERS-----
+MIIBCAKCAQEAua92DDli13gJ+//ZXyGaggjIuidqB0crXfhUlsrBk9BV1hH3i7fR
+XGP9rUdk2ubnB3k2ejBStL5oBrkHm9SzUFSQHqfDjLZjKoUpOEmuDc4cHvX1XTR5
+Pr1vf5cd0yEncJWG5W4zyUB8k++SUdL2qaeslSs+f491HBLDYn/h8zCgRbBvxhxb
+9qeho1xcbnWeqkN6Kc9bgGozA16P9NLuuLttNnOblkH+lMBf42BSne/TWt3AlGZf
+slKmmZcySUhF8aKfJnLKbkBCFqOtFRh8zBA9a7g+BT/lSANATCDPaAk1YVih2EKb
+dpc3briTDbRsiqg2JKMI7+VdULY9bh3EawIBAg==
+-----END DH PARAMETERS-----
diff --git a/tests/integration/test_secure_socket/configs_secure/server.crt b/tests/integration/test_secure_socket/configs_secure/server.crt
new file mode 100644
index 00000000000..7ade2d96273
--- /dev/null
+++ b/tests/integration/test_secure_socket/configs_secure/server.crt
@@ -0,0 +1,19 @@
+-----BEGIN CERTIFICATE-----
+MIIC/TCCAeWgAwIBAgIJANjx1QSR77HBMA0GCSqGSIb3DQEBCwUAMBQxEjAQBgNV
+BAMMCWxvY2FsaG9zdDAgFw0xODA3MzAxODE2MDhaGA8yMjkyMDUxNDE4MTYwOFow
+FDESMBAGA1UEAwwJbG9jYWxob3N0MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB
+CgKCAQEAs9uSo6lJG8o8pw0fbVGVu0tPOljSWcVSXH9uiJBwlZLQnhN4SFSFohfI
+4K8U1tBDTnxPLUo/V1K9yzoLiRDGMkwVj6+4+hE2udS2ePTQv5oaMeJ9wrs+5c9T
+4pOtlq3pLAdm04ZMB1nbrEysceVudHRkQbGHzHp6VG29Fw7Ga6YpqyHQihRmEkTU
+7UCYNA+Vk7aDPdMS/khweyTpXYZimaK9f0ECU3/VOeG3fH6Sp2X6FN4tUj/aFXEj
+sRmU5G2TlYiSIUMF2JPdhSihfk1hJVALrHPTU38SOL+GyyBRWdNcrIwVwbpvsvPg
+pryMSNxnpr0AK0dFhjwnupIv5hJIOQIDAQABo1AwTjAdBgNVHQ4EFgQUjPLb3uYC
+kcamyZHK4/EV8jAP0wQwHwYDVR0jBBgwFoAUjPLb3uYCkcamyZHK4/EV8jAP0wQw
+DAYDVR0TBAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAQEAM/ocuDvfPus/KpMVD51j
+4IdlU8R0vmnYLQ+ygzOAo7+hUWP5j0yvq4ILWNmQX6HNvUggCgFv9bjwDFhb/5Vr
+85ieWfTd9+LTjrOzTw4avdGwpX9G+6jJJSSq15tw5ElOIFb/qNA9O4dBiu8vn03C
+L/zRSXrARhSqTW5w/tZkUcSTT+M5h28+Lgn9ysx4Ff5vi44LJ1NnrbJbEAIYsAAD
++UA+4MBFKx1r6hHINULev8+lCfkpwIaeS8RL+op4fr6kQPxnULw8wT8gkuc8I4+L
+P9gg/xDHB44T3ADGZ5Ib6O0DJaNiToO6rnoaaxs0KkotbvDWvRoxEytSbXKoYjYp
+0g==
+-----END CERTIFICATE-----
diff --git a/tests/integration/test_secure_socket/configs_secure/server.key b/tests/integration/test_secure_socket/configs_secure/server.key
new file mode 100644
index 00000000000..f0fb61ac443
--- /dev/null
+++ b/tests/integration/test_secure_socket/configs_secure/server.key
@@ -0,0 +1,28 @@
+-----BEGIN PRIVATE KEY-----
+MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCz25KjqUkbyjyn
+DR9tUZW7S086WNJZxVJcf26IkHCVktCeE3hIVIWiF8jgrxTW0ENOfE8tSj9XUr3L
+OguJEMYyTBWPr7j6ETa51LZ49NC/mhox4n3Cuz7lz1Pik62WreksB2bThkwHWdus
+TKxx5W50dGRBsYfMenpUbb0XDsZrpimrIdCKFGYSRNTtQJg0D5WTtoM90xL+SHB7
+JOldhmKZor1/QQJTf9U54bd8fpKnZfoU3i1SP9oVcSOxGZTkbZOViJIhQwXYk92F
+KKF+TWElUAusc9NTfxI4v4bLIFFZ01ysjBXBum+y8+CmvIxI3GemvQArR0WGPCe6
+ki/mEkg5AgMBAAECggEATrbIBIxwDJOD2/BoUqWkDCY3dGevF8697vFuZKIiQ7PP
+TX9j4vPq0DfsmDjHvAPFkTHiTQXzlroFik3LAp+uvhCCVzImmHq0IrwvZ9xtB43f
+7Pkc5P6h1l3Ybo8HJ6zRIY3TuLtLxuPSuiOMTQSGRL0zq3SQ5DKuGwkz+kVjHXUN
+MR2TECFwMHKQ5VLrC+7PMpsJYyOMlDAWhRfUalxC55xOXTpaN8TxNnwQ8K2ISVY5
+212Jz/a4hn4LdwxSz3Tiu95PN072K87HLWx3EdT6vW4Ge5P/A3y+smIuNAlanMnu
+plHBRtpATLiTxZt/n6npyrfQVbYjSH7KWhB8hBHtaQKBgQDh9Cq1c/KtqDtE0Ccr
+/r9tZNTUwBE6VP+3OJeKdEdtsfuxjOCkS1oAjgBJiSDOiWPh1DdoDeVZjPKq6pIu
+Mq12OE3Doa8znfCXGbkSzEKOb2unKZMJxzrz99kXt40W5DtrqKPNb24CNqTiY8Aa
+CjtcX+3weat82VRXvph6U8ltMwKBgQDLxjiQQzNoY7qvg7CwJCjf9qq8jmLK766g
+1FHXopqS+dTxDLM8eJSRrpmxGWJvNeNc1uPhsKsKgotqAMdBUQTf7rSTbt4MyoH5
+bUcRLtr+0QTK9hDWMOOvleqNXha68vATkohWYfCueNsC60qD44o8RZAS6UNy3ENq
+cM1cxqe84wKBgQDKkHutWnooJtajlTxY27O/nZKT/HA1bDgniMuKaz4R4Gr1PIez
+on3YW3V0d0P7BP6PWRIm7bY79vkiMtLEKdiKUGWeyZdo3eHvhDb/3DCawtau8L2K
+GZsHVp2//mS1Lfz7Qh8/L/NedqCQ+L4iWiPnZ3THjjwn3CoZ05ucpvrAMwKBgB54
+nay039MUVq44Owub3KDg+dcIU62U+cAC/9oG7qZbxYPmKkc4oL7IJSNecGHA5SbU
+2268RFdl/gLz6tfRjbEOuOHzCjFPdvAdbysanpTMHLNc6FefJ+zxtgk9sJh0C4Jh
+vxFrw9nTKKzfEl12gQ1SOaEaUIO0fEBGbe8ZpauRAoGAMAlGV+2/K4ebvAJKOVTa
+dKAzQ+TD2SJmeR1HZmKDYddNqwtZlzg3v4ZhCk4eaUmGeC1Bdh8MDuB3QQvXz4Dr
+vOIP4UVaOr+uM+7TgAgVnP4/K6IeJGzUDhX93pmpWhODfdu/oojEKVcpCojmEmS1
+KCBtmIrQLqzMpnBpLNuSY+Q=
+-----END PRIVATE KEY-----
diff --git a/tests/integration/test_secure_socket/configs_secure/users.d/users.xml b/tests/integration/test_secure_socket/configs_secure/users.d/users.xml
new file mode 100644
index 00000000000..479017f6370
--- /dev/null
+++ b/tests/integration/test_secure_socket/configs_secure/users.d/users.xml
@@ -0,0 +1,6 @@
+<yandex>
+    <profiles>
+        <default>
+        </default>
+    </profiles>
+</yandex>
diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py
new file mode 100644
index 00000000000..6932c4a5bc9
--- /dev/null
+++ b/tests/integration/test_secure_socket/test.py
@@ -0,0 +1,83 @@
+import os.path
+import time
+
+import pytest
+from helpers.cluster import ClickHouseCluster
+from helpers.test_tools import TSV
+
+cluster = ClickHouseCluster(__file__)
+
+NODES = {'node' + str(i): None for i in (1, 2)}
+
+config = '''<yandex>
+    <profiles>
+        <default>
+            <sleep_in_send_data>{sleep_in_send_data}</sleep_in_send_data>
+        </default>
+    </profiles>
+</yandex>'''
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    cluster.__with_ssl_config = True
+    main_configs = [
+        "configs_secure/config.d/remote_servers.xml",
+        "configs_secure/server.crt",
+        "configs_secure/server.key",
+        "configs_secure/dhparam.pem",
+        "configs_secure/config.d/ssl_conf.xml",
+    ]
+
+    NODES['node1'] =  cluster.add_instance('node1', main_configs=main_configs)
+    NODES['node2'] =  cluster.add_instance('node2', main_configs=main_configs, user_configs=["configs_secure/users.d/users.xml"])
+    try:
+        cluster.start()
+        NODES['node2'].query("CREATE TABLE base_table (x UInt64) ENGINE = MergeTree  ORDER BY x;")
+        NODES['node2'].query("INSERT INTO base_table VALUES (5);")
+        NODES['node1'].query("CREATE TABLE distributed_table (x UInt64) ENGINE = Distributed(test_cluster, default, base_table);")
+
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+
+def test(started_cluster):
+    NODES['node2'].replace_config('/etc/clickhouse-server/users.d/users.xml', config.format(sleep_in_send_data=1000))
+    
+    attempts = 0
+    while attempts < 1000:
+        setting = NODES['node2'].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data'")
+        if int(setting) == 1000:
+            break
+        time.sleep(0.1)
+        attempts += 1
+
+    assert attempts < 1000
+
+
+    start = time.time()
+    NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, use_hedged_requests=0, async_socket_for_remote=0;')
+    end = time.time()
+    assert end - start < 6
+
+    start = time.time()
+    error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, use_hedged_requests=0;')
+    end = time.time()
+
+    assert end - start < 6
+
+    # Check that exception about timeout wasn't thrown from DB::ReadBufferFromPocoSocket::nextImpl().
+    assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1
+
+    start = time.time()
+    error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5;')
+    end = time.time()
+
+    assert end - start < 6
+
+    # Check that exception about timeout wasn't thrown from DB::ReadBufferFromPocoSocket::nextImpl().
+    assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1
+
+

From 271398be61e88e4f20f327210ba53595273715d8 Mon Sep 17 00:00:00 2001
From: Pavel Kruglov <avogar@sandbox-633380738>
Date: Thu, 4 Mar 2021 23:15:33 +0300
Subject: [PATCH 003/260] add __init__.py

---
 tests/integration/test_secure_socket/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/integration/test_secure_socket/__init__.py

diff --git a/tests/integration/test_secure_socket/__init__.py b/tests/integration/test_secure_socket/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d

From ed6363b88bd1b436c706b639c0a9697037c3a5b7 Mon Sep 17 00:00:00 2001
From: Pavel Kruglov <avogar@sandbox-633380738>
Date: Fri, 5 Mar 2021 16:18:12 +0300
Subject: [PATCH 004/260] Increase allowed query time

---
 tests/integration/test_secure_socket/test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py
index 6932c4a5bc9..8c94b4c35ad 100644
--- a/tests/integration/test_secure_socket/test.py
+++ b/tests/integration/test_secure_socket/test.py
@@ -60,13 +60,13 @@ def test(started_cluster):
     start = time.time()
     NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, use_hedged_requests=0, async_socket_for_remote=0;')
     end = time.time()
-    assert end - start < 6
+    assert end - start < 10
 
     start = time.time()
     error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, use_hedged_requests=0;')
     end = time.time()
 
-    assert end - start < 6
+    assert end - start < 10
 
     # Check that exception about timeout wasn't thrown from DB::ReadBufferFromPocoSocket::nextImpl().
     assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1
@@ -75,7 +75,7 @@ def test(started_cluster):
     error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5;')
     end = time.time()
 
-    assert end - start < 6
+    assert end - start < 10
 
     # Check that exception about timeout wasn't thrown from DB::ReadBufferFromPocoSocket::nextImpl().
     assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1

From 19af94bad97a7f0f6316249f29b69b6a6f64ea01 Mon Sep 17 00:00:00 2001
From: Pavel Kruglov <avogar@sandbox-633380738>
Date: Fri, 5 Mar 2021 19:08:49 +0300
Subject: [PATCH 005/260] restart tests

---
 tests/integration/test_secure_socket/test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py
index 8c94b4c35ad..337b6b05bd7 100644
--- a/tests/integration/test_secure_socket/test.py
+++ b/tests/integration/test_secure_socket/test.py
@@ -31,6 +31,7 @@ def started_cluster():
 
     NODES['node1'] =  cluster.add_instance('node1', main_configs=main_configs)
     NODES['node2'] =  cluster.add_instance('node2', main_configs=main_configs, user_configs=["configs_secure/users.d/users.xml"])
+
     try:
         cluster.start()
         NODES['node2'].query("CREATE TABLE base_table (x UInt64) ENGINE = MergeTree  ORDER BY x;")

From 9c35e4987899c4c52c6d4c33dce602ac9a8629f1 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Sat, 6 Mar 2021 10:34:39 +0300
Subject: [PATCH 006/260] Fix heap-buffer-overflow in highlighting multi-line
 comments

Not closed multi-line comment returns the whole query, so it should not
be processed further with the lexer.

ASan report:

    :) /*=================================================================
    ==14889==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x60400006ebc0 at pc 0x00000a8148ea bp 0x7fffffff8610 sp 0x7fffffff7dd8
    WRITE of size 16 at 0x60400006ebc0 thread T0
        0 0xa8148e9 in __asan_memcpy (/src/ch/tmp/upstream/clickhouse-asan+0xa8148e9)
        1 0xaa8a3a4 in DB::Client::highlight(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::vector<replxx::Replxx::Color, std::__1::allocator<replxx::Replxx::Color> >&) obj-x86_64-linux-gnu/../programs/client/Client.cpp:464:52
        2 0x25f7b6d8 in std::__1::__function::__policy_func<void (std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::vector<replxx::Replxx::Color, std::__1::allocator<replxx::Replxx::Color> >&)>::operator()(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::vector<replxx::Replxx::Color, std::__1::allocator<replxx::Replxx::Color> >&) const obj-x86_64-linux-gnu/../contrib/libcxx/include/functional:2221:16
        3 0x25f7b6d8 in std::__1::function<void (std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::vector<replxx::Replxx::Color, std::__1::allocator<replxx::Replxx::Color> >&)>::operator()(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::vector<replxx::Replxx::Color, std::__1::allocator<replxx::Replxx::Color> >&) const obj-x86_64-linux-gnu/../contrib/libcxx/include/functional:2560:12
        4 0x25f7b6d8 in replxx::Replxx::ReplxxImpl::render(replxx::Replxx::ReplxxImpl::HINT_ACTION) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:546:3
        5 0x25f74059 in replxx::Replxx::ReplxxImpl::refresh_line(replxx::Replxx::ReplxxImpl::HINT_ACTION) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:729:2
        6 0x25f6bc8f in replxx::Replxx::ReplxxImpl::insert_character(char32_t) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1197:3
        7 0x25f79347 in replxx::Replxx::ReplxxImpl::action(unsigned long long, replxx::Replxx::ACTION_RESULT (replxx::Replxx::ReplxxImpl::* const&)(char32_t), char32_t) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1130:29
        8 0x25f79347 in replxx::Replxx::ReplxxImpl::get_input_line() obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1123:11
        9 0x25f7844c in replxx::Replxx::ReplxxImpl::input(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:455:8
        10 0x25af5693 in ReplxxLineReader::readOneLine(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) obj-x86_64-linux-gnu/../base/common/ReplxxLineReader.cpp:108:29
        11 0x25aed149 in LineReader::readLine(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) obj-x86_64-linux-gnu/../base/common/LineReader.cpp:81:26
        12 0xaa80ba2 in DB::Client::mainImpl() obj-x86_64-linux-gnu/../programs/client/Client.cpp:654:33
        13 0xaa756f5 in DB::Client::main(std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) obj-x86_64-linux-gnu/../programs/client/Client.cpp:280:20
        14 0x25c0c8b5 in Poco::Util::Application::run() obj-x86_64-linux-gnu/../contrib/poco/Util/src/Application.cpp:334:8
        15 0xaa4d050 in mainEntryClickHouseClient(int, char**) obj-x86_64-linux-gnu/../programs/client/Client.cpp:2724:23
        16 0xa848c3a in main obj-x86_64-linux-gnu/../programs/main.cpp:368:12
        17 0x7ffff7dcab24 in __libc_start_main (/usr/lib/libc.so.6+0x27b24)
        18 0xa79b36d in _start (/src/ch/tmp/upstream/clickhouse-asan+0xa79b36d)

    0x60400006ebc0 is located 0 bytes to the right of 48-byte region [0x60400006eb90,0x60400006ebc0)
    allocated by thread T0 here:
        0 0xa84509d in operator new(unsigned long) (/src/ch/tmp/upstream/clickhouse-asan+0xa84509d)
        1 0x25f7af76 in void* std::__1::__libcpp_operator_new<unsigned long>(unsigned long) obj-x86_64-linux-gnu/../contrib/libcxx/include/new:235:10
        2 0x25f7af76 in std::__1::__libcpp_allocate(unsigned long, unsigned long) obj-x86_64-linux-gnu/../contrib/libcxx/include/new:261:10
        3 0x25f7af76 in std::__1::allocator<replxx::Replxx::Color>::allocate(unsigned long) obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:840:38
        4 0x25f7af76 in std::__1::allocator_traits<std::__1::allocator<replxx::Replxx::Color> >::allocate(std::__1::allocator<replxx::Replxx::Color>&, unsigned long) obj-x86_64-linux-gnu/../contrib/libcxx/include/__memory/allocator_traits.h:468:21
        5 0x25f7af76 in std::__1::vector<replxx::Replxx::Color, std::__1::allocator<replxx::Replxx::Color> >::__vallocate(unsigned long) obj-x86_64-linux-gnu/../contrib/libcxx/include/vector:993:37
        6 0x25f7af76 in std::__1::vector<replxx::Replxx::Color, std::__1::allocator<replxx::Replxx::Color> >::vector(unsigned long, replxx::Replxx::Color const&) obj-x86_64-linux-gnu/../contrib/libcxx/include/vector:1155:9
        7 0x25f7af76 in replxx::Replxx::ReplxxImpl::render(replxx::Replxx::ReplxxImpl::HINT_ACTION) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:543:19
        8 0x25f74059 in replxx::Replxx::ReplxxImpl::refresh_line(replxx::Replxx::ReplxxImpl::HINT_ACTION) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:729:2
        9 0x25f6bc8f in replxx::Replxx::ReplxxImpl::insert_character(char32_t) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1197:3
        10 0x25f79347 in replxx::Replxx::ReplxxImpl::action(unsigned long long, replxx::Replxx::ACTION_RESULT (replxx::Replxx::ReplxxImpl::* const&)(char32_t), char32_t) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1130:29
        11 0x25f79347 in replxx::Replxx::ReplxxImpl::get_input_line() obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1123:11
        12 0x25f7844c in replxx::Replxx::ReplxxImpl::input(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:455:8
        13 0x25af5693 in ReplxxLineReader::readOneLine(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) obj-x86_64-linux-gnu/../base/common/ReplxxLineReader.cpp:108:29
        14 0x25aed149 in LineReader::readLine(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) obj-x86_64-linux-gnu/../base/common/LineReader.cpp:81:26
        15 0xaa80ba2 in DB::Client::mainImpl() obj-x86_64-linux-gnu/../programs/client/Client.cpp:654:33
        16 0xaa756f5 in DB::Client::main(std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) obj-x86_64-linux-gnu/../programs/client/Client.cpp:280:20
        17 0x25c0c8b5 in Poco::Util::Application::run() obj-x86_64-linux-gnu/../contrib/poco/Util/src/Application.cpp:334:8
        18 0xaa4d050 in mainEntryClickHouseClient(int, char**) obj-x86_64-linux-gnu/../programs/client/Client.cpp:2724:23
        19 0xa848c3a in main obj-x86_64-linux-gnu/../programs/main.cpp:368:12
        20 0x7ffff7dcab24 in __libc_start_main (/usr/lib/libc.so.6+0x27b24)

    SUMMARY: AddressSanitizer: heap-buffer-overflow (/src/ch/tmp/upstream/clickhouse-asan+0xa8148e9) in __asan_memcpy

v2: fix lexer instead of client quirk
---
 src/Parsers/Lexer.cpp                         |  3 ++-
 ...light_multi_line_comment_regression.expect | 25 +++++++++++++++++++
 ...ht_multi_line_comment_regression.reference |  0
 .../queries/0_stateless/arcadia_skip_list.txt |  1 +
 tests/queries/skip_list.json                  |  3 ++-
 5 files changed, 30 insertions(+), 2 deletions(-)
 create mode 100755 tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect
 create mode 100644 tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.reference

diff --git a/src/Parsers/Lexer.cpp b/src/Parsers/Lexer.cpp
index ffa8250a3f3..1fa4d396113 100644
--- a/src/Parsers/Lexer.cpp
+++ b/src/Parsers/Lexer.cpp
@@ -275,7 +275,8 @@ Token Lexer::nextTokenImpl()
                         else
                             ++pos;
                     }
-                    return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, end);
+                    pos = end;
+                    return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, pos);
                 }
             }
             return Token(TokenType::Slash, token_begin, pos);
diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect
new file mode 100755
index 00000000000..65b9bde235b
--- /dev/null
+++ b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect
@@ -0,0 +1,25 @@
+#!/usr/bin/expect -f
+
+log_user 0
+set timeout 5
+match_max 100000
+# A default timeout action is to do nothing, change it to fail
+expect_after {
+    timeout {
+        exit 2
+    }
+}
+
+set basedir [file dirname $argv0]
+spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT"
+expect ":) "
+
+# regression for heap-buffer-overflow issue (under ASAN)
+send -- "/**"
+expect "/**"
+# just in case few more bytes
+send -- "foobar"
+expect "/**foobar"
+
+send -- "\3\4"
+expect eof
diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.reference b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt
index c1e991ff6b2..a9cca053d3c 100644
--- a/tests/queries/0_stateless/arcadia_skip_list.txt
+++ b/tests/queries/0_stateless/arcadia_skip_list.txt
@@ -212,3 +212,4 @@
 01017_uniqCombined_memory_usage
 01747_join_view_filter_dictionary
 01748_dictionary_table_dot
+01755_client_highlight_multi_line_comment_regression
diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json
index 1200d8f5436..b829423f846 100644
--- a/tests/queries/skip_list.json
+++ b/tests/queries/skip_list.json
@@ -95,7 +95,8 @@
         "01370_client_autocomplete_word_break_characters",
         "01676_clickhouse_client_autocomplete",
         "01193_metadata_loading",
-        "01455_time_zones"
+        "01455_time_zones",
+        "01755_client_highlight_multi_line_comment_regression"
     ],
     "release-build": [
     ],

From e9f8dd645e8ffc54ff701b8b3dfacf2831454d09 Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Tue, 9 Mar 2021 17:45:47 +0800
Subject: [PATCH 007/260] Shard pruning via literals

---
 src/Interpreters/evaluateConstantExpression.cpp   | 10 ++++++++--
 src/Storages/StorageDistributed.cpp               |  4 ++++
 .../01755_shard_pruning_with_literal.reference    |  2 ++
 .../01755_shard_pruning_with_literal.sql          | 15 +++++++++++++++
 4 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 tests/queries/0_stateless/01755_shard_pruning_with_literal.reference
 create mode 100644 tests/queries/0_stateless/01755_shard_pruning_with_literal.sql

diff --git a/src/Interpreters/evaluateConstantExpression.cpp b/src/Interpreters/evaluateConstantExpression.cpp
index 42e96bae07b..db19c000cfd 100644
--- a/src/Interpreters/evaluateConstantExpression.cpp
+++ b/src/Interpreters/evaluateConstantExpression.cpp
@@ -290,8 +290,6 @@ std::optional<Blocks> evaluateExpressionOverConstantCondition(const ASTPtr & nod
 {
     Blocks result;
 
-    // TODO: `node` may be always-false literal.
-
     if (const auto * fn = node->as<ASTFunction>())
     {
         const auto dnf = analyzeFunction(fn, target_expr);
@@ -350,6 +348,14 @@ std::optional<Blocks> evaluateExpressionOverConstantCondition(const ASTPtr & nod
             }
         }
     }
+    else if (const auto * literal = node->as<ASTLiteral>())
+    {
+        // Check if it's always true or false.
+        if (literal->value.getType() == Field::Types::UInt64 && literal->value.get<UInt64>() == 0)
+            return {result};
+        else
+            return {};
+    }
 
     return {result};
 }
diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp
index 039cf63eca2..32fc9fc4cba 100644
--- a/src/Storages/StorageDistributed.cpp
+++ b/src/Storages/StorageDistributed.cpp
@@ -526,6 +526,10 @@ void StorageDistributed::read(
     const size_t /*max_block_size*/,
     const unsigned /*num_streams*/)
 {
+    // Return directly if no shard to query.
+    if (query_info.cluster->getShardsInfo().empty())
+        return;
+
     const auto & modified_query_ast = rewriteSelectQuery(
         query_info.query, remote_database, remote_table, remote_table_function_ptr);
 
diff --git a/tests/queries/0_stateless/01755_shard_pruning_with_literal.reference b/tests/queries/0_stateless/01755_shard_pruning_with_literal.reference
new file mode 100644
index 00000000000..6ed281c757a
--- /dev/null
+++ b/tests/queries/0_stateless/01755_shard_pruning_with_literal.reference
@@ -0,0 +1,2 @@
+1
+1
diff --git a/tests/queries/0_stateless/01755_shard_pruning_with_literal.sql b/tests/queries/0_stateless/01755_shard_pruning_with_literal.sql
new file mode 100644
index 00000000000..0816ac6e88b
--- /dev/null
+++ b/tests/queries/0_stateless/01755_shard_pruning_with_literal.sql
@@ -0,0 +1,15 @@
+set optimize_skip_unused_shards=1;
+
+drop table if exists data_01755;
+
+drop table if exists dist_01755;
+
+create table data_01755 (i Int) Engine=Memory;
+
+create table dist_01755 as data_01755 Engine=Distributed(test_cluster_two_shards, currentDatabase(), data_01755, i);
+
+insert into data_01755 values (1);
+
+select * from dist_01755 where 0;
+
+select * from dist_01755 where 1 settings enable_early_constant_folding = 0;

From 5881270f90380751198b7d145df8795c1ab0883c Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Fri, 12 Mar 2021 17:10:08 +0800
Subject: [PATCH 008/260] Update src/Storages/StorageDistributed.cpp

Co-authored-by: Azat Khuzhin <a3at.mail@gmail.com>
---
 src/Storages/StorageDistributed.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp
index 32fc9fc4cba..039cf63eca2 100644
--- a/src/Storages/StorageDistributed.cpp
+++ b/src/Storages/StorageDistributed.cpp
@@ -526,10 +526,6 @@ void StorageDistributed::read(
     const size_t /*max_block_size*/,
     const unsigned /*num_streams*/)
 {
-    // Return directly if no shard to query.
-    if (query_info.cluster->getShardsInfo().empty())
-        return;
-
     const auto & modified_query_ast = rewriteSelectQuery(
         query_info.query, remote_database, remote_table, remote_table_function_ptr);
 

From ba9e1e5a8d23e7fbf6bff83e6493d46f1d49ef75 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 10 Mar 2021 19:12:32 +0300
Subject: [PATCH 009/260] Some initial code

Add some java magic

Allow to connect with old session id

More angry nemesis and fixes

Angry

Fix style

Split to files

Better wrappers

Better structure

Add set test and split to separate files (I think something broken now)

Better

Missed files
---
 src/Coordination/CoordinationSettings.h       |   1 +
 src/Coordination/NuKeeperServer.cpp           |   4 +-
 src/Server/NuKeeperTCPHandler.cpp             |  10 +-
 tests/jepsen.nukeeper/.gitignore              |  13 +
 tests/jepsen.nukeeper/CHANGELOG.md            |  24 ++
 tests/jepsen.nukeeper/LICENSE                 | 280 ++++++++++++++++++
 tests/jepsen.nukeeper/README.md               |  22 ++
 tests/jepsen.nukeeper/doc/intro.md            |   3 +
 tests/jepsen.nukeeper/project.clj             |  13 +
 tests/jepsen.nukeeper/resources/config.xml    |   1 +
 tests/jepsen.nukeeper/resources/listen.xml    |   3 +
 .../resources/test_keeper_config.xml          |  33 +++
 tests/jepsen.nukeeper/resources/users.xml     |   1 +
 .../src/jepsen/nukeeper/main.clj              | 143 +++++++++
 .../src/jepsen/nukeeper/register.clj          |  64 ++++
 .../src/jepsen/nukeeper/set.clj               |  43 +++
 .../src/jepsen/nukeeper/utils.clj             |  56 ++++
 .../test/jepsen/nukeeper_test.clj             |  28 ++
 18 files changed, 733 insertions(+), 9 deletions(-)
 create mode 100644 tests/jepsen.nukeeper/.gitignore
 create mode 100644 tests/jepsen.nukeeper/CHANGELOG.md
 create mode 100644 tests/jepsen.nukeeper/LICENSE
 create mode 100644 tests/jepsen.nukeeper/README.md
 create mode 100644 tests/jepsen.nukeeper/doc/intro.md
 create mode 100644 tests/jepsen.nukeeper/project.clj
 create mode 120000 tests/jepsen.nukeeper/resources/config.xml
 create mode 100644 tests/jepsen.nukeeper/resources/listen.xml
 create mode 100644 tests/jepsen.nukeeper/resources/test_keeper_config.xml
 create mode 120000 tests/jepsen.nukeeper/resources/users.xml
 create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
 create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj
 create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
 create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
 create mode 100644 tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj

diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h
index dcfb13c359e..c816f8089d5 100644
--- a/src/Coordination/CoordinationSettings.h
+++ b/src/Coordination/CoordinationSettings.h
@@ -31,6 +31,7 @@ struct Settings;
     M(UInt64, rotate_log_storage_interval, 10000, "How many records will be stored in one log storage file", 0) \
     M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \
     M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \
+    M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \
     M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0)
 
 DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)
diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp
index edda26613dd..2081c969523 100644
--- a/src/Coordination/NuKeeperServer.cpp
+++ b/src/Coordination/NuKeeperServer.cpp
@@ -30,6 +30,8 @@ NuKeeperServer::NuKeeperServer(
     , state_manager(nuraft::cs_new<NuKeeperStateManager>(server_id, "test_keeper_server", config, coordination_settings))
     , responses_queue(responses_queue_)
 {
+    if (coordination_settings->quorum_reads)
+        LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Quorum reads enabled, NuKeeper will work slower.");
 }
 
 void NuKeeperServer::startup()
@@ -106,7 +108,7 @@ nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, const Coord
 void NuKeeperServer::putRequest(const NuKeeperStorage::RequestForSession & request_for_session)
 {
     auto [session_id, request] = request_for_session;
-    if (isLeaderAlive() && request->isReadRequest())
+    if (!coordination_settings->quorum_reads && isLeaderAlive() && request->isReadRequest())
     {
         state_machine->processReadRequest(request_for_session);
     }
diff --git a/src/Server/NuKeeperTCPHandler.cpp b/src/Server/NuKeeperTCPHandler.cpp
index b283356d27d..b676331f6c0 100644
--- a/src/Server/NuKeeperTCPHandler.cpp
+++ b/src/Server/NuKeeperTCPHandler.cpp
@@ -240,16 +240,10 @@ Poco::Timespan NuKeeperTCPHandler::receiveHandshake()
         throw Exception("Unexpected protocol version: " + toString(protocol_version), ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT);
 
     Coordination::read(last_zxid_seen, *in);
-
-    if (last_zxid_seen != 0)
-        throw Exception("Non zero last_zxid_seen is not supported", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT);
-
     Coordination::read(timeout_ms, *in);
+
+    /// TODO Stop ignoring this value
     Coordination::read(previous_session_id, *in);
-
-    if (previous_session_id != 0)
-        throw Exception("Non zero previous session id is not supported", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT);
-
     Coordination::read(passwd, *in);
 
     int8_t readonly;
diff --git a/tests/jepsen.nukeeper/.gitignore b/tests/jepsen.nukeeper/.gitignore
new file mode 100644
index 00000000000..d956ab0a125
--- /dev/null
+++ b/tests/jepsen.nukeeper/.gitignore
@@ -0,0 +1,13 @@
+/target
+/classes
+/checkouts
+profiles.clj
+pom.xml
+pom.xml.asc
+*.jar
+*.class
+/.lein-*
+/.nrepl-port
+/.prepl-port
+.hgignore
+.hg/
diff --git a/tests/jepsen.nukeeper/CHANGELOG.md b/tests/jepsen.nukeeper/CHANGELOG.md
new file mode 100644
index 00000000000..6c7cb4f7c8a
--- /dev/null
+++ b/tests/jepsen.nukeeper/CHANGELOG.md
@@ -0,0 +1,24 @@
+# Change Log
+All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/).
+
+## [Unreleased]
+### Changed
+- Add a new arity to `make-widget-async` to provide a different widget shape.
+
+## [0.1.1] - 2021-03-10
+### Changed
+- Documentation on how to make the widgets.
+
+### Removed
+- `make-widget-sync` - we're all async, all the time.
+
+### Fixed
+- Fixed widget maker to keep working when daylight savings switches over.
+
+## 0.1.0 - 2021-03-10
+### Added
+- Files from the new template.
+- Widget maker public API - `make-widget-sync`.
+
+[Unreleased]: https://github.com/your-name/jepsen.nukeeper/compare/0.1.1...HEAD
+[0.1.1]: https://github.com/your-name/jepsen.nukeeper/compare/0.1.0...0.1.1
diff --git a/tests/jepsen.nukeeper/LICENSE b/tests/jepsen.nukeeper/LICENSE
new file mode 100644
index 00000000000..231512650b9
--- /dev/null
+++ b/tests/jepsen.nukeeper/LICENSE
@@ -0,0 +1,280 @@
+Eclipse Public License - v 2.0
+
+    THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE
+    PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION
+    OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
+
+1. DEFINITIONS
+
+"Contribution" means:
+
+  a) in the case of the initial Contributor, the initial content
+     Distributed under this Agreement, and
+
+  b) in the case of each subsequent Contributor:
+     i) changes to the Program, and
+     ii) additions to the Program;
+  where such changes and/or additions to the Program originate from
+  and are Distributed by that particular Contributor. A Contribution
+  "originates" from a Contributor if it was added to the Program by
+  such Contributor itself or anyone acting on such Contributor's behalf.
+  Contributions do not include changes or additions to the Program that
+  are not Modified Works.
+
+"Contributor" means any person or entity that Distributes the Program.
+
+"Licensed Patents" mean patent claims licensable by a Contributor which
+are necessarily infringed by the use or sale of its Contribution alone
+or when combined with the Program.
+
+"Program" means the Contributions Distributed in accordance with this
+Agreement.
+
+"Recipient" means anyone who receives the Program under this Agreement
+or any Secondary License (as applicable), including Contributors.
+
+"Derivative Works" shall mean any work, whether in Source Code or other
+form, that is based on (or derived from) the Program and for which the
+editorial revisions, annotations, elaborations, or other modifications
+represent, as a whole, an original work of authorship.
+
+"Modified Works" shall mean any work in Source Code or other form that
+results from an addition to, deletion from, or modification of the
+contents of the Program, including, for purposes of clarity any new file
+in Source Code form that contains any contents of the Program. Modified
+Works shall not include works that contain only declarations,
+interfaces, types, classes, structures, or files of the Program solely
+in each case in order to link to, bind by name, or subclass the Program
+or Modified Works thereof.
+
+"Distribute" means the acts of a) distributing or b) making available
+in any manner that enables the transfer of a copy.
+
+"Source Code" means the form of a Program preferred for making
+modifications, including but not limited to software source code,
+documentation source, and configuration files.
+
+"Secondary License" means either the GNU General Public License,
+Version 2.0, or any later versions of that license, including any
+exceptions or additional permissions as identified by the initial
+Contributor.
+
+2. GRANT OF RIGHTS
+
+  a) Subject to the terms of this Agreement, each Contributor hereby
+  grants Recipient a non-exclusive, worldwide, royalty-free copyright
+  license to reproduce, prepare Derivative Works of, publicly display,
+  publicly perform, Distribute and sublicense the Contribution of such
+  Contributor, if any, and such Derivative Works.
+
+  b) Subject to the terms of this Agreement, each Contributor hereby
+  grants Recipient a non-exclusive, worldwide, royalty-free patent
+  license under Licensed Patents to make, use, sell, offer to sell,
+  import and otherwise transfer the Contribution of such Contributor,
+  if any, in Source Code or other form. This patent license shall
+  apply to the combination of the Contribution and the Program if, at
+  the time the Contribution is added by the Contributor, such addition
+  of the Contribution causes such combination to be covered by the
+  Licensed Patents. The patent license shall not apply to any other
+  combinations which include the Contribution. No hardware per se is
+  licensed hereunder.
+
+  c) Recipient understands that although each Contributor grants the
+  licenses to its Contributions set forth herein, no assurances are
+  provided by any Contributor that the Program does not infringe the
+  patent or other intellectual property rights of any other entity.
+  Each Contributor disclaims any liability to Recipient for claims
+  brought by any other entity based on infringement of intellectual
+  property rights or otherwise. As a condition to exercising the
+  rights and licenses granted hereunder, each Recipient hereby
+  assumes sole responsibility to secure any other intellectual
+  property rights needed, if any. For example, if a third party
+  patent license is required to allow Recipient to Distribute the
+  Program, it is Recipient's responsibility to acquire that license
+  before distributing the Program.
+
+  d) Each Contributor represents that to its knowledge it has
+  sufficient copyright rights in its Contribution, if any, to grant
+  the copyright license set forth in this Agreement.
+
+  e) Notwithstanding the terms of any Secondary License, no
+  Contributor makes additional grants to any Recipient (other than
+  those set forth in this Agreement) as a result of such Recipient's
+  receipt of the Program under the terms of a Secondary License
+  (if permitted under the terms of Section 3).
+
+3. REQUIREMENTS
+
+3.1 If a Contributor Distributes the Program in any form, then:
+
+  a) the Program must also be made available as Source Code, in
+  accordance with section 3.2, and the Contributor must accompany
+  the Program with a statement that the Source Code for the Program
+  is available under this Agreement, and informs Recipients how to
+  obtain it in a reasonable manner on or through a medium customarily
+  used for software exchange; and
+
+  b) the Contributor may Distribute the Program under a license
+  different than this Agreement, provided that such license:
+     i) effectively disclaims on behalf of all other Contributors all
+     warranties and conditions, express and implied, including
+     warranties or conditions of title and non-infringement, and
+     implied warranties or conditions of merchantability and fitness
+     for a particular purpose;
+
+     ii) effectively excludes on behalf of all other Contributors all
+     liability for damages, including direct, indirect, special,
+     incidental and consequential damages, such as lost profits;
+
+     iii) does not attempt to limit or alter the recipients' rights
+     in the Source Code under section 3.2; and
+
+     iv) requires any subsequent distribution of the Program by any
+     party to be under a license that satisfies the requirements
+     of this section 3.
+
+3.2 When the Program is Distributed as Source Code:
+
+  a) it must be made available under this Agreement, or if the
+  Program (i) is combined with other material in a separate file or
+  files made available under a Secondary License, and (ii) the initial
+  Contributor attached to the Source Code the notice described in
+  Exhibit A of this Agreement, then the Program may be made available
+  under the terms of such Secondary Licenses, and
+
+  b) a copy of this Agreement must be included with each copy of
+  the Program.
+
+3.3 Contributors may not remove or alter any copyright, patent,
+trademark, attribution notices, disclaimers of warranty, or limitations
+of liability ("notices") contained within the Program from any copy of
+the Program which they Distribute, provided that Contributors may add
+their own appropriate notices.
+
+4. COMMERCIAL DISTRIBUTION
+
+Commercial distributors of software may accept certain responsibilities
+with respect to end users, business partners and the like. While this
+license is intended to facilitate the commercial use of the Program,
+the Contributor who includes the Program in a commercial product
+offering should do so in a manner which does not create potential
+liability for other Contributors. Therefore, if a Contributor includes
+the Program in a commercial product offering, such Contributor
+("Commercial Contributor") hereby agrees to defend and indemnify every
+other Contributor ("Indemnified Contributor") against any losses,
+damages and costs (collectively "Losses") arising from claims, lawsuits
+and other legal actions brought by a third party against the Indemnified
+Contributor to the extent caused by the acts or omissions of such
+Commercial Contributor in connection with its distribution of the Program
+in a commercial product offering. The obligations in this section do not
+apply to any claims or Losses relating to any actual or alleged
+intellectual property infringement. In order to qualify, an Indemnified
+Contributor must: a) promptly notify the Commercial Contributor in
+writing of such claim, and b) allow the Commercial Contributor to control,
+and cooperate with the Commercial Contributor in, the defense and any
+related settlement negotiations. The Indemnified Contributor may
+participate in any such claim at its own expense.
+
+For example, a Contributor might include the Program in a commercial
+product offering, Product X. That Contributor is then a Commercial
+Contributor. If that Commercial Contributor then makes performance
+claims, or offers warranties related to Product X, those performance
+claims and warranties are such Commercial Contributor's responsibility
+alone. Under this section, the Commercial Contributor would have to
+defend claims against the other Contributors related to those performance
+claims and warranties, and if a court requires any other Contributor to
+pay any damages as a result, the Commercial Contributor must pay
+those damages.
+
+5. NO WARRANTY
+
+EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT
+PERMITTED BY APPLICABLE LAW, THE PROGRAM IS PROVIDED ON AN "AS IS"
+BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR
+IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF
+TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR
+PURPOSE. Each Recipient is solely responsible for determining the
+appropriateness of using and distributing the Program and assumes all
+risks associated with its exercise of rights under this Agreement,
+including but not limited to the risks and costs of program errors,
+compliance with applicable laws, damage to or loss of data, programs
+or equipment, and unavailability or interruption of operations.
+
+6. DISCLAIMER OF LIABILITY
+
+EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT
+PERMITTED BY APPLICABLE LAW, NEITHER RECIPIENT NOR ANY CONTRIBUTORS
+SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST
+PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE
+EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+7. GENERAL
+
+If any provision of this Agreement is invalid or unenforceable under
+applicable law, it shall not affect the validity or enforceability of
+the remainder of the terms of this Agreement, and without further
+action by the parties hereto, such provision shall be reformed to the
+minimum extent necessary to make such provision valid and enforceable.
+
+If Recipient institutes patent litigation against any entity
+(including a cross-claim or counterclaim in a lawsuit) alleging that the
+Program itself (excluding combinations of the Program with other software
+or hardware) infringes such Recipient's patent(s), then such Recipient's
+rights granted under Section 2(b) shall terminate as of the date such
+litigation is filed.
+
+All Recipient's rights under this Agreement shall terminate if it
+fails to comply with any of the material terms or conditions of this
+Agreement and does not cure such failure in a reasonable period of
+time after becoming aware of such noncompliance. If all Recipient's
+rights under this Agreement terminate, Recipient agrees to cease use
+and distribution of the Program as soon as reasonably practicable.
+However, Recipient's obligations under this Agreement and any licenses
+granted by Recipient relating to the Program shall continue and survive.
+
+Everyone is permitted to copy and distribute copies of this Agreement,
+but in order to avoid inconsistency the Agreement is copyrighted and
+may only be modified in the following manner. The Agreement Steward
+reserves the right to publish new versions (including revisions) of
+this Agreement from time to time. No one other than the Agreement
+Steward has the right to modify this Agreement. The Eclipse Foundation
+is the initial Agreement Steward. The Eclipse Foundation may assign the
+responsibility to serve as the Agreement Steward to a suitable separate
+entity. Each new version of the Agreement will be given a distinguishing
+version number. The Program (including Contributions) may always be
+Distributed subject to the version of the Agreement under which it was
+received. In addition, after a new version of the Agreement is published,
+Contributor may elect to Distribute the Program (including its
+Contributions) under the new version.
+
+Except as expressly stated in Sections 2(a) and 2(b) above, Recipient
+receives no rights or licenses to the intellectual property of any
+Contributor under this Agreement, whether expressly, by implication,
+estoppel or otherwise. All rights in the Program not expressly granted
+under this Agreement are reserved. Nothing in this Agreement is intended
+to be enforceable by any entity that is not a Contributor or Recipient.
+No third-party beneficiary rights are created under this Agreement.
+
+Exhibit A - Form of Secondary Licenses Notice
+
+"This Source Code may also be made available under the following
+Secondary Licenses when the conditions for such availability set forth
+in the Eclipse Public License, v. 2.0 are satisfied: GNU General Public
+License as published by the Free Software Foundation, either version 2
+of the License, or (at your option) any later version, with the GNU
+Classpath Exception which is available at
+https://www.gnu.org/software/classpath/license.html."
+
+  Simply including a copy of this Agreement, including this Exhibit A
+  is not sufficient to license the Source Code under Secondary Licenses.
+
+  If it is not possible or desirable to put the notice in a particular
+  file, then You may include the notice in a location (such as a LICENSE
+  file in a relevant directory) where a recipient would be likely to
+  look for such a notice.
+
+  You may add additional accurate notices of copyright ownership.
diff --git a/tests/jepsen.nukeeper/README.md b/tests/jepsen.nukeeper/README.md
new file mode 100644
index 00000000000..f72409e080f
--- /dev/null
+++ b/tests/jepsen.nukeeper/README.md
@@ -0,0 +1,22 @@
+# jepsen.nukeeper
+
+A Clojure library designed to ... well, that part is up to you.
+
+## Usage
+
+FIXME
+
+## License
+
+Copyright © 2021 FIXME
+
+This program and the accompanying materials are made available under the
+terms of the Eclipse Public License 2.0 which is available at
+http://www.eclipse.org/legal/epl-2.0.
+
+This Source Code may also be made available under the following Secondary
+Licenses when the conditions for such availability set forth in the Eclipse
+Public License, v. 2.0 are satisfied: GNU General Public License as published by
+the Free Software Foundation, either version 2 of the License, or (at your
+option) any later version, with the GNU Classpath Exception which is available
+at https://www.gnu.org/software/classpath/license.html.
diff --git a/tests/jepsen.nukeeper/doc/intro.md b/tests/jepsen.nukeeper/doc/intro.md
new file mode 100644
index 00000000000..c6e5ccbd04a
--- /dev/null
+++ b/tests/jepsen.nukeeper/doc/intro.md
@@ -0,0 +1,3 @@
+# Introduction to jepsen.nukeeper
+
+TODO: write [great documentation](http://jacobian.org/writing/what-to-write/)
diff --git a/tests/jepsen.nukeeper/project.clj b/tests/jepsen.nukeeper/project.clj
new file mode 100644
index 00000000000..e7150c9e5d4
--- /dev/null
+++ b/tests/jepsen.nukeeper/project.clj
@@ -0,0 +1,13 @@
+(defproject jepsen.nukeeper "0.1.0-SNAPSHOT"
+  :injections [(.. System (setProperty "zookeeper.request.timeout" "10000"))]
+  :description "A jepsen tests for ClickHouse NuKeeper"
+  :url "https://clickhouse.tech/"
+  :license {:name "EPL-2.0"
+            :url "https://www.eclipse.org/legal/epl-2.0/"}
+  :main jepsen.nukeeper.main
+  :plugins [[lein-cljfmt "0.7.0"]]
+  :dependencies [[org.clojure/clojure "1.10.1"]
+                 [jepsen "0.2.3"]
+                 [zookeeper-clj "0.9.4"]
+                 [org.apache.zookeeper/zookeeper "3.6.1" :exclusions [org.slf4j/slf4j-log4j12]]]
+  :repl-options {:init-ns jepsen.nukeeper.main})
diff --git a/tests/jepsen.nukeeper/resources/config.xml b/tests/jepsen.nukeeper/resources/config.xml
new file mode 120000
index 00000000000..c7596baa075
--- /dev/null
+++ b/tests/jepsen.nukeeper/resources/config.xml
@@ -0,0 +1 @@
+../../../programs/server/config.xml
\ No newline at end of file
diff --git a/tests/jepsen.nukeeper/resources/listen.xml b/tests/jepsen.nukeeper/resources/listen.xml
new file mode 100644
index 00000000000..de8c737ff75
--- /dev/null
+++ b/tests/jepsen.nukeeper/resources/listen.xml
@@ -0,0 +1,3 @@
+<yandex>
+    <listen_host>::</listen_host>
+</yandex>
diff --git a/tests/jepsen.nukeeper/resources/test_keeper_config.xml b/tests/jepsen.nukeeper/resources/test_keeper_config.xml
new file mode 100644
index 00000000000..0e2a688ea0b
--- /dev/null
+++ b/tests/jepsen.nukeeper/resources/test_keeper_config.xml
@@ -0,0 +1,33 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>{id}</server_id>
+
+        <coordination_settings>
+            <operation_timeout_ms>10000</operation_timeout_ms>
+            <session_timeout_ms>30000</session_timeout_ms>
+            <force_sync>false</force_sync>
+            <startup_timeout>60000</startup_timeout>
+            <raft_logs_level>trace</raft_logs_level>
+            <quorum_reads>{quorum_reads}</quorum_reads>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>{srv1}</hostname>
+                <port>9444</port>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>{srv2}</hostname>
+                <port>9444</port>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>{srv3}</hostname>
+                <port>9444</port>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
diff --git a/tests/jepsen.nukeeper/resources/users.xml b/tests/jepsen.nukeeper/resources/users.xml
new file mode 120000
index 00000000000..41b137a130f
--- /dev/null
+++ b/tests/jepsen.nukeeper/resources/users.xml
@@ -0,0 +1 @@
+../../../programs/server/users.xml
\ No newline at end of file
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
new file mode 100644
index 00000000000..8aa157bc16e
--- /dev/null
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -0,0 +1,143 @@
+(ns jepsen.nukeeper.main
+  (:require [clojure.tools.logging :refer :all]
+            [jepsen.nukeeper.utils :refer :all]
+            [jepsen.nukeeper.set :as set]
+            [jepsen.nukeeper.register :as register]
+            [clojure.string :as str]
+            [jepsen
+             [checker :as checker]
+             [cli :as cli]
+             [client :as client]
+             [control :as c]
+             [db :as db]
+             [nemesis :as nemesis]
+             [generator :as gen]
+             [independent :as independent]
+             [tests :as tests]]
+            [jepsen.control.util :as cu]
+            [jepsen.os.ubuntu :as ubuntu]
+            [jepsen.checker.timeline :as timeline]
+            [clojure.java.io :as io]
+            [knossos.model :as model]
+            [zookeeper.data :as data]
+            [zookeeper :as zk])
+  (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
+
+(def dir "/var/lib/clickhouse")
+(def binary "clickhouse")
+(def logdir "/var/log/clickhouse-server")
+(def logfile "/var/log/clickhouse-server/stderr.log")
+(def serverlog "/var/log/clickhouse-server/clickhouse-server.log")
+(def pidfile (str dir "/clickhouse.pid"))
+(def binary-path "/tmp")
+
+(defn cluster-config
+  [test node config-template]
+  (let [nodes (:nodes test)]
+    (clojure.string/replace
+     (clojure.string/replace
+      (clojure.string/replace
+       (clojure.string/replace
+        (clojure.string/replace config-template #"\{quorum_reads\}" (str (boolean (:quorum test))))
+        #"\{srv1\}" (get nodes 0))
+       #"\{srv2\}" (get nodes 1))
+      #"\{srv3\}" (get nodes 2))
+     #"\{id\}" (str (inc (.indexOf nodes node))))))
+
+(defn db
+  [version]
+  (reify db/DB
+    (setup! [_ test node]
+      (info node "installing clickhouse" version)
+      (c/su
+       (if-not (cu/exists? (str binary-path "/clickhouse"))
+         (c/exec :sky :get :-d binary-path :-N :Backbone version))
+       (c/exec :mkdir :-p logdir)
+       (c/exec :touch logfile)
+       (c/exec (str binary-path "/clickhouse") :install)
+       (c/exec :chown :-R :root dir)
+       (c/exec :chown :-R :root logdir)
+       (c/exec :echo (slurp (io/resource "listen.xml")) :> "/etc/clickhouse-server/config.d/listen.xml")
+       (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> "/etc/clickhouse-server/config.d/test_keeper_config.xml")
+       (cu/start-daemon!
+        {:pidfile pidfile
+         :logfile logfile
+         :chdir dir}
+        (str binary-path "/clickhouse")
+        :server
+        :--config "/etc/clickhouse-server/config.xml")
+       (Thread/sleep 10000)))
+
+    (teardown! [_ test node]
+      (info node "tearing down clickhouse")
+      (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
+      (c/su
+       (c/exec :rm :-f (str binary-path "/clickhouse"))
+       (c/exec :rm :-rf dir)
+       (c/exec :rm :-rf logdir)
+       (c/exec :rm :-rf "/etc/clickhouse-server")))
+
+    db/LogFiles
+    (log-files [_ test node]
+      [logfile serverlog])))
+
+(def workloads
+  "A map of workload names to functions that construct workloads, given opts."
+  {"set"      set/workload
+   "register" register/workload})
+
+(def cli-opts
+  "Additional command line options."
+  [["-w" "--workload NAME" "What workload should we run?"
+    :missing  (str "--workload " (cli/one-of workloads))
+    :validate [workloads (cli/one-of workloads)]]
+   ["-q" "--quorum" "Use quorum reads, instead of reading from any primary."]
+   ["-r" "--rate HZ" "Approximate number of requests per second, per thread."
+    :default  10
+    :parse-fn read-string
+    :validate [#(and (number? %) (pos? %)) "Must be a positive number"]]
+   [nil "--ops-per-key NUM" "Maximum number of operations on any given key."
+    :default  100
+    :parse-fn parse-long
+    :validate [pos? "Must be a positive integer."]]])
+
+(defn nukeeper-test
+  "Given an options map from the command line runner (e.g. :nodes, :ssh,
+  :concurrency, ...), constructs a test map."
+  [opts]
+  (let [quorum (boolean (:quorum opts))
+        workload  ((get workloads (:workload opts)) opts)]
+    (merge tests/noop-test
+           opts
+           {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)))
+            :os ubuntu/os
+            :db (db "rbtorrent:8831b5baa571abc28340cf66a9279a4ce45fac64")
+            :pure-generators true
+            :client (:client workload)
+            :nemesis (nemesis/partition-random-halves)
+            :checker (checker/compose
+                      {:perf     (checker/perf)
+                       :workload (:checker workload)})
+            :generator (gen/phases
+                        (->> (:generator workload)
+                             (gen/stagger (/ (:rate opts)))
+                             (gen/nemesis
+                              (cycle [(gen/sleep 5)
+                                      {:type :info, :f :start}
+                                      (gen/sleep 5)
+                                      {:type :info, :f :stop}]))
+                             (gen/time-limit (:time-limit opts)))
+                        (gen/log "Healing cluster")
+                        (gen/nemesis (gen/once {:type :info, :f :stop}))
+                        (gen/log "Waiting for recovery")
+                        (gen/sleep 10)
+                        (gen/clients (:final-generator workload)))})))
+
+(defn -main
+  "Handles command line arguments. Can either run a test, or a web server for
+  browsing results."
+  [& args]
+  (cli/run! (merge (cli/single-test-cmd {:test-fn nukeeper-test
+                                         :opt-spec cli-opts})
+                   (cli/serve-cmd))
+            args))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj
new file mode 100644
index 00000000000..98322845346
--- /dev/null
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj
@@ -0,0 +1,64 @@
+(ns jepsen.nukeeper.register
+  (:require   [jepsen
+               [checker :as checker]
+               [client :as client]
+               [independent :as independent]
+               [generator :as gen]]
+              [jepsen.checker.timeline :as timeline]
+              [knossos.model :as model]
+              [jepsen.nukeeper.utils :refer :all]
+              [zookeeper :as zk])
+  (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
+
+(defn r   [_ _] {:type :invoke, :f :read, :value nil})
+(defn w   [_ _] {:type :invoke, :f :write, :value (rand-int 5)})
+(defn cas [_ _] {:type :invoke, :f :cas, :value [(rand-int 5) (rand-int 5)]})
+
+(defrecord RegisterClient [conn]
+  client/Client
+  (open! [this test node]
+    (assoc this :conn (zk-connect node 9181 30000)))
+
+  (setup! [this test]
+    (zk-create-range conn 300)) ; 300 nodes to be sure
+
+  (invoke! [_ test op]
+    (let [[k v] (:value op)
+          zk-k (zk-path k)]
+      (case (:f op)
+        :read (try
+                (assoc op :type :ok, :value (independent/tuple k (parse-long (:data (zk-get-str conn zk-k)))))
+                (catch Exception _ (assoc op :type :fail, :error :connect-error)))
+        :write (try
+                 (do (zk-set conn zk-k v)
+                     (assoc op :type :ok))
+                 (catch Exception _ (assoc op :type :info, :error :connect-error)))
+        :cas (try
+               (let [[old new] v]
+                 (assoc op :type (if (zk-cas conn zk-k old new)
+                                   :ok
+                                   :fail)))
+               (catch KeeperException$BadVersionException _ (assoc op :type :fail, :error :bad-version))
+               (catch Exception _ (assoc op :type :info, :error :connect-error))))))
+
+  (teardown! [this test])
+
+  (close! [_ test]
+    (zk/close conn)))
+
+(defn workload
+  "Tests linearizable reads, writes, and compare-and-set operations on
+  independent keys."
+  [opts]
+  {:client    (RegisterClient. nil)
+   :checker   (independent/checker
+               (checker/compose
+                {:linear   (checker/linearizable {:model     (model/cas-register)
+                                                  :algorithm :linear})
+                 :timeline (timeline/html)}))
+   :generator (independent/concurrent-generator
+               10
+               (range)
+               (fn [k]
+                 (->> (gen/mix [r w cas])
+                      (gen/limit (:ops-per-key opts)))))})
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
new file mode 100644
index 00000000000..7e196fab4c7
--- /dev/null
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -0,0 +1,43 @@
+(ns jepsen.nukeeper.set
+  (:require   [jepsen
+               [checker :as checker]
+               [client :as client]
+               [generator :as gen]]
+              [jepsen.nukeeper.utils :refer :all]
+              [zookeeper :as zk])
+  (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
+
+(defrecord SetClient [k conn]
+  client/Client
+  (open! [this test node]
+    (assoc this :conn (zk-connect node 9181 30000)))
+
+  (setup! [this test]
+    (zk-create-if-not-exists conn k "#{}"))
+
+  (invoke! [_ test op]
+    (case (:f op)
+      :read ;(try
+      (assoc op
+             :type :ok
+             :value (read-string (:data (zk-get-str conn k))))
+              ;(catch Exception _ (assoc op :type :fail, :error :connect-error)))
+      :add (try
+             (do
+               (zk-add-to-set conn k (:value op))
+               (assoc op :type :ok))
+             (catch KeeperException$BadVersionException _ (assoc op :type :fail, :error :bad-version))
+             (catch Exception _ (assoc op :type :info, :error :connect-error)))))
+
+  (teardown! [_ test])
+
+  (close! [_ test]))
+
+(defn workload
+  "A generator, client, and checker for a set test."
+  [opts]
+  {:client    (SetClient. "/a-set" nil)
+   :checker   (checker/set)
+   :generator (->> (range)
+                   (map (fn [x] {:type :invoke, :f :add, :value x})))
+   :final-generator (gen/once {:type :invoke, :f :read, :value nil})})
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
new file mode 100644
index 00000000000..3caec8e5f62
--- /dev/null
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -0,0 +1,56 @@
+(ns jepsen.nukeeper.utils
+  (:require [clojure.string :as str]
+            [zookeeper.data :as data]
+            [zookeeper :as zk]))
+
+(defn parse-long
+  "Parses a string to a Long. Passes through `nil` and empty strings."
+  [s]
+  (if (and s (> (count s) 0))
+    (Long/parseLong s)))
+
+(defn zk-range
+  []
+  (map (fn [v] (str "/" v)) (range)))
+
+(defn zk-path
+  [n]
+  (str "/" n))
+
+(defn zk-connect
+  [host port timeout]
+  (zk/connect (str host ":" port) :timeout-msec timeout))
+
+(defn zk-create-range
+  [conn n]
+  (dorun (map (fn [v] (zk/create-all conn v :persistent? true)) (take n (zk-range)))))
+
+(defn zk-set
+  ([conn path value]
+   (zk/set-data conn path (data/to-bytes (str value)) -1))
+  ([conn path value version]
+   (zk/set-data conn path (data/to-bytes (str value)) version)))
+
+(defn zk-get-str
+  [conn path]
+  (let [zk-result (zk/data conn path)]
+    {:data (data/to-string (:data zk-result))
+     :stat (:stat zk-result)}))
+
+(defn zk-cas
+  [conn path old-value new-value]
+  (let [current-value (zk-get-str conn path)]
+    (if (= (parse-long (:data current-value)) old-value)
+      (do (zk-set conn path new-value (:version (:stat current-value)))
+          true))))
+
+(defn zk-add-to-set
+  [conn path elem]
+  (let [current-value (zk-get-str conn path)
+        current-set (read-string (:data current-value))
+        new-set (conj current-set elem)]
+    (zk-set conn path (pr-str new-set) (:version (:stat current-value)))))
+
+(defn zk-create-if-not-exists
+  [conn path data]
+  (zk/create conn path :data (data/to-bytes (str data))))
diff --git a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
new file mode 100644
index 00000000000..824aa40d2c8
--- /dev/null
+++ b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
@@ -0,0 +1,28 @@
+(ns jepsen.nukeeper-test
+  (:require [clojure.test :refer :all]
+            [jepsen.nukeeper.utils :refer :all]
+            [zookeeper :as zk]
+            [zookeeper.data :as data]))
+
+(defn multicreate
+  [conn]
+  (dorun (map (fn [v] (zk/create conn v :persistent? true)) (take 10 (zk-range)))))
+
+(defn multidelete
+  [conn]
+  (dorun (map (fn [v] (zk/delete conn v)) (take 10 (zk-range)))))
+
+(deftest a-test
+  (testing "nukeeper connection"
+    (let [conn (zk/connect "localhost:9181" :timeout-msec 5000)]
+      (println (take 10 (zk-range)))
+      (multidelete conn)
+      (multicreate conn)
+      (zk/create-all conn "/0")
+      (zk/create conn "/0")
+      (println (zk/children conn "/"))
+      (zk/set-data conn "/0" (data/to-bytes "777") -1)
+      (Thread/sleep 5000)
+      (println "VALUE" (data/to-string (:data (zk/data conn "/0"))))
+      (is (= (data/to-string (:data (zk/data conn "/0"))) "777"))
+      (zk/close conn))))

From ce20eae2a3efd4e649bcb598c71d403f83463deb Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Wed, 10 Mar 2021 01:58:19 +0300
Subject: [PATCH 010/260] Added specialized CacheDictionaryStorage

---
 src/Common/PODArray.h                         |  24 +
 src/Common/tests/gtest_pod_array.cpp          |  54 ++
 src/Dictionaries/CacheDictionaryStorage.h     | 474 ++++++++++++++----
 src/Dictionaries/SSDCacheDictionaryStorage.h  |   3 -
 .../SerializedCacheDictionaryStorage.h        | 412 +++++++++++++++
 src/Dictionaries/benchmark                    | 154 ------
 .../registerCacheDictionaries.cpp             |  62 ++-
 7 files changed, 908 insertions(+), 275 deletions(-)
 create mode 100644 src/Dictionaries/SerializedCacheDictionaryStorage.h
 delete mode 100644 src/Dictionaries/benchmark

diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h
index 163a6503d2e..57ad3d46177 100644
--- a/src/Common/PODArray.h
+++ b/src/Common/PODArray.h
@@ -692,6 +692,30 @@ public:
         assign(from.begin(), from.end());
     }
 
+    void erase(const_iterator first, const_iterator last)
+    {
+        iterator first_no_const = const_cast<iterator>(first);
+        iterator last_no_const = const_cast<iterator>(last);
+
+        size_t items_to_move = end() - last;
+
+        while (items_to_move != 0)
+        {
+            *first_no_const = *last_no_const;
+
+            ++first_no_const;
+            ++last_no_const;
+
+            --items_to_move;
+        }
+
+        this->c_end = reinterpret_cast<char *>(first_no_const);
+    }
+
+    void erase(const_iterator pos)
+    {
+        this->erase(pos, pos + 1);
+    }
 
     bool operator== (const PODArray & rhs) const
     {
diff --git a/src/Common/tests/gtest_pod_array.cpp b/src/Common/tests/gtest_pod_array.cpp
index 53b3e207a22..63cf7026757 100644
--- a/src/Common/tests/gtest_pod_array.cpp
+++ b/src/Common/tests/gtest_pod_array.cpp
@@ -92,3 +92,57 @@ TEST(Common, PODInsertElementSizeNotMultipleOfLeftPadding)
 
     EXPECT_EQ(arr1_initially_empty.size(), items_to_insert_size);
 }
+
+TEST(Common, PODErase)
+{
+    {
+        PaddedPODArray<UInt64> items {0,1,2,3,4,5,6,7,8,9};
+        PaddedPODArray<UInt64> expected;
+        expected = {0,1,2,3,4,5,6,7,8,9};
+
+        items.erase(items.begin(), items.begin());
+        EXPECT_EQ(items, expected);
+
+        items.erase(items.end(), items.end());
+        EXPECT_EQ(items, expected);
+    }
+    {
+        PaddedPODArray<UInt64> actual {0,1,2,3,4,5,6,7,8,9};
+        PaddedPODArray<UInt64> expected;
+
+        expected = {0,1,4,5,6,7,8,9};
+        actual.erase(actual.begin() + 2, actual.begin() + 4);
+        EXPECT_EQ(actual, expected);
+
+        expected = {0,1,4};
+        actual.erase(actual.begin() + 3, actual.end());
+        EXPECT_EQ(actual, expected);
+
+        expected = {};
+        actual.erase(actual.begin(), actual.end());
+        EXPECT_EQ(actual, expected);
+
+        for (size_t i = 0; i < 10; ++i)
+            actual.emplace_back(static_cast<UInt64>(i));
+
+        expected = {0,1,4,5,6,7,8,9};
+        actual.erase(actual.begin() + 2, actual.begin() + 4);
+        EXPECT_EQ(actual, expected);
+
+        expected = {0,1,4};
+        actual.erase(actual.begin() + 3, actual.end());
+        EXPECT_EQ(actual, expected);
+
+        expected = {};
+        actual.erase(actual.begin(), actual.end());
+        EXPECT_EQ(actual, expected);
+    }
+    {
+        PaddedPODArray<UInt64> actual {0,1,2,3,4,5,6,7,8,9};
+        PaddedPODArray<UInt64> expected;
+
+        expected = {1,2,3,4,5,6,7,8,9};
+        actual.erase(actual.begin());
+        EXPECT_EQ(actual, expected);
+    }
+}
diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h
index cf0b74e8bd2..2b34b13fa6f 100644
--- a/src/Dictionaries/CacheDictionaryStorage.h
+++ b/src/Dictionaries/CacheDictionaryStorage.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <chrono>
+#include <variant>
 
 #include <pcg_random.hpp>
 
@@ -30,16 +31,7 @@ struct CacheDictionaryStorageConfiguration
     const DictionaryLifetime lifetime;
 };
 
-/** Keys are stored in LRUCache and column values are serialized into arena.
-
-    Cell in LRUCache consists of allocated size and place in arena were columns serialized data is stored.
-
-    Columns are serialized by rows.
-
-    When cell is removed from LRUCache data associated with it is also removed from arena.
-
-    In case of complex key we also store key data in arena and it is removed from arena.
-*/
+/// TODO: Add documentation
 template <DictionaryKeyType dictionary_key_type>
 class CacheDictionaryStorage final : public ICacheDictionaryStorage
 {
@@ -47,11 +39,36 @@ public:
     using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
     static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage");
 
-    explicit CacheDictionaryStorage(CacheDictionaryStorageConfiguration & configuration_)
+    explicit CacheDictionaryStorage(
+        const DictionaryStructure & dictionary_structure,
+        CacheDictionaryStorageConfiguration & configuration_)
         : configuration(configuration_)
         , rnd_engine(randomSeed())
-        , cache(configuration.max_size_in_cells, false, { arena })
+        , cache(configuration.max_size_in_cells, false, { *this })
     {
+        for (const auto & dictionary_attribute : dictionary_structure.attributes)
+        {
+            auto attribute_type = dictionary_attribute.underlying_type;
+
+            auto type_call = [&](const auto & dictionary_attribute_type)
+            {
+                using Type = std::decay_t<decltype(dictionary_attribute_type)>;
+                using AttributeType = typename Type::AttributeType;
+                using ValueType = DictionaryValueType<AttributeType>;
+
+                attributes.emplace_back();
+                auto & last_attribute = attributes.back();
+                last_attribute.type = attribute_type;
+                last_attribute.is_complex_type = dictionary_attribute.is_nullable || dictionary_attribute.is_array;
+
+                if (dictionary_attribute.is_nullable)
+                    last_attribute.attribute_container = std::vector<Field>();
+                else
+                    last_attribute.attribute_container = PaddedPODArray<ValueType>();
+            };
+
+            callOnDictionaryAttributeType(attribute_type, type_call);
+        }
     }
 
     bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; }
@@ -144,10 +161,36 @@ public:
 
     size_t getMaxSize() const override { return cache.getMaxSize(); }
 
-    size_t getBytesAllocated() const override { return arena.size() + cache.getSizeInBytes(); }
+    size_t getBytesAllocated() const override
+    {
+        size_t attributes_size_in_bytes = 0;
+        size_t attributes_size = attributes.size();
+
+        for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index)
+        {
+            getAttributeContainer(attribute_index, [&](const auto & container)
+            {
+                attributes_size_in_bytes += container.capacity() * sizeof(container[0]);
+            });
+        }
+
+        return arena.size() + cache.getSizeInBytes();
+    }
 
 private:
 
+    struct FetchedKey
+    {
+        FetchedKey(size_t element_index_, bool is_default_)
+            : element_index(element_index_)
+            , is_default(is_default_)
+        {}
+
+        const size_t element_index;
+        const bool is_default;
+    };
+
+
     template <typename KeysStorageFetchResult>
     ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl(
         const PaddedPODArray<KeyType> & keys,
@@ -161,10 +204,12 @@ private:
         const auto now = std::chrono::system_clock::now();
 
         size_t fetched_columns_index = 0;
+        size_t keys_size = keys.size();
 
         std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds);
 
-        size_t keys_size = keys.size();
+        PaddedPODArray<FetchedKey> fetched_keys;
+        fetched_keys.reserve(keys_size);
 
         for (size_t key_index = 0; key_index < keys_size; ++key_index)
         {
@@ -195,19 +240,14 @@ private:
                     ++result.found_keys_size;
                 }
 
-                ++fetched_columns_index;
-
-                if (cell.isDefault())
+                if (cell.is_default)
                 {
                     result.key_index_to_state[key_index].setDefault();
                     ++result.default_keys_size;
-                    insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index);
-                }
-                else
-                {
-                    const char * place_for_serialized_columns = cell.place_for_serialized_columns;
-                    deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, place_for_serialized_columns);
                 }
+
+                fetched_keys.emplace_back(cell.element_index, cell.is_default);
+                ++fetched_columns_index;
             }
             else
             {
@@ -216,64 +256,166 @@ private:
             }
         }
 
+        for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index)
+        {
+            if (!fetch_request.shouldFillResultColumnWithIndex(attribute_index))
+                continue;
+
+            size_t fetched_keys_size = fetched_keys.size();
+            auto & attribute = attributes[attribute_index];
+            const auto & default_value_provider = fetch_request.defaultValueProviderAtIndex(attribute_index);
+            auto & fetched_column = *result.fetched_columns[attribute_index];
+            fetched_column.reserve(fetched_keys_size);
+
+            if (unlikely(attribute.is_complex_type))
+            {
+                auto & container = std::get<std::vector<Field>>(attribute.attribute_container);
+
+                for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index)
+                {
+                    auto fetched_key = fetched_keys[fetched_key_index];
+
+                    if (fetched_key.is_default)
+                        fetched_column.insert(default_value_provider.getDefaultValue(fetched_key_index));
+                    else
+                        fetched_column.insert(container[fetched_key.element_index]);
+                }
+            }
+            else
+            {
+                auto type_call = [&](const auto & dictionary_attribute_type)
+                {
+                    using Type = std::decay_t<decltype(dictionary_attribute_type)>;
+                    using AttributeType = typename Type::AttributeType;
+                    using ValueType = DictionaryValueType<AttributeType>;
+                    using ColumnType =
+                        std::conditional_t<std::is_same_v<AttributeType, String>, ColumnString,
+                            std::conditional_t<IsDecimalNumber<AttributeType>, ColumnDecimal<ValueType>,
+                                ColumnVector<AttributeType>>>;
+
+                    auto & container = std::get<PaddedPODArray<ValueType>>(attribute.attribute_container);
+                    ColumnType & column_typed = static_cast<ColumnType &>(fetched_column);
+
+                    if constexpr (std::is_same_v<ColumnType, ColumnString>)
+                    {
+                        for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index)
+                        {
+                            auto fetched_key = fetched_keys[fetched_key_index];
+
+                            if (fetched_key.is_default)
+                                column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index));
+                            else
+                            {
+                                auto item = container[fetched_key.element_index];
+                                column_typed.insertData(item.data, item.size);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index)
+                        {
+                            auto fetched_key = fetched_keys[fetched_key_index];
+                            auto & data = column_typed.getData();
+
+                            if (fetched_key.is_default)
+                                column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index));
+                            else
+                            {
+                                auto item = container[fetched_key.element_index];
+                                data.push_back(item);
+                            }
+                        }
+                    }
+                };
+
+                callOnDictionaryAttributeType(attribute.type, type_call);
+            }
+        }
+
         return result;
     }
 
     void insertColumnsForKeysImpl(const PaddedPODArray<KeyType> & keys, Columns columns)
     {
-        Arena temporary_values_pool;
-
-        size_t columns_to_serialize_size = columns.size();
-        PaddedPODArray<StringRef> temporary_column_data(columns_to_serialize_size);
-
         const auto now = std::chrono::system_clock::now();
-
         size_t keys_size = keys.size();
 
         for (size_t key_index = 0; key_index < keys_size; ++key_index)
         {
-            size_t allocated_size_for_columns = 0;
-            const char * block_start = nullptr;
-
             auto key = keys[key_index];
-            auto * it = cache.find(key);
+            cache.erase(key);
 
-            for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index)
+            Cell cell;
+
+            setCellDeadline(cell, now);
+            cell.element_index = insert_index;
+            cell.is_default = false;
+
+            ++insert_index;
+
+            insertCellInCache(key, cell);
+        }
+
+        Field complex_column_value;
+
+        for (size_t column_index = 0; column_index < columns.size(); ++column_index)
+        {
+            auto & attribute = attributes[column_index];
+            const auto & column = columns[column_index];
+            size_t column_size = column->size();
+
+            if (unlikely(attribute.is_complex_type))
             {
-                auto & column = columns[column_index];
-                temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start);
-                allocated_size_for_columns += temporary_column_data[column_index].size;
-            }
+                auto & container = std::get<std::vector<Field>>(attribute.attribute_container);
+                container.reserve(column_size);
 
-            char * place_for_serialized_columns = arena.alloc(allocated_size_for_columns);
-            memcpy(reinterpret_cast<void*>(place_for_serialized_columns), reinterpret_cast<const void*>(block_start), allocated_size_for_columns);
-
-            if (it)
-            {
-                /// Cell exists need to free previous serialized place and update deadline
-                auto & cell = it->getMapped();
-
-                if (cell.place_for_serialized_columns)
-                    arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
-
-                setCellDeadline(cell, now);
-                cell.allocated_size_for_columns = allocated_size_for_columns;
-                cell.place_for_serialized_columns = place_for_serialized_columns;
+                for (size_t item_index = 0; item_index < column_size; ++item_index)
+                {
+                    column->get(item_index, complex_column_value);
+                    container.emplace_back(complex_column_value);
+                }
             }
             else
             {
-                /// No cell exists so create and put in cache
-                Cell cell;
+                auto type_call = [&](const auto & dictionary_attribute_type)
+                {
+                    using Type = std::decay_t<decltype(dictionary_attribute_type)>;
+                    using AttributeType = typename Type::AttributeType;
+                    using ValueType = DictionaryValueType<AttributeType>;
+                    using ColumnType =
+                        std::conditional_t<std::is_same_v<AttributeType, String>, ColumnString,
+                            std::conditional_t<IsDecimalNumber<AttributeType>, ColumnDecimal<ValueType>,
+                                ColumnVector<AttributeType>>>;
 
-                setCellDeadline(cell, now);
-                cell.allocated_size_for_columns = allocated_size_for_columns;
-                cell.place_for_serialized_columns = place_for_serialized_columns;
+                    const ColumnType & column_typed = static_cast<const ColumnType &>(*column);
 
-                insertCellInCache(key, cell);
+                    auto & container = std::get<PaddedPODArray<ValueType>>(attribute.attribute_container);
+                    container.reserve(column_size);
+
+                    if constexpr (std::is_same_v<ColumnType, ColumnString>)
+                    {
+                        /// TODO: Serialize while column string in arena then just insert offsets in container
+                        for (size_t item_index = 0; item_index < column_size; ++item_index)
+                        {
+                            StringRef value = column->getDataAt(item_index);
+                            StringRef updated_data = copyStringInArena(value);
+
+                            container.emplace_back(updated_data);
+                        }
+                    }
+                    else
+                    {
+                        const auto & data = column_typed.getData();
+                        container.insert(data.begin(), data.end());
+                    }
+                };
+
+                callOnDictionaryAttributeType(attribute.type, type_call);
             }
-
-            temporary_values_pool.rollback(allocated_size_for_columns);
         }
+
+        deleteUnusedKeysIfNecessary();
     }
 
     void insertDefaultKeysImpl(const PaddedPODArray<KeyType> & keys)
@@ -282,31 +424,18 @@ private:
 
         for (auto key : keys)
         {
-            auto * it = cache.find(key);
+            cache.erase(key);
 
-            if (it)
-            {
-                auto & cell = it->getMapped();
+            Cell cell;
 
-                setCellDeadline(cell, now);
+            setCellDeadline(cell, now);
+            cell.element_index = 0;
+            cell.is_default = true;
 
-                if (cell.place_for_serialized_columns)
-                    arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
-
-                cell.allocated_size_for_columns = 0;
-                cell.place_for_serialized_columns = nullptr;
-            }
-            else
-            {
-                Cell cell;
-
-                setCellDeadline(cell, now);
-                cell.allocated_size_for_columns = 0;
-                cell.place_for_serialized_columns = nullptr;
-
-                insertCellInCache(key, cell);
-            }
+            insertCellInCache(key, cell);
         }
+
+        deleteUnusedKeysIfNecessary();
     }
 
     PaddedPODArray<KeyType> getCachedKeysImpl() const
@@ -318,7 +447,7 @@ private:
         {
             auto & cell = node.getMapped();
 
-            if (cell.isDefault())
+            if (cell.is_default)
                 continue;
 
             result.emplace_back(node.getKey());
@@ -327,37 +456,138 @@ private:
         return result;
     }
 
+    void deleteUnusedKeysIfNecessary()
+    {
+        size_t cache_max_size = cache.getMaxSize();
+
+        if (unlikely(attributes.empty()) || insert_index * 2 < cache_max_size)
+            return;
+
+        std::unordered_map<size_t, typename CacheLRUHashMap::iterator> element_index_to_cache_iterator;
+
+        for (auto begin = cache.begin(); begin != cache.end(); ++begin)
+        {
+            auto & node = *begin;
+            auto & cell = node.getMapped();
+            size_t element_index = cell.element_index;
+            element_index_to_cache_iterator.insert(std::make_pair(element_index, begin));
+        }
+
+        size_t last_remove_index = 0;
+
+        getAttributeContainer(0, [&, this](auto & container)
+        {
+            size_t container_size = container.size();
+            size_t remove_index = 0;
+
+            for (size_t i = 0; i < container_size; ++i)
+            {
+                if (indexes_to_delete.contains(i))
+                    continue;
+
+                std::swap(container[remove_index], container[i]);
+
+                auto it = element_index_to_cache_iterator.find(remove_index);
+                if (it != element_index_to_cache_iterator.end())
+                {
+                    auto & cell = it->second->getMapped();
+                    cell.element_index = remove_index;
+                }
+
+                ++remove_index;
+            }
+
+            container.erase(container.begin() + remove_index, container.end());
+            last_remove_index = remove_index;
+        });
+
+        insert_index = last_remove_index;
+
+        for (size_t attribute_index = 1; attribute_index < attributes.size(); ++attribute_index)
+        {
+            getAttributeContainer(attribute_index, [this](auto & container)
+            {
+                size_t container_size = container.size();
+                size_t remove_index = 0;
+
+                for (size_t i = 0; i < container_size; ++i)
+                {
+                    if (indexes_to_delete.contains(i))
+                        continue;
+
+                    std::swap(container[remove_index], container[i]);
+                    ++remove_index;
+                }
+
+                container.erase(container.begin() + remove_index, container.end());
+            });
+       }
+
+       indexes_to_delete.clear();
+    }
+
+    template <typename GetContainerFunc>
+    void getAttributeContainer(size_t attribute_index, GetContainerFunc && func)
+    {
+        auto & attribute = attributes[attribute_index];
+        auto & attribute_type = attribute.type;
+
+        if (unlikely(attribute.is_complex_type))
+        {
+            auto & container = std::get<std::vector<Field>>(attribute.attribute_container);
+            std::forward<GetContainerFunc>(func)(container);
+        }
+        else
+        {
+            auto type_call = [&](const auto & dictionary_attribute_type)
+            {
+                using Type = std::decay_t<decltype(dictionary_attribute_type)>;
+                using AttributeType = typename Type::AttributeType;
+                using ValueType = DictionaryValueType<AttributeType>;
+
+                auto & container = std::get<PaddedPODArray<ValueType>>(attribute.attribute_container);
+                std::forward<GetContainerFunc>(func)(container);
+            };
+
+            callOnDictionaryAttributeType(attribute_type, type_call);
+        }
+    }
+
+    template <typename GetContainerFunc>
+    void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) const
+    {
+        return const_cast<std::decay_t<decltype(*this)> *>(this)->template getAttributeContainer(attribute_index, std::forward<GetContainerFunc>(func));
+    }
+
+
     using TimePoint = std::chrono::system_clock::time_point;
 
     struct Cell
     {
         TimePoint deadline;
-        size_t allocated_size_for_columns;
-        char * place_for_serialized_columns;
-
-        inline bool isDefault() const { return place_for_serialized_columns == nullptr; }
-        inline void setDefault()
-        {
-            place_for_serialized_columns = nullptr;
-            allocated_size_for_columns = 0;
-        }
+        size_t element_index;
+        bool is_default;
     };
 
     void insertCellInCache(KeyType & key, const Cell & cell)
     {
+        /// Copy complex key into arena and put in cache
         if constexpr (dictionary_key_type == DictionaryKeyType::complex)
-        {
-            /// Copy complex key into arena and put in cache
-            size_t key_size = key.size;
-            char * place_for_key = arena.alloc(key_size);
-            memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
-            KeyType updated_key{place_for_key, key_size};
-            key = updated_key;
-        }
+            key = copyStringInArena(key);
 
         cache.insert(key, cell);
     }
 
+    StringRef copyStringInArena(StringRef value_to_copy)
+    {
+        size_t value_to_copy_size = value_to_copy.size;
+        char * place_for_key = arena.alloc(value_to_copy_size);
+        memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(value_to_copy.data), value_to_copy_size);
+        StringRef updated_value{place_for_key, value_to_copy_size};
+
+        return updated_value;
+    }
+
     inline static bool cellHasDeadline(const Cell & cell)
     {
         return cell.deadline != std::chrono::system_clock::from_time_t(0);
@@ -378,34 +608,58 @@ private:
         cell.deadline = now + std::chrono::seconds(distribution(rnd_engine));
     }
 
-    template <typename>
-    friend class ArenaCellDisposer;
-
     CacheDictionaryStorageConfiguration configuration;
 
     ArenaWithFreeLists arena;
 
     pcg64 rnd_engine;
 
-    class ArenaCellDisposer
+    struct Attribute
+    {
+        AttributeUnderlyingType type;
+        bool is_complex_type;
+
+        std::variant<
+            PaddedPODArray<UInt8>,
+            PaddedPODArray<UInt16>,
+            PaddedPODArray<UInt32>,
+            PaddedPODArray<UInt64>,
+            PaddedPODArray<UInt128>,
+            PaddedPODArray<Int8>,
+            PaddedPODArray<Int16>,
+            PaddedPODArray<Int32>,
+            PaddedPODArray<Int64>,
+            PaddedPODArray<Decimal32>,
+            PaddedPODArray<Decimal64>,
+            PaddedPODArray<Decimal128>,
+            PaddedPODArray<Float32>,
+            PaddedPODArray<Float64>,
+            PaddedPODArray<StringRef>,
+            std::vector<Field>> attribute_container;
+    };
+
+    std::vector<Attribute> attributes;
+    size_t insert_index = 0;
+    std::unordered_set<size_t, DefaultHash<size_t>> indexes_to_delete;
+
+    class CacheStorageCellDisposer
     {
     public:
-        ArenaWithFreeLists & arena;
+        CacheDictionaryStorage & storage;
 
         template <typename Key, typename Value>
-        void operator()(const Key & key, const Value & value) const
+        void operator()(const Key & key, const Value & cell) const
         {
             /// In case of complex key we keep it in arena
             if constexpr (std::is_same_v<Key, StringRef>)
-                arena.free(const_cast<char *>(key.data), key.size);
+                storage.arena.free(const_cast<char *>(key.data), key.size);
 
-            if (value.place_for_serialized_columns)
-                arena.free(value.place_for_serialized_columns, value.allocated_size_for_columns);
+            storage.indexes_to_delete.insert(cell.element_index);
         }
     };
 
-    using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, ArenaCellDisposer>;
-    using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, ArenaCellDisposer>;
+    using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, CacheStorageCellDisposer>;
+    using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, CacheStorageCellDisposer>;
 
     using CacheLRUHashMap = std::conditional_t<
         dictionary_key_type == DictionaryKeyType::simple,
diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h
index 16a8954de58..e061b783ee4 100644
--- a/src/Dictionaries/SSDCacheDictionaryStorage.h
+++ b/src/Dictionaries/SSDCacheDictionaryStorage.h
@@ -1316,9 +1316,6 @@ private:
         cell.deadline = now + std::chrono::seconds{distribution(rnd_engine)};
     }
 
-    template <typename>
-    friend class ArenaCellKeyDisposer;
-
     SSDCacheDictionaryStorageConfiguration configuration;
 
     SSDCacheFileBuffer<SSDCacheKeyType> file_buffer;
diff --git a/src/Dictionaries/SerializedCacheDictionaryStorage.h b/src/Dictionaries/SerializedCacheDictionaryStorage.h
new file mode 100644
index 00000000000..2616e03763c
--- /dev/null
+++ b/src/Dictionaries/SerializedCacheDictionaryStorage.h
@@ -0,0 +1,412 @@
+#pragma once
+
+#include <chrono>
+
+#include <pcg_random.hpp>
+
+#include <Common/randomSeed.h>
+#include <Common/Arena.h>
+#include <Common/ArenaWithFreeLists.h>
+#include <Common/HashTable/LRUHashMap.h>
+#include <Dictionaries/DictionaryStructure.h>
+#include <Dictionaries/ICacheDictionaryStorage.h>
+#include <Dictionaries/DictionaryHelpers.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}
+
+struct SerializedCacheDictionaryStorageConfiguration
+{
+    /// Max size of storage in cells
+    const size_t max_size_in_cells;
+    /// Needed to perform check if cell is expired or not found. Default value is dictionary max lifetime.
+    const size_t strict_max_lifetime_seconds;
+    /// Lifetime of dictionary. Cell deadline is random value between lifetime min and max seconds.
+    const DictionaryLifetime lifetime;
+};
+
+/** Keys are stored in LRUCache and column values are serialized into arena.
+
+    Cell in LRUCache consists of allocated size and place in arena were columns serialized data is stored.
+
+    Columns are serialized by rows.
+
+    When cell is removed from LRUCache data associated with it is also removed from arena.
+
+    In case of complex key we also store key data in arena and it is removed from arena.
+*/
+/// TODO: Remove
+template <DictionaryKeyType dictionary_key_type>
+class SerializedCacheDictionaryStorage final : public ICacheDictionaryStorage
+{
+public:
+    using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
+    static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage");
+
+    explicit SerializedCacheDictionaryStorage(SerializedCacheDictionaryStorageConfiguration & configuration_)
+        : configuration(configuration_)
+        , rnd_engine(randomSeed())
+        , cache(configuration.max_size_in_cells, false, { arena })
+    {
+    }
+
+    bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; }
+
+    String getName() const override
+    {
+        if (dictionary_key_type == DictionaryKeyType::simple)
+            return "SerializedCache";
+        else
+            return "ComplexKeySerializedCache";
+    }
+
+    bool supportsSimpleKeys() const override { return dictionary_key_type == DictionaryKeyType::simple; }
+
+    SimpleKeysStorageFetchResult fetchColumnsForKeys(
+        const PaddedPODArray<UInt64> & keys,
+        const DictionaryStorageFetchRequest & fetch_request) override
+    {
+        if constexpr (dictionary_key_type == DictionaryKeyType::simple)
+            return fetchColumnsForKeysImpl<SimpleKeysStorageFetchResult>(keys, fetch_request);
+        else
+            throw Exception("Method fetchColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    void insertColumnsForKeys(const PaddedPODArray<UInt64> & keys, Columns columns) override
+    {
+        if constexpr (dictionary_key_type == DictionaryKeyType::simple)
+            insertColumnsForKeysImpl(keys, columns);
+        else
+            throw Exception("Method insertColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    void insertDefaultKeys(const PaddedPODArray<UInt64> & keys) override
+    {
+        if constexpr (dictionary_key_type == DictionaryKeyType::simple)
+            insertDefaultKeysImpl(keys);
+        else
+            throw Exception("Method insertDefaultKeysImpl is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    PaddedPODArray<UInt64> getCachedSimpleKeys() const override
+    {
+        if constexpr (dictionary_key_type == DictionaryKeyType::simple)
+            return getCachedKeysImpl();
+        else
+            throw Exception("Method getCachedSimpleKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    bool supportsComplexKeys() const override { return dictionary_key_type == DictionaryKeyType::complex; }
+
+    ComplexKeysStorageFetchResult fetchColumnsForKeys(
+        const PaddedPODArray<StringRef> & keys,
+        const DictionaryStorageFetchRequest & column_fetch_requests) override
+    {
+        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
+            return fetchColumnsForKeysImpl<ComplexKeysStorageFetchResult>(keys, column_fetch_requests);
+        else
+            throw Exception("Method fetchColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    void insertColumnsForKeys(const PaddedPODArray<StringRef> & keys, Columns columns) override
+    {
+        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
+            insertColumnsForKeysImpl(keys, columns);
+        else
+            throw Exception("Method insertColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    void insertDefaultKeys(const PaddedPODArray<StringRef> & keys) override
+    {
+        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
+            insertDefaultKeysImpl(keys);
+        else
+            throw Exception("Method insertDefaultKeysImpl is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    PaddedPODArray<StringRef> getCachedComplexKeys() const override
+    {
+        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
+            return getCachedKeysImpl();
+        else
+            throw Exception("Method getCachedComplexKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    size_t getSize() const override { return cache.size(); }
+
+    size_t getMaxSize() const override { return cache.getMaxSize(); }
+
+    size_t getBytesAllocated() const override { return arena.size() + cache.getSizeInBytes(); }
+
+private:
+
+    template <typename KeysStorageFetchResult>
+    ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl(
+        const PaddedPODArray<KeyType> & keys,
+        const DictionaryStorageFetchRequest & fetch_request)
+    {
+        KeysStorageFetchResult result;
+
+        result.fetched_columns = fetch_request.makeAttributesResultColumns();
+        result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found});
+
+        const auto now = std::chrono::system_clock::now();
+
+        size_t fetched_columns_index = 0;
+
+        std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds);
+
+        size_t keys_size = keys.size();
+
+        for (size_t key_index = 0; key_index < keys_size; ++key_index)
+        {
+            auto key = keys[key_index];
+            auto * it = cache.find(key);
+
+            if (it)
+            {
+                /// Columns values for key are serialized in cache now deserialize them
+                const auto & cell = it->getMapped();
+
+                bool has_deadline = cellHasDeadline(cell);
+
+                if (has_deadline && now > cell.deadline + max_lifetime_seconds)
+                {
+                    result.key_index_to_state[key_index] = {KeyState::not_found};
+                    ++result.not_found_keys_size;
+                    continue;
+                }
+                else if (has_deadline && now > cell.deadline)
+                {
+                    result.key_index_to_state[key_index] = {KeyState::expired, fetched_columns_index};
+                    ++result.expired_keys_size;
+                }
+                else
+                {
+                    result.key_index_to_state[key_index] = {KeyState::found, fetched_columns_index};
+                    ++result.found_keys_size;
+                }
+
+                ++fetched_columns_index;
+
+                if (cell.isDefault())
+                {
+                    result.key_index_to_state[key_index].setDefault();
+                    ++result.default_keys_size;
+                    insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index);
+                }
+                else
+                {
+                    const char * place_for_serialized_columns = cell.place_for_serialized_columns;
+                    deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, place_for_serialized_columns);
+                }
+            }
+            else
+            {
+                result.key_index_to_state[key_index] = {KeyState::not_found};
+                ++result.not_found_keys_size;
+            }
+        }
+
+        return result;
+    }
+
+    void insertColumnsForKeysImpl(const PaddedPODArray<KeyType> & keys, Columns columns)
+    {
+        Arena temporary_values_pool;
+
+        size_t columns_to_serialize_size = columns.size();
+        PaddedPODArray<StringRef> temporary_column_data(columns_to_serialize_size);
+
+        const auto now = std::chrono::system_clock::now();
+
+        size_t keys_size = keys.size();
+
+        for (size_t key_index = 0; key_index < keys_size; ++key_index)
+        {
+            size_t allocated_size_for_columns = 0;
+            const char * block_start = nullptr;
+
+            auto key = keys[key_index];
+            auto * it = cache.find(key);
+
+            for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index)
+            {
+                auto & column = columns[column_index];
+                temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start);
+                allocated_size_for_columns += temporary_column_data[column_index].size;
+            }
+
+            char * place_for_serialized_columns = arena.alloc(allocated_size_for_columns);
+            memcpy(reinterpret_cast<void*>(place_for_serialized_columns), reinterpret_cast<const void*>(block_start), allocated_size_for_columns);
+
+            if (it)
+            {
+                /// Cell exists need to free previous serialized place and update deadline
+                auto & cell = it->getMapped();
+
+                if (cell.place_for_serialized_columns)
+                    arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
+
+                setCellDeadline(cell, now);
+                cell.allocated_size_for_columns = allocated_size_for_columns;
+                cell.place_for_serialized_columns = place_for_serialized_columns;
+            }
+            else
+            {
+                /// No cell exists so create and put in cache
+                Cell cell;
+
+                setCellDeadline(cell, now);
+                cell.allocated_size_for_columns = allocated_size_for_columns;
+                cell.place_for_serialized_columns = place_for_serialized_columns;
+
+                insertCellInCache(key, cell);
+            }
+
+            temporary_values_pool.rollback(allocated_size_for_columns);
+        }
+    }
+
+    void insertDefaultKeysImpl(const PaddedPODArray<KeyType> & keys)
+    {
+        const auto now = std::chrono::system_clock::now();
+
+        for (auto key : keys)
+        {
+            auto * it = cache.find(key);
+
+            if (it)
+            {
+                auto & cell = it->getMapped();
+
+                setCellDeadline(cell, now);
+
+                if (cell.place_for_serialized_columns)
+                    arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
+
+                cell.allocated_size_for_columns = 0;
+                cell.place_for_serialized_columns = nullptr;
+            }
+            else
+            {
+                Cell cell;
+
+                setCellDeadline(cell, now);
+                cell.allocated_size_for_columns = 0;
+                cell.place_for_serialized_columns = nullptr;
+
+                insertCellInCache(key, cell);
+            }
+        }
+    }
+
+    PaddedPODArray<KeyType> getCachedKeysImpl() const
+    {
+        PaddedPODArray<KeyType> result;
+        result.reserve(cache.size());
+
+        for (auto & node : cache)
+        {
+            auto & cell = node.getMapped();
+
+            if (cell.isDefault())
+                continue;
+
+            result.emplace_back(node.getKey());
+        }
+
+        return result;
+    }
+
+    using TimePoint = std::chrono::system_clock::time_point;
+
+    struct Cell
+    {
+        TimePoint deadline;
+        size_t allocated_size_for_columns;
+        char * place_for_serialized_columns;
+
+        inline bool isDefault() const { return place_for_serialized_columns == nullptr; }
+        inline void setDefault()
+        {
+            place_for_serialized_columns = nullptr;
+            allocated_size_for_columns = 0;
+        }
+    };
+
+    void insertCellInCache(KeyType & key, const Cell & cell)
+    {
+        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
+        {
+            /// Copy complex key into arena and put in cache
+            size_t key_size = key.size;
+            char * place_for_key = arena.alloc(key_size);
+            memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
+            KeyType updated_key{place_for_key, key_size};
+            key = updated_key;
+        }
+
+        cache.insert(key, cell);
+    }
+
+    inline static bool cellHasDeadline(const Cell & cell)
+    {
+        return cell.deadline != std::chrono::system_clock::from_time_t(0);
+    }
+
+    inline void setCellDeadline(Cell & cell, TimePoint now)
+    {
+        if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
+        {
+            cell.deadline = std::chrono::system_clock::from_time_t(0);
+            return;
+        }
+
+        size_t min_sec_lifetime = configuration.lifetime.min_sec;
+        size_t max_sec_lifetime = configuration.lifetime.max_sec;
+
+        std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
+        cell.deadline = now + std::chrono::seconds(distribution(rnd_engine));
+    }
+
+    SerializedCacheDictionaryStorageConfiguration configuration;
+
+    ArenaWithFreeLists arena;
+
+    pcg64 rnd_engine;
+
+    class ArenaCellDisposer
+    {
+    public:
+        ArenaWithFreeLists & arena;
+
+        template <typename Key, typename Value>
+        void operator()(const Key & key, const Value & value) const
+        {
+            /// In case of complex key we keep it in arena
+            if constexpr (std::is_same_v<Key, StringRef>)
+                arena.free(const_cast<char *>(key.data), key.size);
+
+            if (value.place_for_serialized_columns)
+                arena.free(value.place_for_serialized_columns, value.allocated_size_for_columns);
+        }
+    };
+
+    using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, ArenaCellDisposer>;
+    using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, ArenaCellDisposer>;
+
+    using CacheLRUHashMap = std::conditional_t<
+        dictionary_key_type == DictionaryKeyType::simple,
+        SimpleKeyLRUHashMap,
+        ComplexKeyLRUHashMap>;
+
+    CacheLRUHashMap cache;
+};
+
+}
diff --git a/src/Dictionaries/benchmark b/src/Dictionaries/benchmark
deleted file mode 100644
index 37d0d92ac14..00000000000
--- a/src/Dictionaries/benchmark
+++ /dev/null
@@ -1,154 +0,0 @@
-clickhouse-client --query="DROP TABLE IF EXISTS simple_cache_dictionary_table_source";
-clickhouse-client --query="CREATE TABLE simple_cache_dictionary_table_source (id UInt64, value1 String, value2 UInt64, value3 String, value4 Float64, value5 Decimal64(4)) ENGINE=TinyLog;"
-clickhouse-client --query="INSERT INTO simple_cache_dictionary_table_source SELECT number, concat('Value1 ', toString(number)), number, concat('Value3 ', toString(number)), toFloat64(number), cast(number, 'Decimal64(4)') FROM system.numbers LIMIT 1000000;"
-
-clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_simple_cache_dictionary (
-    id UInt64,
-    value1 String,
-    value2 UInt64,
-    value3 String,
-    value4 Float64,
-    value5 Decimal64(4)
-)
-PRIMARY KEY id
-SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default'))
-LIFETIME(MIN 300 MAX 300)
-LAYOUT(CACHE(SIZE_IN_CELLS 100000));"
-
-clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_ssd_simple_cache_dictionary (
-    id UInt64,
-    value1 String,
-    value2 UInt64,
-    value3 String,
-    value4 Float64,
-    value5 Decimal64(4)
-)
-PRIMARY KEY id
-SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default'))
-LIFETIME(MIN 300 MAX 300)
-LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 WRITE_BUFFER_SIZE 327680 MAX_STORED_KEYS 1048576 PATH '/opt/mkita/ClickHouse/build_release/programs/ssd_cache'));"
-
-clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_dummy_simple_cache_dictionary (
-    id UInt64,
-    value1 String,
-    value2 UInt64,
-    value3 String,
-    value4 Float64,
-    value5 Decimal64(4)
-)
-PRIMARY KEY id
-SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default'))
-LIFETIME(MIN 300 MAX 300)
-LAYOUT(DUMMY_SIMPLE());"
-
-./clickhouse-benchmark --query="SELECT
-    dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value1', number),
-    dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value2', number),
-    dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value3', number),
-    dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value4', number),
-    dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-LIMIT 10000
-FORMAT Null"
-
-./clickhouse-benchmark --query="SELECT
-    dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number)
-FROM system.numbers
-LIMIT 10000
-FORMAT Null"
-
-./clickhouse-benchmark --query="SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number) FROM system.numbers_mt LIMIT 10000 FORMAT Null"
-
-./clickhouse-benchmark --query="SELECT
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value3', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value4', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-LIMIT 10000
-FORMAT Null"
-
-./clickhouse-benchmark --query="SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number) FROM system.numbers_mt LIMIT 10000 FORMAT Null"
-
-SELECT
-    dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number),
-    dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value2', number),
-    dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value3', number),
-    dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value4', number),
-    dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-    LIMIT 10000
-FORMAT Null
-
-SELECT dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) FROM system.numbers LIMIT 10000 FORMAT Null
-
-SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) FROM system.numbers LIMIT 10000
-FORMAT Null
-
-SELECT
-    dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number)
-FROM system.numbers
-    LIMIT 10000
-FORMAT
-    Null
-
-SELECT
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value3', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value4', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-    LIMIT 10000
-FORMAT
-    Null
-
-SELECT
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number)
-FROM system.numbers
-LIMIT 10000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value1', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value2', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value3', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value4', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value1', number),
-    dictGet('clickhouse_simple_cache_dictionary', 'value2', number),
-    dictGet('clickhouse_simple_cache_dictionary', 'value3', number),
-    dictGet('clickhouse_simple_cache_dictionary', 'value4', number),
-    dictGet('clickhouse_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT * FROM clickhouse_simple_cache_dictionary_table;
\ No newline at end of file
diff --git a/src/Dictionaries/registerCacheDictionaries.cpp b/src/Dictionaries/registerCacheDictionaries.cpp
index 92e6eb97b63..23eea6e7e21 100644
--- a/src/Dictionaries/registerCacheDictionaries.cpp
+++ b/src/Dictionaries/registerCacheDictionaries.cpp
@@ -1,6 +1,7 @@
 #include "CacheDictionary.h"
-#include "SSDCacheDictionaryStorage.h"
 #include "CacheDictionaryStorage.h"
+#include "SerializedCacheDictionaryStorage.h"
+#include "SSDCacheDictionaryStorage.h"
 #include <Dictionaries/DictionaryFactory.h>
 
 namespace DB
@@ -18,9 +19,16 @@ CacheDictionaryStorageConfiguration parseCacheStorageConfiguration(
     const Poco::Util::AbstractConfiguration & config,
     const String & layout_prefix,
     const DictionaryLifetime & dict_lifetime,
-    DictionaryKeyType dictionary_key_type)
+    DictionaryKeyType dictionary_key_type,
+    bool serialized_storage)
 {
-    String dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".complex_key_cache." : ".cache.";
+    String dictionary_type_prefix;
+
+    if (!serialized_storage)
+        dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".complex_key_cache." : ".cache.";
+    else
+        dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".serialized_complex_key_cache." : ".serialized_cache.";
+
     String dictionary_configuration_prefix = layout_prefix + dictionary_type_prefix;
 
     const size_t size = config.getUInt64(dictionary_configuration_prefix + "size_in_cells");
@@ -158,7 +166,8 @@ DictionaryPtr createCacheDictionaryLayout(
     const DictionaryStructure & dict_struct,
     const Poco::Util::AbstractConfiguration & config,
     const std::string & config_prefix,
-    DictionarySourcePtr source_ptr)
+    DictionarySourcePtr source_ptr,
+    bool serialized_storage)
 {
     static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionary");
 
@@ -193,8 +202,23 @@ DictionaryPtr createCacheDictionaryLayout(
 
     const bool allow_read_expired_keys = config.getBool(layout_prefix + ".cache.allow_read_expired_keys", false);
 
-    auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type);
-    auto storage = std::make_shared<CacheDictionaryStorage<dictionary_key_type>>(storage_configuration);
+    auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type, serialized_storage);
+
+    std::shared_ptr<ICacheDictionaryStorage> storage;
+
+    if (serialized_storage)
+    {
+        SerializedCacheDictionaryStorageConfiguration serialized_configuration
+        {
+            .max_size_in_cells = storage_configuration.max_size_in_cells,
+            .strict_max_lifetime_seconds = storage_configuration.strict_max_lifetime_seconds,
+            .lifetime = storage_configuration.lifetime,
+        };
+
+        storage = std::make_shared<SerializedCacheDictionaryStorage<dictionary_key_type>>(serialized_configuration);
+    }
+    else
+        storage = std::make_shared<CacheDictionaryStorage<dictionary_key_type>>(dict_struct, storage_configuration);
 
     auto update_queue_configuration = parseCacheDictionaryUpdateQueueConfiguration(full_name, config, layout_prefix, dictionary_key_type);
 
@@ -265,7 +289,7 @@ void registerDictionaryCache(DictionaryFactory & factory)
                                           const std::string & config_prefix,
                                           DictionarySourcePtr source_ptr) -> DictionaryPtr
     {
-        return createCacheDictionaryLayout<DictionaryKeyType::simple>(full_name, dict_struct, config, config_prefix, std::move(source_ptr));
+        return createCacheDictionaryLayout<DictionaryKeyType::simple>(full_name, dict_struct, config, config_prefix, std::move(source_ptr), false);
     };
 
     factory.registerLayout("cache", create_simple_cache_layout, false);
@@ -276,11 +300,33 @@ void registerDictionaryCache(DictionaryFactory & factory)
                                                const std::string & config_prefix,
                                                DictionarySourcePtr source_ptr) -> DictionaryPtr
     {
-        return createCacheDictionaryLayout<DictionaryKeyType::complex>(full_name, dict_struct, config, config_prefix, std::move(source_ptr));
+        return createCacheDictionaryLayout<DictionaryKeyType::complex>(full_name, dict_struct, config, config_prefix, std::move(source_ptr), false);
     };
 
     factory.registerLayout("complex_key_cache", create_complex_key_cache_layout, true);
 
+    auto create_simple_serialized_cache_layout = [=](const String & full_name,
+                                          const DictionaryStructure & dict_struct,
+                                          const Poco::Util::AbstractConfiguration & config,
+                                          const std::string & config_prefix,
+                                          DictionarySourcePtr source_ptr) -> DictionaryPtr
+    {
+        return createCacheDictionaryLayout<DictionaryKeyType::simple>(full_name, dict_struct, config, config_prefix, std::move(source_ptr), true);
+    };
+
+    factory.registerLayout("serialized_cache", create_simple_serialized_cache_layout, false);
+
+    auto create_complex_key_serialzied_cache_layout = [=](const std::string & full_name,
+                                               const DictionaryStructure & dict_struct,
+                                               const Poco::Util::AbstractConfiguration & config,
+                                               const std::string & config_prefix,
+                                               DictionarySourcePtr source_ptr) -> DictionaryPtr
+    {
+        return createCacheDictionaryLayout<DictionaryKeyType::complex>(full_name, dict_struct, config, config_prefix, std::move(source_ptr), true);
+    };
+
+    factory.registerLayout("complex_key_serialized_cache", create_complex_key_serialzied_cache_layout, true);
+
 #if defined(OS_LINUX) || defined(__FreeBSD__)
 
     auto create_simple_ssd_cache_layout = [=](const std::string & full_name,

From 0783882fcfd5d372b1631b41c4145f7a2808425c Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Sun, 14 Mar 2021 00:49:45 +0300
Subject: [PATCH 011/260] Updated cache implementation

---
 src/Dictionaries/CacheDictionaryStorage.h     | 195 +++++----
 src/Dictionaries/ICacheDictionaryStorage.h    |   1 +
 .../SerializedCacheDictionaryStorage.h        | 412 ------------------
 .../registerCacheDictionaries.cpp             |  59 +--
 4 files changed, 115 insertions(+), 552 deletions(-)
 delete mode 100644 src/Dictionaries/SerializedCacheDictionaryStorage.h

diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h
index 2b34b13fa6f..bbf1325c8a3 100644
--- a/src/Dictionaries/CacheDictionaryStorage.h
+++ b/src/Dictionaries/CacheDictionaryStorage.h
@@ -4,6 +4,8 @@
 #include <variant>
 
 #include <pcg_random.hpp>
+#include <absl/container/flat_hash_map.h>
+#include <absl/container/flat_hash_set.h>
 
 #include <Common/randomSeed.h>
 #include <Common/Arena.h>
@@ -31,6 +33,8 @@ struct CacheDictionaryStorageConfiguration
     const DictionaryLifetime lifetime;
 };
 
+
+
 /// TODO: Add documentation
 template <DictionaryKeyType dictionary_key_type>
 class CacheDictionaryStorage final : public ICacheDictionaryStorage
@@ -46,29 +50,7 @@ public:
         , rnd_engine(randomSeed())
         , cache(configuration.max_size_in_cells, false, { *this })
     {
-        for (const auto & dictionary_attribute : dictionary_structure.attributes)
-        {
-            auto attribute_type = dictionary_attribute.underlying_type;
-
-            auto type_call = [&](const auto & dictionary_attribute_type)
-            {
-                using Type = std::decay_t<decltype(dictionary_attribute_type)>;
-                using AttributeType = typename Type::AttributeType;
-                using ValueType = DictionaryValueType<AttributeType>;
-
-                attributes.emplace_back();
-                auto & last_attribute = attributes.back();
-                last_attribute.type = attribute_type;
-                last_attribute.is_complex_type = dictionary_attribute.is_nullable || dictionary_attribute.is_array;
-
-                if (dictionary_attribute.is_nullable)
-                    last_attribute.attribute_container = std::vector<Field>();
-                else
-                    last_attribute.attribute_container = PaddedPODArray<ValueType>();
-            };
-
-            callOnDictionaryAttributeType(attribute_type, type_call);
-        }
+        setup(dictionary_structure);
     }
 
     bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; }
@@ -88,9 +70,7 @@ public:
         const DictionaryStorageFetchRequest & fetch_request) override
     {
         if constexpr (dictionary_key_type == DictionaryKeyType::simple)
-        {
             return fetchColumnsForKeysImpl<SimpleKeysStorageFetchResult>(keys, fetch_request);
-        }
         else
             throw Exception("Method fetchColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
     }
@@ -126,9 +106,7 @@ public:
         const DictionaryStorageFetchRequest & column_fetch_requests) override
     {
         if constexpr (dictionary_key_type == DictionaryKeyType::complex)
-        {
             return fetchColumnsForKeysImpl<ComplexKeysStorageFetchResult>(keys, column_fetch_requests);
-        }
         else
             throw Exception("Method fetchColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
     }
@@ -174,7 +152,7 @@ public:
             });
         }
 
-        return arena.size() + cache.getSizeInBytes();
+        return arena.size() + cache.getSizeInBytes() + attributes_size_in_bytes;
     }
 
 private:
@@ -192,7 +170,7 @@ private:
 
 
     template <typename KeysStorageFetchResult>
-    ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl(
+    KeysStorageFetchResult fetchColumnsForKeysImpl(
         const PaddedPODArray<KeyType> & keys,
         const DictionaryStorageFetchRequest & fetch_request)
     {
@@ -216,44 +194,41 @@ private:
             auto key = keys[key_index];
             auto * it = cache.find(key);
 
-            if (it)
-            {
-                /// Columns values for key are serialized in cache now deserialize them
-                const auto & cell = it->getMapped();
-
-                bool has_deadline = cellHasDeadline(cell);
-
-                if (has_deadline && now > cell.deadline + max_lifetime_seconds)
-                {
-                    result.key_index_to_state[key_index] = {KeyState::not_found};
-                    ++result.not_found_keys_size;
-                    continue;
-                }
-                else if (has_deadline && now > cell.deadline)
-                {
-                    result.key_index_to_state[key_index] = {KeyState::expired, fetched_columns_index};
-                    ++result.expired_keys_size;
-                }
-                else
-                {
-                    result.key_index_to_state[key_index] = {KeyState::found, fetched_columns_index};
-                    ++result.found_keys_size;
-                }
-
-                if (cell.is_default)
-                {
-                    result.key_index_to_state[key_index].setDefault();
-                    ++result.default_keys_size;
-                }
-
-                fetched_keys.emplace_back(cell.element_index, cell.is_default);
-                ++fetched_columns_index;
-            }
-            else
+            if (!it)
             {
                 result.key_index_to_state[key_index] = {KeyState::not_found};
                 ++result.not_found_keys_size;
+                continue;
             }
+
+            const auto & cell = it->getMapped();
+
+            if (now > cell.deadline + max_lifetime_seconds)
+            {
+                result.key_index_to_state[key_index] = {KeyState::not_found};
+                ++result.not_found_keys_size;
+                continue;
+            }
+
+            bool cell_is_expired = false;
+            KeyState::State key_state = KeyState::found;
+
+            if (now > cell.deadline)
+            {
+                cell_is_expired = true;
+                key_state = KeyState::expired;
+            }
+
+            result.key_index_to_state[key_index] = {key_state, fetched_columns_index};
+            ++fetched_columns_index;
+
+            result.expired_keys_size += cell_is_expired;
+            result.found_keys_size += !cell_is_expired;
+
+            result.key_index_to_state[key_index].setDefaultValue(cell.is_default);
+            result.default_keys_size += cell.is_default;
+
+            fetched_keys.emplace_back(cell.element_index, cell.is_default);
         }
 
         for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index)
@@ -275,7 +250,7 @@ private:
                 {
                     auto fetched_key = fetched_keys[fetched_key_index];
 
-                    if (fetched_key.is_default)
+                    if (unlikely(fetched_key.is_default))
                         fetched_column.insert(default_value_provider.getDefaultValue(fetched_key_index));
                     else
                         fetched_column.insert(container[fetched_key.element_index]);
@@ -302,7 +277,7 @@ private:
                         {
                             auto fetched_key = fetched_keys[fetched_key_index];
 
-                            if (fetched_key.is_default)
+                            if (unlikely(fetched_key.is_default))
                                 column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index));
                             else
                             {
@@ -318,7 +293,7 @@ private:
                             auto fetched_key = fetched_keys[fetched_key_index];
                             auto & data = column_typed.getData();
 
-                            if (fetched_key.is_default)
+                            if (unlikely(fetched_key.is_default))
                                 column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index));
                             else
                             {
@@ -460,10 +435,10 @@ private:
     {
         size_t cache_max_size = cache.getMaxSize();
 
-        if (unlikely(attributes.empty()) || insert_index * 2 < cache_max_size)
+        if (unlikely(attributes.empty()) || insert_index < cache_max_size * 2)
             return;
 
-        std::unordered_map<size_t, typename CacheLRUHashMap::iterator> element_index_to_cache_iterator;
+        absl::flat_hash_map<size_t, typename CacheLRUHashMap::iterator, DefaultHash<size_t>> element_index_to_cache_iterator;
 
         for (auto begin = cache.begin(); begin != cache.end(); ++begin)
         {
@@ -483,7 +458,15 @@ private:
             for (size_t i = 0; i < container_size; ++i)
             {
                 if (indexes_to_delete.contains(i))
+                {
+                    if constexpr (std::is_same_v<decltype(container[0]), StringRef>)
+                    {
+                        StringRef data = container[i];
+                        arena.free(const_cast<char *>(data.data), data.size);
+                    }
+
                     continue;
+                }
 
                 std::swap(container[remove_index], container[i]);
 
@@ -513,7 +496,15 @@ private:
                 for (size_t i = 0; i < container_size; ++i)
                 {
                     if (indexes_to_delete.contains(i))
+                    {
+                        if constexpr (std::is_same_v<decltype(container[0]), StringRef>)
+                        {
+                            StringRef data = container[i];
+                            arena.free(const_cast<char *>(data.data), data.size);
+                        }
+
                         continue;
+                    }
 
                     std::swap(container[remove_index], container[i]);
                     ++remove_index;
@@ -559,6 +550,47 @@ private:
         return const_cast<std::decay_t<decltype(*this)> *>(this)->template getAttributeContainer(attribute_index, std::forward<GetContainerFunc>(func));
     }
 
+    StringRef copyStringInArena(StringRef value_to_copy)
+    {
+        size_t value_to_copy_size = value_to_copy.size;
+        char * place_for_key = arena.alloc(value_to_copy_size);
+        memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(value_to_copy.data), value_to_copy_size);
+        StringRef updated_value{place_for_key, value_to_copy_size};
+
+        return updated_value;
+    }
+
+    void setup(const DictionaryStructure & dictionary_structure)
+    {
+        /// For each dictionary attribute create storage attribute
+        /// For simple attributes create PODArray, for complex vector of Fields
+
+        attributes.reserve(dictionary_structure.attributes.size());
+
+        for (const auto & dictionary_attribute : dictionary_structure.attributes)
+        {
+            auto attribute_type = dictionary_attribute.underlying_type;
+
+            auto type_call = [&](const auto & dictionary_attribute_type)
+            {
+                using Type = std::decay_t<decltype(dictionary_attribute_type)>;
+                using AttributeType = typename Type::AttributeType;
+                using ValueType = DictionaryValueType<AttributeType>;
+
+                attributes.emplace_back();
+                auto & last_attribute = attributes.back();
+                last_attribute.type = attribute_type;
+                last_attribute.is_complex_type = dictionary_attribute.is_nullable || dictionary_attribute.is_array;
+
+                if (dictionary_attribute.is_nullable)
+                    last_attribute.attribute_container = std::vector<Field>();
+                else
+                    last_attribute.attribute_container = PaddedPODArray<ValueType>();
+            };
+
+            callOnDictionaryAttributeType(attribute_type, type_call);
+        }
+    }
 
     using TimePoint = std::chrono::system_clock::time_point;
 
@@ -578,26 +610,13 @@ private:
         cache.insert(key, cell);
     }
 
-    StringRef copyStringInArena(StringRef value_to_copy)
-    {
-        size_t value_to_copy_size = value_to_copy.size;
-        char * place_for_key = arena.alloc(value_to_copy_size);
-        memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(value_to_copy.data), value_to_copy_size);
-        StringRef updated_value{place_for_key, value_to_copy_size};
-
-        return updated_value;
-    }
-
-    inline static bool cellHasDeadline(const Cell & cell)
-    {
-        return cell.deadline != std::chrono::system_clock::from_time_t(0);
-    }
-
     inline void setCellDeadline(Cell & cell, TimePoint now)
     {
         if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
         {
-            cell.deadline = std::chrono::system_clock::from_time_t(0);
+            /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds
+            /// to the expiration time. And it overflows pretty well.
+            cell.deadline = std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds);
             return;
         }
 
@@ -638,10 +657,6 @@ private:
             std::vector<Field>> attribute_container;
     };
 
-    std::vector<Attribute> attributes;
-    size_t insert_index = 0;
-    std::unordered_set<size_t, DefaultHash<size_t>> indexes_to_delete;
-
     class CacheStorageCellDisposer
     {
     public:
@@ -667,6 +682,10 @@ private:
         ComplexKeyLRUHashMap>;
 
     CacheLRUHashMap cache;
+
+    std::vector<Attribute> attributes;
+    size_t insert_index = 0;
+    absl::flat_hash_set<size_t, DefaultHash<size_t>> indexes_to_delete;
 };
 
 }
diff --git a/src/Dictionaries/ICacheDictionaryStorage.h b/src/Dictionaries/ICacheDictionaryStorage.h
index 8db2dab536c..a428cebdfe7 100644
--- a/src/Dictionaries/ICacheDictionaryStorage.h
+++ b/src/Dictionaries/ICacheDictionaryStorage.h
@@ -31,6 +31,7 @@ struct KeyState
     inline bool isNotFound() const { return state == State::not_found; }
     inline bool isDefault() const { return is_default; }
     inline void setDefault() { is_default = true; }
+    inline void setDefaultValue(bool is_default_value) { is_default = is_default_value; }
     /// Valid only if keyState is found or expired
     inline size_t getFetchedColumnIndex() const { return fetched_column_index; }
 
diff --git a/src/Dictionaries/SerializedCacheDictionaryStorage.h b/src/Dictionaries/SerializedCacheDictionaryStorage.h
deleted file mode 100644
index 2616e03763c..00000000000
--- a/src/Dictionaries/SerializedCacheDictionaryStorage.h
+++ /dev/null
@@ -1,412 +0,0 @@
-#pragma once
-
-#include <chrono>
-
-#include <pcg_random.hpp>
-
-#include <Common/randomSeed.h>
-#include <Common/Arena.h>
-#include <Common/ArenaWithFreeLists.h>
-#include <Common/HashTable/LRUHashMap.h>
-#include <Dictionaries/DictionaryStructure.h>
-#include <Dictionaries/ICacheDictionaryStorage.h>
-#include <Dictionaries/DictionaryHelpers.h>
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int NOT_IMPLEMENTED;
-}
-
-struct SerializedCacheDictionaryStorageConfiguration
-{
-    /// Max size of storage in cells
-    const size_t max_size_in_cells;
-    /// Needed to perform check if cell is expired or not found. Default value is dictionary max lifetime.
-    const size_t strict_max_lifetime_seconds;
-    /// Lifetime of dictionary. Cell deadline is random value between lifetime min and max seconds.
-    const DictionaryLifetime lifetime;
-};
-
-/** Keys are stored in LRUCache and column values are serialized into arena.
-
-    Cell in LRUCache consists of allocated size and place in arena were columns serialized data is stored.
-
-    Columns are serialized by rows.
-
-    When cell is removed from LRUCache data associated with it is also removed from arena.
-
-    In case of complex key we also store key data in arena and it is removed from arena.
-*/
-/// TODO: Remove
-template <DictionaryKeyType dictionary_key_type>
-class SerializedCacheDictionaryStorage final : public ICacheDictionaryStorage
-{
-public:
-    using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
-    static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage");
-
-    explicit SerializedCacheDictionaryStorage(SerializedCacheDictionaryStorageConfiguration & configuration_)
-        : configuration(configuration_)
-        , rnd_engine(randomSeed())
-        , cache(configuration.max_size_in_cells, false, { arena })
-    {
-    }
-
-    bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; }
-
-    String getName() const override
-    {
-        if (dictionary_key_type == DictionaryKeyType::simple)
-            return "SerializedCache";
-        else
-            return "ComplexKeySerializedCache";
-    }
-
-    bool supportsSimpleKeys() const override { return dictionary_key_type == DictionaryKeyType::simple; }
-
-    SimpleKeysStorageFetchResult fetchColumnsForKeys(
-        const PaddedPODArray<UInt64> & keys,
-        const DictionaryStorageFetchRequest & fetch_request) override
-    {
-        if constexpr (dictionary_key_type == DictionaryKeyType::simple)
-            return fetchColumnsForKeysImpl<SimpleKeysStorageFetchResult>(keys, fetch_request);
-        else
-            throw Exception("Method fetchColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
-    }
-
-    void insertColumnsForKeys(const PaddedPODArray<UInt64> & keys, Columns columns) override
-    {
-        if constexpr (dictionary_key_type == DictionaryKeyType::simple)
-            insertColumnsForKeysImpl(keys, columns);
-        else
-            throw Exception("Method insertColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
-    }
-
-    void insertDefaultKeys(const PaddedPODArray<UInt64> & keys) override
-    {
-        if constexpr (dictionary_key_type == DictionaryKeyType::simple)
-            insertDefaultKeysImpl(keys);
-        else
-            throw Exception("Method insertDefaultKeysImpl is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
-    }
-
-    PaddedPODArray<UInt64> getCachedSimpleKeys() const override
-    {
-        if constexpr (dictionary_key_type == DictionaryKeyType::simple)
-            return getCachedKeysImpl();
-        else
-            throw Exception("Method getCachedSimpleKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
-    }
-
-    bool supportsComplexKeys() const override { return dictionary_key_type == DictionaryKeyType::complex; }
-
-    ComplexKeysStorageFetchResult fetchColumnsForKeys(
-        const PaddedPODArray<StringRef> & keys,
-        const DictionaryStorageFetchRequest & column_fetch_requests) override
-    {
-        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
-            return fetchColumnsForKeysImpl<ComplexKeysStorageFetchResult>(keys, column_fetch_requests);
-        else
-            throw Exception("Method fetchColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
-    }
-
-    void insertColumnsForKeys(const PaddedPODArray<StringRef> & keys, Columns columns) override
-    {
-        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
-            insertColumnsForKeysImpl(keys, columns);
-        else
-            throw Exception("Method insertColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
-    }
-
-    void insertDefaultKeys(const PaddedPODArray<StringRef> & keys) override
-    {
-        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
-            insertDefaultKeysImpl(keys);
-        else
-            throw Exception("Method insertDefaultKeysImpl is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
-    }
-
-    PaddedPODArray<StringRef> getCachedComplexKeys() const override
-    {
-        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
-            return getCachedKeysImpl();
-        else
-            throw Exception("Method getCachedComplexKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
-    }
-
-    size_t getSize() const override { return cache.size(); }
-
-    size_t getMaxSize() const override { return cache.getMaxSize(); }
-
-    size_t getBytesAllocated() const override { return arena.size() + cache.getSizeInBytes(); }
-
-private:
-
-    template <typename KeysStorageFetchResult>
-    ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl(
-        const PaddedPODArray<KeyType> & keys,
-        const DictionaryStorageFetchRequest & fetch_request)
-    {
-        KeysStorageFetchResult result;
-
-        result.fetched_columns = fetch_request.makeAttributesResultColumns();
-        result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found});
-
-        const auto now = std::chrono::system_clock::now();
-
-        size_t fetched_columns_index = 0;
-
-        std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds);
-
-        size_t keys_size = keys.size();
-
-        for (size_t key_index = 0; key_index < keys_size; ++key_index)
-        {
-            auto key = keys[key_index];
-            auto * it = cache.find(key);
-
-            if (it)
-            {
-                /// Columns values for key are serialized in cache now deserialize them
-                const auto & cell = it->getMapped();
-
-                bool has_deadline = cellHasDeadline(cell);
-
-                if (has_deadline && now > cell.deadline + max_lifetime_seconds)
-                {
-                    result.key_index_to_state[key_index] = {KeyState::not_found};
-                    ++result.not_found_keys_size;
-                    continue;
-                }
-                else if (has_deadline && now > cell.deadline)
-                {
-                    result.key_index_to_state[key_index] = {KeyState::expired, fetched_columns_index};
-                    ++result.expired_keys_size;
-                }
-                else
-                {
-                    result.key_index_to_state[key_index] = {KeyState::found, fetched_columns_index};
-                    ++result.found_keys_size;
-                }
-
-                ++fetched_columns_index;
-
-                if (cell.isDefault())
-                {
-                    result.key_index_to_state[key_index].setDefault();
-                    ++result.default_keys_size;
-                    insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index);
-                }
-                else
-                {
-                    const char * place_for_serialized_columns = cell.place_for_serialized_columns;
-                    deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, place_for_serialized_columns);
-                }
-            }
-            else
-            {
-                result.key_index_to_state[key_index] = {KeyState::not_found};
-                ++result.not_found_keys_size;
-            }
-        }
-
-        return result;
-    }
-
-    void insertColumnsForKeysImpl(const PaddedPODArray<KeyType> & keys, Columns columns)
-    {
-        Arena temporary_values_pool;
-
-        size_t columns_to_serialize_size = columns.size();
-        PaddedPODArray<StringRef> temporary_column_data(columns_to_serialize_size);
-
-        const auto now = std::chrono::system_clock::now();
-
-        size_t keys_size = keys.size();
-
-        for (size_t key_index = 0; key_index < keys_size; ++key_index)
-        {
-            size_t allocated_size_for_columns = 0;
-            const char * block_start = nullptr;
-
-            auto key = keys[key_index];
-            auto * it = cache.find(key);
-
-            for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index)
-            {
-                auto & column = columns[column_index];
-                temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start);
-                allocated_size_for_columns += temporary_column_data[column_index].size;
-            }
-
-            char * place_for_serialized_columns = arena.alloc(allocated_size_for_columns);
-            memcpy(reinterpret_cast<void*>(place_for_serialized_columns), reinterpret_cast<const void*>(block_start), allocated_size_for_columns);
-
-            if (it)
-            {
-                /// Cell exists need to free previous serialized place and update deadline
-                auto & cell = it->getMapped();
-
-                if (cell.place_for_serialized_columns)
-                    arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
-
-                setCellDeadline(cell, now);
-                cell.allocated_size_for_columns = allocated_size_for_columns;
-                cell.place_for_serialized_columns = place_for_serialized_columns;
-            }
-            else
-            {
-                /// No cell exists so create and put in cache
-                Cell cell;
-
-                setCellDeadline(cell, now);
-                cell.allocated_size_for_columns = allocated_size_for_columns;
-                cell.place_for_serialized_columns = place_for_serialized_columns;
-
-                insertCellInCache(key, cell);
-            }
-
-            temporary_values_pool.rollback(allocated_size_for_columns);
-        }
-    }
-
-    void insertDefaultKeysImpl(const PaddedPODArray<KeyType> & keys)
-    {
-        const auto now = std::chrono::system_clock::now();
-
-        for (auto key : keys)
-        {
-            auto * it = cache.find(key);
-
-            if (it)
-            {
-                auto & cell = it->getMapped();
-
-                setCellDeadline(cell, now);
-
-                if (cell.place_for_serialized_columns)
-                    arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
-
-                cell.allocated_size_for_columns = 0;
-                cell.place_for_serialized_columns = nullptr;
-            }
-            else
-            {
-                Cell cell;
-
-                setCellDeadline(cell, now);
-                cell.allocated_size_for_columns = 0;
-                cell.place_for_serialized_columns = nullptr;
-
-                insertCellInCache(key, cell);
-            }
-        }
-    }
-
-    PaddedPODArray<KeyType> getCachedKeysImpl() const
-    {
-        PaddedPODArray<KeyType> result;
-        result.reserve(cache.size());
-
-        for (auto & node : cache)
-        {
-            auto & cell = node.getMapped();
-
-            if (cell.isDefault())
-                continue;
-
-            result.emplace_back(node.getKey());
-        }
-
-        return result;
-    }
-
-    using TimePoint = std::chrono::system_clock::time_point;
-
-    struct Cell
-    {
-        TimePoint deadline;
-        size_t allocated_size_for_columns;
-        char * place_for_serialized_columns;
-
-        inline bool isDefault() const { return place_for_serialized_columns == nullptr; }
-        inline void setDefault()
-        {
-            place_for_serialized_columns = nullptr;
-            allocated_size_for_columns = 0;
-        }
-    };
-
-    void insertCellInCache(KeyType & key, const Cell & cell)
-    {
-        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
-        {
-            /// Copy complex key into arena and put in cache
-            size_t key_size = key.size;
-            char * place_for_key = arena.alloc(key_size);
-            memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
-            KeyType updated_key{place_for_key, key_size};
-            key = updated_key;
-        }
-
-        cache.insert(key, cell);
-    }
-
-    inline static bool cellHasDeadline(const Cell & cell)
-    {
-        return cell.deadline != std::chrono::system_clock::from_time_t(0);
-    }
-
-    inline void setCellDeadline(Cell & cell, TimePoint now)
-    {
-        if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
-        {
-            cell.deadline = std::chrono::system_clock::from_time_t(0);
-            return;
-        }
-
-        size_t min_sec_lifetime = configuration.lifetime.min_sec;
-        size_t max_sec_lifetime = configuration.lifetime.max_sec;
-
-        std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
-        cell.deadline = now + std::chrono::seconds(distribution(rnd_engine));
-    }
-
-    SerializedCacheDictionaryStorageConfiguration configuration;
-
-    ArenaWithFreeLists arena;
-
-    pcg64 rnd_engine;
-
-    class ArenaCellDisposer
-    {
-    public:
-        ArenaWithFreeLists & arena;
-
-        template <typename Key, typename Value>
-        void operator()(const Key & key, const Value & value) const
-        {
-            /// In case of complex key we keep it in arena
-            if constexpr (std::is_same_v<Key, StringRef>)
-                arena.free(const_cast<char *>(key.data), key.size);
-
-            if (value.place_for_serialized_columns)
-                arena.free(value.place_for_serialized_columns, value.allocated_size_for_columns);
-        }
-    };
-
-    using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, ArenaCellDisposer>;
-    using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, ArenaCellDisposer>;
-
-    using CacheLRUHashMap = std::conditional_t<
-        dictionary_key_type == DictionaryKeyType::simple,
-        SimpleKeyLRUHashMap,
-        ComplexKeyLRUHashMap>;
-
-    CacheLRUHashMap cache;
-};
-
-}
diff --git a/src/Dictionaries/registerCacheDictionaries.cpp b/src/Dictionaries/registerCacheDictionaries.cpp
index 23eea6e7e21..9f0f214e79b 100644
--- a/src/Dictionaries/registerCacheDictionaries.cpp
+++ b/src/Dictionaries/registerCacheDictionaries.cpp
@@ -1,6 +1,5 @@
 #include "CacheDictionary.h"
 #include "CacheDictionaryStorage.h"
-#include "SerializedCacheDictionaryStorage.h"
 #include "SSDCacheDictionaryStorage.h"
 #include <Dictionaries/DictionaryFactory.h>
 
@@ -19,16 +18,9 @@ CacheDictionaryStorageConfiguration parseCacheStorageConfiguration(
     const Poco::Util::AbstractConfiguration & config,
     const String & layout_prefix,
     const DictionaryLifetime & dict_lifetime,
-    DictionaryKeyType dictionary_key_type,
-    bool serialized_storage)
+    DictionaryKeyType dictionary_key_type)
 {
-    String dictionary_type_prefix;
-
-    if (!serialized_storage)
-        dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".complex_key_cache." : ".cache.";
-    else
-        dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".serialized_complex_key_cache." : ".serialized_cache.";
-
+    String dictionary_type_prefix = (dictionary_key_type == DictionaryKeyType::complex) ? ".complex_key_cache." : ".cache.";
     String dictionary_configuration_prefix = layout_prefix + dictionary_type_prefix;
 
     const size_t size = config.getUInt64(dictionary_configuration_prefix + "size_in_cells");
@@ -166,8 +158,7 @@ DictionaryPtr createCacheDictionaryLayout(
     const DictionaryStructure & dict_struct,
     const Poco::Util::AbstractConfiguration & config,
     const std::string & config_prefix,
-    DictionarySourcePtr source_ptr,
-    bool serialized_storage)
+    DictionarySourcePtr source_ptr)
 {
     static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionary");
 
@@ -202,23 +193,9 @@ DictionaryPtr createCacheDictionaryLayout(
 
     const bool allow_read_expired_keys = config.getBool(layout_prefix + ".cache.allow_read_expired_keys", false);
 
-    auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type, serialized_storage);
+    auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type);
 
-    std::shared_ptr<ICacheDictionaryStorage> storage;
-
-    if (serialized_storage)
-    {
-        SerializedCacheDictionaryStorageConfiguration serialized_configuration
-        {
-            .max_size_in_cells = storage_configuration.max_size_in_cells,
-            .strict_max_lifetime_seconds = storage_configuration.strict_max_lifetime_seconds,
-            .lifetime = storage_configuration.lifetime,
-        };
-
-        storage = std::make_shared<SerializedCacheDictionaryStorage<dictionary_key_type>>(serialized_configuration);
-    }
-    else
-        storage = std::make_shared<CacheDictionaryStorage<dictionary_key_type>>(dict_struct, storage_configuration);
+    std::shared_ptr<ICacheDictionaryStorage> storage = std::make_shared<CacheDictionaryStorage<dictionary_key_type>>(dict_struct, storage_configuration);
 
     auto update_queue_configuration = parseCacheDictionaryUpdateQueueConfiguration(full_name, config, layout_prefix, dictionary_key_type);
 
@@ -289,7 +266,7 @@ void registerDictionaryCache(DictionaryFactory & factory)
                                           const std::string & config_prefix,
                                           DictionarySourcePtr source_ptr) -> DictionaryPtr
     {
-        return createCacheDictionaryLayout<DictionaryKeyType::simple>(full_name, dict_struct, config, config_prefix, std::move(source_ptr), false);
+        return createCacheDictionaryLayout<DictionaryKeyType::simple>(full_name, dict_struct, config, config_prefix, std::move(source_ptr));
     };
 
     factory.registerLayout("cache", create_simple_cache_layout, false);
@@ -300,33 +277,11 @@ void registerDictionaryCache(DictionaryFactory & factory)
                                                const std::string & config_prefix,
                                                DictionarySourcePtr source_ptr) -> DictionaryPtr
     {
-        return createCacheDictionaryLayout<DictionaryKeyType::complex>(full_name, dict_struct, config, config_prefix, std::move(source_ptr), false);
+        return createCacheDictionaryLayout<DictionaryKeyType::complex>(full_name, dict_struct, config, config_prefix, std::move(source_ptr));
     };
 
     factory.registerLayout("complex_key_cache", create_complex_key_cache_layout, true);
 
-    auto create_simple_serialized_cache_layout = [=](const String & full_name,
-                                          const DictionaryStructure & dict_struct,
-                                          const Poco::Util::AbstractConfiguration & config,
-                                          const std::string & config_prefix,
-                                          DictionarySourcePtr source_ptr) -> DictionaryPtr
-    {
-        return createCacheDictionaryLayout<DictionaryKeyType::simple>(full_name, dict_struct, config, config_prefix, std::move(source_ptr), true);
-    };
-
-    factory.registerLayout("serialized_cache", create_simple_serialized_cache_layout, false);
-
-    auto create_complex_key_serialzied_cache_layout = [=](const std::string & full_name,
-                                               const DictionaryStructure & dict_struct,
-                                               const Poco::Util::AbstractConfiguration & config,
-                                               const std::string & config_prefix,
-                                               DictionarySourcePtr source_ptr) -> DictionaryPtr
-    {
-        return createCacheDictionaryLayout<DictionaryKeyType::complex>(full_name, dict_struct, config, config_prefix, std::move(source_ptr), true);
-    };
-
-    factory.registerLayout("complex_key_serialized_cache", create_complex_key_serialzied_cache_layout, true);
-
 #if defined(OS_LINUX) || defined(__FreeBSD__)
 
     auto create_simple_ssd_cache_layout = [=](const std::string & full_name,

From ee898d6d47a01a5daa5baee1d73e63acb6b122e4 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Sun, 14 Mar 2021 15:51:55 +0300
Subject: [PATCH 012/260] Fixed style check

---
 src/Dictionaries/CacheDictionaryStorage.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h
index bbf1325c8a3..d27c6512244 100644
--- a/src/Dictionaries/CacheDictionaryStorage.h
+++ b/src/Dictionaries/CacheDictionaryStorage.h
@@ -33,8 +33,6 @@ struct CacheDictionaryStorageConfiguration
     const DictionaryLifetime lifetime;
 };
 
-
-
 /// TODO: Add documentation
 template <DictionaryKeyType dictionary_key_type>
 class CacheDictionaryStorage final : public ICacheDictionaryStorage
@@ -168,7 +166,6 @@ private:
         const bool is_default;
     };
 
-
     template <typename KeysStorageFetchResult>
     KeysStorageFetchResult fetchColumnsForKeysImpl(
         const PaddedPODArray<KeyType> & keys,

From 1a356af579dc3ca9dbb8692c44eb8bab0b481be1 Mon Sep 17 00:00:00 2001
From: ip <igor@ahrefs.com>
Date: Tue, 9 Mar 2021 22:16:27 +0000
Subject: [PATCH 013/260] StorageMergeTree: allocate temp part blocknumber in
 dest table

---
 src/Storages/StorageMergeTree.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 070e6eb0483..435b79e1815 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -1366,7 +1366,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const
             DataPartsLock lock(mutex);
 
             for (MutableDataPartPtr & part : dst_parts)
-                dest_table_storage->renameTempPartAndReplace(part, &increment, &transaction, lock);
+                dest_table_storage->renameTempPartAndReplace(part, &dest_table_storage->increment, &transaction, lock);
 
             removePartsFromWorkingSet(src_parts, true, lock);
             transaction.commit(&lock);

From 3d1c42827b01b08ffb8f60aa2cf4685fb759a1d3 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Tue, 16 Mar 2021 01:59:04 +0300
Subject: [PATCH 014/260] Added FixedDeadlineHashMap

---
 src/Common/HashTable/FixedDeadlineHashMap.h | 253 +++++++++++++++
 src/Common/HashTable/LRUHashMap.h           |   6 +-
 src/Dictionaries/CacheDictionaryStorage.h   | 339 ++++++++------------
 3 files changed, 398 insertions(+), 200 deletions(-)
 create mode 100644 src/Common/HashTable/FixedDeadlineHashMap.h

diff --git a/src/Common/HashTable/FixedDeadlineHashMap.h b/src/Common/HashTable/FixedDeadlineHashMap.h
new file mode 100644
index 00000000000..0f7819f4020
--- /dev/null
+++ b/src/Common/HashTable/FixedDeadlineHashMap.h
@@ -0,0 +1,253 @@
+#pragma once
+
+#include <chrono>
+#include <type_traits>
+#include <Common/HashTable/HashMap.h>
+#include <Common/BitHelpers.h>
+
+using TimePoint = std::chrono::system_clock::time_point;
+
+template <typename TKey, typename TMapped, typename Hash, bool save_hash_in_cell>
+struct DeadlineCell :
+    public std::conditional_t<save_hash_in_cell,
+        HashMapCellWithSavedHash<TKey, TMapped, Hash, HashTableNoState>,
+        HashMapCell<TKey, TMapped, Hash, HashTableNoState>>
+{
+    using Key = TKey;
+
+    using Base = std::conditional_t<save_hash_in_cell,
+        HashMapCellWithSavedHash<TKey, TMapped, Hash, HashTableNoState>,
+        HashMapCell<TKey, TMapped, Hash, HashTableNoState>>;
+
+    using Mapped = typename Base::Mapped;
+    using State = typename Base::State;
+
+    using mapped_type = Mapped;
+    using key_type = Key;
+
+    using Base::Base;
+
+    inline TimePoint getDeadline() const { return deadline; }
+
+    void setDeadline(TimePoint & deadline_value) { deadline = deadline_value; }
+
+private:
+    TimePoint deadline;
+};
+
+template <typename TKey, typename TValue, typename Disposer, typename Hash, bool save_hash_in_cells>
+class FixedDeadlineHashMapImpl :
+    private HashMapTable<
+        TKey,
+        DeadlineCell<TKey, TValue, Hash, save_hash_in_cells>,
+        Hash,
+        HashTableGrower<>,
+        HashTableAllocator>
+{
+    /// TODO: Make custom grower
+    using Base = HashMapTable<
+        TKey,
+        DeadlineCell<TKey, TValue, Hash, save_hash_in_cells>,
+        Hash,
+        HashTableGrower<>,
+        HashTableAllocator>;
+
+    static size_t calculateMaxSize(size_t max_size, size_t max_collision_resolution_chain)
+    {
+        return roundUpToPowerOfTwoOrZero(std::max(max_size, max_collision_resolution_chain));
+    }
+public:
+    using Cell = DeadlineCell<TKey, TValue, Hash, save_hash_in_cells>;
+    using Key = TKey;
+    using Value = TValue;
+    using Mapped = typename Cell::Mapped;
+
+    explicit FixedDeadlineHashMapImpl(size_t max_size_, size_t max_collision_resolution_chain_, Disposer disposer_ = Disposer())
+        : Base(calculateMaxSize(max_size_, max_collision_resolution_chain_))
+        , max_collision_resolution_chain(max_collision_resolution_chain_)
+        , max_size(max_size_)
+        , disposer(std::move(disposer_))
+    {
+        assert(max_size > 0);
+        assert(max_collision_resolution_chain > 0);
+    }
+
+    ~FixedDeadlineHashMapImpl()
+    {
+        clear();
+    }
+
+    Cell * get(const Key & key)
+    {
+        if (Cell::isZero(key, *this))
+            return this->hasZero() ? this->zeroValue() : nullptr;
+
+        /// TODO: Optimize
+
+        size_t hash_value = Base::hash(key);
+        size_t place_value = Base::grower.place(hash_value);
+        size_t resolution_chain = max_collision_resolution_chain;
+
+        while (resolution_chain != 0)
+        {
+            auto & cell = Base::buf[place_value];
+
+            if (cell.isZero(*this))
+                return nullptr;
+
+            if (cell.keyEquals(key, hash_value, *this))
+                return &cell;
+
+            place_value = Base::grower.next(place_value);
+            --resolution_chain;
+        }
+
+        return nullptr;
+    }
+
+    const Cell * get(const Key & key) const
+    {
+        return const_cast<std::decay_t<decltype(*this)> *>(this)->get(key);
+    }
+
+    std::pair<Cell *, bool> ALWAYS_INLINE insert(const Key & key, const Value & value)
+    {
+        return emplace(key, value);
+    }
+
+    std::pair<Cell *, bool> ALWAYS_INLINE insert(const Key & key, Value && value)
+    {
+        return emplace(key, std::move(value));
+    }
+
+    template<typename ...Args>
+    std::pair<Cell *, bool> ALWAYS_INLINE emplace(const Key & key, Args && ... args)
+    {
+        size_t hash_value = Base::hash(key);
+        std::pair<Cell *, bool> result;
+
+        if (!emplaceIfZero(key, hash_value, result))
+            result = emplaceNonZeroImpl(key, hash_value);
+
+        bool was_inserted = result.second;
+
+        if (was_inserted)
+            new (&result.first->getMapped()) Value(std::forward<Args>(args)...);
+
+        return result;
+    }
+
+    template <typename ...Args>
+    void reinsert(Cell * place_to_use, const Key & key, Args && ... args)
+    {
+        size_t hash_value = Base::hash(key);
+
+        new (place_to_use) Cell(key, *this);
+        new (&place_to_use->getMapped()) Value(std::forward<Args>(args)...);
+        place_to_use->setHash(hash_value);
+    }
+
+    using Base::size;
+
+    using iterator = typename Base::iterator;
+    using const_iterator = typename Base::const_iterator;
+
+    using Base::begin;
+    using Base::end;
+
+    size_t getMaxSize() const { return max_size; }
+
+    size_t getSizeInBytes() const { return Base::getBufferSizeInBytes(); }
+
+    void clear()
+    {
+        for (auto & cell : *this)
+            disposer(cell.getKey(), cell.getMapped());
+    }
+
+private:
+    size_t max_collision_resolution_chain;
+    size_t max_size;
+    Disposer disposer;
+
+    bool emplaceIfZero(const Key & key, size_t hash_value, std::pair<Cell *, bool> & result)
+    {
+        if (!Cell::isZero(key, *this))
+            return false;
+
+        if (this->hasZero())
+        {
+            result = {this->zeroValue(), false};
+            return true;
+        }
+
+        ++Base::m_size;
+
+        this->setHasZero();
+        this->zeroValue()->setHash(hash_value);
+        result = {this->zeroValue(), true};
+
+        return true;
+    }
+
+    std::pair<Cell *, bool> emplaceNonZeroImpl(const Key & key, size_t hash_value)
+    {
+        TimePoint oldest_time = TimePoint::max();
+        size_t place_value = Base::grower.place(hash_value);
+        size_t resolution_chain = max_collision_resolution_chain;
+
+        bool use_old_value_place = false;
+        Cell * place_to_insert = nullptr;
+
+        while (resolution_chain != 0)
+        {
+            auto & cell = Base::buf[place_value];
+
+            if (cell.isZero(*this))
+            {
+                use_old_value_place = false;
+                place_to_insert = &cell;
+                break;
+            }
+
+            if (cell.keyEquals(key, hash_value, *this))
+                return std::make_pair(&cell, false);
+
+            if (cell.getDeadline() < oldest_time)
+            {
+                use_old_value_place = true;
+                place_to_insert = &cell;
+            }
+
+            place_value = Base::grower.next(place_value);
+            --resolution_chain;
+        }
+
+        if (!place_to_insert)
+            place_to_insert = &Base::buf[place_value];
+
+        if (use_old_value_place)
+            return std::make_pair(place_to_insert, false);
+        else
+        {
+            ++Base::m_size;
+
+            new (place_to_insert) Cell(key, *this);
+            place_to_insert->setHash(hash_value);
+
+            return std::make_pair(place_to_insert, true);
+        }
+    }
+};
+
+template <typename Key, typename Mapped>
+struct DefaultFixedHashMapCellDisposer
+{
+    void operator()(const Key &, const Mapped &) const {}
+};
+
+template <typename Key, typename Value, typename Disposer = DefaultFixedHashMapCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
+using FixedDeadlineHashMap = FixedDeadlineHashMapImpl<Key, Value, Disposer, Hash, false>;
+
+template <typename Key, typename Value, typename Disposer = DefaultFixedHashMapCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
+using FixedDeadlineHashMapWithSavedHash = FixedDeadlineHashMapImpl<Key, Value, Disposer, Hash, true>;
diff --git a/src/Common/HashTable/LRUHashMap.h b/src/Common/HashTable/LRUHashMap.h
index df9766c5ee8..870fb219523 100644
--- a/src/Common/HashTable/LRUHashMap.h
+++ b/src/Common/HashTable/LRUHashMap.h
@@ -271,13 +271,13 @@ private:
 };
 
 template <typename Key, typename Mapped>
-struct DefaultCellDisposer
+struct DefaultLRUHashMapCellDisposer
 {
     void operator()(const Key &, const Mapped &) const {}
 };
 
-template <typename Key, typename Value, typename Disposer = DefaultCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
+template <typename Key, typename Value, typename Disposer = DefaultLRUHashMapCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
 using LRUHashMap = LRUHashMapImpl<Key, Value, Disposer, Hash, false>;
 
-template <typename Key, typename Value, typename Disposer = DefaultCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
+template <typename Key, typename Value, typename Disposer = DefaultLRUHashMapCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
 using LRUHashMapWithSavedHash = LRUHashMapImpl<Key, Value, Disposer, Hash, true>;
diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h
index d27c6512244..a98f92e5da9 100644
--- a/src/Dictionaries/CacheDictionaryStorage.h
+++ b/src/Dictionaries/CacheDictionaryStorage.h
@@ -11,6 +11,7 @@
 #include <Common/Arena.h>
 #include <Common/ArenaWithFreeLists.h>
 #include <Common/HashTable/LRUHashMap.h>
+#include <Common/HashTable/FixedDeadlineHashMap.h>
 #include <Dictionaries/DictionaryStructure.h>
 #include <Dictionaries/ICacheDictionaryStorage.h>
 #include <Dictionaries/DictionaryHelpers.h>
@@ -46,7 +47,7 @@ public:
         CacheDictionaryStorageConfiguration & configuration_)
         : configuration(configuration_)
         , rnd_engine(randomSeed())
-        , cache(configuration.max_size_in_cells, false, { *this })
+        , cache(configuration.max_size_in_cells, 10, { *this })
     {
         setup(dictionary_structure);
     }
@@ -162,8 +163,8 @@ private:
             , is_default(is_default_)
         {}
 
-        const size_t element_index;
-        const bool is_default;
+        size_t element_index;
+        bool is_default;
     };
 
     template <typename KeysStorageFetchResult>
@@ -184,12 +185,12 @@ private:
         std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds);
 
         PaddedPODArray<FetchedKey> fetched_keys;
-        fetched_keys.reserve(keys_size);
+        fetched_keys.resize_fill(keys_size);
 
         for (size_t key_index = 0; key_index < keys_size; ++key_index)
         {
             auto key = keys[key_index];
-            auto * it = cache.find(key);
+            auto * it = cache.get(key);
 
             if (!it)
             {
@@ -198,9 +199,10 @@ private:
                 continue;
             }
 
+            auto deadline = it->getDeadline();
             const auto & cell = it->getMapped();
 
-            if (now > cell.deadline + max_lifetime_seconds)
+            if (now > deadline + max_lifetime_seconds)
             {
                 result.key_index_to_state[key_index] = {KeyState::not_found};
                 ++result.not_found_keys_size;
@@ -210,7 +212,7 @@ private:
             bool cell_is_expired = false;
             KeyState::State key_state = KeyState::found;
 
-            if (now > cell.deadline)
+            if (now > deadline)
             {
                 cell_is_expired = true;
                 key_state = KeyState::expired;
@@ -225,7 +227,7 @@ private:
             result.key_index_to_state[key_index].setDefaultValue(cell.is_default);
             result.default_keys_size += cell.is_default;
 
-            fetched_keys.emplace_back(cell.element_index, cell.is_default);
+            fetched_keys[key_index] = FetchedKey{cell.element_index, cell.is_default};
         }
 
         for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index)
@@ -311,103 +313,143 @@ private:
     void insertColumnsForKeysImpl(const PaddedPODArray<KeyType> & keys, Columns columns)
     {
         const auto now = std::chrono::system_clock::now();
+
         size_t keys_size = keys.size();
 
+        size_t columns_size = columns.size();
+        Field column_value;
+
         for (size_t key_index = 0; key_index < keys_size; ++key_index)
         {
             auto key = keys[key_index];
-            cache.erase(key);
 
-            Cell cell;
+            auto [it, was_inserted] = cache.insert(key, {});
 
-            setCellDeadline(cell, now);
-            cell.element_index = insert_index;
-            cell.is_default = false;
-
-            ++insert_index;
-
-            insertCellInCache(key, cell);
-        }
-
-        Field complex_column_value;
-
-        for (size_t column_index = 0; column_index < columns.size(); ++column_index)
-        {
-            auto & attribute = attributes[column_index];
-            const auto & column = columns[column_index];
-            size_t column_size = column->size();
-
-            if (unlikely(attribute.is_complex_type))
+            if (was_inserted)
             {
-                auto & container = std::get<std::vector<Field>>(attribute.attribute_container);
-                container.reserve(column_size);
+                auto & cell = it->getMapped();
+                cell.is_default = false;
 
-                for (size_t item_index = 0; item_index < column_size; ++item_index)
+                for (size_t attribute_index = 0; attribute_index < columns_size; ++attribute_index)
                 {
-                    column->get(item_index, complex_column_value);
-                    container.emplace_back(complex_column_value);
+                    auto & column = columns[attribute_index];
+
+                    getAttributeContainer(attribute_index, [&](auto & container)
+                    {
+                        container.emplace_back();
+                        cell.element_index = container.size() - 1;
+
+                        using ElementType = std::decay_t<decltype(container[0])>;
+
+                        column->get(key_index, column_value);
+
+                        if constexpr (std::is_same_v<ElementType, Field>)
+                            container.back() = column_value;
+                        else if constexpr (std::is_same_v<ElementType, StringRef>)
+                        {
+                            const String & value = column_value.get<String>();
+                            StringRef inserted_value = copyStringInArena(StringRef { value.data(), value.size() });
+                            container.back() = inserted_value;
+                        }
+                        else
+                            container.back() = column_value.get<ElementType>();
+                    });
                 }
             }
             else
             {
-                auto type_call = [&](const auto & dictionary_attribute_type)
+                auto & cell_key = it->getKey();
+
+                Cell cell;
+
+                size_t existing_index = it->getMapped().element_index;
+
+                cell.element_index = existing_index;
+                cell.is_default = false;
+
+                if (cell_key != key)
                 {
-                    using Type = std::decay_t<decltype(dictionary_attribute_type)>;
-                    using AttributeType = typename Type::AttributeType;
-                    using ValueType = DictionaryValueType<AttributeType>;
-                    using ColumnType =
-                        std::conditional_t<std::is_same_v<AttributeType, String>, ColumnString,
-                            std::conditional_t<IsDecimalNumber<AttributeType>, ColumnDecimal<ValueType>,
-                                ColumnVector<AttributeType>>>;
+                    /// In case of complex key we keep it in arena
+                    if constexpr (std::is_same_v<KeyType, StringRef>)
+                        arena.free(const_cast<char *>(key.data), key.size);
+                }
 
-                    const ColumnType & column_typed = static_cast<const ColumnType &>(*column);
+                cache.reinsert(it, key, cell);
 
-                    auto & container = std::get<PaddedPODArray<ValueType>>(attribute.attribute_container);
-                    container.reserve(column_size);
+                /// Put values into index
 
-                    if constexpr (std::is_same_v<ColumnType, ColumnString>)
+                for (size_t attribute_index = 0; attribute_index < columns_size; ++attribute_index)
+                {
+                    auto & column = columns[attribute_index];
+
+                    getAttributeContainer(attribute_index, [&](auto & container)
                     {
-                        /// TODO: Serialize while column string in arena then just insert offsets in container
-                        for (size_t item_index = 0; item_index < column_size; ++item_index)
+                        using ElementType = std::decay_t<decltype(container[0])>;
+
+                        column->get(key_index, column_value);
+
+                        if constexpr (std::is_same_v<ElementType, Field>)
+                            container[existing_index] = column_value;
+                        else if constexpr (std::is_same_v<ElementType, StringRef>)
                         {
-                            StringRef value = column->getDataAt(item_index);
-                            StringRef updated_data = copyStringInArena(value);
-
-                            container.emplace_back(updated_data);
+                            const String & value = column_value.get<String>();
+                            StringRef inserted_value = copyStringInArena(StringRef { value.data(), value.size() });
+                            container[existing_index] = inserted_value;
                         }
-                    }
-                    else
-                    {
-                        const auto & data = column_typed.getData();
-                        container.insert(data.begin(), data.end());
-                    }
-                };
-
-                callOnDictionaryAttributeType(attribute.type, type_call);
+                        else
+                            container[existing_index] = column_value.get<ElementType>();
+                    });
+                }
             }
-        }
 
-        deleteUnusedKeysIfNecessary();
+            setCellDeadline(*it, now);
+        }
     }
 
     void insertDefaultKeysImpl(const PaddedPODArray<KeyType> & keys)
     {
         const auto now = std::chrono::system_clock::now();
 
-        for (auto key : keys)
+        size_t keys_size = keys.size();
+
+        for (size_t key_index = 0; key_index < keys_size; ++key_index)
         {
-            cache.erase(key);
+            auto key = keys[key_index];
 
-            Cell cell;
+            Cell value;
+            value.is_default = true;
 
-            setCellDeadline(cell, now);
-            cell.element_index = 0;
-            cell.is_default = true;
+            auto [it, was_inserted] = cache.insert(key, value);
 
-            insertCellInCache(key, cell);
+            if (was_inserted)
+            {
+                auto & cell = it->getMapped();
+
+                for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
+                {
+                    getAttributeContainer(attribute_index, [&](auto & container)
+                    {
+                        container.emplace_back();
+                        cell.element_index = container.size();
+                    });
+                }
+            }
+            else
+            {
+                value.element_index = it->getMapped().element_index;
+
+                if (it->getKey() != key)
+                {
+                    /// In case of complex key we keep it in arena
+                    if constexpr (std::is_same_v<KeyType, StringRef>)
+                        arena.free(const_cast<char *>(key.data), key.size);
+                }
+
+                cache.reinsert(it, key, value);
+            }
+
+            setCellDeadline(*it, now);
         }
-
-        deleteUnusedKeysIfNecessary();
     }
 
     PaddedPODArray<KeyType> getCachedKeysImpl() const
@@ -428,92 +470,6 @@ private:
         return result;
     }
 
-    void deleteUnusedKeysIfNecessary()
-    {
-        size_t cache_max_size = cache.getMaxSize();
-
-        if (unlikely(attributes.empty()) || insert_index < cache_max_size * 2)
-            return;
-
-        absl::flat_hash_map<size_t, typename CacheLRUHashMap::iterator, DefaultHash<size_t>> element_index_to_cache_iterator;
-
-        for (auto begin = cache.begin(); begin != cache.end(); ++begin)
-        {
-            auto & node = *begin;
-            auto & cell = node.getMapped();
-            size_t element_index = cell.element_index;
-            element_index_to_cache_iterator.insert(std::make_pair(element_index, begin));
-        }
-
-        size_t last_remove_index = 0;
-
-        getAttributeContainer(0, [&, this](auto & container)
-        {
-            size_t container_size = container.size();
-            size_t remove_index = 0;
-
-            for (size_t i = 0; i < container_size; ++i)
-            {
-                if (indexes_to_delete.contains(i))
-                {
-                    if constexpr (std::is_same_v<decltype(container[0]), StringRef>)
-                    {
-                        StringRef data = container[i];
-                        arena.free(const_cast<char *>(data.data), data.size);
-                    }
-
-                    continue;
-                }
-
-                std::swap(container[remove_index], container[i]);
-
-                auto it = element_index_to_cache_iterator.find(remove_index);
-                if (it != element_index_to_cache_iterator.end())
-                {
-                    auto & cell = it->second->getMapped();
-                    cell.element_index = remove_index;
-                }
-
-                ++remove_index;
-            }
-
-            container.erase(container.begin() + remove_index, container.end());
-            last_remove_index = remove_index;
-        });
-
-        insert_index = last_remove_index;
-
-        for (size_t attribute_index = 1; attribute_index < attributes.size(); ++attribute_index)
-        {
-            getAttributeContainer(attribute_index, [this](auto & container)
-            {
-                size_t container_size = container.size();
-                size_t remove_index = 0;
-
-                for (size_t i = 0; i < container_size; ++i)
-                {
-                    if (indexes_to_delete.contains(i))
-                    {
-                        if constexpr (std::is_same_v<decltype(container[0]), StringRef>)
-                        {
-                            StringRef data = container[i];
-                            arena.free(const_cast<char *>(data.data), data.size);
-                        }
-
-                        continue;
-                    }
-
-                    std::swap(container[remove_index], container[i]);
-                    ++remove_index;
-                }
-
-                container.erase(container.begin() + remove_index, container.end());
-            });
-       }
-
-       indexes_to_delete.clear();
-    }
-
     template <typename GetContainerFunc>
     void getAttributeContainer(size_t attribute_index, GetContainerFunc && func)
     {
@@ -589,41 +545,12 @@ private:
         }
     }
 
-    using TimePoint = std::chrono::system_clock::time_point;
-
     struct Cell
     {
-        TimePoint deadline;
         size_t element_index;
         bool is_default;
     };
 
-    void insertCellInCache(KeyType & key, const Cell & cell)
-    {
-        /// Copy complex key into arena and put in cache
-        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
-            key = copyStringInArena(key);
-
-        cache.insert(key, cell);
-    }
-
-    inline void setCellDeadline(Cell & cell, TimePoint now)
-    {
-        if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
-        {
-            /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds
-            /// to the expiration time. And it overflows pretty well.
-            cell.deadline = std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds);
-            return;
-        }
-
-        size_t min_sec_lifetime = configuration.lifetime.min_sec;
-        size_t max_sec_lifetime = configuration.lifetime.max_sec;
-
-        std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
-        cell.deadline = now + std::chrono::seconds(distribution(rnd_engine));
-    }
-
     CacheDictionaryStorageConfiguration configuration;
 
     ArenaWithFreeLists arena;
@@ -660,29 +587,47 @@ private:
         CacheDictionaryStorage & storage;
 
         template <typename Key, typename Value>
-        void operator()(const Key & key, const Value & cell) const
+        void operator()(const Key & key, const Value &) const
         {
             /// In case of complex key we keep it in arena
             if constexpr (std::is_same_v<Key, StringRef>)
                 storage.arena.free(const_cast<char *>(key.data), key.size);
-
-            storage.indexes_to_delete.insert(cell.element_index);
         }
     };
 
-    using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, CacheStorageCellDisposer>;
-    using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, CacheStorageCellDisposer>;
+    using SimpleFixedDeadlineHashMap = FixedDeadlineHashMap<UInt64, Cell, CacheStorageCellDisposer>;
+    using ComplexFixedDeadlineHashMap = FixedDeadlineHashMap<StringRef, Cell, CacheStorageCellDisposer>;
 
-    using CacheLRUHashMap = std::conditional_t<
+    using FixedDeadlineHashMap = std::conditional_t<
         dictionary_key_type == DictionaryKeyType::simple,
-        SimpleKeyLRUHashMap,
-        ComplexKeyLRUHashMap>;
+        SimpleFixedDeadlineHashMap,
+        ComplexFixedDeadlineHashMap>;
 
-    CacheLRUHashMap cache;
+    using FixedDeadlineHashMapCell = typename FixedDeadlineHashMap::Cell;
+
+    inline void setCellDeadline(FixedDeadlineHashMapCell & cell, TimePoint now)
+    {
+        if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
+        {
+            /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds
+            /// to the expiration time. And it overflows pretty well.
+            auto deadline = std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds);
+            cell.setDeadline(deadline);
+            return;
+        }
+
+        size_t min_sec_lifetime = configuration.lifetime.min_sec;
+        size_t max_sec_lifetime = configuration.lifetime.max_sec;
+
+        std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
+
+        auto deadline = now + std::chrono::seconds(distribution(rnd_engine));
+        cell.setDeadline(deadline);
+    }
+
+    FixedDeadlineHashMap cache;
 
     std::vector<Attribute> attributes;
-    size_t insert_index = 0;
-    absl::flat_hash_set<size_t, DefaultHash<size_t>> indexes_to_delete;
 };
 
 }

From f00e1084107f5fe3e9262a45116fbabd945f508d Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Tue, 16 Mar 2021 14:07:30 +0800
Subject: [PATCH 015/260] Fix scalar subquery index analysis

---
 src/Storages/MergeTree/KeyCondition.cpp               | 11 ++++++++---
 .../0_stateless/01649_with_alias_key_condition.sql    |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp
index 8f5dec8077d..6833d2e2fd4 100644
--- a/src/Storages/MergeTree/KeyCondition.cpp
+++ b/src/Storages/MergeTree/KeyCondition.cpp
@@ -444,7 +444,8 @@ bool KeyCondition::addCondition(const String & column, const Range & range)
   */
 bool KeyCondition::getConstant(const ASTPtr & expr, Block & block_with_constants, Field & out_value, DataTypePtr & out_type)
 {
-    String column_name = expr->getColumnNameWithoutAlias();
+    // Constant expr should use alias names if any
+    String column_name = expr->getColumnName();
 
     if (const auto * lit = expr->as<ASTLiteral>())
     {
@@ -607,7 +608,8 @@ bool KeyCondition::canConstantBeWrappedByMonotonicFunctions(
     if (strict)
         return false;
 
-    String expr_name = node->getColumnNameWithoutAlias();
+    // Constant expr should use alias names if any
+    String expr_name = node->getColumnName();
     const auto & sample_block = key_expr->getSampleBlock();
     if (!sample_block.has(expr_name))
         return false;
@@ -675,7 +677,8 @@ bool KeyCondition::canConstantBeWrappedByFunctions(
     if (strict)
         return false;
 
-    String expr_name = ast->getColumnNameWithoutAlias();
+    // Constant expr should use alias names if any
+    String expr_name = ast->getColumnName();
     const auto & sample_block = key_expr->getSampleBlock();
     if (!sample_block.has(expr_name))
         return false;
@@ -1011,6 +1014,8 @@ bool KeyCondition::isKeyPossiblyWrappedByMonotonicFunctionsImpl(
       * Therefore, use the full name of the expression for search.
       */
     const auto & sample_block = key_expr->getSampleBlock();
+
+    // Key columns should use canonical names for index analysis
     String name = node->getColumnNameWithoutAlias();
 
     auto it = key_columns.find(name);
diff --git a/tests/queries/0_stateless/01649_with_alias_key_condition.sql b/tests/queries/0_stateless/01649_with_alias_key_condition.sql
index b813e6ee84f..0a796f8512e 100644
--- a/tests/queries/0_stateless/01649_with_alias_key_condition.sql
+++ b/tests/queries/0_stateless/01649_with_alias_key_condition.sql
@@ -6,6 +6,6 @@ insert into alias_key_condition values (1, 2), (3, 4);
 
 set force_primary_key = 1;
 
-with i as k select * from alias_key_condition where k = 3;
+with i as k select * from alias_key_condition where k = (select i from alias_key_condition where i = 3);
 
 drop table if exists alias_key_condition;

From f49d6404f39807850ef8fca116dd180261cf3be2 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 16 Mar 2021 11:03:47 +0300
Subject: [PATCH 016/260] Trying to add new nemesis

---
 .../src/jepsen/nukeeper/constants.clj         |  9 ++++
 .../src/jepsen/nukeeper/main.clj              | 18 +++-----
 .../src/jepsen/nukeeper/nemesis.clj           | 13 ++++++
 .../src/jepsen/nukeeper/set.clj               | 10 +++--
 .../src/jepsen/nukeeper/utils.clj             | 44 ++++++++++++++++++-
 5 files changed, 78 insertions(+), 16 deletions(-)
 create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
 create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
new file mode 100644
index 00000000000..0a20adea086
--- /dev/null
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
@@ -0,0 +1,9 @@
+(ns jepsen.nukeeper.constants)
+
+(def dir "/var/lib/clickhouse")
+(def binary "clickhouse")
+(def logdir "/var/log/clickhouse-server")
+(def logfile "/var/log/clickhouse-server/stderr.log")
+(def serverlog "/var/log/clickhouse-server/clickhouse-server.log")
+(def pidfile (str dir "/clickhouse.pid"))
+(def binary-path "/tmp")
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 8aa157bc16e..2b244c924bd 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -2,7 +2,9 @@
   (:require [clojure.tools.logging :refer :all]
             [jepsen.nukeeper.utils :refer :all]
             [jepsen.nukeeper.set :as set]
+            [jepsen.nukeeper.nemesis :as custom-nemesis]
             [jepsen.nukeeper.register :as register]
+            [jepsen.nukeeper.constants :refer :all]
             [clojure.string :as str]
             [jepsen
              [checker :as checker]
@@ -23,14 +25,6 @@
             [zookeeper :as zk])
   (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
 
-(def dir "/var/lib/clickhouse")
-(def binary "clickhouse")
-(def logdir "/var/log/clickhouse-server")
-(def logfile "/var/log/clickhouse-server/stderr.log")
-(def serverlog "/var/log/clickhouse-server/clickhouse-server.log")
-(def pidfile (str dir "/clickhouse.pid"))
-(def binary-path "/tmp")
-
 (defn cluster-config
   [test node config-template]
   (let [nodes (:nodes test)]
@@ -66,13 +60,13 @@
         (str binary-path "/clickhouse")
         :server
         :--config "/etc/clickhouse-server/config.xml")
-       (Thread/sleep 10000)))
+       (wait-clickhouse-alive! node test)))
 
     (teardown! [_ test node]
       (info node "tearing down clickhouse")
       (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
       (c/su
-       (c/exec :rm :-f (str binary-path "/clickhouse"))
+       ;(c/exec :rm :-f (str binary-path "/clickhouse"))
        (c/exec :rm :-rf dir)
        (c/exec :rm :-rf logdir)
        (c/exec :rm :-rf "/etc/clickhouse-server")))
@@ -111,10 +105,10 @@
            opts
            {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)))
             :os ubuntu/os
-            :db (db "rbtorrent:8831b5baa571abc28340cf66a9279a4ce45fac64")
+            :db (db "rbtorrent:46832e8fa975b094a5591184b3c854700ed770f4")
             :pure-generators true
             :client (:client workload)
-            :nemesis (nemesis/partition-random-halves)
+            :nemesis (custom-nemesis/random-single-node-killer-nemesis)
             :checker (checker/compose
                       {:perf     (checker/perf)
                        :workload (:checker workload)})
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
new file mode 100644
index 00000000000..2f359bc5cba
--- /dev/null
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -0,0 +1,13 @@
+(ns jepsen.nukeeper.nemesis
+  (:require [jepsen
+             [nemesis :as nemesis]]
+            [jepsen.nukeeper.utils :refer :all]))
+
+
+
+(defn random-single-node-killer-nemesis
+  []
+  (nemesis/node-start-stopper
+    rand-nth
+    (fn start [test node] (kill-clickhouse! node test))
+    (fn stop [test node] (start-clickhouse! node test))))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
index 7e196fab4c7..6a33350673d 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -1,5 +1,7 @@
 (ns jepsen.nukeeper.set
-  (:require   [jepsen
+  (:require
+              [clojure.tools.logging :refer :all]
+              [jepsen
                [checker :as checker]
                [client :as client]
                [generator :as gen]]
@@ -18,9 +20,11 @@
   (invoke! [_ test op]
     (case (:f op)
       :read ;(try
-      (assoc op
+      (do (info "LIST ON NODE" (zk-list conn "/"))
+          (info "EXISTS NODE" (zk/exists conn "/a-set"))
+          (assoc op
              :type :ok
-             :value (read-string (:data (zk-get-str conn k))))
+             :value (read-string (:data (zk-get-str conn k)))))
               ;(catch Exception _ (assoc op :type :fail, :error :connect-error)))
       :add (try
              (do
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index 3caec8e5f62..e398039a329 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -1,7 +1,11 @@
 (ns jepsen.nukeeper.utils
   (:require [clojure.string :as str]
             [zookeeper.data :as data]
-            [zookeeper :as zk]))
+            [zookeeper :as zk]
+            [jepsen.control.util :as cu]
+            [jepsen.nukeeper.constants :refer :all]
+            [jepsen.control :as c]
+            [clojure.tools.logging :refer :all]))
 
 (defn parse-long
   "Parses a string to a Long. Passes through `nil` and empty strings."
@@ -37,6 +41,10 @@
     {:data (data/to-string (:data zk-result))
      :stat (:stat zk-result)}))
 
+(defn zk-list
+  [conn path]
+  (zk/children conn path))
+
 (defn zk-cas
   [conn path old-value new-value]
   (let [current-value (zk-get-str conn path)]
@@ -54,3 +62,37 @@
 (defn zk-create-if-not-exists
   [conn path data]
   (zk/create conn path :data (data/to-bytes (str data))))
+
+
+(defn clickhouse-alive?
+  [node test]
+  (info "Checking server alive on" node)
+  (try
+      (c/exec (str binary-path "/clickhouse") :client :--query "SELECT 1")
+      (catch Exception _ false)))
+
+(defn wait-clickhouse-alive!
+  [node test & {:keys [maxtries] :or {maxtries 30}}]
+  (loop [i 0]
+    (cond (> i maxtries) false
+          (clickhouse-alive? node test) true
+          :else (do (Thread/sleep 1000) (recur (inc i))))))
+
+(defn kill-clickhouse!
+  [node test]
+  (info "Killing server on node" node)
+  (c/su
+    (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)))
+
+(defn start-clickhouse!
+  [node test]
+  (info "Starting server on node" node)
+  (c/su
+    (cu/start-daemon!
+      {:pidfile pidfile
+       :logfile logfile
+       :chdir dir}
+      (str binary-path "/clickhouse")
+      :server
+      :--config "/etc/clickhouse-server/config.xml"))
+  (wait-clickhouse-alive! node test))

From 6454479edda94ed7df6b00e25f77388139ae0fb8 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 16 Mar 2021 14:44:43 +0300
Subject: [PATCH 017/260] Add useful util for state dump

---
 .../src/jepsen/nukeeper/constants.clj         |  3 +
 .../src/jepsen/nukeeper/main.clj              |  9 +-
 .../src/jepsen/nukeeper/nemesis.clj           |  1 -
 .../src/jepsen/nukeeper/set.clj               | 13 +--
 utils/CMakeLists.txt                          |  1 +
 utils/nukeeper-data-dumper/CMakeLists.txt     |  2 +
 utils/nukeeper-data-dumper/main.cpp           | 87 +++++++++++++++++++
 7 files changed, 108 insertions(+), 8 deletions(-)
 create mode 100644 utils/nukeeper-data-dumper/CMakeLists.txt
 create mode 100644 utils/nukeeper-data-dumper/main.cpp

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
index 0a20adea086..511ff8e3bf3 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
@@ -5,5 +5,8 @@
 (def logdir "/var/log/clickhouse-server")
 (def logfile "/var/log/clickhouse-server/stderr.log")
 (def serverlog "/var/log/clickhouse-server/clickhouse-server.log")
+(def snapshotsdir "/var/lib/clickhouse/coordination/snapshots")
+(def coordinationdir "/var/lib/clickhouse/coordination")
+(def logsdir "/var/lib/clickhouse/coordination/logs")
 (def pidfile (str dir "/clickhouse.pid"))
 (def binary-path "/tmp")
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 2b244c924bd..1153f6f1389 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -15,7 +15,8 @@
              [nemesis :as nemesis]
              [generator :as gen]
              [independent :as independent]
-             [tests :as tests]]
+             [tests :as tests]
+             [util :as util :refer [meh]]]
             [jepsen.control.util :as cu]
             [jepsen.os.ubuntu :as ubuntu]
             [jepsen.checker.timeline :as timeline]
@@ -73,7 +74,11 @@
 
     db/LogFiles
     (log-files [_ test node]
-      [logfile serverlog])))
+      (c/su
+        (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
+        (c/cd dir
+            (c/exec :tar :czf "coordination.tar.gz" "coordination")))
+      [logfile serverlog (str dir "/coordination.tar.gz")])))
 
 (def workloads
   "A map of workload names to functions that construct workloads, given opts."
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index 2f359bc5cba..84253dd6d42 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -4,7 +4,6 @@
             [jepsen.nukeeper.utils :refer :all]))
 
 
-
 (defn random-single-node-killer-nemesis
   []
   (nemesis/node-start-stopper
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
index 6a33350673d..fcdfa138c4c 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -9,18 +9,21 @@
               [zookeeper :as zk])
   (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
 
-(defrecord SetClient [k conn]
+(defrecord SetClient [k conn nodename]
   client/Client
   (open! [this test node]
-    (assoc this :conn (zk-connect node 9181 30000)))
+    (assoc
+     (assoc this
+            :conn (zk-connect node 9181 30000))
+     :nodename node))
 
   (setup! [this test]
     (zk-create-if-not-exists conn k "#{}"))
 
-  (invoke! [_ test op]
+  (invoke! [this test op]
     (case (:f op)
       :read ;(try
-      (do (info "LIST ON NODE" (zk-list conn "/"))
+      (do (info "LIST ON NODE" nodename (zk-list conn "/"))
           (info "EXISTS NODE" (zk/exists conn "/a-set"))
           (assoc op
              :type :ok
@@ -40,7 +43,7 @@
 (defn workload
   "A generator, client, and checker for a set test."
   [opts]
-  {:client    (SetClient. "/a-set" nil)
+  {:client    (SetClient. "/a-set" nil nil)
    :checker   (checker/set)
    :generator (->> (range)
                    (map (fn [x] {:type :invoke, :f :add, :value x})))
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
index d38b34f3419..dc077f0e49a 100644
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -21,6 +21,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS)
     add_subdirectory (corrector_utf8)
     add_subdirectory (zookeeper-cli)
     add_subdirectory (zookeeper-test)
+    add_subdirectory (nukeeper-data-dumper)
     add_subdirectory (zookeeper-dump-tree)
     add_subdirectory (zookeeper-remove-by-list)
     add_subdirectory (zookeeper-create-entry-to-download-part)
diff --git a/utils/nukeeper-data-dumper/CMakeLists.txt b/utils/nukeeper-data-dumper/CMakeLists.txt
new file mode 100644
index 00000000000..bab1137bf4d
--- /dev/null
+++ b/utils/nukeeper-data-dumper/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(nukeeper-data-dumper main.cpp)
+target_link_libraries(nukeeper-data-dumper PRIVATE dbms)
diff --git a/utils/nukeeper-data-dumper/main.cpp b/utils/nukeeper-data-dumper/main.cpp
new file mode 100644
index 00000000000..20682bdb366
--- /dev/null
+++ b/utils/nukeeper-data-dumper/main.cpp
@@ -0,0 +1,87 @@
+#include <Poco/ConsoleChannel.h>
+#include <Poco/Logger.h>
+#include <Coordination/NuKeeperStateMachine.h>
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+#include <Common/ZooKeeper/ZooKeeperIO.h>
+#include <Common/Exception.h>
+#include <libnuraft/nuraft.hxx> // Y_IGNORE
+#include <Coordination/NuKeeperLogStore.h>
+#include <Coordination/Changelog.h>
+#include <common/logger_useful.h>
+
+using namespace Coordination;
+using namespace DB;
+
+void dumpMachine(std::shared_ptr<NuKeeperStateMachine> machine)
+{
+    auto & storage = machine->getStorage();
+    std::queue<std::string> keys;
+    keys.push("/");
+
+    while (!keys.empty())
+    {
+        auto key = keys.front();
+        keys.pop();
+        auto value = storage.container.getValue(key);
+        std::cout << key << "\n";
+        std::cout << "\tStat: {version: " << value.stat.version <<
+            ", mtime: " << value.stat.mtime <<
+            ", emphemeralOwner: " << value.stat.ephemeralOwner <<
+            ", czxid: " << value.stat.czxid <<
+            ", mzxid: " << value.stat.mzxid <<
+            ", numChildren: " << value.stat.numChildren <<
+            ", dataLength: " << value.stat.dataLength <<
+            "}" << std::endl;
+        std::cout << "\tData: " << storage.container.getValue(key).data << std::endl;
+
+        for (const auto & child : value.children)
+        {
+            if (key == "/")
+                keys.push(key + child);
+            else
+                keys.push(key + "/" + child);
+        }
+    }
+    std::cout << std::flush;
+}
+
+int main(int argc, char *argv[])
+{
+    if (argc != 3)
+    {
+        std::cerr << "usage: " << argv[0] << " snapshotpath logpath" << std::endl;
+        return 3;
+    }
+    else
+    {
+        Poco::AutoPtr<Poco::ConsoleChannel> channel(new Poco::ConsoleChannel(std::cerr));
+        Poco::Logger::root().setChannel(channel);
+        Poco::Logger::root().setLevel("trace");
+    }
+    auto * logger = &Poco::Logger::get("nukeeper-dumper");
+    ResponsesQueue queue;
+    SnapshotsQueue snapshots_queue{1};
+    CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
+    auto state_machine = std::make_shared<NuKeeperStateMachine>(queue, snapshots_queue, argv[1], settings);
+    state_machine->init();
+    size_t last_commited_index = state_machine->last_commit_index();
+
+    LOG_INFO(logger, "Last commited index: {}", last_commited_index);
+
+    DB::NuKeeperLogStore changelog(argv[2], 10000000, true);
+    changelog.init(last_commited_index, 10000000000UL); /// collect all logs
+    if (changelog.size() == 0)
+        LOG_INFO(logger, "Changelog empty");
+    else
+        LOG_INFO(logger, "Last changelog entry {}", changelog.next_slot() - 1);
+
+    for (size_t i = last_commited_index + 1; i < changelog.next_slot(); ++i)
+    {
+        if (changelog.entry_at(i)->get_val_type() == nuraft::log_val_type::app_log)
+            state_machine->commit(i, changelog.entry_at(i)->get_buf());
+    }
+
+    dumpMachine(state_machine);
+
+    return 0;
+}

From 077a2019b6e577b530c7edd116b16dbe35168692 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 16 Mar 2021 15:36:54 +0300
Subject: [PATCH 018/260] Found first real bug with jepsen

---
 src/Coordination/NuKeeperStorage.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp
index fff44163b71..2440d6f6613 100644
--- a/src/Coordination/NuKeeperStorage.cpp
+++ b/src/Coordination/NuKeeperStorage.cpp
@@ -641,6 +641,13 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor
             for (const auto & ephemeral_path : it->second)
             {
                 container.erase(ephemeral_path);
+                container.updateValue(parentPath(ephemeral_path), [&ephemeral_path] (NuKeeperStorage::Node & parent)
+                {
+                    --parent.stat.numChildren;
+                    ++parent.stat.cversion;
+                    parent.children.erase(getBaseName(ephemeral_path));
+                });
+
                 auto responses = processWatchesImpl(ephemeral_path, watches, list_watches, Coordination::Event::DELETED);
                 results.insert(results.end(), responses.begin(), responses.end());
             }

From 8cf8265d474b038c60ffdb5a855451cadb24520c Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 16 Mar 2021 15:37:46 +0300
Subject: [PATCH 019/260] Style

---
 .../src/jepsen/nukeeper/main.clj              |  9 ++++---
 .../src/jepsen/nukeeper/nemesis.clj           | 26 +++++++++++++++----
 .../src/jepsen/nukeeper/set.clj               | 18 ++++++-------
 .../src/jepsen/nukeeper/utils.clj             | 21 +++++++--------
 utils/nukeeper-data-dumper/main.cpp           |  2 +-
 5 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 1153f6f1389..dd40b7e399b 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -75,9 +75,9 @@
     db/LogFiles
     (log-files [_ test node]
       (c/su
-        (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
-        (c/cd dir
-            (c/exec :tar :czf "coordination.tar.gz" "coordination")))
+       (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
+       (c/cd dir
+             (c/exec :tar :czf "coordination.tar.gz" "coordination")))
       [logfile serverlog (str dir "/coordination.tar.gz")])))
 
 (def workloads
@@ -105,7 +105,8 @@
   :concurrency, ...), constructs a test map."
   [opts]
   (let [quorum (boolean (:quorum opts))
-        workload  ((get workloads (:workload opts)) opts)]
+        workload  ((get workloads (:workload opts)) opts)
+        current-nemesis (get custom-nemesis/custom-nemesises "killer")]
     (merge tests/noop-test
            opts
            {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index 84253dd6d42..620ad1bd3d3 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -1,12 +1,28 @@
 (ns jepsen.nukeeper.nemesis
   (:require [jepsen
-             [nemesis :as nemesis]]
+             [nemesis :as nemesis]
+             [generator :as gen]]
             [jepsen.nukeeper.utils :refer :all]))
 
-
 (defn random-single-node-killer-nemesis
   []
   (nemesis/node-start-stopper
-    rand-nth
-    (fn start [test node] (kill-clickhouse! node test))
-    (fn stop [test node] (start-clickhouse! node test))))
+   rand-nth
+   (fn start [test node] (kill-clickhouse! node test))
+   (fn stop [test node] (start-clickhouse! node test))))
+
+(def custom-nemesises
+  {"killer" {:nemesis (random-single-node-killer-nemesis)
+             :generator
+             (gen/nemesis
+              (cycle [(gen/sleep 5)
+                      {:type :info, :f :start}
+                      (gen/sleep 5)
+                      {:type :info, :f :stop}]))}
+   "simple-partitioner" {:nemesis (nemesis/partition-random-halves)
+                         :generator
+                         (gen/nemesis
+                          (cycle [(gen/sleep 5)
+                                  {:type :info, :f :start}
+                                  (gen/sleep 5)
+                                  {:type :info, :f :stop}]))}})
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
index fcdfa138c4c..f2f614b2d17 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -1,12 +1,12 @@
 (ns jepsen.nukeeper.set
   (:require
-              [clojure.tools.logging :refer :all]
-              [jepsen
-               [checker :as checker]
-               [client :as client]
-               [generator :as gen]]
-              [jepsen.nukeeper.utils :refer :all]
-              [zookeeper :as zk])
+   [clojure.tools.logging :refer :all]
+   [jepsen
+    [checker :as checker]
+    [client :as client]
+    [generator :as gen]]
+   [jepsen.nukeeper.utils :refer :all]
+   [zookeeper :as zk])
   (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
 
 (defrecord SetClient [k conn nodename]
@@ -26,8 +26,8 @@
       (do (info "LIST ON NODE" nodename (zk-list conn "/"))
           (info "EXISTS NODE" (zk/exists conn "/a-set"))
           (assoc op
-             :type :ok
-             :value (read-string (:data (zk-get-str conn k)))))
+                 :type :ok
+                 :value (read-string (:data (zk-get-str conn k)))))
               ;(catch Exception _ (assoc op :type :fail, :error :connect-error)))
       :add (try
              (do
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index e398039a329..19b4959d742 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -63,13 +63,12 @@
   [conn path data]
   (zk/create conn path :data (data/to-bytes (str data))))
 
-
 (defn clickhouse-alive?
   [node test]
   (info "Checking server alive on" node)
   (try
-      (c/exec (str binary-path "/clickhouse") :client :--query "SELECT 1")
-      (catch Exception _ false)))
+    (c/exec (str binary-path "/clickhouse") :client :--query "SELECT 1")
+    (catch Exception _ false)))
 
 (defn wait-clickhouse-alive!
   [node test & {:keys [maxtries] :or {maxtries 30}}]
@@ -82,17 +81,17 @@
   [node test]
   (info "Killing server on node" node)
   (c/su
-    (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)))
+   (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)))
 
 (defn start-clickhouse!
   [node test]
   (info "Starting server on node" node)
   (c/su
-    (cu/start-daemon!
-      {:pidfile pidfile
-       :logfile logfile
-       :chdir dir}
-      (str binary-path "/clickhouse")
-      :server
-      :--config "/etc/clickhouse-server/config.xml"))
+   (cu/start-daemon!
+    {:pidfile pidfile
+     :logfile logfile
+     :chdir dir}
+    (str binary-path "/clickhouse")
+    :server
+    :--config "/etc/clickhouse-server/config.xml"))
   (wait-clickhouse-alive! node test))
diff --git a/utils/nukeeper-data-dumper/main.cpp b/utils/nukeeper-data-dumper/main.cpp
index 20682bdb366..0340c94c5a0 100644
--- a/utils/nukeeper-data-dumper/main.cpp
+++ b/utils/nukeeper-data-dumper/main.cpp
@@ -22,8 +22,8 @@ void dumpMachine(std::shared_ptr<NuKeeperStateMachine> machine)
     {
         auto key = keys.front();
         keys.pop();
-        auto value = storage.container.getValue(key);
         std::cout << key << "\n";
+        auto value = storage.container.getValue(key);
         std::cout << "\tStat: {version: " << value.stat.version <<
             ", mtime: " << value.stat.mtime <<
             ", emphemeralOwner: " << value.stat.ephemeralOwner <<

From 63873f46bbb791cc9f5f094af3a7b35a64e4f04a Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 16 Mar 2021 15:40:28 +0300
Subject: [PATCH 020/260] Create persistent nodes in tests

---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj   | 6 ++----
 tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
index f2f614b2d17..deb69c3ced4 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -22,13 +22,11 @@
 
   (invoke! [this test op]
     (case (:f op)
-      :read ;(try
-      (do (info "LIST ON NODE" nodename (zk-list conn "/"))
-          (info "EXISTS NODE" (zk/exists conn "/a-set"))
+      :read
+      (do
           (assoc op
                  :type :ok
                  :value (read-string (:data (zk-get-str conn k)))))
-              ;(catch Exception _ (assoc op :type :fail, :error :connect-error)))
       :add (try
              (do
                (zk-add-to-set conn k (:value op))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index 19b4959d742..9912b34cd46 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -61,7 +61,7 @@
 
 (defn zk-create-if-not-exists
   [conn path data]
-  (zk/create conn path :data (data/to-bytes (str data))))
+  (zk/create conn path :data (data/to-bytes (str data)) :persistent? true))
 
 (defn clickhouse-alive?
   [node test]

From 54fbea68a194cccfd286cc76b9224684667ec5f8 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 16 Mar 2021 17:53:49 +0300
Subject: [PATCH 021/260] Add hammer-time nemesis

---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj    | 4 ++--
 tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index dd40b7e399b..6e3777d3141 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -111,10 +111,10 @@
            opts
            {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)))
             :os ubuntu/os
-            :db (db "rbtorrent:46832e8fa975b094a5591184b3c854700ed770f4")
+            :db (db "rbtorrent:a122093aee0bdcb70ca42d5e5fb4ba5544372f5f")
             :pure-generators true
             :client (:client workload)
-            :nemesis (custom-nemesis/random-single-node-killer-nemesis)
+            :nemesis (custom-nemesis/hammer-time-nemesis)
             :checker (checker/compose
                       {:perf     (checker/perf)
                        :workload (:checker workload)})
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index 620ad1bd3d3..f3e01714128 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -11,6 +11,10 @@
    (fn start [test node] (kill-clickhouse! node test))
    (fn stop [test node] (start-clickhouse! node test))))
 
+(defn hammer-time-nemesis
+  []
+  (nemesis/hammer-time "clickhouse"))
+
 (def custom-nemesises
   {"killer" {:nemesis (random-single-node-killer-nemesis)
              :generator

From 82b2c34c4029ab0dd80ba0bf97974d2ffb1285d2 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 16 Mar 2021 23:27:09 +0300
Subject: [PATCH 022/260] Remove strange file

---
 tests/jepsen.nukeeper/CHANGELOG.md | 24 ------------------------
 1 file changed, 24 deletions(-)
 delete mode 100644 tests/jepsen.nukeeper/CHANGELOG.md

diff --git a/tests/jepsen.nukeeper/CHANGELOG.md b/tests/jepsen.nukeeper/CHANGELOG.md
deleted file mode 100644
index 6c7cb4f7c8a..00000000000
--- a/tests/jepsen.nukeeper/CHANGELOG.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Change Log
-All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/).
-
-## [Unreleased]
-### Changed
-- Add a new arity to `make-widget-async` to provide a different widget shape.
-
-## [0.1.1] - 2021-03-10
-### Changed
-- Documentation on how to make the widgets.
-
-### Removed
-- `make-widget-sync` - we're all async, all the time.
-
-### Fixed
-- Fixed widget maker to keep working when daylight savings switches over.
-
-## 0.1.0 - 2021-03-10
-### Added
-- Files from the new template.
-- Widget maker public API - `make-widget-sync`.
-
-[Unreleased]: https://github.com/your-name/jepsen.nukeeper/compare/0.1.1...HEAD
-[0.1.1]: https://github.com/your-name/jepsen.nukeeper/compare/0.1.0...0.1.1

From 46af999f3aea5db1963ed241062ed3048af8f103 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 17 Mar 2021 10:11:55 +0300
Subject: [PATCH 023/260] Trying to add corruption nemesis

---
 .../src/jepsen/nukeeper/main.clj              |  6 +--
 .../src/jepsen/nukeeper/nemesis.clj           | 44 ++++++++++++++++++-
 .../src/jepsen/nukeeper/set.clj               |  6 +--
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 6e3777d3141..d62cbabd56f 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -114,7 +114,7 @@
             :db (db "rbtorrent:a122093aee0bdcb70ca42d5e5fb4ba5544372f5f")
             :pure-generators true
             :client (:client workload)
-            :nemesis (custom-nemesis/hammer-time-nemesis)
+            :nemesis (custom-nemesis/logs-corruption-nemesis)
             :checker (checker/compose
                       {:perf     (checker/perf)
                        :workload (:checker workload)})
@@ -123,9 +123,7 @@
                              (gen/stagger (/ (:rate opts)))
                              (gen/nemesis
                               (cycle [(gen/sleep 5)
-                                      {:type :info, :f :start}
-                                      (gen/sleep 5)
-                                      {:type :info, :f :stop}]))
+                                      {:type :info, :f :corrupt}]))
                              (gen/time-limit (:time-limit opts)))
                         (gen/log "Healing cluster")
                         (gen/nemesis (gen/once {:type :info, :f :stop}))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index f3e01714128..6b0497cd0af 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -1,7 +1,11 @@
 (ns jepsen.nukeeper.nemesis
-  (:require [jepsen
+  (:require
+           [clojure.tools.logging :refer :all]
+           [jepsen
              [nemesis :as nemesis]
+             [control :as c]
              [generator :as gen]]
+            [jepsen.nukeeper.constants :refer :all]
             [jepsen.nukeeper.utils :refer :all]))
 
 (defn random-single-node-killer-nemesis
@@ -15,6 +19,44 @@
   []
   (nemesis/hammer-time "clickhouse"))
 
+(defn select-last-file
+  [path]
+  (info "EXECUTE ON PATH" path)
+  (last (clojure.string/split (c/exec :find path :-type :f :-printf "%T+ $PWD%p\n" :| :sort :| :awk "'{print $2}'")) #"\n"))
+
+(defn corrupt-file
+  [fname]
+  (c/exec :dd "if=/dev/zero" ("str of=" fname) "bs=1" "count=1" "seek=N" "conv=notrunc"))
+
+(defn corruptor-nemesis
+  [path corruption-op]
+  (reify nemesis/Nemesis
+    (setup! [this test] this)
+
+    (invoke! [this test op]
+      (let [nodes (list (rand-nth (:nodes test)))]
+        (info "Corruption on node" nodes)
+        (c/on-nodes test nodes
+            (fn [node]
+              (let [file-to-corrupt (select-last-file path)]
+                (info "Corrupting file" file-to-corrupt)
+                 (c/su
+                     (corruption-op (select-last-file path))
+                     (kill-clickhouse! node test)
+                     (start-clickhouse! node test)))))
+        {:f (:f op)
+         :value :corrupted}))
+
+    (teardown! [this test])))
+
+(defn logs-corruption-nemesis
+  []
+  (corruptor-nemesis logsdir corrupt-file))
+
+(defn snapshots-corruption-nemesis
+  []
+  (corruptor-nemesis snapshotsdir corrupt-file))
+
 (def custom-nemesises
   {"killer" {:nemesis (random-single-node-killer-nemesis)
              :generator
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
index deb69c3ced4..d50253aa174 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -24,9 +24,9 @@
     (case (:f op)
       :read
       (do
-          (assoc op
-                 :type :ok
-                 :value (read-string (:data (zk-get-str conn k)))))
+        (assoc op
+               :type :ok
+               :value (read-string (:data (zk-get-str conn k)))))
       :add (try
              (do
                (zk-add-to-set conn k (:value op))

From d9f835a242743116332604fde39db3c74aa0afc9 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 17 Mar 2021 11:13:52 +0300
Subject: [PATCH 024/260] Finally corrupted logs

---
 .../src/jepsen/nukeeper/nemesis.clj           | 38 ++++++++++++-------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index 6b0497cd0af..bf2348f1860 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -22,30 +22,40 @@
 (defn select-last-file
   [path]
   (info "EXECUTE ON PATH" path)
-  (last (clojure.string/split (c/exec :find path :-type :f :-printf "%T+ $PWD%p\n" :| :sort :| :awk "'{print $2}'")) #"\n"))
+  (last (clojure.string/split
+         (c/exec :find path :-type :f :-printf "%T+ %p\n" :| :sort :| :awk "{print $2}")
+         #"\n")))
+
+(defn random-file-pos
+  [fname]
+  (let [fsize (Integer/parseInt (c/exec :du :-b fname :| :cut :-f1))]
+    (rand-int fsize)))
 
 (defn corrupt-file
   [fname]
-  (c/exec :dd "if=/dev/zero" ("str of=" fname) "bs=1" "count=1" "seek=N" "conv=notrunc"))
+  (info "Corrupting" fname)
+  (c/exec :dd "if=/dev/zero" (str "of=" fname) "bs=1" "count=1" (str "seek=" (random-file-pos fname)) "conv=notrunc"))
 
 (defn corruptor-nemesis
   [path corruption-op]
   (reify nemesis/Nemesis
+
     (setup! [this test] this)
 
     (invoke! [this test op]
-      (let [nodes (list (rand-nth (:nodes test)))]
-        (info "Corruption on node" nodes)
-        (c/on-nodes test nodes
-            (fn [node]
-              (let [file-to-corrupt (select-last-file path)]
-                (info "Corrupting file" file-to-corrupt)
-                 (c/su
-                     (corruption-op (select-last-file path))
-                     (kill-clickhouse! node test)
-                     (start-clickhouse! node test)))))
-        {:f (:f op)
-         :value :corrupted}))
+      (cond (= (:f op) :corrupt)
+        (let [nodes (list (rand-nth (:nodes test)))]
+          (info "Corruption on node" nodes)
+          (c/on-nodes test nodes
+              (fn [test node]
+                (let [file-to-corrupt (select-last-file path)]
+                  (info "Corrupting file" file-to-corrupt)
+                   (c/su
+                       (corruption-op (select-last-file path))
+                       (kill-clickhouse! node test)
+                       (start-clickhouse! node test)))))
+          (assoc op :type :info, :value :corrupted))
+        :else (assoc op :type :info, :value :not-started)))
 
     (teardown! [this test])))
 

From 05a9f8a793f9da3b42c1e6ad4de2d3253f038a94 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <KochetovNicolai@users.noreply.github.com>
Date: Wed, 17 Mar 2021 12:34:59 +0300
Subject: [PATCH 025/260] Update 01755_shard_pruning_with_literal.sql

---
 .../0_stateless/01755_shard_pruning_with_literal.sql       | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/queries/0_stateless/01755_shard_pruning_with_literal.sql b/tests/queries/0_stateless/01755_shard_pruning_with_literal.sql
index 0816ac6e88b..0e93d76573c 100644
--- a/tests/queries/0_stateless/01755_shard_pruning_with_literal.sql
+++ b/tests/queries/0_stateless/01755_shard_pruning_with_literal.sql
@@ -1,15 +1,14 @@
 set optimize_skip_unused_shards=1;
 
 drop table if exists data_01755;
-
 drop table if exists dist_01755;
 
 create table data_01755 (i Int) Engine=Memory;
-
 create table dist_01755 as data_01755 Engine=Distributed(test_cluster_two_shards, currentDatabase(), data_01755, i);
 
 insert into data_01755 values (1);
 
-select * from dist_01755 where 0;
-
 select * from dist_01755 where 1 settings enable_early_constant_folding = 0;
+
+drop table if exists data_01755;
+drop table if exists dist_01755;

From 5e20ea2c33a19cdc053a86c94295dba3671a7ba2 Mon Sep 17 00:00:00 2001
From: fuqi <fuqi@growingio.com>
Date: Wed, 17 Mar 2021 18:49:24 +0800
Subject: [PATCH 026/260] optimize select final with prewhere primary key

---
 src/Interpreters/InterpreterSelectQuery.cpp   |  2 +-
 .../MergeTree/MergeTreeWhereOptimizer.cpp     | 23 +++++++++++--------
 .../MergeTree/MergeTreeWhereOptimizer.h       |  9 +++++---
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index d0c8966cf07..45ded5223e9 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -393,7 +393,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
             view = nullptr;
         }
 
-        if (try_move_to_prewhere && storage && query.where() && !query.prewhere() && !query.final())
+        if (try_move_to_prewhere && storage && query.where() && !query.prewhere())
         {
             /// PREWHERE optimization: transfer some condition from WHERE to PREWHERE if enabled and viable
             if (const auto & column_sizes = storage->getColumnSizes(); !column_sizes.empty())
diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
index 34cac56d74c..b80c0700602 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
@@ -36,7 +36,8 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
     Poco::Logger * log_)
     : table_columns{ext::map<std::unordered_set>(
         metadata_snapshot->getColumns().getAllPhysical(), [](const NameAndTypePair & col) { return col.name; })}
-    , queried_columns{queried_columns_}
+    , queried_columns{queried_columns_},
+    , primary_key_columns{metadata_snapshot->getPrimaryKey().column_names}
     , block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)}
     , log{log_}
     , column_sizes{std::move(column_sizes_)}
@@ -114,7 +115,7 @@ static bool isConditionGood(const ASTPtr & condition)
 }
 
 
-void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) const
+void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool final) const
 {
     if (const auto * func_and = node->as<ASTFunction>(); func_and && func_and->name == "and")
     {
@@ -133,7 +134,7 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node)
         cond.viable =
             /// Condition depend on some column. Constant expressions are not moved.
             !cond.identifiers.empty()
-            && !cannotBeMoved(node)
+            && !cannotBeMoved(node, final)
             /// Do not take into consideration the conditions consisting only of the first primary key column
             && !hasPrimaryKeyAtoms(node)
             /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded.
@@ -149,10 +150,10 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node)
 }
 
 /// Transform conjunctions chain in WHERE expression to Conditions list.
-MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression) const
+MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool final) const
 {
     Conditions res;
-    analyzeImpl(res, expression);
+    analyzeImpl(res, expression, final);
     return res;
 }
 
@@ -183,7 +184,7 @@ void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const
     if (!select.where() || select.prewhere())
         return;
 
-    Conditions where_conditions = analyze(select.where());
+    Conditions where_conditions = analyze(select.where(), select.final());
     Conditions prewhere_conditions;
 
     UInt64 total_size_of_moved_conditions = 0;
@@ -299,6 +300,9 @@ bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const
     return false;
 }
 
+bool MergeTreeWhereOptimizer::isPrimaryKey(const String & columnName) const {
+    return std::find(primary_key_columns.begin(), primary_key_columns.end(), columnName) != primary_key_columns.end();
+}
 
 bool MergeTreeWhereOptimizer::isConstant(const ASTPtr & expr) const
 {
@@ -319,7 +323,7 @@ bool MergeTreeWhereOptimizer::isSubsetOfTableColumns(const NameSet & identifiers
 }
 
 
-bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr) const
+bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool final) const
 {
     if (const auto * function_ptr = ptr->as<ASTFunction>())
     {
@@ -336,12 +340,13 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr) const
     {
         /// disallow moving result of ARRAY JOIN to PREWHERE
         if (array_joined_names.count(*opt_name) ||
-            array_joined_names.count(Nested::extractTableName(*opt_name)))
+            array_joined_names.count(Nested::extractTableName(*opt_name)) ||
+            (final && !isPrimaryKey(*opt_name)))
             return true;
     }
 
     for (const auto & child : ptr->children)
-        if (cannotBeMoved(child))
+        if (cannotBeMoved(child, final))
             return true;
 
     return false;
diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
index cad77fb9eed..83c45efef74 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
@@ -67,10 +67,10 @@ private:
 
     using Conditions = std::list<Condition>;
 
-    void analyzeImpl(Conditions & res, const ASTPtr & node) const;
+    void analyzeImpl(Conditions & res, const ASTPtr & node, bool final) const;
 
     /// Transform conjunctions chain in WHERE expression to Conditions list.
-    Conditions analyze(const ASTPtr & expression) const;
+    Conditions analyze(const ASTPtr & expression, bool final) const;
 
     /// Transform Conditions list to WHERE or PREWHERE expression.
     static ASTPtr reconstruct(const Conditions & conditions);
@@ -85,6 +85,8 @@ private:
 
     bool isPrimaryKeyAtom(const ASTPtr & ast) const;
 
+    bool isPrimaryKey(const String & columnName) const;
+
     bool isConstant(const ASTPtr & expr) const;
 
     bool isSubsetOfTableColumns(const NameSet & identifiers) const;
@@ -95,7 +97,7 @@ private:
       *
       * Also, disallow moving expressions with GLOBAL [NOT] IN.
       */
-    bool cannotBeMoved(const ASTPtr & ptr) const;
+    bool cannotBeMoved(const ASTPtr & ptr, bool final) const;
 
     void determineArrayJoinedNames(ASTSelectQuery & select);
 
@@ -104,6 +106,7 @@ private:
     String first_primary_key_column;
     const StringSet table_columns;
     const Names queried_columns;
+    const Names primary_key_columns;
     const Block block_with_constants;
     Poco::Logger * log;
     std::unordered_map<std::string, UInt64> column_sizes;

From 9ab713c2e17b3b055ad76224c095636c5d4b5663 Mon Sep 17 00:00:00 2001
From: fuqi <fuqi@growingio.com>
Date: Wed, 17 Mar 2021 18:57:58 +0800
Subject: [PATCH 027/260] optimize select final with prewhere primary key

---
 src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp | 14 +++++++-------
 src/Storages/MergeTree/MergeTreeWhereOptimizer.h   |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
index b80c0700602..792884689d5 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
@@ -115,7 +115,7 @@ static bool isConditionGood(const ASTPtr & condition)
 }
 
 
-void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool final) const
+void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool isFinal) const
 {
     if (const auto * func_and = node->as<ASTFunction>(); func_and && func_and->name == "and")
     {
@@ -134,7 +134,7 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node,
         cond.viable =
             /// Condition depend on some column. Constant expressions are not moved.
             !cond.identifiers.empty()
-            && !cannotBeMoved(node, final)
+            && !cannotBeMoved(node, isFinal)
             /// Do not take into consideration the conditions consisting only of the first primary key column
             && !hasPrimaryKeyAtoms(node)
             /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded.
@@ -150,10 +150,10 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node,
 }
 
 /// Transform conjunctions chain in WHERE expression to Conditions list.
-MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool final) const
+MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool isFinal) const
 {
     Conditions res;
-    analyzeImpl(res, expression, final);
+    analyzeImpl(res, expression, isFinal);
     return res;
 }
 
@@ -323,7 +323,7 @@ bool MergeTreeWhereOptimizer::isSubsetOfTableColumns(const NameSet & identifiers
 }
 
 
-bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool final) const
+bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool isFinal) const
 {
     if (const auto * function_ptr = ptr->as<ASTFunction>())
     {
@@ -341,12 +341,12 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool final) cons
         /// disallow moving result of ARRAY JOIN to PREWHERE
         if (array_joined_names.count(*opt_name) ||
             array_joined_names.count(Nested::extractTableName(*opt_name)) ||
-            (final && !isPrimaryKey(*opt_name)))
+            (isFinal && !isPrimaryKey(*opt_name)))
             return true;
     }
 
     for (const auto & child : ptr->children)
-        if (cannotBeMoved(child, final))
+        if (cannotBeMoved(child, isFinal))
             return true;
 
     return false;
diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
index 83c45efef74..45eb077ed96 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
@@ -67,10 +67,10 @@ private:
 
     using Conditions = std::list<Condition>;
 
-    void analyzeImpl(Conditions & res, const ASTPtr & node, bool final) const;
+    void analyzeImpl(Conditions & res, const ASTPtr & node, bool isFinal) const;
 
     /// Transform conjunctions chain in WHERE expression to Conditions list.
-    Conditions analyze(const ASTPtr & expression, bool final) const;
+    Conditions analyze(const ASTPtr & expression, bool isFinal) const;
 
     /// Transform Conditions list to WHERE or PREWHERE expression.
     static ASTPtr reconstruct(const Conditions & conditions);
@@ -97,7 +97,7 @@ private:
       *
       * Also, disallow moving expressions with GLOBAL [NOT] IN.
       */
-    bool cannotBeMoved(const ASTPtr & ptr, bool final) const;
+    bool cannotBeMoved(const ASTPtr & ptr, bool isFinal) const;
 
     void determineArrayJoinedNames(ASTSelectQuery & select);
 

From 341e22341944a405af306e5dd75631f81228c8e6 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 17 Mar 2021 14:35:37 +0300
Subject: [PATCH 028/260] Better corruption nemesises, options

---
 .../resources/test_keeper_config.xml          |   3 +
 .../src/jepsen/nukeeper/main.clj              |  45 +++++---
 .../src/jepsen/nukeeper/nemesis.clj           | 103 +++++++++++-------
 .../src/jepsen/nukeeper/utils.clj             |   3 +-
 4 files changed, 99 insertions(+), 55 deletions(-)

diff --git a/tests/jepsen.nukeeper/resources/test_keeper_config.xml b/tests/jepsen.nukeeper/resources/test_keeper_config.xml
index 0e2a688ea0b..7ef34d4bea1 100644
--- a/tests/jepsen.nukeeper/resources/test_keeper_config.xml
+++ b/tests/jepsen.nukeeper/resources/test_keeper_config.xml
@@ -10,6 +10,9 @@
             <startup_timeout>60000</startup_timeout>
             <raft_logs_level>trace</raft_logs_level>
             <quorum_reads>{quorum_reads}</quorum_reads>
+            <snapshot_distance>{snapshot_distance}</snapshot_distance>
+            <stale_log_gap>{stale_log_gap}</stale_log_gap>
+            <reserved_log_items>{reserved_log_items}</reserved_log_items>
         </coordination_settings>
 
         <raft_configuration>
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index d62cbabd56f..a5ceae5d5ae 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -28,16 +28,16 @@
 
 (defn cluster-config
   [test node config-template]
-  (let [nodes (:nodes test)]
-    (clojure.string/replace
-     (clojure.string/replace
-      (clojure.string/replace
-       (clojure.string/replace
-        (clojure.string/replace config-template #"\{quorum_reads\}" (str (boolean (:quorum test))))
-        #"\{srv1\}" (get nodes 0))
-       #"\{srv2\}" (get nodes 1))
-      #"\{srv3\}" (get nodes 2))
-     #"\{id\}" (str (inc (.indexOf nodes node))))))
+  (let [nodes (:nodes test)
+        replacement-map {#"\{srv1\}" (get nodes 0)
+                         #"\{srv2\}" (get nodes 1)
+                         #"\{srv3\}" (get nodes 2)
+                         #"\{id\}" (str (inc (.indexOf nodes node)))
+                         #"\{quorum_reads\}" (str (boolean (:quorum test)))
+                         #"\{snapshot_distance\}" (str (:snapshot-distance test))
+                         #"\{stale_log_gap\}" (str (:stale-log-gap test))
+                         #"\{reserved_log_items\}" (str (:reserved-log-items test))}]
+    (reduce #(clojure.string/replace %1 (get %2 0) (get %2 1)) config-template replacement-map)))
 
 (defn db
   [version]
@@ -90,11 +90,26 @@
   [["-w" "--workload NAME" "What workload should we run?"
     :missing  (str "--workload " (cli/one-of workloads))
     :validate [workloads (cli/one-of workloads)]]
+   [nil "--nemesis NAME" "Which nemesis will poison our lives?"
+    :missing  (str "--nemesis " (cli/one-of custom-nemesis/custom-nemesises))
+    :validate [custom-nemesis/custom-nemesises (cli/one-of custom-nemesis/custom-nemesises)]]
    ["-q" "--quorum" "Use quorum reads, instead of reading from any primary."]
    ["-r" "--rate HZ" "Approximate number of requests per second, per thread."
     :default  10
     :parse-fn read-string
     :validate [#(and (number? %) (pos? %)) "Must be a positive number"]]
+   ["-s" "--snapshot-distance NUM" "Number of log entries to create snapshot"
+    :default 10000
+    :parse-fn read-string
+    :validate [#(and (number? %) (pos? %)) "Must be a positive number"]]
+   [nil "--stale-log-gap NUM" "Number of log entries to send snapshot instead of separate logs"
+    :default 1000
+    :parse-fn read-string
+    :validate [#(and (number? %) (pos? %)) "Must be a positive number"]]
+   [nil "--reserved-log-items NUM" "Number of log entries to keep after snapshot"
+    :default 1000
+    :parse-fn read-string
+    :validate [#(and (number? %) (pos? %)) "Must be a positive number"]]
    [nil "--ops-per-key NUM" "Maximum number of operations on any given key."
     :default  100
     :parse-fn parse-long
@@ -106,24 +121,22 @@
   [opts]
   (let [quorum (boolean (:quorum opts))
         workload  ((get workloads (:workload opts)) opts)
-        current-nemesis (get custom-nemesis/custom-nemesises "killer")]
+        current-nemesis (get custom-nemesis/custom-nemesises (:nemesis opts))]
     (merge tests/noop-test
            opts
-           {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)))
+           {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)) (name (:nemesis opts)))
             :os ubuntu/os
             :db (db "rbtorrent:a122093aee0bdcb70ca42d5e5fb4ba5544372f5f")
             :pure-generators true
             :client (:client workload)
-            :nemesis (custom-nemesis/logs-corruption-nemesis)
+            :nemesis (:nemesis current-nemesis)
             :checker (checker/compose
                       {:perf     (checker/perf)
                        :workload (:checker workload)})
             :generator (gen/phases
                         (->> (:generator workload)
                              (gen/stagger (/ (:rate opts)))
-                             (gen/nemesis
-                              (cycle [(gen/sleep 5)
-                                      {:type :info, :f :corrupt}]))
+                             (gen/nemesis (:generator current-nemesis))
                              (gen/time-limit (:time-limit opts)))
                         (gen/log "Healing cluster")
                         (gen/nemesis (gen/once {:type :info, :f :stop}))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index bf2348f1860..93026a7d64c 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -1,12 +1,12 @@
 (ns jepsen.nukeeper.nemesis
   (:require
-           [clojure.tools.logging :refer :all]
-           [jepsen
-             [nemesis :as nemesis]
-             [control :as c]
-             [generator :as gen]]
-            [jepsen.nukeeper.constants :refer :all]
-            [jepsen.nukeeper.utils :refer :all]))
+   [clojure.tools.logging :refer :all]
+   [jepsen
+    [nemesis :as nemesis]
+    [control :as c]
+    [generator :as gen]]
+   [jepsen.nukeeper.constants :refer :all]
+   [jepsen.nukeeper.utils :refer :all]))
 
 (defn random-single-node-killer-nemesis
   []
@@ -21,9 +21,8 @@
 
 (defn select-last-file
   [path]
-  (info "EXECUTE ON PATH" path)
   (last (clojure.string/split
-         (c/exec :find path :-type :f :-printf "%T+ %p\n" :| :sort :| :awk "{print $2}")
+         (c/exec :find path :-type :f :-printf "%T+ %p\n" :| :grep :-v :tmp_ :| :sort :| :awk "{print $2}")
          #"\n")))
 
 (defn random-file-pos
@@ -33,8 +32,11 @@
 
 (defn corrupt-file
   [fname]
-  (info "Corrupting" fname)
-  (c/exec :dd "if=/dev/zero" (str "of=" fname) "bs=1" "count=1" (str "seek=" (random-file-pos fname)) "conv=notrunc"))
+  (if (not (empty? fname))
+    (do
+      (info "Corrupting" fname)
+      (c/exec :dd "if=/dev/zero" (str "of=" fname) "bs=1" "count=1" (str "seek=" (random-file-pos fname)) "conv=notrunc"))
+    (info "Nothing to corrupt")))
 
 (defn corruptor-nemesis
   [path corruption-op]
@@ -44,41 +46,66 @@
 
     (invoke! [this test op]
       (cond (= (:f op) :corrupt)
-        (let [nodes (list (rand-nth (:nodes test)))]
-          (info "Corruption on node" nodes)
-          (c/on-nodes test nodes
-              (fn [test node]
-                (let [file-to-corrupt (select-last-file path)]
-                  (info "Corrupting file" file-to-corrupt)
-                   (c/su
-                       (corruption-op (select-last-file path))
-                       (kill-clickhouse! node test)
-                       (start-clickhouse! node test)))))
-          (assoc op :type :info, :value :corrupted))
-        :else (assoc op :type :info, :value :not-started)))
+            (let [nodes (list (rand-nth (:nodes test)))]
+              (info "Corruption on node" nodes)
+              (c/on-nodes test nodes
+                          (fn [test node]
+                            (c/su
+                             (kill-clickhouse! node test)
+                             (corruption-op path)
+                             (start-clickhouse! node test))))
+              (assoc op :type :info, :value :corrupted))
+            :else (do (c/on-nodes test (:nodes test)
+                                  (fn [test node]
+                                    (c/su
+                                     (start-clickhouse! node test))))
+                      (assoc op :type :info, :value :done))))
 
     (teardown! [this test])))
 
 (defn logs-corruption-nemesis
   []
-  (corruptor-nemesis logsdir corrupt-file))
+  (corruptor-nemesis logsdir #(corrupt-file (select-last-file %1))))
 
 (defn snapshots-corruption-nemesis
   []
-  (corruptor-nemesis snapshotsdir corrupt-file))
+  (corruptor-nemesis snapshotsdir #(corrupt-file (select-last-file %1))))
+
+(defn logs-and-snapshots-corruption-nemesis
+  []
+  (corruptor-nemesis coordinationdir (fn [path]
+                                       (do
+                                         (corrupt-file (select-last-file (str path "/snapshots")))
+                                         (corrupt-file (select-last-file (str path "/logs")))))))
+(defn drop-all-corruption-nemesis
+  []
+  (corruptor-nemesis coordinationdir (fn [path]
+                                       (c/exec :rm :-fr path))))
+
+(defn start-stop-generator
+  []
+  (->>
+   (cycle [(gen/sleep 5)
+           {:type :info, :f :start}
+           (gen/sleep 5)
+           {:type :info, :f :stop}])))
+
+(defn corruption-generator
+  []
+  (->>
+   (cycle [(gen/sleep 5)
+           {:type :info, :f :corrupt}])))
 
 (def custom-nemesises
-  {"killer" {:nemesis (random-single-node-killer-nemesis)
-             :generator
-             (gen/nemesis
-              (cycle [(gen/sleep 5)
-                      {:type :info, :f :start}
-                      (gen/sleep 5)
-                      {:type :info, :f :stop}]))}
+  {"single-node-killer" {:nemesis (random-single-node-killer-nemesis)
+                         :generator (start-stop-generator)}
    "simple-partitioner" {:nemesis (nemesis/partition-random-halves)
-                         :generator
-                         (gen/nemesis
-                          (cycle [(gen/sleep 5)
-                                  {:type :info, :f :start}
-                                  (gen/sleep 5)
-                                  {:type :info, :f :stop}]))}})
+                         :generator (start-stop-generator)}
+   "logs-corruptor" {:nemesis (logs-corruption-nemesis)
+                     :generator (corruption-generator)}
+   "snapshots-corruptor" {:nemesis (snapshots-corruption-nemesis)
+                          :generator (corruption-generator)}
+   "logs-and-snapshots-corruptor" {:nemesis (logs-and-snapshots-corruption-nemesis)
+                                   :generator (corruption-generator)}
+   "drop-data-corruptor" {:nemesis (drop-all-corruption-nemesis)
+                          :generator (corruption-generator)}})
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index 9912b34cd46..e9658e9d6d5 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -81,7 +81,8 @@
   [node test]
   (info "Killing server on node" node)
   (c/su
-   (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)))
+   (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
+   (c/exec :rm :-fr (str dir "/status"))))
 
 (defn start-clickhouse!
   [node test]

From ecd081144c6a1db08b4952b0be19548b54d0f873 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 17 Mar 2021 14:54:26 +0300
Subject: [PATCH 029/260] Add missing hammer-time nemesis

---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index 93026a7d64c..d1dc0d55e5f 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -101,6 +101,8 @@
                          :generator (start-stop-generator)}
    "simple-partitioner" {:nemesis (nemesis/partition-random-halves)
                          :generator (start-stop-generator)}
+   "hammer-time"    {:nemesis (hammer-time-nemesis)
+                     :generator (start-stop-generator)}
    "logs-corruptor" {:nemesis (logs-corruption-nemesis)
                      :generator (corruption-generator)}
    "snapshots-corruptor" {:nemesis (snapshots-corruption-nemesis)

From 7c4fdd79cfa0461156c6dae6015d64ff5e8d66ca Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 17 Mar 2021 15:58:01 +0300
Subject: [PATCH 030/260] Add unique-ids workload

---
 .../src/jepsen/nukeeper/main.clj              |  4 +-
 .../src/jepsen/nukeeper/unique.clj            | 45 +++++++++++++++++++
 .../src/jepsen/nukeeper/utils.clj             |  4 ++
 3 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index a5ceae5d5ae..8b7c1a6caac 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -4,6 +4,7 @@
             [jepsen.nukeeper.set :as set]
             [jepsen.nukeeper.nemesis :as custom-nemesis]
             [jepsen.nukeeper.register :as register]
+            [jepsen.nukeeper.unique :as unique]
             [jepsen.nukeeper.constants :refer :all]
             [clojure.string :as str]
             [jepsen
@@ -83,7 +84,8 @@
 (def workloads
   "A map of workload names to functions that construct workloads, given opts."
   {"set"      set/workload
-   "register" register/workload})
+   "register" register/workload
+   "unique-ids" unique/workload})
 
 (def cli-opts
   "Additional command line options."
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj
new file mode 100644
index 00000000000..fc8370005aa
--- /dev/null
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj
@@ -0,0 +1,45 @@
+(ns jepsen.nukeeper.unique
+  (:require
+   [clojure.tools.logging :refer :all]
+   [jepsen
+    [checker :as checker]
+    [client :as client]
+    [generator :as gen]]
+   [jepsen.nukeeper.utils :refer :all]
+   [zookeeper :as zk])
+  (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
+
+(defn parse-and-get-counter
+  [path]
+  (Integer/parseInt (apply str (take-last 10 (seq (str path))))))
+
+(defrecord UniqueClient [conn nodename]
+  client/Client
+  (open! [this test node]
+    (assoc
+     (assoc this
+            :conn (zk-connect node 9181 30000))
+     :nodename node))
+
+  (setup! [this test])
+
+  (invoke! [this test op]
+    (case
+        :generate
+              (try
+                   (let [result-path (zk-create-sequential conn "/seq-" "")]
+                       (assoc op :type :ok :value (parse-and-get-counter result-path)))
+              (catch Exception _ (assoc op :type :info, :error :connect-error)))))
+
+  (teardown! [_ test])
+
+  (close! [_ test]))
+
+(defn workload
+  "A generator, client, and checker for a set test."
+  [opts]
+  {:client    (UniqueClient. nil nil)
+   :checker   (checker/unique-ids)
+   :generator (->>
+               (range)
+               (map (fn [_] {:type :invoke, :f :generate})))})
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index e9658e9d6d5..10851a2adc7 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -63,6 +63,10 @@
   [conn path data]
   (zk/create conn path :data (data/to-bytes (str data)) :persistent? true))
 
+(defn zk-create-sequential
+  [conn path-prefix data]
+  (zk/create conn path-prefix :data (data/to-bytes (str data)) :persistent? true :sequential? true))
+
 (defn clickhouse-alive?
   [node test]
   (info "Checking server alive on" node)

From 2ee58ed82fc6fe98d67b5f5cf9469c17e60602d6 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 17 Mar 2021 16:00:08 +0300
Subject: [PATCH 031/260] Fix style

---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj
index fc8370005aa..9c753dfe0ab 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj
@@ -25,11 +25,11 @@
 
   (invoke! [this test op]
     (case
-        :generate
-              (try
-                   (let [result-path (zk-create-sequential conn "/seq-" "")]
-                       (assoc op :type :ok :value (parse-and-get-counter result-path)))
-              (catch Exception _ (assoc op :type :info, :error :connect-error)))))
+     :generate
+      (try
+        (let [result-path (zk-create-sequential conn "/seq-" "")]
+          (assoc op :type :ok :value (parse-and-get-counter result-path)))
+        (catch Exception _ (assoc op :type :info, :error :connect-error)))))
 
   (teardown! [_ test])
 

From e116e8d5e84c0a3fef9e2c5d55157d8284c5d95a Mon Sep 17 00:00:00 2001
From: fuqi <fuqi@growingio.com>
Date: Wed, 17 Mar 2021 22:19:10 +0800
Subject: [PATCH 032/260] fix bug

---
 src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
index 792884689d5..256ea69c4e9 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
@@ -36,7 +36,7 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
     Poco::Logger * log_)
     : table_columns{ext::map<std::unordered_set>(
         metadata_snapshot->getColumns().getAllPhysical(), [](const NameAndTypePair & col) { return col.name; })}
-    , queried_columns{queried_columns_},
+    , queried_columns{queried_columns_}
     , primary_key_columns{metadata_snapshot->getPrimaryKey().column_names}
     , block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)}
     , log{log_}
@@ -120,7 +120,7 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node,
     if (const auto * func_and = node->as<ASTFunction>(); func_and && func_and->name == "and")
     {
         for (const auto & elem : func_and->arguments->children)
-            analyzeImpl(res, elem);
+            analyzeImpl(res, elem, isFinal);
     }
     else
     {

From 71b36a702961d54a7f155867dc2683506de06c07 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Wed, 17 Mar 2021 17:20:55 +0300
Subject: [PATCH 033/260] Better error handling and logging in
 WriteBufferFromS3

---
 src/IO/WriteBufferFromS3.cpp | 59 +++++++++++++++++++++++++-----------
 src/IO/WriteBufferFromS3.h   |  2 ++
 2 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp
index 5edf01a940e..e032935b2fc 100644
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@@ -50,9 +50,9 @@ WriteBufferFromS3::WriteBufferFromS3(
     , client_ptr(std::move(client_ptr_))
     , minimum_upload_part_size(minimum_upload_part_size_)
     , max_single_part_upload_size(max_single_part_upload_size_)
-    , temporary_buffer(Aws::MakeShared<Aws::StringStream>("temporary buffer"))
-    , last_part_size(0)
-{ }
+{
+    allocateBuffer();
+}
 
 void WriteBufferFromS3::nextImpl()
 {
@@ -72,11 +72,17 @@ void WriteBufferFromS3::nextImpl()
     if (!multipart_upload_id.empty() && last_part_size > minimum_upload_part_size)
     {
         writePart();
-        last_part_size = 0;
-        temporary_buffer = Aws::MakeShared<Aws::StringStream>("temporary buffer");
+        allocateBuffer();
     }
 }
 
+void WriteBufferFromS3::allocateBuffer()
+{
+    temporary_buffer = Aws::MakeShared<Aws::StringStream>("temporary buffer");
+    temporary_buffer->exceptions(std::ios::badbit);
+    last_part_size = 0;
+}
+
 void WriteBufferFromS3::finalize()
 {
     /// FIXME move final flush into the caller
@@ -130,7 +136,7 @@ void WriteBufferFromS3::createMultipartUpload()
     if (outcome.IsSuccess())
     {
         multipart_upload_id = outcome.GetResult().GetUploadId();
-        LOG_DEBUG(log, "Multipart upload has created. Upload id: {}", multipart_upload_id);
+        LOG_DEBUG(log, "Multipart upload has created. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id);
     }
     else
         throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
@@ -139,8 +145,18 @@ void WriteBufferFromS3::createMultipartUpload()
 
 void WriteBufferFromS3::writePart()
 {
-    if (temporary_buffer->tellp() <= 0)
+    auto size = temporary_buffer->tellp();
+
+    LOG_DEBUG(log, "Writing part. Bucket: {}, Key: {}, Upload_id: {}, Size: {}", bucket, key, multipart_upload_id, size);
+
+    if (size < 0)
+        throw Exception("Failed to write part. Buffer in invalid state.", ErrorCodes::S3_ERROR);
+
+    if (size == 0)
+    {
+        LOG_DEBUG(log, "Skipping writing part. Buffer is empty.");
         return;
+    }
 
     if (part_tags.size() == S3_WARN_MAX_PARTS)
     {
@@ -154,18 +170,16 @@ void WriteBufferFromS3::writePart()
     req.SetKey(key);
     req.SetPartNumber(part_tags.size() + 1);
     req.SetUploadId(multipart_upload_id);
-    req.SetContentLength(temporary_buffer->tellp());
+    req.SetContentLength(size);
     req.SetBody(temporary_buffer);
 
     auto outcome = client_ptr->UploadPart(req);
 
-    LOG_TRACE(log, "Writing part. Bucket: {}, Key: {}, Upload_id: {}, Data size: {}", bucket, key, multipart_upload_id, temporary_buffer->tellp());
-
     if (outcome.IsSuccess())
     {
         auto etag = outcome.GetResult().GetETag();
         part_tags.push_back(etag);
-        LOG_DEBUG(log, "Writing part finished. Total parts: {}, Upload_id: {}, Etag: {}", part_tags.size(), multipart_upload_id, etag);
+        LOG_DEBUG(log, "Writing part finished. Bucket: {}, Key: {}, Upload_id: {}, Etag: {}, Parts: {}", bucket, key, multipart_upload_id, etag, part_tags.size());
     }
     else
         throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
@@ -173,7 +187,10 @@ void WriteBufferFromS3::writePart()
 
 void WriteBufferFromS3::completeMultipartUpload()
 {
-    LOG_DEBUG(log, "Completing multipart upload. Bucket: {}, Key: {}, Upload_id: {}", bucket, key, multipart_upload_id);
+    LOG_DEBUG(log, "Completing multipart upload. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, part_tags.size());
+
+    if (part_tags.empty())
+        throw Exception("Failed to complete multipart upload. No parts have uploaded", ErrorCodes::S3_ERROR);
 
     Aws::S3::Model::CompleteMultipartUploadRequest req;
     req.SetBucket(bucket);
@@ -192,22 +209,30 @@ void WriteBufferFromS3::completeMultipartUpload()
     auto outcome = client_ptr->CompleteMultipartUpload(req);
 
     if (outcome.IsSuccess())
-        LOG_DEBUG(log, "Multipart upload has completed. Upload_id: {}", multipart_upload_id);
+        LOG_DEBUG(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, part_tags.size());
     else
         throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
 }
 
 void WriteBufferFromS3::makeSinglepartUpload()
 {
-    if (temporary_buffer->tellp() <= 0)
-        return;
+    auto size = temporary_buffer->tellp();
 
-    LOG_DEBUG(log, "Making single part upload. Bucket: {}, Key: {}", bucket, key);
+    LOG_DEBUG(log, "Making single part upload. Bucket: {}, Key: {}, Size: {}", bucket, key, size);
+
+    if (size < 0)
+        throw Exception("Failed to make single part upload. Buffer in invalid state", ErrorCodes::S3_ERROR);
+
+    if (size == 0)
+    {
+        LOG_DEBUG(log, "Skipping single part upload. Buffer is empty.");
+        return;
+    }
 
     Aws::S3::Model::PutObjectRequest req;
     req.SetBucket(bucket);
     req.SetKey(key);
-    req.SetContentLength(temporary_buffer->tellp());
+    req.SetContentLength(size);
     req.SetBody(temporary_buffer);
     if (object_metadata.has_value())
         req.SetMetadata(object_metadata.value());
diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h
index 9e4b056603a..7a77949072b 100644
--- a/src/IO/WriteBufferFromS3.h
+++ b/src/IO/WriteBufferFromS3.h
@@ -69,6 +69,8 @@ public:
 private:
     bool finalized = false;
 
+    void allocateBuffer();
+
     void createMultipartUpload();
     void writePart();
     void completeMultipartUpload();

From e6711e936e77b9089fe07123dc102e77bb0dcb9c Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Wed, 17 Mar 2021 17:33:13 +0300
Subject: [PATCH 034/260] Cleanup destination directory during
 DiskCacheWrapper::moveFile()

---
 src/Disks/DiskCacheWrapper.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index 47a0e0655d6..085749f22f8 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -217,6 +217,9 @@ void DiskCacheWrapper::moveFile(const String & from_path, const String & to_path
 {
     if (cache_disk->exists(from_path))
     {
+        if (cache_disk->isDirectory(to_path) && !cache_disk->isDirectoryEmpty(to_path))
+            cache_disk->clearDirectory(to_path);
+
         auto dir_path = directoryPath(to_path);
         if (!cache_disk->exists(dir_path))
             cache_disk->createDirectories(dir_path);

From b8edc12812e0718e065110edc0ac621069f38c4f Mon Sep 17 00:00:00 2001
From: fuqi <fuqi@growingio.com>
Date: Wed, 17 Mar 2021 23:56:55 +0800
Subject: [PATCH 035/260] fix code style

---
 .../MergeTree/MergeTreeWhereOptimizer.cpp     | 20 +++++++++----------
 .../MergeTree/MergeTreeWhereOptimizer.h       |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
index 256ea69c4e9..2effcbb6c75 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
@@ -115,12 +115,12 @@ static bool isConditionGood(const ASTPtr & condition)
 }
 
 
-void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool isFinal) const
+void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool is_final) const
 {
     if (const auto * func_and = node->as<ASTFunction>(); func_and && func_and->name == "and")
     {
         for (const auto & elem : func_and->arguments->children)
-            analyzeImpl(res, elem, isFinal);
+            analyzeImpl(res, elem, is_final);
     }
     else
     {
@@ -134,7 +134,7 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node,
         cond.viable =
             /// Condition depend on some column. Constant expressions are not moved.
             !cond.identifiers.empty()
-            && !cannotBeMoved(node, isFinal)
+            && !cannotBeMoved(node, is_final)
             /// Do not take into consideration the conditions consisting only of the first primary key column
             && !hasPrimaryKeyAtoms(node)
             /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded.
@@ -150,10 +150,10 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node,
 }
 
 /// Transform conjunctions chain in WHERE expression to Conditions list.
-MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool isFinal) const
+MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool is_final) const
 {
     Conditions res;
-    analyzeImpl(res, expression, isFinal);
+    analyzeImpl(res, expression, is_final);
     return res;
 }
 
@@ -300,8 +300,8 @@ bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const
     return false;
 }
 
-bool MergeTreeWhereOptimizer::isPrimaryKey(const String & columnName) const {
-    return std::find(primary_key_columns.begin(), primary_key_columns.end(), columnName) != primary_key_columns.end();
+bool MergeTreeWhereOptimizer::isPrimaryKey(const String & column_name) const {
+    return std::find(primary_key_columns.begin(), primary_key_columns.end(), column_name) != primary_key_columns.end();
 }
 
 bool MergeTreeWhereOptimizer::isConstant(const ASTPtr & expr) const
@@ -323,7 +323,7 @@ bool MergeTreeWhereOptimizer::isSubsetOfTableColumns(const NameSet & identifiers
 }
 
 
-bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool isFinal) const
+bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool is_final) const
 {
     if (const auto * function_ptr = ptr->as<ASTFunction>())
     {
@@ -341,12 +341,12 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool isFinal) co
         /// disallow moving result of ARRAY JOIN to PREWHERE
         if (array_joined_names.count(*opt_name) ||
             array_joined_names.count(Nested::extractTableName(*opt_name)) ||
-            (isFinal && !isPrimaryKey(*opt_name)))
+            (is_final && !isPrimaryKey(*opt_name)))
             return true;
     }
 
     for (const auto & child : ptr->children)
-        if (cannotBeMoved(child, isFinal))
+        if (cannotBeMoved(child, is_final))
             return true;
 
     return false;
diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
index 45eb077ed96..85d1df583fa 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
@@ -67,10 +67,10 @@ private:
 
     using Conditions = std::list<Condition>;
 
-    void analyzeImpl(Conditions & res, const ASTPtr & node, bool isFinal) const;
+    void analyzeImpl(Conditions & res, const ASTPtr & node, bool is_final) const;
 
     /// Transform conjunctions chain in WHERE expression to Conditions list.
-    Conditions analyze(const ASTPtr & expression, bool isFinal) const;
+    Conditions analyze(const ASTPtr & expression, bool is_final) const;
 
     /// Transform Conditions list to WHERE or PREWHERE expression.
     static ASTPtr reconstruct(const Conditions & conditions);
@@ -85,7 +85,7 @@ private:
 
     bool isPrimaryKeyAtom(const ASTPtr & ast) const;
 
-    bool isPrimaryKey(const String & columnName) const;
+    bool isPrimaryKey(const String & column_name) const;
 
     bool isConstant(const ASTPtr & expr) const;
 
@@ -97,7 +97,7 @@ private:
       *
       * Also, disallow moving expressions with GLOBAL [NOT] IN.
       */
-    bool cannotBeMoved(const ASTPtr & ptr, bool isFinal) const;
+    bool cannotBeMoved(const ASTPtr & ptr, bool is_final) const;
 
     void determineArrayJoinedNames(ASTSelectQuery & select);
 

From cfa92f0045436f60265086d4fb8e5434b75b92c4 Mon Sep 17 00:00:00 2001
From: fuqi <fuqi@growingio.com>
Date: Thu, 18 Mar 2021 00:25:43 +0800
Subject: [PATCH 036/260] fix code style

---
 src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
index 2effcbb6c75..98e40bf394d 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
@@ -300,10 +300,13 @@ bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const
     return false;
 }
 
-bool MergeTreeWhereOptimizer::isPrimaryKey(const String & column_name) const {
+
+bool MergeTreeWhereOptimizer::isPrimaryKey(const String & column_name) const
+{
     return std::find(primary_key_columns.begin(), primary_key_columns.end(), column_name) != primary_key_columns.end();
 }
 
+
 bool MergeTreeWhereOptimizer::isConstant(const ASTPtr & expr) const
 {
     const auto column_name = expr->getColumnName();

From bc22f4f6ebd2f18b5a0bc99b756aea9a3cb6e0b7 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Wed, 17 Mar 2021 20:14:36 +0300
Subject: [PATCH 037/260] Updated implementation

---
 src/Common/HashTable/FixedDeadlineHashMap.h   | 253 ---------------
 src/Dictionaries/CacheDictionary.cpp          |  12 +-
 src/Dictionaries/CacheDictionaryStorage.h     | 291 +++++++++++-------
 src/Dictionaries/ICacheDictionaryStorage.h    |   8 +-
 src/Dictionaries/SSDCacheDictionaryStorage.h  |   2 +
 .../01681_cache_dictionary_simple_key.sql     |   4 +-
 .../01682_cache_dictionary_complex_key.sql    |   4 +-
 .../01684_ssd_cache_dictionary_simple_key.sql |   4 +-
 ...01685_ssd_cache_dictionary_complex_key.sql |   4 +-
 9 files changed, 198 insertions(+), 384 deletions(-)
 delete mode 100644 src/Common/HashTable/FixedDeadlineHashMap.h

diff --git a/src/Common/HashTable/FixedDeadlineHashMap.h b/src/Common/HashTable/FixedDeadlineHashMap.h
deleted file mode 100644
index 0f7819f4020..00000000000
--- a/src/Common/HashTable/FixedDeadlineHashMap.h
+++ /dev/null
@@ -1,253 +0,0 @@
-#pragma once
-
-#include <chrono>
-#include <type_traits>
-#include <Common/HashTable/HashMap.h>
-#include <Common/BitHelpers.h>
-
-using TimePoint = std::chrono::system_clock::time_point;
-
-template <typename TKey, typename TMapped, typename Hash, bool save_hash_in_cell>
-struct DeadlineCell :
-    public std::conditional_t<save_hash_in_cell,
-        HashMapCellWithSavedHash<TKey, TMapped, Hash, HashTableNoState>,
-        HashMapCell<TKey, TMapped, Hash, HashTableNoState>>
-{
-    using Key = TKey;
-
-    using Base = std::conditional_t<save_hash_in_cell,
-        HashMapCellWithSavedHash<TKey, TMapped, Hash, HashTableNoState>,
-        HashMapCell<TKey, TMapped, Hash, HashTableNoState>>;
-
-    using Mapped = typename Base::Mapped;
-    using State = typename Base::State;
-
-    using mapped_type = Mapped;
-    using key_type = Key;
-
-    using Base::Base;
-
-    inline TimePoint getDeadline() const { return deadline; }
-
-    void setDeadline(TimePoint & deadline_value) { deadline = deadline_value; }
-
-private:
-    TimePoint deadline;
-};
-
-template <typename TKey, typename TValue, typename Disposer, typename Hash, bool save_hash_in_cells>
-class FixedDeadlineHashMapImpl :
-    private HashMapTable<
-        TKey,
-        DeadlineCell<TKey, TValue, Hash, save_hash_in_cells>,
-        Hash,
-        HashTableGrower<>,
-        HashTableAllocator>
-{
-    /// TODO: Make custom grower
-    using Base = HashMapTable<
-        TKey,
-        DeadlineCell<TKey, TValue, Hash, save_hash_in_cells>,
-        Hash,
-        HashTableGrower<>,
-        HashTableAllocator>;
-
-    static size_t calculateMaxSize(size_t max_size, size_t max_collision_resolution_chain)
-    {
-        return roundUpToPowerOfTwoOrZero(std::max(max_size, max_collision_resolution_chain));
-    }
-public:
-    using Cell = DeadlineCell<TKey, TValue, Hash, save_hash_in_cells>;
-    using Key = TKey;
-    using Value = TValue;
-    using Mapped = typename Cell::Mapped;
-
-    explicit FixedDeadlineHashMapImpl(size_t max_size_, size_t max_collision_resolution_chain_, Disposer disposer_ = Disposer())
-        : Base(calculateMaxSize(max_size_, max_collision_resolution_chain_))
-        , max_collision_resolution_chain(max_collision_resolution_chain_)
-        , max_size(max_size_)
-        , disposer(std::move(disposer_))
-    {
-        assert(max_size > 0);
-        assert(max_collision_resolution_chain > 0);
-    }
-
-    ~FixedDeadlineHashMapImpl()
-    {
-        clear();
-    }
-
-    Cell * get(const Key & key)
-    {
-        if (Cell::isZero(key, *this))
-            return this->hasZero() ? this->zeroValue() : nullptr;
-
-        /// TODO: Optimize
-
-        size_t hash_value = Base::hash(key);
-        size_t place_value = Base::grower.place(hash_value);
-        size_t resolution_chain = max_collision_resolution_chain;
-
-        while (resolution_chain != 0)
-        {
-            auto & cell = Base::buf[place_value];
-
-            if (cell.isZero(*this))
-                return nullptr;
-
-            if (cell.keyEquals(key, hash_value, *this))
-                return &cell;
-
-            place_value = Base::grower.next(place_value);
-            --resolution_chain;
-        }
-
-        return nullptr;
-    }
-
-    const Cell * get(const Key & key) const
-    {
-        return const_cast<std::decay_t<decltype(*this)> *>(this)->get(key);
-    }
-
-    std::pair<Cell *, bool> ALWAYS_INLINE insert(const Key & key, const Value & value)
-    {
-        return emplace(key, value);
-    }
-
-    std::pair<Cell *, bool> ALWAYS_INLINE insert(const Key & key, Value && value)
-    {
-        return emplace(key, std::move(value));
-    }
-
-    template<typename ...Args>
-    std::pair<Cell *, bool> ALWAYS_INLINE emplace(const Key & key, Args && ... args)
-    {
-        size_t hash_value = Base::hash(key);
-        std::pair<Cell *, bool> result;
-
-        if (!emplaceIfZero(key, hash_value, result))
-            result = emplaceNonZeroImpl(key, hash_value);
-
-        bool was_inserted = result.second;
-
-        if (was_inserted)
-            new (&result.first->getMapped()) Value(std::forward<Args>(args)...);
-
-        return result;
-    }
-
-    template <typename ...Args>
-    void reinsert(Cell * place_to_use, const Key & key, Args && ... args)
-    {
-        size_t hash_value = Base::hash(key);
-
-        new (place_to_use) Cell(key, *this);
-        new (&place_to_use->getMapped()) Value(std::forward<Args>(args)...);
-        place_to_use->setHash(hash_value);
-    }
-
-    using Base::size;
-
-    using iterator = typename Base::iterator;
-    using const_iterator = typename Base::const_iterator;
-
-    using Base::begin;
-    using Base::end;
-
-    size_t getMaxSize() const { return max_size; }
-
-    size_t getSizeInBytes() const { return Base::getBufferSizeInBytes(); }
-
-    void clear()
-    {
-        for (auto & cell : *this)
-            disposer(cell.getKey(), cell.getMapped());
-    }
-
-private:
-    size_t max_collision_resolution_chain;
-    size_t max_size;
-    Disposer disposer;
-
-    bool emplaceIfZero(const Key & key, size_t hash_value, std::pair<Cell *, bool> & result)
-    {
-        if (!Cell::isZero(key, *this))
-            return false;
-
-        if (this->hasZero())
-        {
-            result = {this->zeroValue(), false};
-            return true;
-        }
-
-        ++Base::m_size;
-
-        this->setHasZero();
-        this->zeroValue()->setHash(hash_value);
-        result = {this->zeroValue(), true};
-
-        return true;
-    }
-
-    std::pair<Cell *, bool> emplaceNonZeroImpl(const Key & key, size_t hash_value)
-    {
-        TimePoint oldest_time = TimePoint::max();
-        size_t place_value = Base::grower.place(hash_value);
-        size_t resolution_chain = max_collision_resolution_chain;
-
-        bool use_old_value_place = false;
-        Cell * place_to_insert = nullptr;
-
-        while (resolution_chain != 0)
-        {
-            auto & cell = Base::buf[place_value];
-
-            if (cell.isZero(*this))
-            {
-                use_old_value_place = false;
-                place_to_insert = &cell;
-                break;
-            }
-
-            if (cell.keyEquals(key, hash_value, *this))
-                return std::make_pair(&cell, false);
-
-            if (cell.getDeadline() < oldest_time)
-            {
-                use_old_value_place = true;
-                place_to_insert = &cell;
-            }
-
-            place_value = Base::grower.next(place_value);
-            --resolution_chain;
-        }
-
-        if (!place_to_insert)
-            place_to_insert = &Base::buf[place_value];
-
-        if (use_old_value_place)
-            return std::make_pair(place_to_insert, false);
-        else
-        {
-            ++Base::m_size;
-
-            new (place_to_insert) Cell(key, *this);
-            place_to_insert->setHash(hash_value);
-
-            return std::make_pair(place_to_insert, true);
-        }
-    }
-};
-
-template <typename Key, typename Mapped>
-struct DefaultFixedHashMapCellDisposer
-{
-    void operator()(const Key &, const Mapped &) const {}
-};
-
-template <typename Key, typename Value, typename Disposer = DefaultFixedHashMapCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
-using FixedDeadlineHashMap = FixedDeadlineHashMapImpl<Key, Value, Disposer, Hash, false>;
-
-template <typename Key, typename Value, typename Disposer = DefaultFixedHashMapCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
-using FixedDeadlineHashMapWithSavedHash = FixedDeadlineHashMapImpl<Key, Value, Disposer, Hash, true>;
diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp
index fe777355ca1..bef391c4222 100644
--- a/src/Dictionaries/CacheDictionary.cpp
+++ b/src/Dictionaries/CacheDictionary.cpp
@@ -332,10 +332,16 @@ Columns CacheDictionary<dictionary_key_type>::getColumnsImpl(
 
     FetchResult result_of_fetch_from_storage;
 
-    {
-        /// Write lock on storage
-        const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
+    bool protect_get_with_write_lock = cache_storage_ptr->canPerformFetchByMultipleThreadsWithoutLock();
 
+    if (protect_get_with_write_lock)
+    {
+        const ProfilingScopedReadRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
+        result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request);
+    }
+    else
+    {
+        const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
         result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request);
     }
 
diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h
index a98f92e5da9..6b1200dd474 100644
--- a/src/Dictionaries/CacheDictionaryStorage.h
+++ b/src/Dictionaries/CacheDictionaryStorage.h
@@ -4,14 +4,11 @@
 #include <variant>
 
 #include <pcg_random.hpp>
-#include <absl/container/flat_hash_map.h>
-#include <absl/container/flat_hash_set.h>
 
 #include <Common/randomSeed.h>
 #include <Common/Arena.h>
 #include <Common/ArenaWithFreeLists.h>
 #include <Common/HashTable/LRUHashMap.h>
-#include <Common/HashTable/FixedDeadlineHashMap.h>
 #include <Dictionaries/DictionaryStructure.h>
 #include <Dictionaries/ICacheDictionaryStorage.h>
 #include <Dictionaries/DictionaryHelpers.h>
@@ -38,6 +35,9 @@ struct CacheDictionaryStorageConfiguration
 template <DictionaryKeyType dictionary_key_type>
 class CacheDictionaryStorage final : public ICacheDictionaryStorage
 {
+
+    static constexpr size_t max_collision_length = 10;
+
 public:
     using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
     static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage");
@@ -47,13 +47,19 @@ public:
         CacheDictionaryStorageConfiguration & configuration_)
         : configuration(configuration_)
         , rnd_engine(randomSeed())
-        , cache(configuration.max_size_in_cells, 10, { *this })
     {
+        size_t cells_size = roundUpToPowerOfTwoOrZero(std::max(configuration.max_size_in_cells, max_collision_length));
+
+        cells.resize_fill(cells_size);
+        size_overlap_mask = cells_size - 1;
+
         setup(dictionary_structure);
     }
 
     bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; }
 
+    bool canPerformFetchByMultipleThreadsWithoutLock() const override { return true; }
+
     String getName() const override
     {
         if (dictionary_key_type == DictionaryKeyType::simple)
@@ -134,9 +140,9 @@ public:
             throw Exception("Method getCachedComplexKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
     }
 
-    size_t getSize() const override { return cache.size(); }
+    size_t getSize() const override { return size; }
 
-    size_t getMaxSize() const override { return cache.getMaxSize(); }
+    size_t getMaxSize() const override { return configuration.max_size_in_cells; }
 
     size_t getBytesAllocated() const override
     {
@@ -151,7 +157,7 @@ public:
             });
         }
 
-        return arena.size() + cache.getSizeInBytes() + attributes_size_in_bytes;
+        return arena.size() + sizeof(Cell) * configuration.max_size_in_cells + attributes_size_in_bytes;
     }
 
 private:
@@ -175,9 +181,9 @@ private:
         KeysStorageFetchResult result;
 
         result.fetched_columns = fetch_request.makeAttributesResultColumns();
-        result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found});
+        result.key_index_to_state.resize_fill(keys.size());
 
-        const auto now = std::chrono::system_clock::now();
+        const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
 
         size_t fetched_columns_index = 0;
         size_t keys_size = keys.size();
@@ -190,54 +196,39 @@ private:
         for (size_t key_index = 0; key_index < keys_size; ++key_index)
         {
             auto key = keys[key_index];
-            auto * it = cache.get(key);
+            auto [key_state, cell_index] = getKeyStateAndCellIndex(key, now);
 
-            if (!it)
+            if (unlikely(key_state == KeyState::not_found))
             {
                 result.key_index_to_state[key_index] = {KeyState::not_found};
                 ++result.not_found_keys_size;
                 continue;
             }
 
-            auto deadline = it->getDeadline();
-            const auto & cell = it->getMapped();
+            auto & cell = cells[cell_index];
 
-            if (now > deadline + max_lifetime_seconds)
-            {
-                result.key_index_to_state[key_index] = {KeyState::not_found};
-                ++result.not_found_keys_size;
-                continue;
-            }
-
-            bool cell_is_expired = false;
-            KeyState::State key_state = KeyState::found;
-
-            if (now > deadline)
-            {
-                cell_is_expired = true;
-                key_state = KeyState::expired;
-            }
+            result.expired_keys_size += static_cast<size_t>(key_state == KeyState::expired);
 
             result.key_index_to_state[key_index] = {key_state, fetched_columns_index};
-            ++fetched_columns_index;
+            fetched_keys[fetched_columns_index] = FetchedKey(cell.element_index, cell.is_default);
 
-            result.expired_keys_size += cell_is_expired;
-            result.found_keys_size += !cell_is_expired;
+            ++fetched_columns_index;
 
             result.key_index_to_state[key_index].setDefaultValue(cell.is_default);
             result.default_keys_size += cell.is_default;
-
-            fetched_keys[key_index] = FetchedKey{cell.element_index, cell.is_default};
         }
 
+        result.found_keys_size = keys_size - (result.expired_keys_size + result.not_found_keys_size);
+
         for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index)
         {
             if (!fetch_request.shouldFillResultColumnWithIndex(attribute_index))
                 continue;
 
-            size_t fetched_keys_size = fetched_keys.size();
             auto & attribute = attributes[attribute_index];
             const auto & default_value_provider = fetch_request.defaultValueProviderAtIndex(attribute_index);
+
+            size_t fetched_keys_size = fetched_keys.size();
             auto & fetched_column = *result.fetched_columns[attribute_index];
             fetched_column.reserve(fetched_keys_size);
 
@@ -245,7 +236,7 @@ private:
             {
                 auto & container = std::get<std::vector<Field>>(attribute.attribute_container);
 
-                for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index)
+                for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index)
                 {
                     auto fetched_key = fetched_keys[fetched_key_index];
 
@@ -272,7 +263,7 @@ private:
 
                     if constexpr (std::is_same_v<ColumnType, ColumnString>)
                     {
-                        for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index)
+                        for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index)
                         {
                             auto fetched_key = fetched_keys[fetched_key_index];
 
@@ -287,7 +278,7 @@ private:
                     }
                     else
                     {
-                        for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index)
+                        for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index)
                         {
                             auto fetched_key = fetched_keys[fetched_key_index];
                             auto & data = column_typed.getData();
@@ -314,23 +305,27 @@ private:
     {
         const auto now = std::chrono::system_clock::now();
 
-        size_t keys_size = keys.size();
-
-        size_t columns_size = columns.size();
         Field column_value;
 
-        for (size_t key_index = 0; key_index < keys_size; ++key_index)
+        for (size_t key_index = 0; key_index < keys.size(); ++key_index)
         {
             auto key = keys[key_index];
 
-            auto [it, was_inserted] = cache.insert(key, {});
+            size_t cell_index = getCellIndexForInsert(key);
+            auto & cell = cells[cell_index];
+
+            cell.is_default = false;
+
+            bool was_inserted = cell.deadline == 0;
 
             if (was_inserted)
             {
-                auto & cell = it->getMapped();
-                cell.is_default = false;
+                if constexpr (std::is_same_v<KeyType, StringRef>)
+                    cell.key = copyStringInArena(key);
+                else
+                    cell.key = key;
 
-                for (size_t attribute_index = 0; attribute_index < columns_size; ++attribute_index)
+                for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index)
                 {
                     auto & column = columns[attribute_index];
 
@@ -347,38 +342,36 @@ private:
                             container.back() = column_value;
                         else if constexpr (std::is_same_v<ElementType, StringRef>)
                         {
-                            const String & value = column_value.get<String>();
-                            StringRef inserted_value = copyStringInArena(StringRef { value.data(), value.size() });
+                            const String & string_value = column_value.get<String>();
+                            StringRef string_value_ref = StringRef {string_value.data(), string_value.size()};
+                            StringRef inserted_value = copyStringInArena(string_value_ref);
                             container.back() = inserted_value;
                         }
                         else
-                            container.back() = column_value.get<ElementType>();
+                            container.back() = column_value.get<NearestFieldType<ElementType>>();
                     });
                 }
+
+                ++size;
             }
             else
             {
-                auto & cell_key = it->getKey();
-
-                Cell cell;
-
-                size_t existing_index = it->getMapped().element_index;
-
-                cell.element_index = existing_index;
-                cell.is_default = false;
-
-                if (cell_key != key)
+                if (cell.key != key)
                 {
-                    /// In case of complex key we keep it in arena
                     if constexpr (std::is_same_v<KeyType, StringRef>)
-                        arena.free(const_cast<char *>(key.data), key.size);
+                    {
+                        char * data = const_cast<char *>(cell.key.data);
+                        arena.free(data, cell.key.size);
+                        cell.key = copyStringInArena(key);
+                    }
+                    else
+                        cell.key = key;
                 }
 
-                cache.reinsert(it, key, cell);
+                /// Put values into existing index
+                size_t index_to_use = cell.element_index;
 
-                /// Put values into index
-
-                for (size_t attribute_index = 0; attribute_index < columns_size; ++attribute_index)
+                for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index)
                 {
                     auto & column = columns[attribute_index];
 
@@ -389,20 +382,26 @@ private:
                         column->get(key_index, column_value);
 
                         if constexpr (std::is_same_v<ElementType, Field>)
-                            container[existing_index] = column_value;
+                            container[index_to_use] = column_value;
                         else if constexpr (std::is_same_v<ElementType, StringRef>)
                         {
-                            const String & value = column_value.get<String>();
-                            StringRef inserted_value = copyStringInArena(StringRef { value.data(), value.size() });
-                            container[existing_index] = inserted_value;
+                            const String & string_value = column_value.get<String>();
+                            StringRef string_ref_value = StringRef {string_value.data(), string_value.size()};
+                            StringRef inserted_value = copyStringInArena(string_ref_value);
+
+                            StringRef previous_value = container[index_to_use];
+                            char * data = const_cast<char *>(previous_value.data);
+                            arena.free(data, previous_value.size);
+
+                            container[index_to_use] = inserted_value;
                         }
                         else
-                            container[existing_index] = column_value.get<ElementType>();
+                            container[index_to_use] = column_value.get<NearestFieldType<ElementType>>();
                     });
                 }
             }
 
-            setCellDeadline(*it, now);
+            setCellDeadline(cell, now);
         }
     }
 
@@ -416,55 +415,64 @@ private:
         {
             auto key = keys[key_index];
 
-            Cell value;
-            value.is_default = true;
+            size_t cell_index = getCellIndexForInsert(key);
+            auto & cell = cells[cell_index];
 
-            auto [it, was_inserted] = cache.insert(key, value);
+            bool was_inserted = cell.deadline == 0;
+
+            cell.is_default = true;
 
             if (was_inserted)
             {
-                auto & cell = it->getMapped();
+                if constexpr (std::is_same_v<KeyType, StringRef>)
+                    cell.key = copyStringInArena(key);
+                else
+                    cell.key = key;
 
                 for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
                 {
                     getAttributeContainer(attribute_index, [&](auto & container)
                     {
                         container.emplace_back();
-                        cell.element_index = container.size();
+                        cell.element_index = container.size() - 1;
                     });
                 }
+
+                ++size;
             }
             else
             {
-                value.element_index = it->getMapped().element_index;
-
-                if (it->getKey() != key)
+                if (cell.key != key)
                 {
-                    /// In case of complex key we keep it in arena
                     if constexpr (std::is_same_v<KeyType, StringRef>)
-                        arena.free(const_cast<char *>(key.data), key.size);
+                    {
+                        char * data = const_cast<char *>(cell.key.data);
+                        arena.free(data, cell.key.size);
+                        cell.key = copyStringInArena(key);
+                    }
+                    else
+                        cell.key = key;
                 }
-
-                cache.reinsert(it, key, value);
             }
 
-            setCellDeadline(*it, now);
+            setCellDeadline(cell, now);
         }
     }
 
     PaddedPODArray<KeyType> getCachedKeysImpl() const
     {
         PaddedPODArray<KeyType> result;
-        result.reserve(cache.size());
+        result.reserve(size);
 
-        for (auto & node : cache)
+        for (auto cell : cells)
         {
-            auto & cell = node.getMapped();
+            if (cell.deadline == 0)
+                continue;
 
             if (cell.is_default)
                 continue;
 
-            result.emplace_back(node.getKey());
+            result.emplace_back(cell.key);
         }
 
         return result;
@@ -545,18 +553,16 @@ private:
         }
     }
 
+    using TimePoint = std::chrono::system_clock::time_point;
+
     struct Cell
     {
+        KeyType key;
         size_t element_index;
         bool is_default;
+        time_t deadline;
     };
 
-    CacheDictionaryStorageConfiguration configuration;
-
-    ArenaWithFreeLists arena;
-
-    pcg64 rnd_engine;
-
     struct Attribute
     {
         AttributeUnderlyingType type;
@@ -581,38 +587,28 @@ private:
             std::vector<Field>> attribute_container;
     };
 
-    class CacheStorageCellDisposer
-    {
-    public:
-        CacheDictionaryStorage & storage;
+    CacheDictionaryStorageConfiguration configuration;
 
-        template <typename Key, typename Value>
-        void operator()(const Key & key, const Value &) const
-        {
-            /// In case of complex key we keep it in arena
-            if constexpr (std::is_same_v<Key, StringRef>)
-                storage.arena.free(const_cast<char *>(key.data), key.size);
-        }
-    };
+    pcg64 rnd_engine;
 
-    using SimpleFixedDeadlineHashMap = FixedDeadlineHashMap<UInt64, Cell, CacheStorageCellDisposer>;
-    using ComplexFixedDeadlineHashMap = FixedDeadlineHashMap<StringRef, Cell, CacheStorageCellDisposer>;
+    size_t size_overlap_mask = 0;
 
-    using FixedDeadlineHashMap = std::conditional_t<
-        dictionary_key_type == DictionaryKeyType::simple,
-        SimpleFixedDeadlineHashMap,
-        ComplexFixedDeadlineHashMap>;
+    size_t size = 0;
 
-    using FixedDeadlineHashMapCell = typename FixedDeadlineHashMap::Cell;
+    PaddedPODArray<Cell> cells;
 
-    inline void setCellDeadline(FixedDeadlineHashMapCell & cell, TimePoint now)
+    ArenaWithFreeLists arena;
+
+    std::vector<Attribute> attributes;
+
+    inline void setCellDeadline(Cell & cell, TimePoint now)
     {
         if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
         {
             /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds
             /// to the expiration time. And it overflows pretty well.
             auto deadline = std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds);
-            cell.setDeadline(deadline);
+            cell.deadline = std::chrono::system_clock::to_time_t(deadline);
             return;
         }
 
@@ -622,12 +618,73 @@ private:
         std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
 
         auto deadline = now + std::chrono::seconds(distribution(rnd_engine));
-        cell.setDeadline(deadline);
+        cell.deadline = std::chrono::system_clock::to_time_t(deadline);
     }
 
-    FixedDeadlineHashMap cache;
+    inline size_t getCellIndex(const KeyType key) const
+    {
+        const size_t hash = DefaultHash<KeyType>()(key);
+        const size_t index = hash & size_overlap_mask;
+        return index;
+    }
 
-    std::vector<Attribute> attributes;
+    using KeyStateAndCellIndex = std::pair<KeyState::State, size_t>;
+
+    inline KeyStateAndCellIndex getKeyStateAndCellIndex(const KeyType key, const time_t now) const
+    {
+        size_t place_value = getCellIndex(key);
+        const size_t place_value_end = place_value + max_collision_length;
+
+        time_t max_lifetime_seconds = static_cast<time_t>(configuration.strict_max_lifetime_seconds);
+
+        for (; place_value < place_value_end; ++place_value)
+        {
+            const auto cell_place_value = place_value & size_overlap_mask;
+            const auto & cell = cells[cell_place_value];
+
+            if (cell.key != key)
+                continue;
+
+            if (unlikely(now > cell.deadline + max_lifetime_seconds))
+                return std::make_pair(KeyState::not_found, cell_place_value);
+
+            if (unlikely(now > cell.deadline))
+                return std::make_pair(KeyState::expired, cell_place_value);
+
+            return std::make_pair(KeyState::found, cell_place_value);
+        }
+
+        return std::make_pair(KeyState::not_found, place_value);
+    }
+
+    inline size_t getCellIndexForInsert(const KeyType & key) const
+    {
+        size_t place_value = getCellIndex(key);
+        const size_t place_value_end = place_value + max_collision_length;
+        size_t oldest_place_value = place_value;
+
+        time_t oldest_time = std::numeric_limits<time_t>::max();
+
+        for (; place_value < place_value_end; ++place_value)
+        {
+            const size_t cell_place_value = place_value & size_overlap_mask;
+            const Cell cell = cells[cell_place_value];
+
+            if (cell.deadline == 0)
+                return cell_place_value;
+
+            if (cell.key == key)
+                return place_value;
+
+            if (cell.deadline < oldest_time)
+            {
+                oldest_time = cell.deadline;
+                oldest_place_value = cell_place_value;
+            }
+        }
+
+        return oldest_place_value;
+    }
 };
 
 }
diff --git a/src/Dictionaries/ICacheDictionaryStorage.h b/src/Dictionaries/ICacheDictionaryStorage.h
index a428cebdfe7..8a3202b5590 100644
--- a/src/Dictionaries/ICacheDictionaryStorage.h
+++ b/src/Dictionaries/ICacheDictionaryStorage.h
@@ -12,9 +12,9 @@ struct KeyState
 {
     enum State: uint8_t
     {
-        not_found = 2,
-        expired = 4,
-        found = 8,
+        not_found = 0,
+        expired = 1,
+        found = 2,
     };
 
     KeyState(State state_, size_t fetched_column_index_)
@@ -72,6 +72,8 @@ public:
     /// Necessary if all keys are found we can return result to client without additional aggregation
     virtual bool returnsFetchedColumnsInOrderOfRequestedKeys() const = 0;
 
+    virtual bool canPerformFetchByMultipleThreadsWithoutLock() const = 0;
+
     /// Name of storage
     virtual String getName() const = 0;
 
diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h
index e061b783ee4..32d521db103 100644
--- a/src/Dictionaries/SSDCacheDictionaryStorage.h
+++ b/src/Dictionaries/SSDCacheDictionaryStorage.h
@@ -815,6 +815,8 @@ public:
 
     bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return false; }
 
+    bool canPerformFetchByMultipleThreadsWithoutLock() const override { return false; }
+
     String getName() const override
     {
         if (dictionary_key_type == DictionaryKeyType::simple)
diff --git a/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql b/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql
index ee2cde963d7..f200ead341b 100644
--- a/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql
+++ b/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql
@@ -40,7 +40,7 @@ SELECT dictGetOrDefault('01681_database_for_cache_dictionary.cache_dictionary_si
 SELECT 'dictHas';
 SELECT dictHas('01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4;
 SELECT 'select all values as input stream';
-SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes;
+SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes ORDER BY id;
 
 DROP DICTIONARY 01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes;
 DROP TABLE 01681_database_for_cache_dictionary.simple_key_simple_attributes_source_table;
@@ -84,7 +84,7 @@ SELECT dictGetOrDefault('01681_database_for_cache_dictionary.cache_dictionary_si
 SELECT 'dictHas';
 SELECT dictHas('01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4;
 SELECT 'select all values as input stream';
-SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes;
+SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes ORDER BY id;
 
 DROP DICTIONARY 01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes;
 DROP TABLE 01681_database_for_cache_dictionary.simple_key_complex_attributes_source_table;
diff --git a/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql b/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql
index 65c56090c47..4cc83412457 100644
--- a/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql
+++ b/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql
@@ -42,7 +42,7 @@ SELECT dictGetOrDefault('01682_database_for_cache_dictionary.cache_dictionary_co
 SELECT 'dictHas';
 SELECT dictHas('01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4;
 SELECT 'select all values as input stream';
-SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes;
+SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes ORDER BY id;
 
 DROP DICTIONARY 01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes;
 DROP TABLE 01682_database_for_cache_dictionary.complex_key_simple_attributes_source_table;
@@ -89,7 +89,7 @@ SELECT dictGetOrDefault('01682_database_for_cache_dictionary.cache_dictionary_co
 SELECT 'dictHas';
 SELECT dictHas('01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4;
 SELECT 'select all values as input stream';
-SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes;
+SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes ORDER BY id;
 
 DROP DICTIONARY 01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes;
 DROP TABLE 01682_database_for_cache_dictionary.complex_key_complex_attributes_source_table;
diff --git a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql
index 3b327257fc4..9dbad1289f1 100644
--- a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql
+++ b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql
@@ -40,7 +40,7 @@ SELECT dictGetOrDefault('01684_database_for_cache_dictionary.cache_dictionary_si
 SELECT 'dictHas';
 SELECT dictHas('01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4;
 SELECT 'select all values as input stream';
-SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes;
+SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes ORDER BY id;
 
 DROP DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes;
 DROP TABLE 01684_database_for_cache_dictionary.simple_key_simple_attributes_source_table;
@@ -84,7 +84,7 @@ SELECT dictGetOrDefault('01684_database_for_cache_dictionary.cache_dictionary_si
 SELECT 'dictHas';
 SELECT dictHas('01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4;
 SELECT 'select all values as input stream';
-SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes;
+SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes ORDER BY id;
 
 DROP DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes;
 DROP TABLE 01684_database_for_cache_dictionary.simple_key_complex_attributes_source_table;
diff --git a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql
index 1757b136d3e..8ec5a4a2c24 100644
--- a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql
+++ b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql
@@ -42,7 +42,7 @@ SELECT dictGetOrDefault('01685_database_for_cache_dictionary.cache_dictionary_co
 SELECT 'dictHas';
 SELECT dictHas('01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4;
 SELECT 'select all values as input stream';
-SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes;
+SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes ORDER BY id;
 
 DROP DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes;
 DROP TABLE 01685_database_for_cache_dictionary.complex_key_simple_attributes_source_table;
@@ -89,7 +89,7 @@ SELECT dictGetOrDefault('01685_database_for_cache_dictionary.cache_dictionary_co
 SELECT 'dictHas';
 SELECT dictHas('01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4;
 SELECT 'select all values as input stream';
-SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes;
+SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes ORDER BY id;
 
 DROP DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes;
 DROP TABLE 01685_database_for_cache_dictionary.complex_key_complex_attributes_source_table;

From f14020427989af5920b6e96e5f74985f0dc1f174 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Wed, 17 Mar 2021 22:01:45 +0300
Subject: [PATCH 038/260] Improved performance of SSDCache dictionary

---
 src/Common/HashTable/LRUHashMap.h            | 10 ++++
 src/Dictionaries/SSDCacheDictionaryStorage.h | 57 ++++++++++----------
 2 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/Common/HashTable/LRUHashMap.h b/src/Common/HashTable/LRUHashMap.h
index 870fb219523..bc5fd51d0e2 100644
--- a/src/Common/HashTable/LRUHashMap.h
+++ b/src/Common/HashTable/LRUHashMap.h
@@ -202,6 +202,16 @@ public:
         return const_cast<std::decay_t<decltype(*this)> *>(this)->find(key);
     }
 
+    LookupResult ALWAYS_INLINE findNoLRU(const Key & key)
+    {
+        return Base::find(key);
+    }
+
+    ConstLookupResult ALWAYS_INLINE findNoLRU(const Key & key) const
+    {
+        return const_cast<std::decay_t<decltype(*this)> *>(this)->findNoLRU(key);
+    }
+
     Value & ALWAYS_INLINE get(const Key & key)
     {
         auto it = find(key);
diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h
index 32d521db103..5396846e383 100644
--- a/src/Dictionaries/SSDCacheDictionaryStorage.h
+++ b/src/Dictionaries/SSDCacheDictionaryStorage.h
@@ -614,11 +614,13 @@ public:
     }
 
     template <typename FetchBlockFunc>
-    ALWAYS_INLINE void fetchBlocks(char * read_buffer, size_t read_from_file_buffer_blocks_size, const PaddedPODArray<size_t> & blocks_to_fetch, FetchBlockFunc && func) const
+    void fetchBlocks(size_t read_from_file_buffer_blocks_size, const PaddedPODArray<size_t> & blocks_to_fetch, FetchBlockFunc && func) const
     {
         if (blocks_to_fetch.empty())
             return;
 
+        Memory<Allocator<true>> read_buffer(read_from_file_buffer_blocks_size * block_size, 4096);
+
         size_t blocks_to_fetch_size = blocks_to_fetch.size();
 
         PaddedPODArray<iocb> requests;
@@ -631,7 +633,7 @@ public:
         {
             iocb request{};
 
-            char * buffer_place = read_buffer + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size);
+            char * buffer_place = read_buffer.data() + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size);
 
             #if defined(__FreeBSD__)
             request.aio.aio_lio_opcode = LIO_READ;
@@ -806,7 +808,6 @@ public:
     explicit SSDCacheDictionaryStorage(const SSDCacheDictionaryStorageConfiguration & configuration_)
         : configuration(configuration_)
         , file_buffer(configuration_.file_path, configuration.block_size, configuration.file_blocks_size)
-        , read_from_file_buffer(configuration_.block_size * configuration_.read_buffer_blocks_size, 4096)
         , rnd_engine(randomSeed())
         , index(configuration.max_stored_keys, false, { complex_key_arena })
     {
@@ -815,7 +816,7 @@ public:
 
     bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return false; }
 
-    bool canPerformFetchByMultipleThreadsWithoutLock() const override { return false; }
+    bool canPerformFetchByMultipleThreadsWithoutLock() const override { return true; }
 
     String getName() const override
     {
@@ -922,8 +923,7 @@ private:
             default_value
         };
 
-        TimePoint deadline;
-
+        time_t deadline;
         SSDCacheIndex index;
         size_t in_memory_partition_index;
         CellState state;
@@ -954,23 +954,27 @@ private:
         result.fetched_columns = fetch_request.makeAttributesResultColumns();
         result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found});
 
-        const auto now = std::chrono::system_clock::now();
+        const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
 
         size_t fetched_columns_index = 0;
 
-        using BlockIndexToKeysMap = std::unordered_map<size_t, std::vector<KeyToBlockOffset>, DefaultHash<size_t>>;
+        using BlockIndexToKeysMap = absl::flat_hash_map<size_t, PaddedPODArray<KeyToBlockOffset>, DefaultHash<size_t>>;
         BlockIndexToKeysMap block_to_keys_map;
         absl::flat_hash_set<size_t, DefaultHash<size_t>> unique_blocks_to_request;
         PaddedPODArray<size_t> blocks_to_request;
 
-        std::chrono::seconds strict_max_lifetime_seconds(configuration.strict_max_lifetime_seconds);
+        time_t strict_max_lifetime_seconds = static_cast<time_t>(configuration.strict_max_lifetime_seconds);
         size_t keys_size = keys.size();
 
+        for (size_t attribute_size = 0; attribute_size < fetch_request.attributesSize(); ++attribute_size)
+            if (fetch_request.shouldFillResultColumnWithIndex(attribute_size))
+                result.fetched_columns[attribute_size]->reserve(keys_size);
+
         for (size_t key_index = 0; key_index < keys_size; ++key_index)
         {
             auto key = keys[key_index];
 
-            const auto * it = index.find(key);
+            const auto * it = index.findNoLRU(key);
 
             if (!it)
             {
@@ -980,9 +984,7 @@ private:
 
             const auto & cell = it->getMapped();
 
-            bool has_deadline = cellHasDeadline(cell);
-
-            if (has_deadline && now > cell.deadline + strict_max_lifetime_seconds)
+            if (now > cell.deadline + strict_max_lifetime_seconds)
             {
                 ++result.not_found_keys_size;
                 continue;
@@ -991,14 +993,13 @@ private:
             bool cell_is_expired = false;
             KeyState::State key_state = KeyState::found;
 
-            if (has_deadline && now > cell.deadline)
+            if (now > cell.deadline)
             {
                 cell_is_expired = true;
                 key_state = KeyState::expired;
             }
 
             result.expired_keys_size += cell_is_expired;
-            result.found_keys_size += !cell_is_expired;
 
             switch (cell.state)
             {
@@ -1014,7 +1015,8 @@ private:
                 }
                 case Cell::on_disk:
                 {
-                    block_to_keys_map[cell.index.block_index].emplace_back(key_index, cell.index.offset_in_block, cell_is_expired);
+                    PaddedPODArray<KeyToBlockOffset> & keys_block = block_to_keys_map[cell.index.block_index];
+                    keys_block.emplace_back(key_index, cell.index.offset_in_block, cell_is_expired);
 
                     if (!unique_blocks_to_request.contains(cell.index.block_index))
                     {
@@ -1036,10 +1038,12 @@ private:
             }
         }
 
+        result.found_keys_size = keys_size - (result.not_found_keys_size + result.expired_keys_size);
+
         /// Sort blocks by offset before start async io requests
         std::sort(blocks_to_request.begin(), blocks_to_request.end());
 
-        file_buffer.fetchBlocks(read_from_file_buffer.m_data, configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data)
+        file_buffer.fetchBlocks(configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data)
         {
             auto & keys_in_block = block_to_keys_map[block_index];
 
@@ -1048,10 +1052,8 @@ private:
                 char * key_data = block_data + key_in_block.offset_in_block;
                 deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, key_data);
 
-                if (key_in_block.is_expired)
-                    result.key_index_to_state[key_in_block.key_index] = {KeyState::expired, fetched_columns_index};
-                else
-                    result.key_index_to_state[key_in_block.key_index] = {KeyState::found, fetched_columns_index};
+                KeyState::State state = key_in_block.is_expired ? KeyState::expired : KeyState::found;
+                result.key_index_to_state[key_in_block.key_index] = {state, fetched_columns_index};
 
                 ++fetched_columns_index;
             }
@@ -1298,16 +1300,12 @@ private:
         }
     }
 
-    inline static bool cellHasDeadline(const Cell & cell)
-    {
-        return cell.deadline != std::chrono::system_clock::from_time_t(0);
-    }
-
     inline void setCellDeadline(Cell & cell, TimePoint now)
     {
         if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
         {
-            cell.deadline = std::chrono::system_clock::from_time_t(0);
+            auto deadline = std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds);
+            cell.deadline = std::chrono::system_clock::to_time_t(deadline);
             return;
         }
 
@@ -1315,15 +1313,14 @@ private:
         size_t max_sec_lifetime = configuration.lifetime.max_sec;
 
         std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
-        cell.deadline = now + std::chrono::seconds{distribution(rnd_engine)};
+        auto deadline = now + std::chrono::seconds{distribution(rnd_engine)};
+        cell.deadline = std::chrono::system_clock::to_time_t(deadline);
     }
 
     SSDCacheDictionaryStorageConfiguration configuration;
 
     SSDCacheFileBuffer<SSDCacheKeyType> file_buffer;
 
-    Memory<Allocator<true>> read_from_file_buffer;
-
     std::vector<SSDCacheMemoryBuffer<SSDCacheKeyType>> memory_buffer_partitions;
 
     pcg64 rnd_engine;

From d5a1b50fd22a91ced9c5839f7c573f3aef870bfc Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Wed, 17 Mar 2021 22:06:46 +0300
Subject: [PATCH 039/260] Updated naming

---
 src/Dictionaries/CacheDictionary.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp
index bef391c4222..6c13f76132b 100644
--- a/src/Dictionaries/CacheDictionary.cpp
+++ b/src/Dictionaries/CacheDictionary.cpp
@@ -332,9 +332,9 @@ Columns CacheDictionary<dictionary_key_type>::getColumnsImpl(
 
     FetchResult result_of_fetch_from_storage;
 
-    bool protect_get_with_write_lock = cache_storage_ptr->canPerformFetchByMultipleThreadsWithoutLock();
+    bool can_perform_fetch_without_write_lock = cache_storage_ptr->canPerformFetchByMultipleThreadsWithoutLock();
 
-    if (protect_get_with_write_lock)
+    if (can_perform_fetch_without_write_lock)
     {
         const ProfilingScopedReadRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
         result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request);

From 7c0a1b9451ee7cd25e30dc758246ff4388cfe9e1 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Thu, 18 Mar 2021 10:23:19 +0300
Subject: [PATCH 040/260] Cleanup destination directory during
 DiskCacheWrapper::moveFile()

---
 src/Disks/DiskCacheWrapper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index 085749f22f8..0fd03f951ce 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -217,7 +217,7 @@ void DiskCacheWrapper::moveFile(const String & from_path, const String & to_path
 {
     if (cache_disk->exists(from_path))
     {
-        if (cache_disk->isDirectory(to_path) && !cache_disk->isDirectoryEmpty(to_path))
+        if (cache_disk->exists(to_path) && cache_disk->isDirectory(to_path))
             cache_disk->clearDirectory(to_path);
 
         auto dir_path = directoryPath(to_path);

From 18ed9c5c67f1aefed757484ebe42cf2120f1be74 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Thu, 18 Mar 2021 12:55:17 +0300
Subject: [PATCH 041/260] SSDCacheDictionary remove max_stored_keys option

---
 .../external-dicts-dict-layout.md             |   6 +-
 .../external-dicts-dict-layout.md             |   6 +-
 src/Common/HashTable/LRUHashMap.h             |  10 --
 src/Dictionaries/CacheDictionary.cpp          |  12 +-
 src/Dictionaries/CacheDictionaryStorage.h     |   8 +-
 src/Dictionaries/ICacheDictionaryStorage.h    |   8 +-
 src/Dictionaries/SSDCacheDictionaryStorage.h  | 148 ++++++++++--------
 .../registerCacheDictionaries.cpp             |   8 +-
 tests/integration/helpers/dictionary.py       |   4 +-
 .../ssd_complex_key_cache_string.xml          |   1 -
 .../0_stateless/01053_ssd_dictionary.sql      |   4 +-
 .../01280_ssd_complex_key_dictionary.sql      |   2 +-
 12 files changed, 99 insertions(+), 118 deletions(-)

diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
index efef91b4b09..6af22eb27dc 100644
--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@@ -320,8 +320,6 @@ Similar to `cache`, but stores data on SSD and index in RAM.
         <write_buffer_size>1048576</write_buffer_size>
         <!-- Path where cache file will be stored. -->
         <path>/var/lib/clickhouse/clickhouse_dictionaries/test_dict</path>
-        <!-- Max number on stored keys in the cache. Rounded up to a power of two. -->
-        <max_stored_keys>1048576</max_stored_keys>
     </ssd_cache>
 </layout>
 ```
@@ -329,8 +327,8 @@ Similar to `cache`, but stores data on SSD and index in RAM.
 or
 
 ``` sql
-LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
-    PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576))
+LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
+    PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict))
 ```
 
 ### complex_key_ssd_cache {#complex-key-ssd-cache}
diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
index 0fd4a85c46f..9b33a801973 100644
--- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@@ -318,8 +318,6 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))
         <write_buffer_size>1048576</write_buffer_size>
         <!-- Path where cache file will be stored. -->
         <path>/var/lib/clickhouse/clickhouse_dictionaries/test_dict</path>
-        <!-- Max number on stored keys in the cache. Rounded up to a power of two. -->
-        <max_stored_keys>1048576</max_stored_keys>
     </ssd_cache>
 </layout>
 ```
@@ -327,8 +325,8 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))
 или
 
 ``` sql
-LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
-    PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576))
+LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
+    PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict))
 ```
 
 ### complex_key_ssd_cache {#complex-key-ssd-cache}
diff --git a/src/Common/HashTable/LRUHashMap.h b/src/Common/HashTable/LRUHashMap.h
index bc5fd51d0e2..870fb219523 100644
--- a/src/Common/HashTable/LRUHashMap.h
+++ b/src/Common/HashTable/LRUHashMap.h
@@ -202,16 +202,6 @@ public:
         return const_cast<std::decay_t<decltype(*this)> *>(this)->find(key);
     }
 
-    LookupResult ALWAYS_INLINE findNoLRU(const Key & key)
-    {
-        return Base::find(key);
-    }
-
-    ConstLookupResult ALWAYS_INLINE findNoLRU(const Key & key) const
-    {
-        return const_cast<std::decay_t<decltype(*this)> *>(this)->findNoLRU(key);
-    }
-
     Value & ALWAYS_INLINE get(const Key & key)
     {
         auto it = find(key);
diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp
index 6c13f76132b..eedf4dd3d87 100644
--- a/src/Dictionaries/CacheDictionary.cpp
+++ b/src/Dictionaries/CacheDictionary.cpp
@@ -101,7 +101,7 @@ template <DictionaryKeyType dictionary_key_type>
 double CacheDictionary<dictionary_key_type>::getLoadFactor() const
 {
     const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-    return static_cast<double>(cache_storage_ptr->getSize()) / cache_storage_ptr->getMaxSize();
+    return cache_storage_ptr->getLoadFactor();
 }
 
 template <DictionaryKeyType dictionary_key_type>
@@ -332,16 +332,8 @@ Columns CacheDictionary<dictionary_key_type>::getColumnsImpl(
 
     FetchResult result_of_fetch_from_storage;
 
-    bool can_perform_fetch_without_write_lock = cache_storage_ptr->canPerformFetchByMultipleThreadsWithoutLock();
-
-    if (can_perform_fetch_without_write_lock)
     {
-        const ProfilingScopedReadRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
-        result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request);
-    }
-    else
-    {
-        const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
+        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
         result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request);
     }
 
diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h
index 6b1200dd474..874796d879b 100644
--- a/src/Dictionaries/CacheDictionaryStorage.h
+++ b/src/Dictionaries/CacheDictionaryStorage.h
@@ -58,8 +58,6 @@ public:
 
     bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; }
 
-    bool canPerformFetchByMultipleThreadsWithoutLock() const override { return true; }
-
     String getName() const override
     {
         if (dictionary_key_type == DictionaryKeyType::simple)
@@ -142,7 +140,7 @@ public:
 
     size_t getSize() const override { return size; }
 
-    size_t getMaxSize() const override { return configuration.max_size_in_cells; }
+    double getLoadFactor() const override { return static_cast<double>(size) / configuration.max_size_in_cells; }
 
     size_t getBytesAllocated() const override
     {
@@ -654,7 +652,7 @@ private:
             return std::make_pair(KeyState::found, cell_place_value);
         }
 
-        return std::make_pair(KeyState::not_found, place_value);
+        return std::make_pair(KeyState::not_found, place_value & size_overlap_mask);
     }
 
     inline size_t getCellIndexForInsert(const KeyType & key) const
@@ -674,7 +672,7 @@ private:
                 return cell_place_value;
 
             if (cell.key == key)
-                return place_value;
+                return cell_place_value;
 
             if (cell.deadline < oldest_time)
             {
diff --git a/src/Dictionaries/ICacheDictionaryStorage.h b/src/Dictionaries/ICacheDictionaryStorage.h
index 8a3202b5590..72b3ef76f11 100644
--- a/src/Dictionaries/ICacheDictionaryStorage.h
+++ b/src/Dictionaries/ICacheDictionaryStorage.h
@@ -34,7 +34,7 @@ struct KeyState
     inline void setDefaultValue(bool is_default_value) { is_default = is_default_value; }
     /// Valid only if keyState is found or expired
     inline size_t getFetchedColumnIndex() const { return fetched_column_index; }
-
+    inline void setFetchedColumnIndex(size_t fetched_column_index_value) { fetched_column_index = fetched_column_index_value; }
 private:
     State state = not_found;
     size_t fetched_column_index = 0;
@@ -72,8 +72,6 @@ public:
     /// Necessary if all keys are found we can return result to client without additional aggregation
     virtual bool returnsFetchedColumnsInOrderOfRequestedKeys() const = 0;
 
-    virtual bool canPerformFetchByMultipleThreadsWithoutLock() const = 0;
-
     /// Name of storage
     virtual String getName() const = 0;
 
@@ -114,8 +112,8 @@ public:
     /// Return size of keys in storage
     virtual size_t getSize() const = 0;
 
-    /// Return maximum size of keys in storage
-    virtual size_t getMaxSize() const = 0;
+    /// Returns storage load factor
+    virtual double getLoadFactor() const = 0;
 
     /// Return bytes allocated in storage
     virtual size_t getBytesAllocated() const = 0;
diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h
index 5396846e383..f28f9ab37cd 100644
--- a/src/Dictionaries/SSDCacheDictionaryStorage.h
+++ b/src/Dictionaries/SSDCacheDictionaryStorage.h
@@ -17,7 +17,7 @@
 #include <Common/Arena.h>
 #include <Common/ArenaWithFreeLists.h>
 #include <Common/MemorySanitizer.h>
-#include <Common/HashTable/LRUHashMap.h>
+#include <Common/HashTable/HashMap.h>
 #include <IO/AIO.h>
 #include <Dictionaries/DictionaryStructure.h>
 #include <Dictionaries/ICacheDictionaryStorage.h>
@@ -56,7 +56,6 @@ struct SSDCacheDictionaryStorageConfiguration
 
     const std::string file_path;
     const size_t max_partitions_count;
-    const size_t max_stored_keys;
     const size_t block_size;
     const size_t file_blocks_size;
     const size_t read_buffer_blocks_size;
@@ -127,7 +126,7 @@ public:
 
     /// Reset block with new block_data
     /// block_data must be filled with zeroes if it is new block
-    ALWAYS_INLINE inline void reset(char * new_block_data)
+    inline void reset(char * new_block_data)
     {
         block_data = new_block_data;
         current_block_offset = block_header_size;
@@ -135,13 +134,13 @@ public:
     }
 
     /// Check if it is enough place to write key in block
-    ALWAYS_INLINE inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const
+    inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const
     {
         return (current_block_offset + (sizeof(cache_key.key) + sizeof(cache_key.size) + cache_key.size)) <= block_size;
     }
 
     /// Check if it is enough place to write key in block
-    ALWAYS_INLINE inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const
+    inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const
     {
         const StringRef & key = cache_key.key;
         size_t complex_key_size = sizeof(key.size) + key.size;
@@ -152,7 +151,7 @@ public:
     /// Write key and returns offset in ssd cache block where data is written
     /// It is client responsibility to check if there is enough place in block to write key
     /// Returns true if key was written and false if there was not enough place to write key
-    ALWAYS_INLINE inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block)
+    inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block)
     {
         assert(cache_key.size > 0);
 
@@ -181,7 +180,7 @@ public:
         return true;
     }
 
-    ALWAYS_INLINE inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block)
+    inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block)
     {
         assert(cache_key.size > 0);
 
@@ -216,20 +215,20 @@ public:
         return true;
     }
 
-    ALWAYS_INLINE inline size_t getKeysSize() const { return keys_size; }
+    inline size_t getKeysSize() const { return keys_size; }
 
     /// Write keys size into block header
-    ALWAYS_INLINE inline void writeKeysSize()
+    inline void writeKeysSize()
     {
         char * keys_size_offset_data = block_data + block_header_check_sum_size;
         std::memcpy(keys_size_offset_data, &keys_size, sizeof(size_t));
     }
 
     /// Get check sum from block header
-    ALWAYS_INLINE inline size_t getCheckSum() const { return unalignedLoad<size_t>(block_data); }
+    inline size_t getCheckSum() const { return unalignedLoad<size_t>(block_data); }
 
     /// Calculate check sum in block
-    ALWAYS_INLINE inline size_t calculateCheckSum() const
+    inline size_t calculateCheckSum() const
     {
         size_t calculated_check_sum = static_cast<size_t>(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size));
 
@@ -237,7 +236,7 @@ public:
     }
 
     /// Check if check sum from block header matched calculated check sum in block
-    ALWAYS_INLINE inline bool checkCheckSum() const
+    inline bool checkCheckSum() const
     {
         size_t calculated_check_sum = calculateCheckSum();
         size_t check_sum = getCheckSum();
@@ -246,16 +245,16 @@ public:
     }
 
     /// Write check sum in block header
-    ALWAYS_INLINE inline void writeCheckSum()
+    inline void writeCheckSum()
     {
         size_t check_sum = static_cast<size_t>(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size));
         std::memcpy(block_data, &check_sum, sizeof(size_t));
     }
 
-    ALWAYS_INLINE inline size_t getBlockSize() const { return block_size; }
+    inline size_t getBlockSize() const { return block_size; }
 
     /// Returns block data
-    ALWAYS_INLINE inline char * getBlockData() const { return block_data; }
+    inline char * getBlockData() const { return block_data; }
 
     /// Read keys that were serialized in block
     /// It is client responsibility to ensure that simple or complex keys were written in block
@@ -753,7 +752,7 @@ private:
         int fd = -1;
     };
 
-    ALWAYS_INLINE inline static int preallocateDiskSpace(int fd, size_t offset, size_t len)
+    inline static int preallocateDiskSpace(int fd, size_t offset, size_t len)
     {
         #if defined(__FreeBSD__)
             return posix_fallocate(fd, offset, len);
@@ -762,7 +761,7 @@ private:
         #endif
     }
 
-    ALWAYS_INLINE inline static char * getRequestBuffer(const iocb & request)
+    inline static char * getRequestBuffer(const iocb & request)
     {
         char * result = nullptr;
 
@@ -775,7 +774,7 @@ private:
         return result;
     }
 
-    ALWAYS_INLINE inline static ssize_t eventResult(io_event & event)
+    inline static ssize_t eventResult(io_event & event)
     {
         ssize_t  bytes_written;
 
@@ -809,15 +808,12 @@ public:
         : configuration(configuration_)
         , file_buffer(configuration_.file_path, configuration.block_size, configuration.file_blocks_size)
         , rnd_engine(randomSeed())
-        , index(configuration.max_stored_keys, false, { complex_key_arena })
     {
         memory_buffer_partitions.emplace_back(configuration.block_size, configuration.write_buffer_blocks_size);
     }
 
     bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return false; }
 
-    bool canPerformFetchByMultipleThreadsWithoutLock() const override { return true; }
-
     String getName() const override
     {
         if (dictionary_key_type == DictionaryKeyType::simple)
@@ -900,14 +896,31 @@ public:
 
     size_t getSize() const override { return index.size(); }
 
-    size_t getMaxSize() const override {return index.getMaxSize(); }
+    double getLoadFactor() const override
+    {
+        size_t partitions_size = memory_buffer_partitions.size();
+
+        if (partitions_size == configuration.max_partitions_count)
+            return 1.0;
+
+        auto & current_memory_partition = memory_buffer_partitions[current_partition_index];
+
+        size_t full_partitions = partitions_size - 1;
+        size_t blocks_in_memory = (full_partitions * configuration.write_buffer_blocks_size) + current_memory_partition.getCurrentBlockIndex();
+        size_t blocks_on_disk = file_buffer.getCurrentBlockIndex();
+
+        size_t max_blocks_size = (configuration.file_blocks_size + configuration.write_buffer_blocks_size) * configuration.max_partitions_count;
+
+        double load_factor = static_cast<double>(blocks_in_memory + blocks_on_disk) / max_blocks_size;
+        return load_factor;
+    }
 
     size_t getBytesAllocated() const override
     {
         size_t memory_partitions_bytes_size = memory_buffer_partitions.size() * configuration.write_buffer_blocks_size * configuration.block_size;
         size_t file_partitions_bytes_size = memory_buffer_partitions.size() * configuration.file_blocks_size * configuration.block_size;
 
-        return index.getSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size;
+        return index.getBufferSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size;
     }
 
 private:
@@ -935,13 +948,12 @@ private:
 
     struct KeyToBlockOffset
     {
-        KeyToBlockOffset(size_t key_index_, size_t offset_in_block_, bool is_expired_)
-            : key_index(key_index_), offset_in_block(offset_in_block_), is_expired(is_expired_)
+        KeyToBlockOffset(size_t key_index_, size_t offset_in_block_)
+            : key_index(key_index_), offset_in_block(offset_in_block_)
         {}
 
         size_t key_index = 0;
         size_t offset_in_block = 0;
-        bool is_expired = false;
     };
 
     template <typename Result>
@@ -952,7 +964,7 @@ private:
         Result result;
 
         result.fetched_columns = fetch_request.makeAttributesResultColumns();
-        result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found});
+        result.key_index_to_state.resize_fill(keys.size());
 
         const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
 
@@ -974,7 +986,7 @@ private:
         {
             auto key = keys[key_index];
 
-            const auto * it = index.findNoLRU(key);
+            const auto * it = index.find(key);
 
             if (!it)
             {
@@ -984,7 +996,7 @@ private:
 
             const auto & cell = it->getMapped();
 
-            if (now > cell.deadline + strict_max_lifetime_seconds)
+            if (unlikely(now > cell.deadline + strict_max_lifetime_seconds))
             {
                 ++result.not_found_keys_size;
                 continue;
@@ -999,7 +1011,8 @@ private:
                 key_state = KeyState::expired;
             }
 
-            result.expired_keys_size += cell_is_expired;
+            result.expired_keys_size += static_cast<size_t>(cell_is_expired);
+            result.found_keys_size += static_cast<size_t>(!cell_is_expired);
 
             switch (cell.state)
             {
@@ -1016,13 +1029,19 @@ private:
                 case Cell::on_disk:
                 {
                     PaddedPODArray<KeyToBlockOffset> & keys_block = block_to_keys_map[cell.index.block_index];
-                    keys_block.emplace_back(key_index, cell.index.offset_in_block, cell_is_expired);
+                    keys_block.emplace_back(key_index, cell.index.offset_in_block);
 
-                    if (!unique_blocks_to_request.contains(cell.index.block_index))
-                    {
+                    KeyState::State state = cell_is_expired ? KeyState::expired : KeyState::found;
+
+                    /// Fetched column index will be set later during fetch blocks
+                    result.key_index_to_state[key_index] = {state, 0};
+
+                    auto insert_result = unique_blocks_to_request.insert(cell.index.block_index);
+                    bool was_inserted = insert_result.second;
+
+                    if (was_inserted)
                         blocks_to_request.emplace_back(cell.index.block_index);
-                        unique_blocks_to_request.insert(cell.index.block_index);
-                    }
+
                     break;
                 }
                 case Cell::default_value:
@@ -1038,8 +1057,6 @@ private:
             }
         }
 
-        result.found_keys_size = keys_size - (result.not_found_keys_size + result.expired_keys_size);
-
         /// Sort blocks by offset before start async io requests
         std::sort(blocks_to_request.begin(), blocks_to_request.end());
 
@@ -1052,8 +1069,7 @@ private:
                 char * key_data = block_data + key_in_block.offset_in_block;
                 deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, key_data);
 
-                KeyState::State state = key_in_block.is_expired ? KeyState::expired : KeyState::found;
-                result.key_index_to_state[key_in_block.key_index] = {state, fetched_columns_index};
+                result.key_index_to_state[key_in_block.key_index].setFetchedColumnIndex(fetched_columns_index);
 
                 ++fetched_columns_index;
             }
@@ -1091,7 +1107,7 @@ private:
                 throw Exception("Serialized columns size is greater than allowed block size and metadata", ErrorCodes::UNSUPPORTED_METHOD);
 
             /// We cannot reuse place that is already allocated in file or memory cache so we erase key from index
-            index.erase(key);
+            eraseKeyFromIndex(key);
 
             Cell cell;
             setCellDeadline(cell, now);
@@ -1118,8 +1134,7 @@ private:
 
         for (auto key : keys)
         {
-            /// We cannot reuse place that is already allocated in file or memory cache so we erase key from index
-            index.erase(key);
+            eraseKeyFromIndex(key);
 
             Cell cell;
 
@@ -1139,7 +1154,7 @@ private:
                 key = updated_key;
             }
 
-            index.insert(key, cell);
+            index[key] = cell;
         }
     }
 
@@ -1192,7 +1207,7 @@ private:
                 cell.index = cache_index;
                 cell.in_memory_partition_index = current_partition_index;
 
-                index.insert(ssd_cache_key.key, cell);
+                index[ssd_cache_key.key] = cell;
                 break;
             }
             else
@@ -1222,7 +1237,7 @@ private:
                             if (old_key_cell.isOnDisk() &&
                                 old_key_block >= block_index_in_file_before_write &&
                                 old_key_block < file_read_end_block_index)
-                                index.erase(old_key);
+                                eraseKeyFromIndex(old_key);
                         }
                     }
                 }
@@ -1275,7 +1290,7 @@ private:
                     cell.index = cache_index;
                     cell.in_memory_partition_index = current_partition_index;
 
-                    index.insert(ssd_cache_key.key, cell);
+                    index[ssd_cache_key.key] = cell;
                     break;
                 }
                 else
@@ -1313,10 +1328,23 @@ private:
         size_t max_sec_lifetime = configuration.lifetime.max_sec;
 
         std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
-        auto deadline = now + std::chrono::seconds{distribution(rnd_engine)};
+        auto deadline = now + std::chrono::seconds(distribution(rnd_engine));
         cell.deadline = std::chrono::system_clock::to_time_t(deadline);
     }
 
+    inline void eraseKeyFromIndex(KeyType key)
+    {
+        auto it = index.find(key);
+
+        if (it == nullptr)
+            return;
+
+        index.erase(key);
+
+        if constexpr(std::is_same_v<KeyType, StringRef>)
+            complex_key_arena.free(const_cast<char *>(key.data), key.size);
+    }
+
     SSDCacheDictionaryStorageConfiguration configuration;
 
     SSDCacheFileBuffer<SSDCacheKeyType> file_buffer;
@@ -1325,31 +1353,17 @@ private:
 
     pcg64 rnd_engine;
 
-    class ArenaCellKeyDisposer
-    {
-    public:
-        ArenaWithFreeLists & arena;
+    using SimpleKeyHashMap = HashMap<UInt64, Cell>;
+    using ComplexKeyHashMap = HashMapWithSavedHash<StringRef, Cell>;
 
-        template <typename Key, typename Value>
-        void operator()(const Key & key, const Value &) const
-        {
-            /// In case of complex key we keep it in arena
-            if constexpr (std::is_same_v<Key, StringRef>)
-                arena.free(const_cast<char *>(key.data), key.size);
-        }
-    };
-
-    using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, ArenaCellKeyDisposer>;
-    using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, ArenaCellKeyDisposer>;
-
-    using CacheLRUHashMap = std::conditional_t<
+    using CacheMap = std::conditional_t<
         dictionary_key_type == DictionaryKeyType::simple,
-        SimpleKeyLRUHashMap,
-        ComplexKeyLRUHashMap>;
+        SimpleKeyHashMap,
+        ComplexKeyHashMap>;
 
     ArenaWithFreeLists complex_key_arena;
 
-    CacheLRUHashMap index;
+    CacheMap index;
 
     size_t current_partition_index = 0;
 
diff --git a/src/Dictionaries/registerCacheDictionaries.cpp b/src/Dictionaries/registerCacheDictionaries.cpp
index 9f0f214e79b..b93a08acb76 100644
--- a/src/Dictionaries/registerCacheDictionaries.cpp
+++ b/src/Dictionaries/registerCacheDictionaries.cpp
@@ -26,7 +26,7 @@ CacheDictionaryStorageConfiguration parseCacheStorageConfiguration(
     const size_t size = config.getUInt64(dictionary_configuration_prefix + "size_in_cells");
     if (size == 0)
         throw Exception(ErrorCodes::TOO_SMALL_BUFFER_SIZE,
-            "({}: cache dictionary cannot have 0 cells",
+            "({}): cache dictionary cannot have 0 cells",
             full_name);
 
     size_t dict_lifetime_seconds = static_cast<size_t>(dict_lifetime.max_sec);
@@ -59,7 +59,6 @@ SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration(
     static constexpr size_t DEFAULT_READ_BUFFER_SIZE_BYTES = 16 * DEFAULT_SSD_BLOCK_SIZE_BYTES;
     static constexpr size_t DEFAULT_WRITE_BUFFER_SIZE_BYTES = DEFAULT_SSD_BLOCK_SIZE_BYTES;
 
-    static constexpr size_t DEFAULT_MAX_STORED_KEYS = 100000;
     static constexpr size_t DEFAULT_PARTITIONS_COUNT = 16;
 
     const size_t max_partitions_count
@@ -94,16 +93,11 @@ SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration(
     if (directory_path.at(0) != '/')
         directory_path = std::filesystem::path{config.getString("path")}.concat(directory_path).string();
 
-    const size_t max_stored_keys_in_partition
-        = config.getInt64(dictionary_configuration_prefix + "max_stored_keys", DEFAULT_MAX_STORED_KEYS);
-    const size_t rounded_size = roundUpToPowerOfTwoOrZero(max_stored_keys_in_partition);
-
     SSDCacheDictionaryStorageConfiguration configuration{
         strict_max_lifetime_seconds,
         dict_lifetime,
         directory_path,
         max_partitions_count,
-        rounded_size,
         block_size,
         file_size / block_size,
         read_buffer_size / block_size,
diff --git a/tests/integration/helpers/dictionary.py b/tests/integration/helpers/dictionary.py
index b3f7a729777..41d87180c8a 100644
--- a/tests/integration/helpers/dictionary.py
+++ b/tests/integration/helpers/dictionary.py
@@ -7,12 +7,12 @@ class Layout(object):
         'flat': '<flat/>',
         'hashed': '<hashed/>',
         'cache': '<cache><size_in_cells>128</size_in_cells></cache>',
-        'ssd_cache': '<ssd_cache><path>/etc/clickhouse/dictionaries/all</path><max_stored_keys>128</max_stored_keys></ssd_cache>',
+        'ssd_cache': '<ssd_cache><path>/etc/clickhouse/dictionaries/all</path></ssd_cache>',
         'complex_key_hashed': '<complex_key_hashed/>',
         'complex_key_hashed_one_key': '<complex_key_hashed/>',
         'complex_key_hashed_two_keys': '<complex_key_hashed/>',
         'complex_key_cache': '<complex_key_cache><size_in_cells>128</size_in_cells></complex_key_cache>',
-        'complex_key_ssd_cache': '<complex_key_ssd_cache><path>/etc/clickhouse/dictionaries/all</path><max_stored_keys>128</max_stored_keys></complex_key_ssd_cache>',
+        'complex_key_ssd_cache': '<complex_key_ssd_cache><path>/etc/clickhouse/dictionaries/all</path></complex_key_ssd_cache>',
         'range_hashed': '<range_hashed/>',
         'direct': '<direct/>',
         'complex_key_direct': '<complex_key_direct/>'
diff --git a/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml b/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml
index 85f811d2d85..c8fdbcbe0ef 100644
--- a/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml
+++ b/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml
@@ -42,7 +42,6 @@
                 <read_buffer_size>131072</read_buffer_size>
                 <write_buffer_size>1048576</write_buffer_size>
 		        <path>/etc/clickhouse/dictionaries/radars</path>
-                <max_stored_keys>1048576</max_stored_keys>
             </complex_key_ssd_cache>
         </layout>
         <lifetime>1</lifetime>
diff --git a/tests/queries/0_stateless/01053_ssd_dictionary.sql b/tests/queries/0_stateless/01053_ssd_dictionary.sql
index a23ae7e5e96..23a369cc8a6 100644
--- a/tests/queries/0_stateless/01053_ssd_dictionary.sql
+++ b/tests/queries/0_stateless/01053_ssd_dictionary.sql
@@ -76,7 +76,7 @@ CREATE DICTIONARY 01053_db.ssd_dict
 PRIMARY KEY id
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_for_dict' PASSWORD '' DB '01053_db'))
 LIFETIME(MIN 1000 MAX 2000)
-LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096 MAX_STORED_KEYS 1000000));
+LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096));
 
 SELECT 'UPDATE DICTIONARY';
 -- 118
@@ -142,7 +142,7 @@ CREATE DICTIONARY 01053_db.ssd_dict
 PRIMARY KEY id
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_for_dict' PASSWORD '' DB '01053_db'))
 LIFETIME(MIN 1000 MAX 2000)
-LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/2d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 1024 MAX_STORED_KEYS 10));
+LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/2d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 1024));
 
 SELECT 'UPDATE DICTIONARY (MT)';
 -- 118
diff --git a/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql b/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql
index 50b34c4b18f..cd3e52c9691 100644
--- a/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql
+++ b/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql
@@ -98,7 +98,7 @@ CREATE DICTIONARY 01280_db.ssd_dict
 PRIMARY KEY k1, k2
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_for_dict' PASSWORD '' DB '01280_db'))
 LIFETIME(MIN 1000 MAX 2000)
-LAYOUT(COMPLEX_KEY_SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096 MAX_STORED_KEYS 1000000));
+LAYOUT(COMPLEX_KEY_SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096));
 
 SELECT 'UPDATE DICTIONARY';
 -- 118

From cf985a86c48e25f86e27591fba1da801f3b3ecf2 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Thu, 18 Mar 2021 14:25:13 +0300
Subject: [PATCH 042/260] test

---
 .../0_stateless/01684_ssd_cache_dictionary_simple_key.sql   | 6 +++---
 .../0_stateless/01685_ssd_cache_dictionary_complex_key.sql  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql
index 9dbad1289f1..2fe1e54fe6c 100644
--- a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql
+++ b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql
@@ -22,7 +22,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke
 PRIMARY KEY id
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_simple_attributes_source_table'))
 LIFETIME(MIN 1 MAX 1000)
-LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/0d'));
+LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/0d'));
 
 SELECT 'Dictionary cache_dictionary_simple_key_simple_attributes';
 SELECT 'dictGet existing value';
@@ -66,7 +66,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke
 PRIMARY KEY id
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_complex_attributes_source_table'))
 LIFETIME(MIN 1 MAX 1000)
-LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d'));
+LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/1d'));
 
 SELECT 'Dictionary cache_dictionary_simple_key_complex_attributes';
 SELECT 'dictGet existing value';
@@ -108,7 +108,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke
 PRIMARY KEY id
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_hierarchy_table'))
 LIFETIME(MIN 1 MAX 1000)
-LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/2d'));
+LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/2d'));
 
 SELECT 'Dictionary cache_dictionary_simple_key_hierarchy';
 SELECT 'dictGet';
diff --git a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql
index 8ec5a4a2c24..f65aa445284 100644
--- a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql
+++ b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql
@@ -24,7 +24,7 @@ CREATE DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_k
 PRIMARY KEY id, id_key
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_simple_attributes_source_table' DB '01685_database_for_cache_dictionary'))
 LIFETIME(MIN 1 MAX 1000)
-LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/0d'));
+LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/0d'));
 
 SELECT 'Dictionary cache_dictionary_complex_key_simple_attributes';
 SELECT 'dictGet existing value';
@@ -71,7 +71,7 @@ CREATE DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_k
 PRIMARY KEY id, id_key
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_complex_attributes_source_table' DB '01685_database_for_cache_dictionary'))
 LIFETIME(MIN 1 MAX 1000)
-LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d'));
+LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/1d'));
 
 SELECT 'Dictionary cache_dictionary_complex_key_complex_attributes';
 SELECT 'dictGet existing value';
@@ -95,4 +95,4 @@ DROP DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_key
 DROP TABLE 01685_database_for_cache_dictionary.complex_key_complex_attributes_source_table;
 
 DROP DATABASE 01685_database_for_cache_dictionary;
-                                                                                                                                          
+

From f3ff437a3997e399b013f7634b4f7dd7a5184e96 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 18 Mar 2021 14:32:45 +0300
Subject: [PATCH 043/260] Add all nodes killer/stop, one multitransaction
 request and counter test

---
 .../src/jepsen/nukeeper/counter.clj           | 52 +++++++++++++++++++
 .../src/jepsen/nukeeper/main.clj              |  6 ++-
 .../src/jepsen/nukeeper/nemesis.clj           | 35 +++++++++----
 .../src/jepsen/nukeeper/set.clj               |  6 +--
 .../src/jepsen/nukeeper/utils.clj             | 16 +++++-
 .../test/jepsen/nukeeper_test.clj             |  2 +
 6 files changed, 100 insertions(+), 17 deletions(-)
 create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
new file mode 100644
index 00000000000..1bdf3f89186
--- /dev/null
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
@@ -0,0 +1,52 @@
+(ns jepsen.nukeeper.counter
+ (:require
+   [clojure.tools.logging :refer :all]
+   [jepsen
+    [checker :as checker]
+    [client :as client]
+    [generator :as gen]]
+   [jepsen.nukeeper.utils :refer :all]
+   [zookeeper :as zk])
+  (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
+
+
+(defn r   [_ _] {:type :invoke, :f :read})
+(defn add [_ _] {:type :invoke, :f :add, :value (rand-int 5)})
+
+
+(defrecord CounterClient [conn nodename]
+  client/Client
+  (open! [this test node]
+    (assoc
+     (assoc this
+            :conn (zk-connect node 9181 30000))
+     :nodename node))
+
+  (setup! [this test])
+
+  (invoke! [this test op]
+    (case (:f op)
+      :read (try
+              (assoc op
+                     :type :ok
+                     :value (count (zk-list conn "/")))
+              (catch Exception _ (assoc op :type :fail, :error :connect-error)))
+      :add (try
+             (do
+               (zk-multi-create-many-seq-nodes conn "/seq-" (:value op))
+               (assoc op :type :ok))
+             (catch Exception _ (assoc op :type :info, :error :connect-error)))))
+
+  (teardown! [_ test])
+
+  (close! [_ test]))
+
+(defn workload
+  "A generator, client, and checker for a set test."
+  [opts]
+  {:client    (CounterClient. nil nil)
+   :checker   (checker/counter)
+   :generator (->> (range)
+                   (map (fn [x]
+                          (->> (gen/mix [r add])))))
+   :final-generator (gen/once {:type :invoke, :f :read, :value nil})})
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 8b7c1a6caac..0f9619a7653 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -5,6 +5,7 @@
             [jepsen.nukeeper.nemesis :as custom-nemesis]
             [jepsen.nukeeper.register :as register]
             [jepsen.nukeeper.unique :as unique]
+            [jepsen.nukeeper.counter :as counter]
             [jepsen.nukeeper.constants :refer :all]
             [clojure.string :as str]
             [jepsen
@@ -85,7 +86,8 @@
   "A map of workload names to functions that construct workloads, given opts."
   {"set"      set/workload
    "register" register/workload
-   "unique-ids" unique/workload})
+   "unique-ids" unique/workload
+   "counter" counter/workload})
 
 (def cli-opts
   "Additional command line options."
@@ -126,7 +128,7 @@
         current-nemesis (get custom-nemesis/custom-nemesises (:nemesis opts))]
     (merge tests/noop-test
            opts
-           {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)) (name (:nemesis opts)))
+           {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)) " " (name (:nemesis opts)))
             :os ubuntu/os
             :db (db "rbtorrent:a122093aee0bdcb70ca42d5e5fb4ba5544372f5f")
             :pure-generators true
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index d1dc0d55e5f..bf22f9ad1f6 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -8,17 +8,28 @@
    [jepsen.nukeeper.constants :refer :all]
    [jepsen.nukeeper.utils :refer :all]))
 
-(defn random-single-node-killer-nemesis
+(defn random-node-killer-nemesis
   []
   (nemesis/node-start-stopper
    rand-nth
    (fn start [test node] (kill-clickhouse! node test))
    (fn stop [test node] (start-clickhouse! node test))))
 
-(defn hammer-time-nemesis
+(defn all-nodes-killer-nemesis
+  []
+  (nemesis/node-start-stopper
+   identity
+   (fn start [test node] (kill-clickhouse! node test))
+   (fn stop [test node] (start-clickhouse! node test))))
+
+(defn random-node-hammer-time-nemesis
   []
   (nemesis/hammer-time "clickhouse"))
 
+(defn all-nodes-hammer-time-nemesis
+  []
+  (nemesis/hammer-time identity "clickhouse"))
+
 (defn select-last-file
   [path]
   (last (clojure.string/split
@@ -83,11 +94,11 @@
                                        (c/exec :rm :-fr path))))
 
 (defn start-stop-generator
-  []
+  [time-corrupt time-ok]
   (->>
-   (cycle [(gen/sleep 5)
+   (cycle [(gen/sleep time-ok)
            {:type :info, :f :start}
-           (gen/sleep 5)
+           (gen/sleep time-corrupt)
            {:type :info, :f :stop}])))
 
 (defn corruption-generator
@@ -97,12 +108,16 @@
            {:type :info, :f :corrupt}])))
 
 (def custom-nemesises
-  {"single-node-killer" {:nemesis (random-single-node-killer-nemesis)
-                         :generator (start-stop-generator)}
+  {"random-node-killer" {:nemesis (random-node-killer-nemesis)
+                         :generator (start-stop-generator 5 5)}
+   "all-nodes-killer" {:nemesis (all-nodes-killer-nemesis)
+                         :generator (start-stop-generator 1 10)}
    "simple-partitioner" {:nemesis (nemesis/partition-random-halves)
-                         :generator (start-stop-generator)}
-   "hammer-time"    {:nemesis (hammer-time-nemesis)
-                     :generator (start-stop-generator)}
+                         :generator (start-stop-generator 5 5)}
+   "random-node-hammer-time"    {:nemesis (random-node-hammer-time-nemesis)
+                     :generator (start-stop-generator 5 5)}
+   "all-nodes-hammer-time"    {:nemesis (all-nodes-hammer-time-nemesis)
+                     :generator (start-stop-generator 1 10)}
    "logs-corruptor" {:nemesis (logs-corruption-nemesis)
                      :generator (corruption-generator)}
    "snapshots-corruptor" {:nemesis (snapshots-corruption-nemesis)
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
index d50253aa174..c30ec9635a1 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -22,11 +22,9 @@
 
   (invoke! [this test op]
     (case (:f op)
-      :read
-      (do
-        (assoc op
+      :read (assoc op
                :type :ok
-               :value (read-string (:data (zk-get-str conn k)))))
+               :value (read-string (:data (zk-get-str conn k))))
       :add (try
              (do
                (zk-add-to-set conn k (:value op))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index 10851a2adc7..6fd2f3c87f4 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -2,10 +2,13 @@
   (:require [clojure.string :as str]
             [zookeeper.data :as data]
             [zookeeper :as zk]
+            [zookeeper.internal :as zi]
             [jepsen.control.util :as cu]
             [jepsen.nukeeper.constants :refer :all]
             [jepsen.control :as c]
-            [clojure.tools.logging :refer :all]))
+            [clojure.tools.logging :refer :all])
+  (:import  (org.apache.zookeeper CreateMode
+                                  ZooKeeper)))
 
 (defn parse-long
   "Parses a string to a Long. Passes through `nil` and empty strings."
@@ -67,6 +70,17 @@
   [conn path-prefix data]
   (zk/create conn path-prefix :data (data/to-bytes (str data)) :persistent? true :sequential? true))
 
+(defn zk-multi-create-many-seq-nodes
+  [conn path-prefix num]
+  (let [txn (.transaction conn)]
+    (loop [i 0]
+      (cond (>= i num) (.commit txn)
+            :else (do (.create txn path-prefix
+                               (data/to-bytes "")
+                               (zi/acls :open-acl-unsafe)
+                               CreateMode/PERSISTENT_SEQUENTIAL)
+                      (recur (inc i)))))))
+
 (defn clickhouse-alive?
   [node test]
   (info "Checking server alive on" node)
diff --git a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
index 824aa40d2c8..1a3e8646574 100644
--- a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
+++ b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
@@ -22,6 +22,8 @@
       (zk/create conn "/0")
       (println (zk/children conn "/"))
       (zk/set-data conn "/0" (data/to-bytes "777") -1)
+      (zk-multi-create-many-seq-nodes conn "/seq-" 5)
+      (println (zk/children conn "/"))
       (Thread/sleep 5000)
       (println "VALUE" (data/to-string (:data (zk/data conn "/0"))))
       (is (= (data/to-string (:data (zk/data conn "/0"))) "777"))

From c01171c626e0344915abce370a0dc777a7ce93f9 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Thu, 18 Mar 2021 14:58:43 +0300
Subject: [PATCH 044/260] Fixed tests

---
 src/Dictionaries/SSDCacheDictionaryStorage.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h
index f28f9ab37cd..d0b4a5ca835 100644
--- a/src/Dictionaries/SSDCacheDictionaryStorage.h
+++ b/src/Dictionaries/SSDCacheDictionaryStorage.h
@@ -1339,10 +1339,13 @@ private:
         if (it == nullptr)
             return;
 
+        /// In case of complex key in arena key is serialized from hash table
+        KeyType key_copy = it->getKey();
+
         index.erase(key);
 
-        if constexpr(std::is_same_v<KeyType, StringRef>)
-            complex_key_arena.free(const_cast<char *>(key.data), key.size);
+        if constexpr (std::is_same_v<KeyType, StringRef>)
+            complex_key_arena.free(const_cast<char *>(key_copy.data), key_copy.size);
     }
 
     SSDCacheDictionaryStorageConfiguration configuration;

From 5324d75505f49b7fe9fdfb630e0c00399115a4e1 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Thu, 18 Mar 2021 17:12:25 +0300
Subject: [PATCH 045/260] Fixed tests

---
 .../0_stateless/01684_ssd_cache_dictionary_simple_key.sql   | 6 +++---
 .../0_stateless/01685_ssd_cache_dictionary_complex_key.sql  | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql
index 2fe1e54fe6c..9dbad1289f1 100644
--- a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql
+++ b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql
@@ -22,7 +22,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke
 PRIMARY KEY id
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_simple_attributes_source_table'))
 LIFETIME(MIN 1 MAX 1000)
-LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/0d'));
+LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/0d'));
 
 SELECT 'Dictionary cache_dictionary_simple_key_simple_attributes';
 SELECT 'dictGet existing value';
@@ -66,7 +66,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke
 PRIMARY KEY id
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_complex_attributes_source_table'))
 LIFETIME(MIN 1 MAX 1000)
-LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/1d'));
+LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d'));
 
 SELECT 'Dictionary cache_dictionary_simple_key_complex_attributes';
 SELECT 'dictGet existing value';
@@ -108,7 +108,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke
 PRIMARY KEY id
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_hierarchy_table'))
 LIFETIME(MIN 1 MAX 1000)
-LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/2d'));
+LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/2d'));
 
 SELECT 'Dictionary cache_dictionary_simple_key_hierarchy';
 SELECT 'dictGet';
diff --git a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql
index f65aa445284..03a7e1d80df 100644
--- a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql
+++ b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql
@@ -24,7 +24,7 @@ CREATE DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_k
 PRIMARY KEY id, id_key
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_simple_attributes_source_table' DB '01685_database_for_cache_dictionary'))
 LIFETIME(MIN 1 MAX 1000)
-LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/0d'));
+LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/0d'));
 
 SELECT 'Dictionary cache_dictionary_complex_key_simple_attributes';
 SELECT 'dictGet existing value';
@@ -71,7 +71,7 @@ CREATE DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_k
 PRIMARY KEY id, id_key
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_complex_attributes_source_table' DB '01685_database_for_cache_dictionary'))
 LIFETIME(MIN 1 MAX 1000)
-LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/1d'));
+LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d'));
 
 SELECT 'Dictionary cache_dictionary_complex_key_complex_attributes';
 SELECT 'dictGet existing value';

From 0137a6baac723f94c3ee5401bbde98b2e0c51379 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 18 Mar 2021 23:55:11 +0300
Subject: [PATCH 046/260] Add test founding bug

---
 src/Coordination/NuKeeperSnapshotManager.cpp  |  1 +
 src/Coordination/NuKeeperStorage.cpp          | 11 +++
 src/Coordination/NuKeeperStorage.h            |  4 ++
 .../src/jepsen/nukeeper/counter.clj           |  7 +-
 .../src/jepsen/nukeeper/main.clj              |  9 +--
 .../src/jepsen/nukeeper/nemesis.clj           |  6 +-
 .../src/jepsen/nukeeper/queue.clj             | 67 +++++++++++++++++++
 .../src/jepsen/nukeeper/set.clj               |  7 +-
 .../src/jepsen/nukeeper/unique.clj            |  7 +-
 .../src/jepsen/nukeeper/utils.clj             | 36 +++++++++-
 .../test/jepsen/nukeeper_test.clj             | 27 +++++---
 11 files changed, 150 insertions(+), 32 deletions(-)
 create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj

diff --git a/src/Coordination/NuKeeperSnapshotManager.cpp b/src/Coordination/NuKeeperSnapshotManager.cpp
index f5a97619976..5cc7bc356be 100644
--- a/src/Coordination/NuKeeperSnapshotManager.cpp
+++ b/src/Coordination/NuKeeperSnapshotManager.cpp
@@ -161,6 +161,7 @@ void NuKeeperStorageSnapshot::serialize(const NuKeeperStorageSnapshot & snapshot
 
 SnapshotMetadataPtr NuKeeperStorageSnapshot::deserialize(NuKeeperStorage & storage, ReadBuffer & in)
 {
+    storage.clearData();
     uint8_t version;
     readBinary(version, in);
     if (static_cast<SnapshotVersion>(version) > SnapshotVersion::V0)
diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp
index 2440d6f6613..0b773aeaafd 100644
--- a/src/Coordination/NuKeeperStorage.cpp
+++ b/src/Coordination/NuKeeperStorage.cpp
@@ -752,4 +752,15 @@ void NuKeeperStorage::clearDeadWatches(int64_t session_id)
     }
 }
 
+void NuKeeperStorage::clearData()
+{
+    container.clear();
+    ephemerals.clear();
+    sessions_and_watchers.clear();
+    session_expiry_queue.clear();
+    session_and_timeout.clear();
+    session_id_counter = 1;
+    zxid = 0;
+}
+
 }
diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h
index c49df88159f..b44a077c277 100644
--- a/src/Coordination/NuKeeperStorage.h
+++ b/src/Coordination/NuKeeperStorage.h
@@ -82,6 +82,8 @@ public:
 public:
     NuKeeperStorage(int64_t tick_time_ms);
 
+    void clearData();
+
     int64_t getSessionID(int64_t session_timeout_ms)
     {
         auto result = session_id_counter++;
@@ -131,4 +133,6 @@ public:
     }
 };
 
+using NuKeeperStoragePtr = std::unique_ptr<NuKeeperStorage>;
+
 }
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
index 1bdf3f89186..48b270517a4 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
@@ -1,5 +1,5 @@
 (ns jepsen.nukeeper.counter
- (:require
+  (:require
    [clojure.tools.logging :refer :all]
    [jepsen
     [checker :as checker]
@@ -9,11 +9,9 @@
    [zookeeper :as zk])
   (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
 
-
 (defn r   [_ _] {:type :invoke, :f :read})
 (defn add [_ _] {:type :invoke, :f :add, :value (rand-int 5)})
 
-
 (defrecord CounterClient [conn nodename]
   client/Client
   (open! [this test node]
@@ -39,7 +37,8 @@
 
   (teardown! [_ test])
 
-  (close! [_ test]))
+  (close! [_ test]
+    (zk/close conn)))
 
 (defn workload
   "A generator, client, and checker for a set test."
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 0f9619a7653..b8854638ed0 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -5,6 +5,7 @@
             [jepsen.nukeeper.nemesis :as custom-nemesis]
             [jepsen.nukeeper.register :as register]
             [jepsen.nukeeper.unique :as unique]
+            [jepsen.nukeeper.queue :as queue]
             [jepsen.nukeeper.counter :as counter]
             [jepsen.nukeeper.constants :refer :all]
             [clojure.string :as str]
@@ -23,7 +24,6 @@
             [jepsen.os.ubuntu :as ubuntu]
             [jepsen.checker.timeline :as timeline]
             [clojure.java.io :as io]
-            [knossos.model :as model]
             [zookeeper.data :as data]
             [zookeeper :as zk])
   (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
@@ -69,7 +69,7 @@
       (info node "tearing down clickhouse")
       (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
       (c/su
-       ;(c/exec :rm :-f (str binary-path "/clickhouse"))
+       (c/exec :rm :-f (str binary-path "/clickhouse"))
        (c/exec :rm :-rf dir)
        (c/exec :rm :-rf logdir)
        (c/exec :rm :-rf "/etc/clickhouse-server")))
@@ -87,7 +87,8 @@
   {"set"      set/workload
    "register" register/workload
    "unique-ids" unique/workload
-   "counter" counter/workload})
+   "counter" counter/workload
+   "queue" queue/workload})
 
 (def cli-opts
   "Additional command line options."
@@ -130,7 +131,7 @@
            opts
            {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)) " " (name (:nemesis opts)))
             :os ubuntu/os
-            :db (db "rbtorrent:a122093aee0bdcb70ca42d5e5fb4ba5544372f5f")
+            :db (db "rbtorrent:711cf0ff9281804eb53875d0c12499df1c2a0adc")
             :pure-generators true
             :client (:client workload)
             :nemesis (:nemesis current-nemesis)
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index bf22f9ad1f6..59f3cb52dae 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -111,13 +111,13 @@
   {"random-node-killer" {:nemesis (random-node-killer-nemesis)
                          :generator (start-stop-generator 5 5)}
    "all-nodes-killer" {:nemesis (all-nodes-killer-nemesis)
-                         :generator (start-stop-generator 1 10)}
+                       :generator (start-stop-generator 1 10)}
    "simple-partitioner" {:nemesis (nemesis/partition-random-halves)
                          :generator (start-stop-generator 5 5)}
    "random-node-hammer-time"    {:nemesis (random-node-hammer-time-nemesis)
-                     :generator (start-stop-generator 5 5)}
+                                 :generator (start-stop-generator 5 5)}
    "all-nodes-hammer-time"    {:nemesis (all-nodes-hammer-time-nemesis)
-                     :generator (start-stop-generator 1 10)}
+                               :generator (start-stop-generator 1 10)}
    "logs-corruptor" {:nemesis (logs-corruption-nemesis)
                      :generator (corruption-generator)}
    "snapshots-corruptor" {:nemesis (snapshots-corruption-nemesis)
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
new file mode 100644
index 00000000000..f6f7abb51b6
--- /dev/null
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
@@ -0,0 +1,67 @@
+(ns jepsen.nukeeper.queue
+  (:require
+   [clojure.tools.logging :refer :all]
+   [jepsen
+    [checker :as checker]
+    [client :as client]
+    [generator :as gen]]
+   [jepsen.nukeeper.utils :refer :all]
+   [zookeeper :as zk])
+  (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
+
+(defn enqueue   [val _ _] {:type :invoke, :f :enqueue :value val})
+(defn dequeue [_ _] {:type :invoke, :f :dequeue})
+
+(defrecord QueueClient [conn nodename]
+  client/Client
+  (open! [this test node]
+    (assoc
+     (assoc this
+            :conn (zk-connect node 9181 30000))
+     :nodename node))
+
+  (setup! [this test])
+
+  (invoke! [this test op]
+    (case (:f op)
+      :enqueue (try
+                 (do
+                   (zk-create-if-not-exists conn (str "/" (:value op)) "")
+                   (assoc op :type :ok))
+                 (catch Exception _ (assoc op :type :info, :error :connect-error)))
+      :dequeue
+      (try
+        (let [result (zk-multi-delete-first-child conn "/")]
+          (if (not (nil? result))
+            (assoc op :type :ok :value result)
+            (assoc op :type :fail :value result)))
+        (catch KeeperException$BadVersionException _ (assoc op :type :fail, :error :bad-version))
+        (catch Exception _ (assoc op :type :info, :error :connect-error)))
+      :drain
+      (try
+        (loop [result '()]
+          (let [deleted-child (zk-multi-delete-first-child conn "/")]
+            (if (not (nil? deleted-child))
+              (recur (concat result [deleted-child]))
+              (assoc op :type :ok :value result))))
+        (catch Exception _ (assoc op :type :info, :error :connect-error)))))
+
+  (teardown! [_ test])
+
+  (close! [_ test]
+    (zk/close conn)))
+
+(defn sorted-str-range
+  [n]
+  (sort (map (fn [v] (str v)) (take n (range)))))
+
+(defn workload
+  "A generator, client, and checker for a set test."
+  [opts]
+  {:client    (QueueClient. nil nil)
+   :checker   (checker/total-queue)
+   :generator (->> (sorted-str-range 10000)
+                   (map (fn [x]
+                          (rand-nth [{:type :invoke, :f :enqueue :value x}
+                                     {:type :invoke, :f :dequeue}]))))
+   :final-generator (gen/once {:type :invoke, :f :drain, :value nil})})
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
index c30ec9635a1..3213042a3cc 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -23,8 +23,8 @@
   (invoke! [this test op]
     (case (:f op)
       :read (assoc op
-               :type :ok
-               :value (read-string (:data (zk-get-str conn k))))
+                   :type :ok
+                   :value (read-string (:data (zk-get-str conn k))))
       :add (try
              (do
                (zk-add-to-set conn k (:value op))
@@ -34,7 +34,8 @@
 
   (teardown! [_ test])
 
-  (close! [_ test]))
+  (close! [_ test]
+    (zk/close conn)))
 
 (defn workload
   "A generator, client, and checker for a set test."
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj
index 9c753dfe0ab..9dfb906bc17 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj
@@ -9,10 +9,6 @@
    [zookeeper :as zk])
   (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
 
-(defn parse-and-get-counter
-  [path]
-  (Integer/parseInt (apply str (take-last 10 (seq (str path))))))
-
 (defrecord UniqueClient [conn nodename]
   client/Client
   (open! [this test node]
@@ -33,7 +29,8 @@
 
   (teardown! [_ test])
 
-  (close! [_ test]))
+  (close! [_ test]
+    (zk/close conn)))
 
 (defn workload
   "A generator, client, and checker for a set test."
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index 6fd2f3c87f4..fd2b2b5acb3 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -7,8 +7,9 @@
             [jepsen.nukeeper.constants :refer :all]
             [jepsen.control :as c]
             [clojure.tools.logging :refer :all])
-  (:import  (org.apache.zookeeper CreateMode
-                                  ZooKeeper)))
+  (:import (org.apache.zookeeper.data Stat)
+           (org.apache.zookeeper CreateMode
+                                 ZooKeeper)))
 
 (defn parse-long
   "Parses a string to a Long. Passes through `nil` and empty strings."
@@ -16,6 +17,10 @@
   (if (and s (> (count s) 0))
     (Long/parseLong s)))
 
+(defn parse-and-get-counter
+  [path]
+  (Integer/parseInt (apply str (take-last 10 (seq (str path))))))
+
 (defn zk-range
   []
   (map (fn [v] (str "/" v)) (range)))
@@ -48,6 +53,13 @@
   [conn path]
   (zk/children conn path))
 
+(defn zk-list-with-stat
+  [conn path]
+  (let [stat (new Stat)
+        children (seq (.getChildren conn path false stat))]
+    {:children children
+     :stat (zi/stat-to-map stat)}))
+
 (defn zk-cas
   [conn path old-value new-value]
   (let [current-value (zk-get-str conn path)]
@@ -81,6 +93,26 @@
                                CreateMode/PERSISTENT_SEQUENTIAL)
                       (recur (inc i)))))))
 
+(defn zk-parent-path
+  [path]
+  (let [rslash_pos (str/last-index-of path "/")]
+    (if (> rslash_pos 0)
+      (subs path 0 rslash_pos)
+      "/")))
+
+(defn zk-multi-delete-first-child
+  [conn path]
+  (let [{children :children stat :stat} (zk-list-with-stat conn path)
+        txn (.transaction conn)
+        first-child (first (sort children))]
+    (if (not (nil? first-child))
+      (do (.check txn path (:version stat))
+          (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions
+          (.delete txn (str path first-child) -1)
+          (.commit txn)
+          first-child)
+      nil)))
+
 (defn clickhouse-alive?
   [node test]
   (info "Checking server alive on" node)
diff --git a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
index 1a3e8646574..1981e01ebcb 100644
--- a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
+++ b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
@@ -15,16 +15,21 @@
 (deftest a-test
   (testing "nukeeper connection"
     (let [conn (zk/connect "localhost:9181" :timeout-msec 5000)]
-      (println (take 10 (zk-range)))
-      (multidelete conn)
-      (multicreate conn)
-      (zk/create-all conn "/0")
-      (zk/create conn "/0")
+      ;(println (take 10 (zk-range)))
+      ;(multidelete conn)
+      ;(multicreate conn)
+      ;(zk/create-all conn "/0")
+      ;(zk/create conn "/0")
+      ;(println (zk/children conn "/"))
+      ;(zk/set-data conn "/0" (data/to-bytes "777") -1)
+      (println (zk-parent-path "/sasds/dasda/das"))
+      (println (zk-parent-path "/sasds"))
+      (zk-multi-create-many-seq-nodes conn "/a-" 5)
       (println (zk/children conn "/"))
-      (zk/set-data conn "/0" (data/to-bytes "777") -1)
-      (zk-multi-create-many-seq-nodes conn "/seq-" 5)
-      (println (zk/children conn "/"))
-      (Thread/sleep 5000)
-      (println "VALUE" (data/to-string (:data (zk/data conn "/0"))))
-      (is (= (data/to-string (:data (zk/data conn "/0"))) "777"))
+      (println (zk-list-with-stat conn "/"))
+      (println (zk-multi-delete-first-child conn "/"))
+      (println (zk-list-with-stat conn "/"))
+      ;(Thread/sleep 5000)
+      ;(println "VALUE" (data/to-string (:data (zk/data conn "/0"))))
+      ;(is (= (data/to-string (:data (zk/data conn "/0"))) "777"))
       (zk/close conn))))

From 26541471137806f701d7e8c24a9b00c298844cf2 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 19 Mar 2021 00:14:43 +0300
Subject: [PATCH 047/260] Fix on fix

---
 src/Coordination/NuKeeperStorage.cpp               | 2 ++
 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp
index 0b773aeaafd..62f998761ea 100644
--- a/src/Coordination/NuKeeperStorage.cpp
+++ b/src/Coordination/NuKeeperStorage.cpp
@@ -761,6 +761,8 @@ void NuKeeperStorage::clearData()
     session_and_timeout.clear();
     session_id_counter = 1;
     zxid = 0;
+
+    container.insert("/", Node());
 }
 
 }
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index b8854638ed0..e852c7c4720 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -131,7 +131,7 @@
            opts
            {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)) " " (name (:nemesis opts)))
             :os ubuntu/os
-            :db (db "rbtorrent:711cf0ff9281804eb53875d0c12499df1c2a0adc")
+            :db (db "rbtorrent:af3f7a797953f7f359bd3550fe3fd4a68fd27345")
             :pure-generators true
             :client (:client workload)
             :nemesis (:nemesis current-nemesis)

From 6aa9039f7dabe289da918f9bbfdbb2950516cabe Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Fri, 19 Mar 2021 02:05:43 +0300
Subject: [PATCH 048/260] float frames and lag/lead_in_frame

---
 docker/test/performance-comparison/perf.py    |   5 +-
 .../sql-reference/window-functions/index.md   |   4 +-
 src/Core/Field.h                              |  23 ++
 src/Interpreters/WindowDescription.cpp        |  49 ++--
 src/Interpreters/WindowDescription.h          |   5 +-
 src/Parsers/ASTWindowDefinition.cpp           |   6 +-
 src/Parsers/ExpressionElementParsers.cpp      |  54 ++--
 src/Processors/Transforms/WindowTransform.cpp | 243 ++++++++++++++++--
 src/Processors/Transforms/WindowTransform.h   |   9 +-
 tests/performance/window_functions.xml        |  42 +++
 .../01591_window_functions.reference          |  39 +++
 .../0_stateless/01591_window_functions.sql    |  18 ++
 12 files changed, 412 insertions(+), 85 deletions(-)

diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py
index f1c5df146aa..c74da2fe8e3 100755
--- a/docker/test/performance-comparison/perf.py
+++ b/docker/test/performance-comparison/perf.py
@@ -264,7 +264,7 @@ for query_index in queries_to_run:
         try:
             prewarm_id = f'{query_prefix}.prewarm0'
             # Will also detect too long queries during warmup stage
-            res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': 10})
+            res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': args.max_query_seconds})
             print(f'prewarm\t{query_index}\t{prewarm_id}\t{conn_index}\t{c.last_query.elapsed}')
         except KeyboardInterrupt:
             raise
@@ -311,7 +311,8 @@ for query_index in queries_to_run:
 
         for conn_index, c in enumerate(this_query_connections):
             try:
-                res = c.execute(q, query_id = run_id)
+                res = c.execute(q, query_id = run_id,
+                    settings = {'max_execution_time': args.max_query_seconds})
             except Exception as e:
                 # Add query id to the exception to make debugging easier.
                 e.args = (run_id, *e.args)
diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md
index cbf03a44d46..3d18bc123f9 100644
--- a/docs/en/sql-reference/window-functions/index.md
+++ b/docs/en/sql-reference/window-functions/index.md
@@ -23,7 +23,9 @@ ClickHouse supports the standard grammar for defining windows and window functio
 | `GROUPS` frame | not supported |
 | Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | all aggregate functions are supported |
 | `rank()`, `dense_rank()`, `row_number()` | supported |
-| `lag/lead(value, offset)` | not supported, replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`| 
+| `lag/lead(value, offset)` | Not supported. Workarounds: |
+|  | 1) replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`|
+|  | 2) use `lag_in_frame/lead_in_frame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` |
 
 ## References
 
diff --git a/src/Core/Field.h b/src/Core/Field.h
index 3a52186167f..30c3938e455 100644
--- a/src/Core/Field.h
+++ b/src/Core/Field.h
@@ -946,3 +946,26 @@ void writeFieldText(const Field & x, WriteBuffer & buf);
 String toString(const Field & x);
 
 }
+
+template <>
+struct fmt::formatter<DB::Field>
+{
+    constexpr auto parse(format_parse_context & ctx)
+    {
+        auto it = ctx.begin();
+        auto end = ctx.end();
+
+        /// Only support {}.
+        if (it != end && *it != '}')
+            throw format_error("invalid format");
+
+        return it;
+    }
+
+    template <typename FormatContext>
+    auto format(const DB::Field & x, FormatContext & ctx)
+    {
+        return format_to(ctx.out(), "{}", toString(x));
+    }
+};
+
diff --git a/src/Interpreters/WindowDescription.cpp b/src/Interpreters/WindowDescription.cpp
index e922f49c896..e81a1d3235c 100644
--- a/src/Interpreters/WindowDescription.cpp
+++ b/src/Interpreters/WindowDescription.cpp
@@ -1,5 +1,6 @@
 #include <Interpreters/WindowDescription.h>
 
+#include <Core/Field.h>
 #include <IO/Operators.h>
 #include <Parsers/ASTFunction.h>
 
@@ -60,7 +61,7 @@ void WindowFrame::toString(WriteBuffer & buf) const
     }
     else
     {
-        buf << abs(begin_offset);
+        buf << applyVisitor(FieldVisitorToString(), begin_offset);
         buf << " "
             << (begin_preceding ? "PRECEDING" : "FOLLOWING");
     }
@@ -77,7 +78,7 @@ void WindowFrame::toString(WriteBuffer & buf) const
     }
     else
     {
-        buf << abs(end_offset);
+        buf << applyVisitor(FieldVisitorToString(), end_offset);
         buf << " "
             << (end_preceding ? "PRECEDING" : "FOLLOWING");
     }
@@ -121,23 +122,37 @@ void WindowFrame::checkValid() const
     if (end_type == BoundaryType::Offset
         && begin_type == BoundaryType::Offset)
     {
-        // Frame starting with following rows can't have preceding rows.
-        if (!(end_preceding && !begin_preceding))
+        // Frame start offset must be less or equal that the frame end offset.
+        bool begin_less_equal_end;
+        if (begin_preceding && end_preceding)
         {
-            // Frame start offset must be less or equal that the frame end offset.
-            const bool begin_before_end
-                = begin_offset * (begin_preceding ? -1 : 1)
-                    <= end_offset * (end_preceding ? -1 : 1);
-
-            if (!begin_before_end)
-            {
-                throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Frame start offset {} {} does not precede the frame end offset {} {}",
-                    begin_offset, begin_preceding ? "PRECEDING" : "FOLLOWING",
-                    end_offset, end_preceding ? "PRECEDING" : "FOLLOWING");
-            }
-            return;
+            begin_less_equal_end = begin_offset >= end_offset;
         }
+        else if (begin_preceding && !end_preceding)
+        {
+            begin_less_equal_end = true;
+        }
+        else if (!begin_preceding && end_preceding)
+        {
+            begin_less_equal_end = false;
+        }
+        else if (!begin_preceding && !end_preceding)
+        {
+            begin_less_equal_end = begin_offset <= end_offset;
+        }
+        else
+        {
+            assert(false);
+        }
+
+        if (!begin_less_equal_end)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "Frame start offset {} {} does not precede the frame end offset {} {}",
+                begin_offset, begin_preceding ? "PRECEDING" : "FOLLOWING",
+                end_offset, end_preceding ? "PRECEDING" : "FOLLOWING");
+        }
+        return;
     }
 
     throw Exception(ErrorCodes::BAD_ARGUMENTS,
diff --git a/src/Interpreters/WindowDescription.h b/src/Interpreters/WindowDescription.h
index faad4649f91..70a4e0e44e0 100644
--- a/src/Interpreters/WindowDescription.h
+++ b/src/Interpreters/WindowDescription.h
@@ -44,14 +44,13 @@ struct WindowFrame
     // Offset might be both preceding and following, controlled by begin_preceding,
     // but the offset value must be positive.
     BoundaryType begin_type = BoundaryType::Unbounded;
-    // This should have been a Field but I'm getting some crazy linker errors.
-    int64_t begin_offset = 0;
+    Field begin_offset = 0;
     bool begin_preceding = true;
 
     // Here as well, Unbounded can only be UNBOUNDED FOLLOWING, and end_preceding
     // must be false.
     BoundaryType end_type = BoundaryType::Current;
-    int64_t end_offset = 0;
+    Field end_offset = 0;
     bool end_preceding = false;
 
 
diff --git a/src/Parsers/ASTWindowDefinition.cpp b/src/Parsers/ASTWindowDefinition.cpp
index aee951fc1f3..ff08bda65ed 100644
--- a/src/Parsers/ASTWindowDefinition.cpp
+++ b/src/Parsers/ASTWindowDefinition.cpp
@@ -70,7 +70,8 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
         }
         else
         {
-            settings.ostr << abs(frame.begin_offset);
+            settings.ostr << applyVisitor(FieldVisitorToString(),
+                frame.begin_offset);
             settings.ostr << " "
                 << (!frame.begin_preceding ? "FOLLOWING" : "PRECEDING");
         }
@@ -85,7 +86,8 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
         }
         else
         {
-            settings.ostr << abs(frame.end_offset);
+            settings.ostr << applyVisitor(FieldVisitorToString(),
+                frame.end_offset);
             settings.ostr << " "
                 << (!frame.end_preceding ? "FOLLOWING" : "PRECEDING");
         }
diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index a54573432a1..39e3a0af5b7 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -581,30 +581,20 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
         else if (parser_literal.parse(pos, ast_literal, expected))
         {
             const Field & value = ast_literal->as<ASTLiteral &>().value;
-            if (!isInt64FieldType(value.getType()))
+            if ((node->frame.type == WindowFrame::FrameType::Rows
+                    || node->frame.type == WindowFrame::FrameType::Groups)
+                && !(value.getType() == Field::Types::UInt64
+                     || (value.getType() == Field::Types::Int64
+                            && value.get<Int64>() >= 0)))
             {
                 throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Only integer frame offsets are supported, '{}' is not supported.",
+                    "Frame offset for '{}' frame must be a nonnegative integer,  '{}' of type '{}' given.",
+                    WindowFrame::toString(node->frame.type),
+                    applyVisitor(FieldVisitorToString(), value),
                     Field::Types::toString(value.getType()));
             }
-            node->frame.begin_offset = value.get<Int64>();
+            node->frame.begin_offset = value;
             node->frame.begin_type = WindowFrame::BoundaryType::Offset;
-            // We can easily get a UINT64_MAX here, which doesn't even fit into
-            // int64_t. Not sure what checks we are going to need here after we
-            // support floats and dates.
-            if (node->frame.begin_offset > INT_MAX || node->frame.begin_offset < INT_MIN)
-            {
-                throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Frame offset must be between {} and {}, but {} is given",
-                    INT_MAX, INT_MIN, node->frame.begin_offset);
-            }
-
-            if (node->frame.begin_offset < 0)
-            {
-                throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Frame start offset must be greater than zero, {} given",
-                    node->frame.begin_offset);
-            }
         }
         else
         {
@@ -652,28 +642,20 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
             else if (parser_literal.parse(pos, ast_literal, expected))
             {
                 const Field & value = ast_literal->as<ASTLiteral &>().value;
-                if (!isInt64FieldType(value.getType()))
+                if ((node->frame.type == WindowFrame::FrameType::Rows
+                        || node->frame.type == WindowFrame::FrameType::Groups)
+                    && !(value.getType() == Field::Types::UInt64
+                         || (value.getType() == Field::Types::Int64
+                                && value.get<Int64>() >= 0)))
                 {
                     throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Only integer frame offsets are supported, '{}' is not supported.",
+                        "Frame offset for '{}' frame must be a nonnegative integer,  '{}' of type '{}' given.",
+                        WindowFrame::toString(node->frame.type),
+                        applyVisitor(FieldVisitorToString(), value),
                         Field::Types::toString(value.getType()));
                 }
-                node->frame.end_offset = value.get<Int64>();
+                node->frame.end_offset = value;
                 node->frame.end_type = WindowFrame::BoundaryType::Offset;
-
-                if (node->frame.end_offset > INT_MAX || node->frame.end_offset < INT_MIN)
-                {
-                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Frame offset must be between {} and {}, but {} is given",
-                        INT_MAX, INT_MIN, node->frame.end_offset);
-                }
-
-                if (node->frame.end_offset < 0)
-                {
-                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Frame end offset must be greater than zero, {} given",
-                        node->frame.end_offset);
-                }
             }
             else
             {
diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 0013e0061e2..a8e0ed8519b 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -3,6 +3,7 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <Common/Arena.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/getLeastSupertype.h>
 #include <Interpreters/ExpressionActions.h>
 #include <Interpreters/convertFieldToType.h>
 
@@ -27,7 +28,8 @@ public:
     virtual ~IWindowFunction() = default;
 
     // Must insert the result for current_row.
-    virtual void windowInsertResultInto(IColumn & to, const WindowTransform * transform) = 0;
+    virtual void windowInsertResultInto(const WindowTransform * transform,
+        size_t function_index) = 0;
 };
 
 // Compares ORDER BY column values at given rows to find the boundaries of frame:
@@ -37,7 +39,7 @@ template <typename ColumnType>
 static int compareValuesWithOffset(const IColumn * _compared_column,
     size_t compared_row, const IColumn * _reference_column,
     size_t reference_row,
-    uint64_t _offset,
+    const Field & _offset,
     bool offset_is_preceding)
 {
     // Casting the columns to the known type here makes it faster, probably
@@ -46,7 +48,7 @@ static int compareValuesWithOffset(const IColumn * _compared_column,
         _compared_column);
     const auto * reference_column = assert_cast<const ColumnType *>(
         _reference_column);
-    const auto offset = static_cast<typename ColumnType::ValueType>(_offset);
+    const auto offset = _offset.get<typename ColumnType::ValueType>();
 
     const auto compared_value_data = compared_column->getDataAt(compared_row);
     assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));
@@ -101,6 +103,54 @@ static int compareValuesWithOffset(const IColumn * _compared_column,
     }
 }
 
+// A specialization of compareValuesWithOffset for floats.
+template <typename ColumnType>
+static int compareValuesWithOffsetFloat(const IColumn * _compared_column,
+    size_t compared_row, const IColumn * _reference_column,
+    size_t reference_row,
+    const Field & _offset,
+    bool offset_is_preceding)
+{
+    // Casting the columns to the known type here makes it faster, probably
+    // because the getData call can be devirtualized.
+    const auto * compared_column = assert_cast<const ColumnType *>(
+        _compared_column);
+    const auto * reference_column = assert_cast<const ColumnType *>(
+        _reference_column);
+    // The underlying field type is Float64 for Float32 as well. get<Float32>()
+    // would be a reinterpret_cast and yield an incorrect result.
+    const auto offset = _offset.get<Float64>();
+
+    const auto compared_value_data = compared_column->getDataAt(compared_row);
+    assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));
+    auto compared_value = unalignedLoad<typename ColumnType::ValueType>(
+        compared_value_data.data);
+
+    const auto reference_value_data = reference_column->getDataAt(reference_row);
+    assert(reference_value_data.size == sizeof(typename ColumnType::ValueType));
+    auto reference_value = unalignedLoad<typename ColumnType::ValueType>(
+        reference_value_data.data);
+
+    // Floats overflow to Inf and the comparison will work normally, so we don't
+    // have to do anything.
+    if (offset_is_preceding)
+    {
+        reference_value -= offset;
+    }
+    else
+    {
+        reference_value += offset;
+    }
+
+    const auto result =  compared_value < reference_value ? -1
+        : compared_value == reference_value ? 0 : 1;
+
+//    fmt::print(stderr, "compared {}, offset {}, reference {}, result {}\n",
+//        compared_value, offset, reference_value, result);
+
+    return result;
+}
+
 // Helper macros to dispatch on type of the ORDER BY column
 #define APPLY_FOR_ONE_TYPE(FUNCTION, TYPE) \
 else if (typeid_cast<const TYPE *>(column)) \
@@ -114,14 +164,20 @@ if (false) /* NOLINT */ \
 { \
     /* Do nothing, a starter condition. */ \
 } \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int8>) \
 APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt8>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int16>) \
 APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt16>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int32>) \
 APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt32>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int64>) \
 APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt64>) \
+\
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int8>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int16>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int32>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int64>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int128>) \
+\
+APPLY_FOR_ONE_TYPE(FUNCTION##Float, ColumnVector<Float32>) \
+APPLY_FOR_ONE_TYPE(FUNCTION##Float, ColumnVector<Float64>) \
+\
 else \
 { \
     throw Exception(ErrorCodes::NOT_IMPLEMENTED, \
@@ -193,9 +249,28 @@ WindowTransform::WindowTransform(const Block & input_header_,
                 == WindowFrame::BoundaryType::Offset))
     {
         assert(order_by_indices.size() == 1);
-        const IColumn * column = input_header.getByPosition(
-            order_by_indices[0]).column.get();
+        const auto & entry = input_header.getByPosition(order_by_indices[0]);
+        const IColumn * column = entry.column.get();
         APPLY_FOR_TYPES(compareValuesWithOffset)
+
+        // Check that the offset type matches the window type.
+        // Convert the offsets to the ORDER BY column type. We can't just check
+        // that it matches, because e.g. the int literals are always (U)Int64,
+        // but the column might be Int8 and so on.
+        if (window_description.frame.begin_type
+            == WindowFrame::BoundaryType::Offset)
+        {
+            window_description.frame.begin_offset = convertFieldToTypeOrThrow(
+                window_description.frame.begin_offset,
+                *entry.type);
+        }
+        if (window_description.frame.end_type
+            == WindowFrame::BoundaryType::Offset)
+        {
+            window_description.frame.end_offset = convertFieldToTypeOrThrow(
+                window_description.frame.end_offset,
+                *entry.type);
+        }
     }
 }
 
@@ -391,7 +466,7 @@ void WindowTransform::advanceFrameStartRowsOffset()
 {
     // Just recalculate it each time by walking blocks.
     const auto [moved_row, offset_left] = moveRowNumber(current_row,
-        window_description.frame.begin_offset
+        window_description.frame.begin_offset.get<UInt64>()
             * (window_description.frame.begin_preceding ? -1 : 1));
 
     frame_start = moved_row;
@@ -638,7 +713,7 @@ void WindowTransform::advanceFrameEndRowsOffset()
     // Walk the specified offset from the current row. The "+1" is needed
     // because the frame_end is a past-the-end pointer.
     const auto [moved_row, offset_left] = moveRowNumber(current_row,
-        window_description.frame.end_offset
+        window_description.frame.end_offset.get<UInt64>()
             * (window_description.frame.end_preceding ? -1 : 1)
             + 1);
 
@@ -852,14 +927,14 @@ void WindowTransform::writeOutCurrentRow()
     for (size_t wi = 0; wi < workspaces.size(); ++wi)
     {
         auto & ws = workspaces[wi];
-        IColumn * result_column = block.output_columns[wi].get();
 
         if (ws.window_function_impl)
         {
-            ws.window_function_impl->windowInsertResultInto(*result_column, this);
+            ws.window_function_impl->windowInsertResultInto(this, wi);
         }
         else
         {
+            IColumn * result_column = block.output_columns[wi].get();
             const auto * a = ws.aggregate_function.get();
             auto * buf = ws.aggregate_function_state.data();
             // FIXME does it also allocate the result on the arena?
@@ -1275,8 +1350,11 @@ struct WindowFunctionRank final : public WindowFunction
     DataTypePtr getReturnType() const override
     { return std::make_shared<DataTypeUInt64>(); }
 
-    void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
+    void windowInsertResultInto(const WindowTransform * transform,
+        size_t function_index) override
     {
+        IColumn & to = *transform->blockAt(transform->current_row)
+            .output_columns[function_index];
         assert_cast<ColumnUInt64 &>(to).getData().push_back(
             transform->peer_group_start_row_number);
     }
@@ -1292,8 +1370,11 @@ struct WindowFunctionDenseRank final : public WindowFunction
     DataTypePtr getReturnType() const override
     { return std::make_shared<DataTypeUInt64>(); }
 
-    void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
+    void windowInsertResultInto(const WindowTransform * transform,
+        size_t function_index) override
     {
+        IColumn & to = *transform->blockAt(transform->current_row)
+            .output_columns[function_index];
         assert_cast<ColumnUInt64 &>(to).getData().push_back(
             transform->peer_group_number);
     }
@@ -1309,13 +1390,122 @@ struct WindowFunctionRowNumber final : public WindowFunction
     DataTypePtr getReturnType() const override
     { return std::make_shared<DataTypeUInt64>(); }
 
-    void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
+    void windowInsertResultInto(const WindowTransform * transform,
+        size_t function_index) override
     {
+        IColumn & to = *transform->blockAt(transform->current_row)
+            .output_columns[function_index];
         assert_cast<ColumnUInt64 &>(to).getData().push_back(
             transform->current_row_number);
     }
 };
 
+template <bool is_lead>
+struct WindowFunctionLagLeadInFrame final : public WindowFunction
+{
+    WindowFunctionLagLeadInFrame(const std::string & name_,
+            const DataTypes & argument_types_, const Array & parameters_)
+        : WindowFunction(name_, argument_types_, parameters_)
+    {
+        if (!parameters.empty())
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "Function {} cannot be parameterized", name_);
+        }
+
+        if (argument_types.empty())
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "Function {} takes at least one argument", name_);
+        }
+
+        if (argument_types.size() == 1)
+        {
+            return;
+        }
+
+        if (!isInt64FieldType(argument_types[1]->getDefault().getType()))
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "Offset must be an integer, '{}' given",
+                argument_types[1]->getName());
+        }
+
+        if (argument_types.size() == 2)
+        {
+            return;
+        }
+
+        if (!getLeastSupertype({argument_types[0], argument_types[2]}))
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "The default value type '{}' is not convertible to the argument type '{}'",
+                argument_types[2]->getName(),
+                argument_types[0]->getName());
+        }
+
+        if (argument_types.size() > 3)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "Function '{}' accepts at most 3 arguments, {} given",
+                name, argument_types.size());
+        }
+    }
+
+    DataTypePtr getReturnType() const override
+    { return argument_types[0]; }
+
+    void windowInsertResultInto(const WindowTransform * transform,
+        size_t function_index) override
+    {
+        auto & current_block = transform->blockAt(transform->current_row);
+        IColumn & to = *current_block.output_columns[function_index];
+        auto & workspace = transform->workspaces[function_index];
+
+        int offset = 1;
+        if (argument_types.size() > 1)
+        {
+            offset = (*current_block.input_columns[
+                    workspace.argument_column_indices[1]])[
+                        transform->current_row.row].get<Int64>();
+            if (offset < 0)
+            {
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                    "The offset for function {} must be nonnegative, {} given",
+                    getName(), offset);
+            }
+        }
+
+        const auto [target_row, offset_left] = transform->moveRowNumber(
+            transform->current_row, offset * (is_lead ? 1 : -1));
+
+        if (offset_left != 0
+            || target_row < transform->frame_start
+            || transform->frame_end <= target_row)
+        {
+            // Offset is outside the frame.
+            if (argument_types.size() > 2)
+            {
+                // Column with default values is specified.
+                to.insertFrom(*current_block.input_columns[
+                            workspace.argument_column_indices[2]],
+                    transform->current_row.row);
+            }
+            else
+            {
+                to.insertDefault();
+            }
+        }
+        else
+        {
+            // Offset is inside the frame.
+            to.insertFrom(*transform->blockAt(target_row).input_columns[
+                    workspace.argument_column_indices[0]],
+                target_row.row);
+        }
+    }
+};
+
 void registerWindowFunctions(AggregateFunctionFactory & factory)
 {
     // Why didn't I implement lag/lead yet? Because they are a mess. I imagine
@@ -1327,9 +1517,10 @@ void registerWindowFunctions(AggregateFunctionFactory & factory)
     // the whole partition like Postgres does, because using a linear amount
     // of additional memory is not an option when we have a lot of data. We must
     // be able to process at least the lag/lead in streaming fashion.
-    // Our best bet is probably rewriting, say `lag(value, offset)` to
-    // `any(value) over (rows between offset preceding and offset preceding)`,
-    // at the query planning stage.
+    // A partial solution for constant offsets is rewriting, say `lag(value, offset)
+    // to `any(value) over (rows between offset preceding and offset preceding)`.
+    // We also implement non-standard functions `lag/lead_in_frame`, that are
+    // analogous to `lag/lead`, but respect the frame.
     // Functions like cume_dist() do require materializing the entire
     // partition, but it's probably also simpler to implement them by rewriting
     // to a (rows between unbounded preceding and unbounded following) frame,
@@ -1355,6 +1546,20 @@ void registerWindowFunctions(AggregateFunctionFactory & factory)
             return std::make_shared<WindowFunctionRowNumber>(name, argument_types,
                 parameters);
         });
+
+    factory.registerFunction("lag_in_frame", [](const std::string & name,
+            const DataTypes & argument_types, const Array & parameters)
+        {
+            return std::make_shared<WindowFunctionLagLeadInFrame<false>>(
+                name, argument_types, parameters);
+        });
+
+    factory.registerFunction("lead_in_frame", [](const std::string & name,
+            const DataTypes & argument_types, const Array & parameters)
+        {
+            return std::make_shared<WindowFunctionLagLeadInFrame<true>>(
+                name, argument_types, parameters);
+        });
 }
 
 }
diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h
index 5001b984e9a..882bf429c0a 100644
--- a/src/Processors/Transforms/WindowTransform.h
+++ b/src/Processors/Transforms/WindowTransform.h
@@ -110,7 +110,9 @@ public:
     Status prepare() override;
     void work() override;
 
-private:
+    /*
+     * Implementation details.
+     */
     void advancePartitionEnd();
 
     bool arePeers(const RowNumber & x, const RowNumber & y) const;
@@ -321,10 +323,7 @@ public:
     int (* compare_values_with_offset) (
         const IColumn * compared_column, size_t compared_row,
         const IColumn * reference_column, size_t reference_row,
-        // We can make it a Field later if we need the Decimals. Now we only
-        // have ints and datetime, and the underlying Field type for them is
-        // uint64_t anyway.
-        uint64_t offset,
+        const Field & offset,
         bool offset_is_preceding);
 };
 
diff --git a/tests/performance/window_functions.xml b/tests/performance/window_functions.xml
index 622e349d060..8db168b1a97 100644
--- a/tests/performance/window_functions.xml
+++ b/tests/performance/window_functions.xml
@@ -110,4 +110,46 @@
         format Null
     </query>
 
+    <!-- Our variant of lead. -->
+    <query>
+        select lead_in_frame(number) over w
+        from
+            (select number, intDiv(number, 1111) p, mod(number, 111) o
+                from numbers(10000000)) t
+        window w as (partition by p order by o
+            rows between unbounded preceding and unbounded following)
+        format Null
+    </query>
+
+    <!-- A faster replacement for lead with constant offset. -->
+    <query>
+        select any(number) over w
+        from
+            (select number, intDiv(number, 1111) p, mod(number, 111) o
+                from numbers(10000000)) t
+        window w as (partition by p order by o
+            rows between 1 following and 1 following)
+        format Null
+    </query>
+
+    <query>
+        select lead_in_frame(number, number) over w
+        from
+            (select number, intDiv(number, 1111) p, mod(number, 111) o
+                from numbers(10000000)) t
+        window w as (partition by p order by o
+            rows between unbounded preceding and unbounded following)
+        format Null
+    </query>
+
+    <query>
+        select lead_in_frame(number, number, number) over w
+        from
+            (select number, intDiv(number, 1111) p, mod(number, 111) o
+                from numbers(10000000)) t
+        window w as (partition by p order by o
+            rows between unbounded preceding and unbounded following)
+        format Null
+    </query>
+
 </test>
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index d2543f0db75..a1130fc51d7 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -974,6 +974,32 @@ from numbers(5);
 1	3
 2	4
 3	\N
+-- variants of lag/lead that respect the frame
+select number, p, pp,
+    lag_in_frame(number, number - pp, number * 11) over w as lag,
+    lead_in_frame(number, number - pp, number * 11) over w as lead
+from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16))
+window w as (partition by p order by number
+    rows between unbounded preceding and unbounded following)
+order by number
+settings max_block_size = 3;
+;
+0	0	0	0	0
+1	0	0	0	2
+2	0	0	0	4
+3	0	0	0	33
+4	0	0	0	44
+5	1	5	5	5
+6	1	5	5	7
+7	1	5	5	9
+8	1	5	5	88
+9	1	5	5	99
+10	2	10	10	10
+11	2	10	10	12
+12	2	10	10	14
+13	2	10	10	143
+14	2	10	10	154
+15	3	15	15	15
 -- case-insensitive SQL-standard synonyms for any and anyLast
 select
     number,
@@ -993,3 +1019,16 @@ order by number
 7	6	8
 8	7	9
 9	8	9
+-- floating point RANGE frame
+select
+    count(*) over (order by (toFloat32(number) as f32) range 5. preceding),
+    count(*) over (order by (toFloat64(number) as f64) range 5. preceding)
+from numbers(7)
+;
+1	1
+2	2
+3	3
+4	4
+5	5
+6	6
+6	6
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index 03bd8371e23..9ac009e672d 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -336,6 +336,17 @@ select
         over (order by number rows between 1 following and 1 following)
 from numbers(5);
 
+-- variants of lag/lead that respect the frame
+select number, p, pp,
+    lag_in_frame(number, number - pp, number * 11) over w as lag,
+    lead_in_frame(number, number - pp, number * 11) over w as lead
+from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16))
+window w as (partition by p order by number
+    rows between unbounded preceding and unbounded following)
+order by number
+settings max_block_size = 3;
+;
+
 -- case-insensitive SQL-standard synonyms for any and anyLast
 select
     number,
@@ -345,3 +356,10 @@ from numbers(10)
 window w as (order by number range between 1 preceding and 1 following)
 order by number
 ;
+
+-- floating point RANGE frame
+select
+    count(*) over (order by (toFloat32(number) as f32) range 5. preceding),
+    count(*) over (order by (toFloat64(number) as f64) range 5. preceding)
+from numbers(7)
+;

From 4bf2e94fa48d49ae734e9598e7b941a9b3066b9e Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Fri, 19 Mar 2021 03:02:35 +0300
Subject: [PATCH 049/260] clang is too smart

---
 src/Interpreters/WindowDescription.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/Interpreters/WindowDescription.cpp b/src/Interpreters/WindowDescription.cpp
index e81a1d3235c..a97ef41204a 100644
--- a/src/Interpreters/WindowDescription.cpp
+++ b/src/Interpreters/WindowDescription.cpp
@@ -136,14 +136,10 @@ void WindowFrame::checkValid() const
         {
             begin_less_equal_end = false;
         }
-        else if (!begin_preceding && !end_preceding)
+        else /* if (!begin_preceding && !end_preceding) */
         {
             begin_less_equal_end = begin_offset <= end_offset;
         }
-        else
-        {
-            assert(false);
-        }
 
         if (!begin_less_equal_end)
         {

From 81c408cb7f8bd11a121906be33cec4b6e5770553 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 19 Mar 2021 11:08:43 +0300
Subject: [PATCH 050/260] Return meta and storage from snapshot

---
 src/Coordination/NuKeeperSnapshotManager.cpp  | 18 ++++---
 src/Coordination/NuKeeperSnapshotManager.h    | 10 ++--
 src/Coordination/NuKeeperStateMachine.cpp     | 26 +++++-----
 src/Coordination/NuKeeperStateMachine.h       |  4 +-
 src/Coordination/NuKeeperStorage.cpp          | 13 -----
 src/Coordination/NuKeeperStorage.h            |  2 -
 src/Coordination/tests/gtest_for_build.cpp    | 52 +++++++++----------
 .../src/jepsen/nukeeper/main.clj              |  4 +-
 8 files changed, 59 insertions(+), 70 deletions(-)

diff --git a/src/Coordination/NuKeeperSnapshotManager.cpp b/src/Coordination/NuKeeperSnapshotManager.cpp
index 5cc7bc356be..1caa1ea94b8 100644
--- a/src/Coordination/NuKeeperSnapshotManager.cpp
+++ b/src/Coordination/NuKeeperSnapshotManager.cpp
@@ -161,7 +161,6 @@ void NuKeeperStorageSnapshot::serialize(const NuKeeperStorageSnapshot & snapshot
 
 SnapshotMetadataPtr NuKeeperStorageSnapshot::deserialize(NuKeeperStorage & storage, ReadBuffer & in)
 {
-    storage.clearData();
     uint8_t version;
     readBinary(version, in);
     if (static_cast<SnapshotVersion>(version) > SnapshotVersion::V0)
@@ -242,9 +241,10 @@ NuKeeperStorageSnapshot::~NuKeeperStorageSnapshot()
     storage->disableSnapshotMode();
 }
 
-NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_)
+NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_)
     : snapshots_path(snapshots_path_)
     , snapshots_to_keep(snapshots_to_keep_)
+    , storage_tick_time(storage_tick_time_)
 {
     namespace fs = std::filesystem;
 
@@ -326,22 +326,24 @@ nuraft::ptr<nuraft::buffer> NuKeeperSnapshotManager::serializeSnapshotToBuffer(c
     return writer.getBuffer();
 }
 
-SnapshotMetadataPtr NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr<nuraft::buffer> buffer)
+SnapshotMetaAndStorage NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const
 {
     ReadBufferFromNuraftBuffer reader(buffer);
     CompressedReadBuffer compressed_reader(reader);
-    return NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader);
+    auto storage = std::make_unique<NuKeeperStorage>(storage_tick_time);
+    auto snapshot_metadata = NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader);
+    return std::make_pair(snapshot_metadata, std::move(storage));
 }
 
-SnapshotMetadataPtr NuKeeperSnapshotManager::restoreFromLatestSnapshot(NuKeeperStorage * storage)
+SnapshotMetaAndStorage NuKeeperSnapshotManager::restoreFromLatestSnapshot()
 {
     if (existing_snapshots.empty())
-        return nullptr;
+        return {};
 
     auto buffer = deserializeLatestSnapshotBufferFromDisk();
     if (!buffer)
-        return nullptr;
-    return deserializeSnapshotFromBuffer(storage, buffer);
+        return {};
+    return deserializeSnapshotFromBuffer(buffer);
 }
 
 void NuKeeperSnapshotManager::removeOutdatedSnapshotsIfNeeded()
diff --git a/src/Coordination/NuKeeperSnapshotManager.h b/src/Coordination/NuKeeperSnapshotManager.h
index 422baf11a65..d844a52eaf4 100644
--- a/src/Coordination/NuKeeperSnapshotManager.h
+++ b/src/Coordination/NuKeeperSnapshotManager.h
@@ -40,17 +40,20 @@ public:
 using NuKeeperStorageSnapshotPtr = std::shared_ptr<NuKeeperStorageSnapshot>;
 using CreateSnapshotCallback = std::function<void(NuKeeperStorageSnapshotPtr &&)>;
 
+
+using SnapshotMetaAndStorage = std::pair<SnapshotMetadataPtr, NuKeeperStoragePtr>;
+
 class NuKeeperSnapshotManager
 {
 public:
-    NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_);
+    NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_ = 500);
 
-    SnapshotMetadataPtr restoreFromLatestSnapshot(NuKeeperStorage * storage);
+    SnapshotMetaAndStorage restoreFromLatestSnapshot();
 
     static nuraft::ptr<nuraft::buffer> serializeSnapshotToBuffer(const NuKeeperStorageSnapshot & snapshot);
     std::string serializeSnapshotBufferToDisk(nuraft::buffer & buffer, size_t up_to_log_idx);
 
-    static SnapshotMetadataPtr deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr<nuraft::buffer> buffer);
+    SnapshotMetaAndStorage deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const;
 
     nuraft::ptr<nuraft::buffer> deserializeSnapshotBufferFromDisk(size_t up_to_log_idx) const;
     nuraft::ptr<nuraft::buffer> deserializeLatestSnapshotBufferFromDisk();
@@ -74,6 +77,7 @@ private:
     const std::string snapshots_path;
     const size_t snapshots_to_keep;
     std::map<size_t, std::string> existing_snapshots;
+    size_t storage_tick_time;
 };
 
 struct CreateSnapshotTask
diff --git a/src/Coordination/NuKeeperStateMachine.cpp b/src/Coordination/NuKeeperStateMachine.cpp
index 58a7ca3d5bc..32bb4269f20 100644
--- a/src/Coordination/NuKeeperStateMachine.cpp
+++ b/src/Coordination/NuKeeperStateMachine.cpp
@@ -37,8 +37,7 @@ NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data)
 
 NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_)
     : coordination_settings(coordination_settings_)
-    , storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds())
-    , snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep)
+    , snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep, coordination_settings->dead_session_check_period_ms.totalMicroseconds())
     , responses_queue(responses_queue_)
     , snapshots_queue(snapshots_queue_)
     , last_committed_idx(0)
@@ -60,7 +59,7 @@ void NuKeeperStateMachine::init()
         try
         {
             latest_snapshot_buf = snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index);
-            latest_snapshot_meta = snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_buf);
+            std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_buf);
             last_committed_idx = latest_snapshot_meta->get_last_log_idx();
             loaded = true;
             break;
@@ -83,6 +82,9 @@ void NuKeeperStateMachine::init()
     {
         LOG_DEBUG(log, "No existing snapshots, last committed log index {}", last_committed_idx);
     }
+
+    if (!storage)
+        storage = std::make_unique<NuKeeperStorage>(coordination_settings->dead_session_check_period_ms.totalMilliseconds());
 }
 
 nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, nuraft::buffer & data)
@@ -96,7 +98,7 @@ nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, n
         nuraft::buffer_serializer bs(response);
         {
             std::lock_guard lock(storage_lock);
-            session_id = storage.getSessionID(session_timeout_ms);
+            session_id = storage->getSessionID(session_timeout_ms);
             bs.put_i64(session_id);
         }
         LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_timeout_ms);
@@ -109,7 +111,7 @@ nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, n
         NuKeeperStorage::ResponsesForSessions responses_for_sessions;
         {
             std::lock_guard lock(storage_lock);
-            responses_for_sessions = storage.processRequest(request_for_session.request, request_for_session.session_id, log_idx);
+            responses_for_sessions = storage->processRequest(request_for_session.request, request_for_session.session_id, log_idx);
             for (auto & response_for_session : responses_for_sessions)
                 responses_queue.push(response_for_session);
         }
@@ -133,7 +135,7 @@ bool NuKeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
 
     {
         std::lock_guard lock(storage_lock);
-        snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_ptr);
+        std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr);
     }
     last_committed_idx = s.get_last_log_idx();
     return true;
@@ -157,7 +159,7 @@ void NuKeeperStateMachine::create_snapshot(
     CreateSnapshotTask snapshot_task;
     {
         std::lock_guard lock(storage_lock);
-        snapshot_task.snapshot = std::make_shared<NuKeeperStorageSnapshot>(&storage, snapshot_meta_copy);
+        snapshot_task.snapshot = std::make_shared<NuKeeperStorageSnapshot>(storage.get(), snapshot_meta_copy);
     }
 
     snapshot_task.create_snapshot = [this, when_done] (NuKeeperStorageSnapshotPtr && snapshot)
@@ -179,7 +181,7 @@ void NuKeeperStateMachine::create_snapshot(
             {
                 /// Must do it with lock (clearing elements from list)
                 std::lock_guard lock(storage_lock);
-                storage.clearGarbageAfterSnapshot();
+                storage->clearGarbageAfterSnapshot();
                 /// Destroy snapshot with lock
                 snapshot.reset();
                 LOG_TRACE(log, "Cleared garbage after snapshot");
@@ -214,7 +216,7 @@ void NuKeeperStateMachine::save_logical_snp_obj(
     if (obj_id == 0)
     {
         std::lock_guard lock(storage_lock);
-        NuKeeperStorageSnapshot snapshot(&storage, s.get_last_log_idx());
+        NuKeeperStorageSnapshot snapshot(storage.get(), s.get_last_log_idx());
         cloned_buffer = snapshot_manager.serializeSnapshotToBuffer(snapshot);
     }
     else
@@ -271,7 +273,7 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS
     NuKeeperStorage::ResponsesForSessions responses;
     {
         std::lock_guard lock(storage_lock);
-        responses = storage.processRequest(request_for_session.request, request_for_session.session_id, std::nullopt);
+        responses = storage->processRequest(request_for_session.request, request_for_session.session_id, std::nullopt);
     }
     for (const auto & response : responses)
         responses_queue.push(response);
@@ -280,13 +282,13 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS
 std::unordered_set<int64_t> NuKeeperStateMachine::getDeadSessions()
 {
     std::lock_guard lock(storage_lock);
-    return storage.getDeadSessions();
+    return storage->getDeadSessions();
 }
 
 void NuKeeperStateMachine::shutdownStorage()
 {
     std::lock_guard lock(storage_lock);
-    storage.finalize();
+    storage->finalize();
 }
 
 }
diff --git a/src/Coordination/NuKeeperStateMachine.h b/src/Coordination/NuKeeperStateMachine.h
index 905f3448c1a..af9ad6de4d2 100644
--- a/src/Coordination/NuKeeperStateMachine.h
+++ b/src/Coordination/NuKeeperStateMachine.h
@@ -52,7 +52,7 @@ public:
 
     NuKeeperStorage & getStorage()
     {
-        return storage;
+        return *storage;
     }
 
     void processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session);
@@ -68,7 +68,7 @@ private:
 
     CoordinationSettingsPtr coordination_settings;
 
-    NuKeeperStorage storage;
+    NuKeeperStoragePtr storage;
 
     NuKeeperSnapshotManager snapshot_manager;
 
diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp
index 62f998761ea..2440d6f6613 100644
--- a/src/Coordination/NuKeeperStorage.cpp
+++ b/src/Coordination/NuKeeperStorage.cpp
@@ -752,17 +752,4 @@ void NuKeeperStorage::clearDeadWatches(int64_t session_id)
     }
 }
 
-void NuKeeperStorage::clearData()
-{
-    container.clear();
-    ephemerals.clear();
-    sessions_and_watchers.clear();
-    session_expiry_queue.clear();
-    session_and_timeout.clear();
-    session_id_counter = 1;
-    zxid = 0;
-
-    container.insert("/", Node());
-}
-
 }
diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h
index b44a077c277..058eed55cab 100644
--- a/src/Coordination/NuKeeperStorage.h
+++ b/src/Coordination/NuKeeperStorage.h
@@ -82,8 +82,6 @@ public:
 public:
     NuKeeperStorage(int64_t tick_time_ms);
 
-    void clearData();
-
     int64_t getSessionID(int64_t session_timeout_ms)
     {
         auto result = session_id_counter++;
diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp
index 01146248f63..d90b711498e 100644
--- a/src/Coordination/tests/gtest_for_build.cpp
+++ b/src/Coordination/tests/gtest_for_build.cpp
@@ -897,25 +897,25 @@ TEST(CoordinationTest, TestStorageSnapshotSimple)
     manager.serializeSnapshotBufferToDisk(*buf, 2);
     EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin"));
 
-    DB::NuKeeperStorage restored_storage(500);
 
     auto debuf = manager.deserializeSnapshotBufferFromDisk(2);
-    manager.deserializeSnapshotFromBuffer(&restored_storage, debuf);
 
-    EXPECT_EQ(restored_storage.container.size(), 3);
-    EXPECT_EQ(restored_storage.container.getValue("/").children.size(), 1);
-    EXPECT_EQ(restored_storage.container.getValue("/hello").children.size(), 1);
-    EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").children.size(), 0);
+    auto [snapshot_meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf);
 
-    EXPECT_EQ(restored_storage.container.getValue("/").data, "");
-    EXPECT_EQ(restored_storage.container.getValue("/hello").data, "world");
-    EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").data, "somedata");
-    EXPECT_EQ(restored_storage.session_id_counter, 7);
-    EXPECT_EQ(restored_storage.zxid, 2);
-    EXPECT_EQ(restored_storage.ephemerals.size(), 2);
-    EXPECT_EQ(restored_storage.ephemerals[3].size(), 1);
-    EXPECT_EQ(restored_storage.ephemerals[1].size(), 1);
-    EXPECT_EQ(restored_storage.session_and_timeout.size(), 2);
+    EXPECT_EQ(restored_storage->container.size(), 3);
+    EXPECT_EQ(restored_storage->container.getValue("/").children.size(), 1);
+    EXPECT_EQ(restored_storage->container.getValue("/hello").children.size(), 1);
+    EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").children.size(), 0);
+
+    EXPECT_EQ(restored_storage->container.getValue("/").data, "");
+    EXPECT_EQ(restored_storage->container.getValue("/hello").data, "world");
+    EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").data, "somedata");
+    EXPECT_EQ(restored_storage->session_id_counter, 7);
+    EXPECT_EQ(restored_storage->zxid, 2);
+    EXPECT_EQ(restored_storage->ephemerals.size(), 2);
+    EXPECT_EQ(restored_storage->ephemerals[3].size(), 1);
+    EXPECT_EQ(restored_storage->ephemerals[1].size(), 1);
+    EXPECT_EQ(restored_storage->session_and_timeout.size(), 2);
 }
 
 TEST(CoordinationTest, TestStorageSnapshotMoreWrites)
@@ -946,15 +946,14 @@ TEST(CoordinationTest, TestStorageSnapshotMoreWrites)
     manager.serializeSnapshotBufferToDisk(*buf, 50);
     EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin"));
 
-    DB::NuKeeperStorage restored_storage(500);
 
     auto debuf = manager.deserializeSnapshotBufferFromDisk(50);
-    manager.deserializeSnapshotFromBuffer(&restored_storage, debuf);
+    auto [meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf);
 
-    EXPECT_EQ(restored_storage.container.size(), 51);
+    EXPECT_EQ(restored_storage->container.size(), 51);
     for (size_t i = 0; i < 50; ++i)
     {
-        EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
+        EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
     }
 }
 
@@ -987,14 +986,13 @@ TEST(CoordinationTest, TestStorageSnapshotManySnapshots)
     EXPECT_TRUE(fs::exists("./snapshots/snapshot_250.bin"));
 
 
-    DB::NuKeeperStorage restored_storage(500);
-    manager.restoreFromLatestSnapshot(&restored_storage);
+    auto [meta, restored_storage] = manager.restoreFromLatestSnapshot();
 
-    EXPECT_EQ(restored_storage.container.size(), 251);
+    EXPECT_EQ(restored_storage->container.size(), 251);
 
     for (size_t i = 0; i < 250; ++i)
     {
-        EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
+        EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
     }
 }
 
@@ -1040,12 +1038,11 @@ TEST(CoordinationTest, TestStorageSnapshotMode)
             EXPECT_FALSE(storage.container.contains("/hello_" + std::to_string(i)));
     }
 
-    DB::NuKeeperStorage restored_storage(500);
-    manager.restoreFromLatestSnapshot(&restored_storage);
+    auto [meta, restored_storage] = manager.restoreFromLatestSnapshot();
 
     for (size_t i = 0; i < 50; ++i)
     {
-        EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
+        EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
     }
 
 }
@@ -1071,8 +1068,7 @@ TEST(CoordinationTest, TestStorageSnapshotBroken)
     plain_buf.truncate(34);
     plain_buf.sync();
 
-    DB::NuKeeperStorage restored_storage(500);
-    EXPECT_THROW(manager.restoreFromLatestSnapshot(&restored_storage), DB::Exception);
+    EXPECT_THROW(manager.restoreFromLatestSnapshot(), DB::Exception);
 }
 
 nuraft::ptr<nuraft::buffer> getBufferFromZKRequest(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index e852c7c4720..9ef3ab4ca2d 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -69,7 +69,7 @@
       (info node "tearing down clickhouse")
       (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
       (c/su
-       (c/exec :rm :-f (str binary-path "/clickhouse"))
+       ;(c/exec :rm :-f (str binary-path "/clickhouse"))
        (c/exec :rm :-rf dir)
        (c/exec :rm :-rf logdir)
        (c/exec :rm :-rf "/etc/clickhouse-server")))
@@ -131,7 +131,7 @@
            opts
            {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)) " " (name (:nemesis opts)))
             :os ubuntu/os
-            :db (db "rbtorrent:af3f7a797953f7f359bd3550fe3fd4a68fd27345")
+            :db (db "rbtorrent:71c60699aa56568ded73c4a48cecd2fd5e0956cb")
             :pure-generators true
             :client (:client workload)
             :nemesis (:nemesis current-nemesis)

From 58eac8a8b4d15699e2bc8d6784d66076fcb4c2d1 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 19 Mar 2021 12:40:59 +0300
Subject: [PATCH 051/260] Add non-symmetric network partitioners

---
 .../src/jepsen/nukeeper/nemesis.clj           | 35 ++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index 59f3cb52dae..9e5841ad8e4 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -93,6 +93,33 @@
   (corruptor-nemesis coordinationdir (fn [path]
                                        (c/exec :rm :-fr path))))
 
+(defn partition-bridge-nemesis
+  []
+  (nemesis/partitioner nemesis/bridge))
+
+(defn blind-node
+  [nodes]
+  (let [[[victim] others] (nemesis/split-one nodes)]
+    {victim (into #{} others)}))
+
+
+(defn blind-node-partition-nemesis
+  []
+  (nemesis/partitioner blind-node))
+
+(defn blind-others
+  [nodes]
+  (let [[[victim] others] (nemesis/split-one nodes)]
+    (into {} (map (fn [node] [node #{victim}])) others)))
+
+(defn blind-others-partition-nemesis
+  []
+  (nemesis/partitioner blind-others))
+
+(defn network-non-symmetric-nemesis
+  []
+  (nemesis/partitioner nemesis/bridge))
+
 (defn start-stop-generator
   [time-corrupt time-ok]
   (->>
@@ -125,4 +152,10 @@
    "logs-and-snapshots-corruptor" {:nemesis (logs-and-snapshots-corruption-nemesis)
                                    :generator (corruption-generator)}
    "drop-data-corruptor" {:nemesis (drop-all-corruption-nemesis)
-                          :generator (corruption-generator)}})
+                          :generator (corruption-generator)}
+   "bridge-partitioner" {:nemesis (partition-bridge-nemesis)
+                         :generator (start-stop-generator 5 5)}
+   "blind-node-partitioner" {:nemesis (blind-node-partition-nemesis)
+                         :generator (start-stop-generator 5 5)}
+   "blind-others-partitioner" {:nemesis (blind-others-partition-nemesis)
+                         :generator (start-stop-generator 5 5)}})

From 260a978636cc4273a49739b8a786a0665652706b Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 19 Mar 2021 13:46:14 +0300
Subject: [PATCH 052/260] Check linearizeability for queue workload

---
 .../src/jepsen/nukeeper/main.clj              |  3 ++-
 .../src/jepsen/nukeeper/queue.clj             | 20 +++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 9ef3ab4ca2d..0d93368595b 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -88,7 +88,8 @@
    "register" register/workload
    "unique-ids" unique/workload
    "counter" counter/workload
-   "queue" queue/workload})
+   "total-queue" queue/total-workload
+   "linear-queue" queue/linear-workload})
 
 (def cli-opts
   "Additional command line options."
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
index f6f7abb51b6..fa6b96944b2 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
@@ -5,6 +5,8 @@
     [checker :as checker]
     [client :as client]
     [generator :as gen]]
+   [knossos.model :as model]
+   [jepsen.checker.timeline :as timeline]
    [jepsen.nukeeper.utils :refer :all]
    [zookeeper :as zk])
   (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
@@ -55,13 +57,27 @@
   [n]
   (sort (map (fn [v] (str v)) (take n (range)))))
 
-(defn workload
+(defn total-workload
   "A generator, client, and checker for a set test."
   [opts]
   {:client    (QueueClient. nil nil)
-   :checker   (checker/total-queue)
+   :checker   (checker/compose
+               {:total-queue (checker/total-queue)
+                :timeline (timeline/html)})
    :generator (->> (sorted-str-range 10000)
                    (map (fn [x]
                           (rand-nth [{:type :invoke, :f :enqueue :value x}
                                      {:type :invoke, :f :dequeue}]))))
    :final-generator (gen/once {:type :invoke, :f :drain, :value nil})})
+
+(defn linear-workload
+  [opts]
+  {:client    (QueueClient. nil nil)
+   :checker   (checker/compose
+               {:linear   (checker/linearizable {:model     (model/unordered-queue)
+                                                 :algorithm :linear})
+                :timeline (timeline/html)})
+   :generator (->> (sorted-str-range 10000)
+                   (map (fn [x]
+                          (rand-nth [{:type :invoke, :f :enqueue :value x}
+                                     {:type :invoke, :f :dequeue}]))))})

From 0cb88fb20e3b8da667b54331d31c899ce679c152 Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Fri, 19 Mar 2021 15:00:21 +0300
Subject: [PATCH 053/260] Restart tests

---
 src/IO/WriteBufferFromS3.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp
index e032935b2fc..1ec96c73dfa 100644
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@@ -142,7 +142,6 @@ void WriteBufferFromS3::createMultipartUpload()
         throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
 }
 
-
 void WriteBufferFromS3::writePart()
 {
     auto size = temporary_buffer->tellp();

From 2c00b48f858763bc0efef83489b12f3dea8f9841 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 19 Mar 2021 15:10:18 +0300
Subject: [PATCH 054/260] Add an ability to run N random tests

---
 .../src/jepsen/nukeeper/main.clj              | 27 +++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 0d93368595b..86297473180 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -1,6 +1,7 @@
 (ns jepsen.nukeeper.main
   (:require [clojure.tools.logging :refer :all]
             [jepsen.nukeeper.utils :refer :all]
+            [clojure.pprint :refer [pprint]]
             [jepsen.nukeeper.set :as set]
             [jepsen.nukeeper.nemesis :as custom-nemesis]
             [jepsen.nukeeper.register :as register]
@@ -94,10 +95,10 @@
 (def cli-opts
   "Additional command line options."
   [["-w" "--workload NAME" "What workload should we run?"
-    :missing  (str "--workload " (cli/one-of workloads))
+    :default "set"
     :validate [workloads (cli/one-of workloads)]]
    [nil "--nemesis NAME" "Which nemesis will poison our lives?"
-    :missing  (str "--nemesis " (cli/one-of custom-nemesis/custom-nemesises))
+    :default "random-node-killer"
     :validate [custom-nemesis/custom-nemesises (cli/one-of custom-nemesis/custom-nemesises)]]
    ["-q" "--quorum" "Use quorum reads, instead of reading from any primary."]
    ["-r" "--rate HZ" "Approximate number of requests per second, per thread."
@@ -125,6 +126,7 @@
   "Given an options map from the command line runner (e.g. :nodes, :ssh,
   :concurrency, ...), constructs a test map."
   [opts]
+  (info "Test opts\n" (with-out-str (pprint opts)))
   (let [quorum (boolean (:quorum opts))
         workload  ((get workloads (:workload opts)) opts)
         current-nemesis (get custom-nemesis/custom-nemesises (:nemesis opts))]
@@ -150,11 +152,32 @@
                         (gen/sleep 10)
                         (gen/clients (:final-generator workload)))})))
 
+(def all-nemesises (keys custom-nemesis/custom-nemesises))
+
+(def all-workloads (keys workloads))
+
+(defn all-test-options
+  "Takes base cli options, a collection of nemeses, workloads, and a test count,
+  and constructs a sequence of test options."
+  [cli nemeses workloads]
+  (take (:test-count cli) (shuffle (for [n nemeses, w workloads]
+    (assoc cli
+           :nemesis   n
+           :workload  w
+           :test-count 1)))))
+
+(defn all-tests
+  "Turns CLI options into a sequence of tests."
+  [test-fn cli]
+  (map test-fn (all-test-options cli all-nemesises all-workloads)))
+
 (defn -main
   "Handles command line arguments. Can either run a test, or a web server for
   browsing results."
   [& args]
   (cli/run! (merge (cli/single-test-cmd {:test-fn nukeeper-test
                                          :opt-spec cli-opts})
+                   (cli/test-all-cmd {:tests-fn (partial all-tests nukeeper-test)
+                                      :opt-spec cli-opts})
                    (cli/serve-cmd))
             args))

From 95cf05b0ad346b18f10b426723b58269e038c226 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 19 Mar 2021 15:25:44 +0300
Subject: [PATCH 055/260] Fix style and add sync

---
 .../jepsen.nukeeper/src/jepsen/nukeeper/counter.clj  |  8 +++++---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj   |  8 ++++----
 .../jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj  |  5 ++---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj  | 12 +++++++-----
 tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj    |  8 +++++---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj  |  5 +++++
 6 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
index 48b270517a4..6f0cee113c6 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
@@ -25,9 +25,11 @@
   (invoke! [this test op]
     (case (:f op)
       :read (try
-              (assoc op
-                     :type :ok
-                     :value (count (zk-list conn "/")))
+              (do
+                (zk-sync conn)
+                (assoc op
+                       :type :ok
+                       :value (count (zk-list conn "/"))))
               (catch Exception _ (assoc op :type :fail, :error :connect-error)))
       :add (try
              (do
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 86297473180..feca05d8190 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -161,10 +161,10 @@
   and constructs a sequence of test options."
   [cli nemeses workloads]
   (take (:test-count cli) (shuffle (for [n nemeses, w workloads]
-    (assoc cli
-           :nemesis   n
-           :workload  w
-           :test-count 1)))))
+                                     (assoc cli
+                                            :nemesis   n
+                                            :workload  w
+                                            :test-count 1)))))
 
 (defn all-tests
   "Turns CLI options into a sequence of tests."
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index 9e5841ad8e4..ec39c2b3e35 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -102,7 +102,6 @@
   (let [[[victim] others] (nemesis/split-one nodes)]
     {victim (into #{} others)}))
 
-
 (defn blind-node-partition-nemesis
   []
   (nemesis/partitioner blind-node))
@@ -156,6 +155,6 @@
    "bridge-partitioner" {:nemesis (partition-bridge-nemesis)
                          :generator (start-stop-generator 5 5)}
    "blind-node-partitioner" {:nemesis (blind-node-partition-nemesis)
-                         :generator (start-stop-generator 5 5)}
+                             :generator (start-stop-generator 5 5)}
    "blind-others-partitioner" {:nemesis (blind-others-partition-nemesis)
-                         :generator (start-stop-generator 5 5)}})
+                               :generator (start-stop-generator 5 5)}})
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
index fa6b96944b2..323d74acd67 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
@@ -41,11 +41,13 @@
         (catch Exception _ (assoc op :type :info, :error :connect-error)))
       :drain
       (try
-        (loop [result '()]
-          (let [deleted-child (zk-multi-delete-first-child conn "/")]
-            (if (not (nil? deleted-child))
-              (recur (concat result [deleted-child]))
-              (assoc op :type :ok :value result))))
+        (do
+          (zk-sync conn)
+          (loop [result '()]
+            (let [deleted-child (zk-multi-delete-first-child conn "/")]
+              (if (not (nil? deleted-child))
+                (recur (concat result [deleted-child]))
+                (assoc op :type :ok :value result)))))
         (catch Exception _ (assoc op :type :info, :error :connect-error)))))
 
   (teardown! [_ test])
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
index 3213042a3cc..23461591eaf 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -22,9 +22,11 @@
 
   (invoke! [this test op]
     (case (:f op)
-      :read (assoc op
-                   :type :ok
-                   :value (read-string (:data (zk-get-str conn k))))
+      :read (do
+              (zk-sync conn)
+              (assoc op
+                     :type :ok
+                     :value (read-string (:data (zk-get-str conn k)))))
       :add (try
              (do
                (zk-add-to-set conn k (:value op))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index fd2b2b5acb3..c7e46a75d5f 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -93,6 +93,11 @@
                                CreateMode/PERSISTENT_SEQUENTIAL)
                       (recur (inc i)))))))
 
+; sync call not implemented in zookeeper-clj and don't have sync version in java API
+(defn zk-sync
+  [conn]
+  (zk-set conn "/" "" -1))
+
 (defn zk-parent-path
   [path]
   (let [rslash_pos (str/last-index-of path "/")]

From 0bf897993236d68d979a37d59d2d1fa81c6ef394 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 19 Mar 2021 15:27:49 +0300
Subject: [PATCH 056/260] Remove redundant code from counter

---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
index 6f0cee113c6..48b270517a4 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
@@ -25,11 +25,9 @@
   (invoke! [this test op]
     (case (:f op)
       :read (try
-              (do
-                (zk-sync conn)
-                (assoc op
-                       :type :ok
-                       :value (count (zk-list conn "/"))))
+              (assoc op
+                     :type :ok
+                     :value (count (zk-list conn "/")))
               (catch Exception _ (assoc op :type :fail, :error :connect-error)))
       :add (try
              (do

From 1845df25f316c522717031e3e9866b9368ff6ba1 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Fri, 19 Mar 2021 17:02:48 +0300
Subject: [PATCH 057/260] fix possibly dangling reference to Context

---
 src/Interpreters/InterpreterCreateQuery.cpp    |  8 ++++++--
 src/Storages/StorageFactory.cpp                |  1 +
 src/Storages/StorageURL.cpp                    |  9 ++++-----
 src/Storages/StorageURL.h                      |  1 -
 .../integration/test_odbc_interaction/test.py  | 18 ++++++++++++++++++
 5 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp
index d1af86e7b11..24cb8608ab3 100644
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@@ -260,7 +260,8 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
             renamed = true;
         }
 
-        database->loadStoredObjects(context, has_force_restore_data_flag, create.attach && force_attach);
+        /// We use global context here, because storages lifetime is bigger than query context lifetime
+        database->loadStoredObjects(context.getGlobalContext(), has_force_restore_data_flag, create.attach && force_attach);
     }
     catch (...)
     {
@@ -970,7 +971,10 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
     if (create.as_table_function)
     {
         const auto & factory = TableFunctionFactory::instance();
-        res = factory.get(create.as_table_function, context)->execute(create.as_table_function, context, create.table, properties.columns);
+        /// We should use global context here because there will be no query context on server startup
+        /// and because storage lifetime is bigger than query context lifetime.
+        auto table_func = factory.get(create.as_table_function, context.getGlobalContext());
+        res = table_func->execute(create.as_table_function, context.getGlobalContext(), create.table, properties.columns);
         res->renameInMemory({create.database, create.table, create.uuid});
     }
     else
diff --git a/src/Storages/StorageFactory.cpp b/src/Storages/StorageFactory.cpp
index 85f3bea9e0c..7aaec9b7e76 100644
--- a/src/Storages/StorageFactory.cpp
+++ b/src/Storages/StorageFactory.cpp
@@ -179,6 +179,7 @@ StoragePtr StorageFactory::get(
         .attach = query.attach,
         .has_force_restore_data_flag = has_force_restore_data_flag
     };
+    assert(&arguments.context == &arguments.context.getGlobalContext());
 
     auto res = storages.at(name).creator_fn(arguments);
     if (!empty_engine_args.empty())
diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp
index ca984f9ece9..b59f4b4a02a 100644
--- a/src/Storages/StorageURL.cpp
+++ b/src/Storages/StorageURL.cpp
@@ -42,12 +42,11 @@ IStorageURLBase::IStorageURLBase(
     const String & compression_method_)
     : IStorage(table_id_)
     , uri(uri_)
-    , context_global(context_)
     , compression_method(compression_method_)
     , format_name(format_name_)
     , format_settings(format_settings_)
 {
-    context_global.getRemoteHostFilter().checkURL(uri);
+    context_.getRemoteHostFilter().checkURL(uri);
 
     StorageInMemoryMetadata storage_metadata;
     storage_metadata.setColumns(columns_);
@@ -237,11 +236,11 @@ Pipe IStorageURLBase::read(
         chooseCompressionMethod(request_uri.getPath(), compression_method)));
 }
 
-BlockOutputStreamPtr IStorageURLBase::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & /*context*/)
+BlockOutputStreamPtr IStorageURLBase::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & context)
 {
     return std::make_shared<StorageURLBlockOutputStream>(uri, format_name,
-        format_settings, metadata_snapshot->getSampleBlock(), context_global,
-        ConnectionTimeouts::getHTTPTimeouts(context_global),
+        format_settings, metadata_snapshot->getSampleBlock(), context,
+        ConnectionTimeouts::getHTTPTimeouts(context),
         chooseCompressionMethod(uri.toString(), compression_method));
 }
 
diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h
index 21b2e3e27a1..0ea86980b8c 100644
--- a/src/Storages/StorageURL.h
+++ b/src/Storages/StorageURL.h
@@ -45,7 +45,6 @@ protected:
         const String & compression_method_);
 
     Poco::URI uri;
-    const Context & context_global;
     String compression_method;
     String format_name;
     // For URL engine, we use format settings from server context + `SETTINGS`
diff --git a/tests/integration/test_odbc_interaction/test.py b/tests/integration/test_odbc_interaction/test.py
index 6bb6a6ee777..6232168f2e6 100644
--- a/tests/integration/test_odbc_interaction/test.py
+++ b/tests/integration/test_odbc_interaction/test.py
@@ -74,6 +74,9 @@ def started_cluster():
         node1.exec_in_container(
             ["bash", "-c", "echo 'CREATE TABLE t4(X INTEGER PRIMARY KEY ASC, Y, Z);' | sqlite3 {}".format(sqlite_db)],
             privileged=True, user='root')
+        node1.exec_in_container(
+            ["bash", "-c", "echo 'CREATE TABLE tf1(x INTEGER PRIMARY KEY ASC, y, z);' | sqlite3 {}".format(sqlite_db)],
+            privileged=True, user='root')
         print("sqlite tables created")
         mysql_conn = get_mysql_conn()
         print("mysql connection received")
@@ -177,6 +180,21 @@ def test_sqlite_simple_select_function_works(started_cluster):
     assert node1.query(
         "select count(), sum(x) from odbc('DSN={}', '{}') group by x".format(sqlite_setup["DSN"], 't1')) == "1\t1\n"
 
+def test_sqlite_table_function(started_cluster):
+    sqlite_setup = node1.odbc_drivers["SQLite3"]
+    sqlite_db = sqlite_setup["Database"]
+
+    node1.exec_in_container(["bash", "-c", "echo 'INSERT INTO tf1 values(1, 2, 3);' | sqlite3 {}".format(sqlite_db)],
+                            privileged=True, user='root')
+    node1.query("create table odbc_tf as odbc('DSN={}', '{}')".format(sqlite_setup["DSN"], 'tf1'))
+    assert node1.query("select * from odbc_tf") == "1\t2\t3\n"
+
+    assert node1.query("select y from odbc_tf") == "2\n"
+    assert node1.query("select z from odbc_tf") == "3\n"
+    assert node1.query("select x from odbc_tf") == "1\n"
+    assert node1.query("select x, y from odbc_tf") == "1\t2\n"
+    assert node1.query("select z, x, y from odbc_tf") == "3\t1\t2\n"
+    assert node1.query("select count(), sum(x) from odbc_tf group by x") == "1\t1\n"
 
 def test_sqlite_simple_select_storage_works(started_cluster):
     sqlite_setup = node1.odbc_drivers["SQLite3"]

From 3166f0cbfcba2bec42c4e0adaf94fe949f9fc41d Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Fri, 19 Mar 2021 20:18:17 +0300
Subject: [PATCH 058/260] cleanup

---
 src/Parsers/ExpressionElementParsers.cpp      | 4 ++--
 src/Processors/Transforms/WindowTransform.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index 39e3a0af5b7..913813d5486 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -588,7 +588,7 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
                             && value.get<Int64>() >= 0)))
             {
                 throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Frame offset for '{}' frame must be a nonnegative integer,  '{}' of type '{}' given.",
+                    "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.",
                     WindowFrame::toString(node->frame.type),
                     applyVisitor(FieldVisitorToString(), value),
                     Field::Types::toString(value.getType()));
@@ -649,7 +649,7 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
                                 && value.get<Int64>() >= 0)))
                 {
                     throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Frame offset for '{}' frame must be a nonnegative integer,  '{}' of type '{}' given.",
+                        "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.",
                         WindowFrame::toString(node->frame.type),
                         applyVisitor(FieldVisitorToString(), value),
                         Field::Types::toString(value.getType()));
diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index c562fb4ec2c..3a97698453a 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -1463,9 +1463,9 @@ struct WindowFunctionLagLeadInFrame final : public WindowFunction
     void windowInsertResultInto(const WindowTransform * transform,
         size_t function_index) override
     {
-        auto & current_block = transform->blockAt(transform->current_row);
+        const auto & current_block = transform->blockAt(transform->current_row);
         IColumn & to = *current_block.output_columns[function_index];
-        auto & workspace = transform->workspaces[function_index];
+        const auto & workspace = transform->workspaces[function_index];
 
         int offset = 1;
         if (argument_types.size() > 1)

From 3159b9dacf545134b1f85829d059123d5d71474a Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 19 Mar 2021 21:53:09 +0300
Subject: [PATCH 059/260] Disable zookeeper logger

Better
---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj  | 6 +++++-
 tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index feca05d8190..b7f2bb0b98b 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -27,7 +27,9 @@
             [clojure.java.io :as io]
             [zookeeper.data :as data]
             [zookeeper :as zk])
-  (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
+  (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)
+           (ch.qos.logback.classic Level)
+           (org.slf4j Logger LoggerFactory)))
 
 (defn cluster-config
   [test node config-template]
@@ -175,6 +177,8 @@
   "Handles command line arguments. Can either run a test, or a web server for
   browsing results."
   [& args]
+  (.setLevel
+   (LoggerFactory/getLogger "org.apache.zookeeper") Level/OFF)
   (cli/run! (merge (cli/single-test-cmd {:test-fn nukeeper-test
                                          :opt-spec cli-opts})
                    (cli/test-all-cmd {:tests-fn (partial all-tests nukeeper-test)
diff --git a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
index 1981e01ebcb..db84ff33ee3 100644
--- a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
+++ b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj
@@ -2,7 +2,9 @@
   (:require [clojure.test :refer :all]
             [jepsen.nukeeper.utils :refer :all]
             [zookeeper :as zk]
-            [zookeeper.data :as data]))
+            [zookeeper.data :as data])
+  (:import (ch.qos.logback.classic Level)
+           (org.slf4j Logger LoggerFactory)))
 
 (defn multicreate
   [conn]
@@ -14,6 +16,8 @@
 
 (deftest a-test
   (testing "nukeeper connection"
+    (.setLevel
+     (LoggerFactory/getLogger "org.apache.zookeeper") Level/OFF)
     (let [conn (zk/connect "localhost:9181" :timeout-msec 5000)]
       ;(println (take 10 (zk-range)))
       ;(multidelete conn)

From 957c053f7e3604f6366e0e569fb1bdfdf8fcd8cb Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Fri, 19 Mar 2021 23:29:01 +0300
Subject: [PATCH 060/260] Fix segfault

---
 src/IO/PeekableReadBuffer.cpp           | 29 ++++++++++++++++++++-----
 src/IO/PeekableReadBuffer.h             |  5 +----
 src/Interpreters/InterserverIOHandler.h |  2 --
 src/Server/HTTP/HTMLForm.cpp            |  5 +++++
 4 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/src/IO/PeekableReadBuffer.cpp b/src/IO/PeekableReadBuffer.cpp
index 1d999d586b2..551f87d7363 100644
--- a/src/IO/PeekableReadBuffer.cpp
+++ b/src/IO/PeekableReadBuffer.cpp
@@ -82,6 +82,7 @@ bool PeekableReadBuffer::peekNext()
         checkpoint.emplace(memory.data());
         checkpoint_in_own_memory = true;
     }
+
     if (currentlyReadFromOwnMemory())
     {
         /// Update buffer size
@@ -99,7 +100,6 @@ bool PeekableReadBuffer::peekNext()
                 pos_offset = 0;
         }
         BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset);
-
     }
 
     peeked_size += bytes_to_copy;
@@ -113,12 +113,21 @@ void PeekableReadBuffer::rollbackToCheckpoint(bool drop)
 {
     checkStateCorrect();
 
-    if (!checkpoint)
-        throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR);
-    else if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
+    assert(checkpoint);
+
+    if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
+    {
+        /// Both checkpoint and position are in the same buffer.
         pos = *checkpoint;
-    else /// Checkpoint is in own memory and pos is not. Switch to reading from own memory
+    }
+    else
+    {
+        /// Checkpoint is in own memory and position is not.
+        assert(checkpointInOwnMemory());
+
+        /// Switch to reading from own memory.
         BufferBase::set(memory.data(), peeked_size, *checkpoint - memory.data());
+    }
 
     if (drop)
         dropCheckpoint();
@@ -134,10 +143,11 @@ bool PeekableReadBuffer::nextImpl()
 
     checkStateCorrect();
     bool res;
+    bool checkpoint_at_end = checkpoint && *checkpoint == working_buffer.end();
 
     if (checkpoint)
     {
-        if (currentlyReadFromOwnMemory())
+        if (currentlyReadFromOwnMemory() || checkpoint_at_end)
             res = sub_buf.hasPendingData() || sub_buf.next();
         else
             res = peekNext();
@@ -163,6 +173,13 @@ bool PeekableReadBuffer::nextImpl()
     BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset());
     nextimpl_working_buffer_offset = sub_buf.offset();
 
+    if (checkpoint_at_end)
+    {
+        checkpoint.emplace(working_buffer.begin());
+        peeked_size = 0;
+        checkpoint_in_own_memory = false;
+    }
+
     checkStateCorrect();
     return res;
 }
diff --git a/src/IO/PeekableReadBuffer.h b/src/IO/PeekableReadBuffer.h
index 4f6e669b31d..4515c6f8ce5 100644
--- a/src/IO/PeekableReadBuffer.h
+++ b/src/IO/PeekableReadBuffer.h
@@ -43,10 +43,7 @@ public:
     /// Forget checkpoint and all data between checkpoint and position
     ALWAYS_INLINE inline void dropCheckpoint()
     {
-#ifndef NDEBUG
-        if (!checkpoint)
-            throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR);
-#endif
+        assert(checkpoint);
         if (!currentlyReadFromOwnMemory())
         {
             /// Don't need to store unread data anymore
diff --git a/src/Interpreters/InterserverIOHandler.h b/src/Interpreters/InterserverIOHandler.h
index b4768c30f32..b0c95ed3835 100644
--- a/src/Interpreters/InterserverIOHandler.h
+++ b/src/Interpreters/InterserverIOHandler.h
@@ -9,8 +9,6 @@
 #include <Common/ActionBlocker.h>
 #include <common/types.h>
 
-#include <Poco/Net/HTMLForm.h>
-
 #include <atomic>
 #include <map>
 #include <shared_mutex>
diff --git a/src/Server/HTTP/HTMLForm.cpp b/src/Server/HTTP/HTMLForm.cpp
index ca407858c33..a00950c8e27 100644
--- a/src/Server/HTTP/HTMLForm.cpp
+++ b/src/Server/HTTP/HTMLForm.cpp
@@ -369,6 +369,11 @@ bool HTMLForm::MultipartReadBuffer::nextImpl()
     else
         boundary_hit = startsWith(line, boundary);
 
+    if (!line.empty())
+        /// If we don't make sure that memory is contiguous then situation may happen, when part of the line is inside internal memory
+        /// and other part is inside sub-buffer, thus we'll be unable to setup our working buffer properly.
+        in.makeContinuousMemoryFromCheckpointToPos();
+
     in.rollbackToCheckpoint(true);
 
     /// Rolling back to checkpoint may change underlying buffers.

From c5f918f198d55150f55b81722993a3bbca06fbdc Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Sun, 21 Mar 2021 19:26:06 +0300
Subject: [PATCH 061/260] CacheDictionaryStorage insert into default value fix

---
 src/Dictionaries/CacheDictionaryStorage.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h
index 874796d879b..7694176d4aa 100644
--- a/src/Dictionaries/CacheDictionaryStorage.h
+++ b/src/Dictionaries/CacheDictionaryStorage.h
@@ -276,10 +276,11 @@ private:
                     }
                     else
                     {
+                        auto & data = column_typed.getData();
+
                         for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index)
                         {
                             auto fetched_key = fetched_keys[fetched_key_index];
-                            auto & data = column_typed.getData();
 
                             if (unlikely(fetched_key.is_default))
                                 column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index));
@@ -312,6 +313,7 @@ private:
             size_t cell_index = getCellIndexForInsert(key);
             auto & cell = cells[cell_index];
 
+            bool cell_was_default = cell.is_default;
             cell.is_default = false;
 
             bool was_inserted = cell.deadline == 0;
@@ -387,9 +389,11 @@ private:
                             StringRef string_ref_value = StringRef {string_value.data(), string_value.size()};
                             StringRef inserted_value = copyStringInArena(string_ref_value);
 
-                            StringRef previous_value = container[index_to_use];
-                            char * data = const_cast<char *>(previous_value.data);
-                            arena.free(data, previous_value.size);
+                            if (!cell_was_default)
+                            {
+                                StringRef previous_value = container[index_to_use];
+                                arena.free(const_cast<char *>(previous_value.data), previous_value.size);
+                            }
 
                             container[index_to_use] = inserted_value;
                         }

From 5ec7dbbdad1aa37d626bcc57ba5d8b324849650d Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 22 Mar 2021 13:06:09 +0300
Subject: [PATCH 062/260] Add lightweight run and fix queue workload

---
 .../resources/test_keeper_config.xml          |  2 +-
 .../src/jepsen/nukeeper/main.clj              | 36 ++++++++++++++-----
 .../src/jepsen/nukeeper/queue.clj             | 10 ++----
 .../src/jepsen/nukeeper/utils.clj             | 20 +++++++----
 4 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/tests/jepsen.nukeeper/resources/test_keeper_config.xml b/tests/jepsen.nukeeper/resources/test_keeper_config.xml
index 7ef34d4bea1..c69fb0f228c 100644
--- a/tests/jepsen.nukeeper/resources/test_keeper_config.xml
+++ b/tests/jepsen.nukeeper/resources/test_keeper_config.xml
@@ -7,7 +7,7 @@
             <operation_timeout_ms>10000</operation_timeout_ms>
             <session_timeout_ms>30000</session_timeout_ms>
             <force_sync>false</force_sync>
-            <startup_timeout>60000</startup_timeout>
+            <startup_timeout>120000</startup_timeout>
             <raft_logs_level>trace</raft_logs_level>
             <quorum_reads>{quorum_reads}</quorum_reads>
             <snapshot_distance>{snapshot_distance}</snapshot_distance>
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index b7f2bb0b98b..4e7c16930d4 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -122,7 +122,8 @@
    [nil "--ops-per-key NUM" "Maximum number of operations on any given key."
     :default  100
     :parse-fn parse-long
-    :validate [pos? "Must be a positive integer."]]])
+    :validate [pos? "Must be a positive integer."]]
+   [nil, "--lightweight-run", "Subset of workloads/nemesises which is simple to validate"]])
 
 (defn nukeeper-test
   "Given an options map from the command line runner (e.g. :nodes, :ssh,
@@ -136,7 +137,7 @@
            opts
            {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)) " " (name (:nemesis opts)))
             :os ubuntu/os
-            :db (db "rbtorrent:71c60699aa56568ded73c4a48cecd2fd5e0956cb")
+            :db (db "rbtorrent:5fecc75309f38e302c95b4a226b2de60dfbb5681")
             :pure-generators true
             :client (:client workload)
             :nemesis (:nemesis current-nemesis)
@@ -158,20 +159,39 @@
 
 (def all-workloads (keys workloads))
 
+(def lightweight-workloads ["set" "unique-ids" "counter" "total-queue"])
+
+(def useful-nemesises ["random-node-killer"
+                       "simple-partitioner"
+                       "logs-and-snapshots-corruptor"
+                       "drop-data-corruptor"
+                       "bridge-partitioner"
+                       "blind-node-partitioner"
+                       "blind-others-partitioner"])
+
+(defn cart [colls]
+  (if (empty? colls)
+    '(())
+    (for [more (cart (rest colls))
+          x (first colls)]
+      (cons x more))))
+
 (defn all-test-options
   "Takes base cli options, a collection of nemeses, workloads, and a test count,
   and constructs a sequence of test options."
-  [cli nemeses workloads]
-  (take (:test-count cli) (shuffle (for [n nemeses, w workloads]
+  [cli worload-nemeseis-collection]
+  (take (:test-count cli)
+        (shuffle (for [[workload nemesis] worload-nemeseis-collection]
                                      (assoc cli
-                                            :nemesis   n
-                                            :workload  w
+                                            :nemesis   nemesis
+                                            :workload  workload
                                             :test-count 1)))))
-
 (defn all-tests
   "Turns CLI options into a sequence of tests."
   [test-fn cli]
-  (map test-fn (all-test-options cli all-nemesises all-workloads)))
+  (if (boolean (:lightweight-run cli))
+    (map test-fn (all-test-options cli (cart [all-workloads all-nemesises])))
+    (map test-fn (all-test-options cli (cart [lightweight-workloads useful-nemesises])))))
 
 (defn -main
   "Handles command line arguments. Can either run a test, or a web server for
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
index 323d74acd67..951c0822ad2 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
@@ -37,17 +37,13 @@
           (if (not (nil? result))
             (assoc op :type :ok :value result)
             (assoc op :type :fail :value result)))
-        (catch KeeperException$BadVersionException _ (assoc op :type :fail, :error :bad-version))
         (catch Exception _ (assoc op :type :info, :error :connect-error)))
       :drain
+      ; drain via delete is to long, just list all nodes
       (try
         (do
           (zk-sync conn)
-          (loop [result '()]
-            (let [deleted-child (zk-multi-delete-first-child conn "/")]
-              (if (not (nil? deleted-child))
-                (recur (concat result [deleted-child]))
-                (assoc op :type :ok :value result)))))
+          (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/")))))
         (catch Exception _ (assoc op :type :info, :error :connect-error)))))
 
   (teardown! [_ test])
@@ -66,7 +62,7 @@
    :checker   (checker/compose
                {:total-queue (checker/total-queue)
                 :timeline (timeline/html)})
-   :generator (->> (sorted-str-range 10000)
+   :generator (->> (sorted-str-range 50000)
                    (map (fn [x]
                           (rand-nth [{:type :invoke, :f :enqueue :value x}
                                      {:type :invoke, :f :dequeue}]))))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index c7e46a75d5f..fe415ff9e51 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -9,7 +9,8 @@
             [clojure.tools.logging :refer :all])
   (:import (org.apache.zookeeper.data Stat)
            (org.apache.zookeeper CreateMode
-                                 ZooKeeper)))
+                                 ZooKeeper)
+           (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
 
 (defn parse-long
   "Parses a string to a Long. Passes through `nil` and empty strings."
@@ -111,11 +112,18 @@
         txn (.transaction conn)
         first-child (first (sort children))]
     (if (not (nil? first-child))
-      (do (.check txn path (:version stat))
-          (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions
-          (.delete txn (str path first-child) -1)
-          (.commit txn)
-          first-child)
+      (try
+          (do (.check txn path (:version stat))
+              (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions
+              (.delete txn (str path first-child) -1)
+              (.commit txn)
+              first-child)
+        (catch KeeperException$BadVersionException _ nil)
+        ; Even if we got connection loss, delete may actually be executed.
+        ; This function is used for queue model, which strictly require
+        ; all enqueued elements to be dequeued, but allow duplicates.
+        ; So even in case when we not sure about delete we return first-child.
+        (catch Exception _ first-child))
       nil)))
 
 (defn clickhouse-alive?

From 043b3cc7b589ec29ff03f3c3e005fd6c2718e05c Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 22 Mar 2021 13:45:22 +0300
Subject: [PATCH 063/260] Fix startup when leadership changed

---
 src/Coordination/CoordinationSettings.h            |  1 +
 src/Coordination/NuKeeperServer.cpp                | 13 +++++++++++++
 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj |  2 +-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h
index c816f8089d5..45eb1348ac6 100644
--- a/src/Coordination/CoordinationSettings.h
+++ b/src/Coordination/CoordinationSettings.h
@@ -31,6 +31,7 @@ struct Settings;
     M(UInt64, rotate_log_storage_interval, 10000, "How many records will be stored in one log storage file", 0) \
     M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \
     M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \
+    M(UInt64, fresh_log_gap, 200, "When node became fresh", 0) \
     M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \
     M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0)
 
diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp
index 2081c969523..bfff7bf8f69 100644
--- a/src/Coordination/NuKeeperServer.cpp
+++ b/src/Coordination/NuKeeperServer.cpp
@@ -61,6 +61,7 @@ void NuKeeperServer::startup()
     params.reserved_log_items_ = coordination_settings->reserved_log_items;
     params.snapshot_distance_ = coordination_settings->snapshot_distance;
     params.stale_log_gap_ = coordination_settings->stale_log_gap;
+    params.fresh_log_gap_ = coordination_settings->fresh_log_gap;
     params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds();
     params.auto_forwarding_ = coordination_settings->auto_forwarding;
     params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2;
@@ -202,6 +203,18 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
                 set_initialized();
             return nuraft::cb_func::ReturnCode::Ok;
         }
+        case nuraft::cb_func::BecomeFollower:
+        {
+            auto leader_index = raft_instance->get_leader_committed_log_idx();
+            auto our_index = raft_instance->get_committed_log_idx();
+            /// This may happen when we start RAFT claster from scratch.
+            /// Node first became leader, and after that some other node became leader.
+            /// BecameFresh for this node will not be called because it was already fresh
+            /// when it was leader.
+            if (isLeaderAlive() && leader_index < our_index + coordination_settings->fresh_log_gap)
+                set_initialized();
+            return nuraft::cb_func::ReturnCode::Ok;
+        }
         case nuraft::cb_func::BecomeFresh:
         {
             set_initialized(); /// We are fresh follower, ready to serve requests.
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 4e7c16930d4..5167da96c59 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -137,7 +137,7 @@
            opts
            {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)) " " (name (:nemesis opts)))
             :os ubuntu/os
-            :db (db "rbtorrent:5fecc75309f38e302c95b4a226b2de60dfbb5681")
+            :db (db "rbtorrent:156b85947eac9c85ef5d0ef15757a9f9e7c9e430")
             :pure-generators true
             :client (:client workload)
             :nemesis (:nemesis current-nemesis)

From bf3a4361caaaa60696ed4fcc3a9d9978b4818503 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 22 Mar 2021 13:49:47 +0300
Subject: [PATCH 064/260] Followup fix

---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 5167da96c59..dfa1cfd913e 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -190,8 +190,8 @@
   "Turns CLI options into a sequence of tests."
   [test-fn cli]
   (if (boolean (:lightweight-run cli))
-    (map test-fn (all-test-options cli (cart [all-workloads all-nemesises])))
-    (map test-fn (all-test-options cli (cart [lightweight-workloads useful-nemesises])))))
+    (map test-fn (all-test-options cli (cart [lightweight-workloads useful-nemesises])))
+    (map test-fn (all-test-options cli (cart [all-workloads all-nemesises])))))
 
 (defn -main
   "Handles command line arguments. Can either run a test, or a web server for

From 56840aba5ae88f7b3e88162ad0daf09afb465c8e Mon Sep 17 00:00:00 2001
From: Maksim Kita <maksim-kita@yandex-team.ru>
Date: Mon, 22 Mar 2021 16:02:32 +0300
Subject: [PATCH 065/260] Fixed tests

---
 src/Dictionaries/CacheDictionaryStorage.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h
index 7694176d4aa..ffab7f1f9cf 100644
--- a/src/Dictionaries/CacheDictionaryStorage.h
+++ b/src/Dictionaries/CacheDictionaryStorage.h
@@ -421,6 +421,7 @@ private:
             auto & cell = cells[cell_index];
 
             bool was_inserted = cell.deadline == 0;
+            bool cell_was_default = cell.is_default;
 
             cell.is_default = true;
 
@@ -444,6 +445,23 @@ private:
             }
             else
             {
+                for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
+                {
+                    getAttributeContainer(attribute_index, [&](const auto & container)
+                    {
+                        using ElementType = std::decay_t<decltype(container[0])>;
+
+                        if constexpr (std::is_same_v<ElementType, StringRef>)
+                        {
+                            if (!cell_was_default)
+                            {
+                                StringRef previous_value = container[cell.element_index];
+                                arena.free(const_cast<char *>(previous_value.data), previous_value.size);
+                            }
+                        }
+                    });
+                }
+
                 if (cell.key != key)
                 {
                     if constexpr (std::is_same_v<KeyType, StringRef>)

From fc3e11a06844d95e45c8e8eb514157fd95ff8431 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Mon, 22 Mar 2021 17:09:38 +0300
Subject: [PATCH 066/260] fix

---
 src/Interpreters/InterpreterCreateQuery.cpp |  2 +-
 src/TableFunctions/ITableFunctionXDBC.cpp   | 16 +++++++++++-----
 src/TableFunctions/ITableFunctionXDBC.h     |  4 +++-
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp
index 24cb8608ab3..7034e74eaf8 100644
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@@ -971,9 +971,9 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
     if (create.as_table_function)
     {
         const auto & factory = TableFunctionFactory::instance();
+        auto table_func = factory.get(create.as_table_function, context);
         /// We should use global context here because there will be no query context on server startup
         /// and because storage lifetime is bigger than query context lifetime.
-        auto table_func = factory.get(create.as_table_function, context.getGlobalContext());
         res = table_func->execute(create.as_table_function, context.getGlobalContext(), create.table, properties.columns);
         res->renameInMemory({create.database, create.table, create.uuid});
     }
diff --git a/src/TableFunctions/ITableFunctionXDBC.cpp b/src/TableFunctions/ITableFunctionXDBC.cpp
index e04a86b5abf..21c78d199db 100644
--- a/src/TableFunctions/ITableFunctionXDBC.cpp
+++ b/src/TableFunctions/ITableFunctionXDBC.cpp
@@ -55,15 +55,21 @@ void ITableFunctionXDBC::parseArguments(const ASTPtr & ast_function, const Conte
         connection_string = args[0]->as<ASTLiteral &>().value.safeGet<String>();
         remote_table_name = args[1]->as<ASTLiteral &>().value.safeGet<String>();
     }
+}
 
-    /// Have to const_cast, because bridges store their commands inside context
-    helper = createBridgeHelper(const_cast<Context &>(context), context.getSettingsRef().http_receive_timeout.value, connection_string);
-    helper->startBridgeSync();
+void ITableFunctionXDBC::startBridgeIfNot(const Context & context) const
+{
+    if (!helper)
+    {
+        /// Have to const_cast, because bridges store their commands inside context
+        helper = createBridgeHelper(const_cast<Context &>(context), context.getSettingsRef().http_receive_timeout.value, connection_string);
+        helper->startBridgeSync();
+    }
 }
 
 ColumnsDescription ITableFunctionXDBC::getActualTableStructure(const Context & context) const
 {
-    assert(helper);
+    startBridgeIfNot(context);
 
     /* Infer external table structure */
     Poco::URI columns_info_uri = helper->getColumnsInfoURI();
@@ -87,7 +93,7 @@ ColumnsDescription ITableFunctionXDBC::getActualTableStructure(const Context & c
 
 StoragePtr ITableFunctionXDBC::executeImpl(const ASTPtr & /*ast_function*/, const Context & context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const
 {
-    assert(helper);
+    startBridgeIfNot(context);
     auto columns = getActualTableStructure(context);
     auto result = std::make_shared<StorageXDBC>(StorageID(getDatabaseName(), table_name), schema_name, remote_table_name, columns, context, helper);
     result->startup();
diff --git a/src/TableFunctions/ITableFunctionXDBC.h b/src/TableFunctions/ITableFunctionXDBC.h
index fb0a0fd1185..f3ff64c2f2d 100644
--- a/src/TableFunctions/ITableFunctionXDBC.h
+++ b/src/TableFunctions/ITableFunctionXDBC.h
@@ -29,10 +29,12 @@ private:
 
     void parseArguments(const ASTPtr & ast_function, const Context & context) override;
 
+    void startBridgeIfNot(const Context & context) const;
+
     String connection_string;
     String schema_name;
     String remote_table_name;
-    BridgeHelperPtr helper;
+    mutable BridgeHelperPtr helper;
 };
 
 class TableFunctionJDBC : public ITableFunctionXDBC

From 6d4d669f96cadf7c2994e51e994ec6d6c2c4a99e Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Mon, 22 Mar 2021 20:20:42 +0300
Subject: [PATCH 067/260] Move checkpoint to sub-buffer only from internal
 memory

---
 src/IO/PeekableReadBuffer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/IO/PeekableReadBuffer.cpp b/src/IO/PeekableReadBuffer.cpp
index 551f87d7363..15fdd9448ec 100644
--- a/src/IO/PeekableReadBuffer.cpp
+++ b/src/IO/PeekableReadBuffer.cpp
@@ -143,11 +143,11 @@ bool PeekableReadBuffer::nextImpl()
 
     checkStateCorrect();
     bool res;
-    bool checkpoint_at_end = checkpoint && *checkpoint == working_buffer.end();
+    bool checkpoint_at_end = checkpoint && *checkpoint == working_buffer.end() && currentlyReadFromOwnMemory();
 
     if (checkpoint)
     {
-        if (currentlyReadFromOwnMemory() || checkpoint_at_end)
+        if (currentlyReadFromOwnMemory())
             res = sub_buf.hasPendingData() || sub_buf.next();
         else
             res = peekNext();

From b88f67ed124908ccccd900d359657b6b23c80367 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 21:46:22 +0300
Subject: [PATCH 068/260] Add a test for #21991

---
 tests/queries/0_stateless/01765_tehran_dst.reference | 1 +
 tests/queries/0_stateless/01765_tehran_dst.sql       | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 tests/queries/0_stateless/01765_tehran_dst.reference
 create mode 100644 tests/queries/0_stateless/01765_tehran_dst.sql

diff --git a/tests/queries/0_stateless/01765_tehran_dst.reference b/tests/queries/0_stateless/01765_tehran_dst.reference
new file mode 100644
index 00000000000..8b0a23c7971
--- /dev/null
+++ b/tests/queries/0_stateless/01765_tehran_dst.reference
@@ -0,0 +1 @@
+2021-03-22 23:15:11
diff --git a/tests/queries/0_stateless/01765_tehran_dst.sql b/tests/queries/0_stateless/01765_tehran_dst.sql
new file mode 100644
index 00000000000..b332ba457a1
--- /dev/null
+++ b/tests/queries/0_stateless/01765_tehran_dst.sql
@@ -0,0 +1 @@
+SELECT toTimeZone(toDateTime('2021-03-22 18:45:11', 'UTC'), 'Asia/Tehran');

From 5d8b74e02ec72e8d2223432f4746dffab367207d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 21:49:12 +0300
Subject: [PATCH 069/260] Also fix #13845

---
 tests/queries/0_stateless/01765_tehran_dst.reference | 1 +
 tests/queries/0_stateless/01765_tehran_dst.sql       | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/queries/0_stateless/01765_tehran_dst.reference b/tests/queries/0_stateless/01765_tehran_dst.reference
index 8b0a23c7971..61f5403e5e5 100644
--- a/tests/queries/0_stateless/01765_tehran_dst.reference
+++ b/tests/queries/0_stateless/01765_tehran_dst.reference
@@ -1 +1,2 @@
 2021-03-22 23:15:11
+2020-03-21 23:00:00
diff --git a/tests/queries/0_stateless/01765_tehran_dst.sql b/tests/queries/0_stateless/01765_tehran_dst.sql
index b332ba457a1..41b92ae2360 100644
--- a/tests/queries/0_stateless/01765_tehran_dst.sql
+++ b/tests/queries/0_stateless/01765_tehran_dst.sql
@@ -1 +1,2 @@
 SELECT toTimeZone(toDateTime('2021-03-22 18:45:11', 'UTC'), 'Asia/Tehran');
+SELECT toDateTime('2020-03-21 23:00:00', 'Asia/Tehran');

From a215bf78121c7a60cf1ff868874c4e9f10a30803 Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Mon, 22 Mar 2021 22:12:42 +0300
Subject: [PATCH 070/260] Restart CI

---
 src/IO/WriteBufferFromS3.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp
index 1ec96c73dfa..0069cf7583a 100644
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@@ -21,6 +21,7 @@ namespace ProfileEvents
     extern const Event S3WriteBytes;
 }
 
+
 namespace DB
 {
 // S3 protocol does not allow to have multipart upload with more than 10000 parts.

From fb08bc46bf963222efa7aa2e040b6de418f6c9a7 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 22:15:57 +0300
Subject: [PATCH 071/260] Add a test for #11720

---
 .../0_stateless/01766_todatetime64_no_timezone_arg.reference     | 1 +
 tests/queries/0_stateless/01766_todatetime64_no_timezone_arg.sql | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 tests/queries/0_stateless/01766_todatetime64_no_timezone_arg.reference
 create mode 100644 tests/queries/0_stateless/01766_todatetime64_no_timezone_arg.sql

diff --git a/tests/queries/0_stateless/01766_todatetime64_no_timezone_arg.reference b/tests/queries/0_stateless/01766_todatetime64_no_timezone_arg.reference
new file mode 100644
index 00000000000..52eea094ae4
--- /dev/null
+++ b/tests/queries/0_stateless/01766_todatetime64_no_timezone_arg.reference
@@ -0,0 +1 @@
+2021-03-22 00:00:00.000
diff --git a/tests/queries/0_stateless/01766_todatetime64_no_timezone_arg.sql b/tests/queries/0_stateless/01766_todatetime64_no_timezone_arg.sql
new file mode 100644
index 00000000000..99141a694c1
--- /dev/null
+++ b/tests/queries/0_stateless/01766_todatetime64_no_timezone_arg.sql
@@ -0,0 +1 @@
+SELECT toDateTime64('2021-03-22', 3);

From dbc5018000cc1587b0dbb5198aa2a920121cd9ed Mon Sep 17 00:00:00 2001
From: Pavel Kruglov <avogar@sandbox-633380738>
Date: Mon, 22 Mar 2021 22:18:06 +0300
Subject: [PATCH 072/260] Update tests for hedged requests

---
 src/Core/Settings.h                           |   4 +-
 src/Server/TCPHandler.cpp                     |  10 +-
 .../integration/test_hedged_requests/test.py  | 233 +++++-------------
 .../test_hedged_requests_parallel/test.py     | 136 ++++------
 4 files changed, 112 insertions(+), 271 deletions(-)

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index cf7bda7d1a1..5b2f603e915 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -220,8 +220,8 @@ class IColumn;
     M(Milliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.", 0) \
     \
     /** Settings for testing hedged requests */ \
-    M(Int64, sleep_in_send_tables_status, 0, "Time to sleep in sending tables status response in TCPHandler", 0) \
-    M(Int64, sleep_in_send_data, 0, "Time to sleep in sending data in TCPHandler", 0) \
+    M(Milliseconds, sleep_in_send_tables_status_ms, 0, "Time to sleep in sending tables status response in TCPHandler", 0) \
+    M(Milliseconds, sleep_in_send_data_ms, 0, "Time to sleep in sending data in TCPHandler", 0) \
     \
     M(Bool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.", 0) \
     M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.", 0) \
diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 5765c3ec43e..e00a8f01c3e 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -722,11 +722,10 @@ void TCPHandler::processTablesStatusRequest()
 
     /// For testing hedged requests
     const Settings & settings = query_context->getSettingsRef();
-    if (settings.sleep_in_send_tables_status)
+    if (settings.sleep_in_send_tables_status_ms.totalMilliseconds())
     {
         out->next();
-        std::chrono::seconds sec(settings.sleep_in_send_tables_status);
-        std::this_thread::sleep_for(sec);
+        std::this_thread::sleep_for(settings.sleep_in_send_tables_status_ms);
     }
 
     response.write(*out, client_tcp_protocol_version);
@@ -1415,11 +1414,10 @@ void TCPHandler::sendData(const Block & block)
 
     /// For testing hedged requests
     const Settings & settings = query_context->getSettingsRef();
-    if (block.rows() > 0 && settings.sleep_in_send_data)
+    if (block.rows() > 0 && settings.sleep_in_send_data_ms.totalMilliseconds())
     {
         out->next();
-        std::chrono::seconds sec(settings.sleep_in_send_data);
-        std::this_thread::sleep_for(sec);
+        std::this_thread::sleep_for(settings.sleep_in_send_data_ms);
     }
 
     state.block_out->write(block);
diff --git a/tests/integration/test_hedged_requests/test.py b/tests/integration/test_hedged_requests/test.py
index fbb8e20c450..a1693206ecc 100644
--- a/tests/integration/test_hedged_requests/test.py
+++ b/tests/integration/test_hedged_requests/test.py
@@ -15,28 +15,30 @@ NODES = {'node_' + str(i): None for i in (1, 2, 3)}
 
 NODES['node'] = None
 
-sleep_time = 30
+# Sleep time in milliseconds.
+sleep_time = 30000
 
 @pytest.fixture(scope="module")
 def started_cluster():
     NODES['node'] = cluster.add_instance(
-    'node', with_zookeeper=True, stay_alive=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml'])
+    'node', stay_alive=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml'])
 
     for name in NODES:
         if name != 'node':
-            NODES[name] = cluster.add_instance(name, with_zookeeper=True,  user_configs=['configs/users1.xml'])
+            NODES[name] = cluster.add_instance(name, user_configs=['configs/users1.xml'])
 
     try:
         cluster.start()
 
         for node_id, node in list(NODES.items()):
-            node.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE =
-            ReplicatedMergeTree('/clickhouse/tables/replicated', '{}')  ORDER BY id PARTITION BY toYYYYMM(date)'''.format(node_id))
+            node.query('''CREATE TABLE test_hedged (id UInt32, date Date) ENGINE =
+            MergeTree()  ORDER BY id PARTITION BY toYYYYMM(date)''')
+
+            node.query("INSERT INTO test_hedged select number, toDate(number) from numbers(100);")
+
 
         NODES['node'].query('''CREATE TABLE distributed (id UInt32, date Date) ENGINE =
-            Distributed('test_cluster', 'default', 'replicated')''')
-
-        NODES['node'].query("INSERT INTO distributed select number, toDate(number) from numbers(100);")
+            Distributed('test_cluster', 'default', 'test_hedged')''')
 
         yield cluster
 
@@ -47,8 +49,8 @@ def started_cluster():
 config = '''<yandex>
     <profiles>
         <default>
-            <sleep_in_send_tables_status>{sleep_in_send_tables_status}</sleep_in_send_tables_status>
-            <sleep_in_send_data>{sleep_in_send_data}</sleep_in_send_data>
+            <sleep_in_send_tables_status_ms>{sleep_in_send_tables_status_ms}</sleep_in_send_tables_status_ms>
+            <sleep_in_send_data_ms>{sleep_in_send_data_ms}</sleep_in_send_data_ms>
         </default>
     </profiles>
 </yandex>'''
@@ -70,12 +72,12 @@ def check_query(expected_replica, receive_timeout=300):
     assert query_time < 10
 
 
-def check_settings(node_name, sleep_in_send_tables_status, sleep_in_send_data):
+def check_settings(node_name, sleep_in_send_tables_status_ms, sleep_in_send_data_ms):
     attempts = 0
     while attempts < 1000:
-        setting1 = NODES[node_name].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_tables_status'")
-        setting2 = NODES[node_name].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data'")
-        if int(setting1) == sleep_in_send_tables_status and int(setting2) == sleep_in_send_data:
+        setting1 = NODES[node_name].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_tables_status_ms'")
+        setting2 = NODES[node_name].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data_ms'")
+        if int(setting1) == sleep_in_send_tables_status_ms and int(setting2) == sleep_in_send_data_ms:
             return
         time.sleep(0.1)
         attempts += 1
@@ -88,7 +90,29 @@ def check_changing_replica_events(expected_count):
     assert int(result) == expected_count
 
 
+def update_configs(node_1_sleep_in_send_tables_status=0, node_1_sleep_in_send_data=0,
+                    node_2_sleep_in_send_tables_status=0, node_2_sleep_in_send_data=0,
+                    node_3_sleep_in_send_tables_status=0, node_3_sleep_in_send_data=0):
+    NODES['node_1'].replace_config(
+        '/etc/clickhouse-server/users.d/users1.xml',
+        config.format(sleep_in_send_tables_status_ms=node_1_sleep_in_send_tables_status, sleep_in_send_data_ms=node_1_sleep_in_send_data))
+
+    NODES['node_2'].replace_config(
+        '/etc/clickhouse-server/users.d/users1.xml',
+        config.format(sleep_in_send_tables_status_ms=node_2_sleep_in_send_tables_status, sleep_in_send_data_ms=node_2_sleep_in_send_data))
+
+    NODES['node_3'].replace_config(
+        '/etc/clickhouse-server/users.d/users1.xml',
+        config.format(sleep_in_send_tables_status_ms=node_3_sleep_in_send_tables_status, sleep_in_send_data_ms=node_3_sleep_in_send_data))
+
+    check_settings('node_1', node_1_sleep_in_send_tables_status, node_1_sleep_in_send_data)
+    check_settings('node_2', node_2_sleep_in_send_tables_status, node_2_sleep_in_send_data)
+    check_settings('node_3', node_3_sleep_in_send_tables_status, node_3_sleep_in_send_data)
+
+
 def test_stuck_replica(started_cluster):
+    update_configs()
+
     cluster.pause_container("node_1")
 
     check_query(expected_replica="node_2")
@@ -111,6 +135,8 @@ def test_stuck_replica(started_cluster):
 
 
 def test_long_query(started_cluster):
+    update_configs()
+
     # Restart to reset pool states.
     NODES['node'].restart_clickhouse()
 
@@ -121,169 +147,54 @@ def test_long_query(started_cluster):
 
 
 def test_send_table_status_sleep(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=0))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=0))
-    
-    check_settings('node_1', sleep_time, 0)
-    check_settings('node_2', 0, 0)
-    check_settings('node_3', 0, 0)
-
+    update_configs(node_1_sleep_in_send_tables_status=sleep_time)
     check_query(expected_replica="node_2")
     check_changing_replica_events(1)
 
 
 def test_send_table_status_sleep2(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=0))
-    
-    check_settings('node_1', sleep_time, 0)
-    check_settings('node_2', sleep_time, 0)
-    check_settings('node_3', 0, 0)
-
+    update_configs(node_1_sleep_in_send_tables_status=sleep_time, node_2_sleep_in_send_tables_status=sleep_time)
     check_query(expected_replica="node_3")
     check_changing_replica_events(2)
 
 
 def test_send_data(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=0))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=0))
-
-    check_settings('node_1', 0, sleep_time)
-    check_settings('node_2', 0, 0)
-    check_settings('node_3', 0, 0)
-
+    update_configs(node_1_sleep_in_send_data=sleep_time)
     check_query(expected_replica="node_2")
     check_changing_replica_events(1)
 
 
 def test_send_data2(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=0))
-    
-    check_settings('node_1', 0, sleep_time)
-    check_settings('node_2', 0, sleep_time)
-    check_settings('node_3', 0, 0)
-
+    update_configs(node_1_sleep_in_send_data=sleep_time, node_2_sleep_in_send_data=sleep_time)
     check_query(expected_replica="node_3")
     check_changing_replica_events(2)
 
 
 def test_combination1(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-    
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=0))
-
-    check_settings('node_1', sleep_time, 0)
-    check_settings('node_2', 0, sleep_time)
-    check_settings('node_3', 0, 0)
-
+    update_configs(node_1_sleep_in_send_tables_status=sleep_time, node_2_sleep_in_send_data=sleep_time)
     check_query(expected_replica="node_3")
     check_changing_replica_events(2)
 
 
 def test_combination2(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=0))
-
-    check_settings('node_1', 0, sleep_time)
-    check_settings('node_2', sleep_time, 0)
-    check_settings('node_3', 0, 0)
-    
+    update_configs(node_1_sleep_in_send_data=sleep_time, node_2_sleep_in_send_tables_status=sleep_time)
     check_query(expected_replica="node_3")
     check_changing_replica_events(2)
 
 
 def test_combination3(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-    
-    check_settings('node_1', 0, sleep_time)
-    check_settings('node_2', 1, 0)
-    check_settings('node_3', 0, sleep_time)
-
+    update_configs(node_1_sleep_in_send_data=sleep_time,
+                    node_2_sleep_in_send_tables_status=1000,
+                    node_3_sleep_in_send_data=sleep_time)
     check_query(expected_replica="node_2")
     check_changing_replica_events(3)
 
 
 def test_combination4(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=1, sleep_in_send_data=sleep_time))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=2, sleep_in_send_data=0))
-
-    check_settings('node_1', 1, sleep_time)
-    check_settings('node_2', 1, 0)
-    check_settings('node_3', 2, 0)
-
+    update_configs(node_1_sleep_in_send_tables_status=1000,
+                    node_1_sleep_in_send_data=sleep_time,
+                    node_2_sleep_in_send_tables_status=1000,
+                    node_3_sleep_in_send_tables_status=1000)
     check_query(expected_replica="node_2")
     check_changing_replica_events(4)
 
@@ -291,22 +202,9 @@ def test_combination4(started_cluster):
 def test_receive_timeout1(started_cluster):
     # Check the situation when first two replicas get receive timeout
     # in establishing connection, but the third replica is ok.
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=3, sleep_in_send_data=0))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=3, sleep_in_send_data=0))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=1))
-    
-    check_settings('node_1', 3, 0)
-    check_settings('node_2', 3, 0)
-    check_settings('node_3', 0, 1)
-
+    update_configs(node_1_sleep_in_send_tables_status=3000,
+                    node_2_sleep_in_send_tables_status=3000,
+                    node_3_sleep_in_send_data=1000)
     check_query(expected_replica="node_3", receive_timeout=2)
     check_changing_replica_events(2)
 
@@ -315,22 +213,9 @@ def test_receive_timeout2(started_cluster):
     # Check the situation when first replica get receive timeout
     # in packet receiving but there are replicas in process of
     # connection establishing.
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=4))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=2, sleep_in_send_data=0))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=2, sleep_in_send_data=0))
-
-    check_settings('node_1', 0, 4)
-    check_settings('node_2', 2, 0)
-    check_settings('node_3', 2, 0)
-
+    update_configs(node_1_sleep_in_send_data=4000,
+                    node_2_sleep_in_send_tables_status=2000,
+                    node_3_sleep_in_send_tables_status=2000)
     check_query(expected_replica="node_2", receive_timeout=3)
     check_changing_replica_events(3)
 
diff --git a/tests/integration/test_hedged_requests_parallel/test.py b/tests/integration/test_hedged_requests_parallel/test.py
index 543d93f9989..33f70da00ca 100644
--- a/tests/integration/test_hedged_requests_parallel/test.py
+++ b/tests/integration/test_hedged_requests_parallel/test.py
@@ -14,29 +14,30 @@ cluster = ClickHouseCluster(__file__)
 NODES = {'node_' + str(i): None for i in (1, 2, 3, 4)}
 NODES['node'] = None
 
-sleep_time = 30
+# Cleep time in milliseconds.
+sleep_time = 30000
 
 @pytest.fixture(scope="module")
 def started_cluster():
     cluster = ClickHouseCluster(__file__)
     NODES['node'] = cluster.add_instance(
-    'node', with_zookeeper=True, stay_alive=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml'])
+    'node', stay_alive=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml'])
 
     for name in NODES:
         if name != 'node':
-            NODES[name] = cluster.add_instance(name, with_zookeeper=True, user_configs=['configs/users1.xml'])
+            NODES[name] = cluster.add_instance(name, user_configs=['configs/users1.xml'])
     
     try:
         cluster.start()
 
         for node_id, node in list(NODES.items()):
-            node.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE =
-            ReplicatedMergeTree('/clickhouse/tables/replicated', '{}')  ORDER BY id PARTITION BY toYYYYMM(date)'''.format(node_id))
+            node.query('''CREATE TABLE test_hedged (id UInt32, date Date) ENGINE =
+            MergeTree()  ORDER BY id PARTITION BY toYYYYMM(date)''')
+
+            node.query("INSERT INTO test_hedged SELECT number, toDateTime(number) FROM numbers(100)")
 
         NODES['node'].query('''CREATE TABLE distributed (id UInt32, date Date) ENGINE =
-            Distributed('test_cluster', 'default', 'replicated')''')
-
-        NODES['node'].query("INSERT INTO distributed SELECT number, toDateTime(number) FROM numbers(100)")
+            Distributed('test_cluster', 'default', 'test_hedged')''')
 
         yield cluster
 
@@ -47,8 +48,8 @@ def started_cluster():
 config = '''<yandex>
     <profiles>
         <default>
-            <sleep_in_send_tables_status>{sleep_in_send_tables_status}</sleep_in_send_tables_status>
-            <sleep_in_send_data>{sleep_in_send_data}</sleep_in_send_data>
+            <sleep_in_send_tables_status_ms>{sleep_in_send_tables_status_ms}</sleep_in_send_tables_status_ms>
+            <sleep_in_send_data_ms>{sleep_in_send_data_ms}</sleep_in_send_data_ms>
         </default>
     </profiles>
 </yandex>'''
@@ -72,12 +73,12 @@ def check_query(query=QUERY_1):
     assert query_time < 5
 
 
-def check_settings(node_name, sleep_in_send_tables_status, sleep_in_send_data):
+def check_settings(node_name, sleep_in_send_tables_status_ms, sleep_in_send_data_ms):
     attempts = 0
     while attempts < 1000:
-        setting1 = NODES[node_name].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_tables_status'")
-        setting2 = NODES[node_name].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data'")
-        if int(setting1) == sleep_in_send_tables_status and int(setting2) == sleep_in_send_data:
+        setting1 = NODES[node_name].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_tables_status_ms'")
+        setting2 = NODES[node_name].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data_ms'")
+        if int(setting1) == sleep_in_send_tables_status_ms and int(setting2) == sleep_in_send_data_ms:
             return
         time.sleep(0.1)
         attempts += 1
@@ -90,107 +91,64 @@ def check_changing_replica_events(expected_count):
     assert int(result) == expected_count
 
 
-def test_send_table_status_sleep(started_cluster):
+def update_configs(node_1_sleep_in_send_tables_status=0, node_1_sleep_in_send_data=0,
+                    node_2_sleep_in_send_tables_status=0, node_2_sleep_in_send_data=0,
+                    node_3_sleep_in_send_tables_status=0, node_3_sleep_in_send_data=0,
+                    node_4_sleep_in_send_tables_status=0, node_4_sleep_in_send_data=0):
     NODES['node_1'].replace_config(
         '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0))
+        config.format(sleep_in_send_tables_status_ms=node_1_sleep_in_send_tables_status, sleep_in_send_data_ms=node_1_sleep_in_send_data))
 
     NODES['node_2'].replace_config(
         '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0))
-    
-    check_settings('node_1', sleep_time, 0)
-    check_settings('node_2', sleep_time, 0)
+        config.format(sleep_in_send_tables_status_ms=node_2_sleep_in_send_tables_status, sleep_in_send_data_ms=node_2_sleep_in_send_data))
 
+    NODES['node_3'].replace_config(
+        '/etc/clickhouse-server/users.d/users1.xml',
+        config.format(sleep_in_send_tables_status_ms=node_3_sleep_in_send_tables_status, sleep_in_send_data_ms=node_3_sleep_in_send_data))
+
+    NODES['node_4'].replace_config(
+        '/etc/clickhouse-server/users.d/users1.xml',
+        config.format(sleep_in_send_tables_status_ms=node_4_sleep_in_send_tables_status, sleep_in_send_data_ms=node_4_sleep_in_send_data))
+
+    check_settings('node_1', node_1_sleep_in_send_tables_status, node_1_sleep_in_send_data)
+    check_settings('node_2', node_2_sleep_in_send_tables_status, node_2_sleep_in_send_data)
+    check_settings('node_3', node_3_sleep_in_send_tables_status, node_3_sleep_in_send_data)
+    check_settings('node_4', node_4_sleep_in_send_tables_status, node_4_sleep_in_send_data)
+
+
+def test_send_table_status_sleep(started_cluster):
+    update_configs(node_1_sleep_in_send_tables_status=sleep_time, node_2_sleep_in_send_tables_status=sleep_time)
     check_query()
     check_changing_replica_events(2)
 
 
 def test_send_data(started_cluster):
-
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    check_settings('node_1', 0, sleep_time)
-    check_settings('node_2', 0, sleep_time)
-
+    update_configs(node_1_sleep_in_send_data=sleep_time, node_2_sleep_in_send_data=sleep_time)
     check_query()
     check_changing_replica_events(2)
 
 
 def test_combination1(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    check_settings('node_1', 1, 0)
-    check_settings('node_2', 1, 0)
-    check_settings('node_3', 0, sleep_time)
-
+    update_configs(node_1_sleep_in_send_tables_status=1000,
+                    node_2_sleep_in_send_tables_status=1000,
+                    node_3_sleep_in_send_data=sleep_time)
     check_query()
     check_changing_replica_events(3)
 
 
 def test_combination2(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    NODES['node_4'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0))
-
-    
-    check_settings('node_1', 0, sleep_time)
-    check_settings('node_2', 1, 0)
-    check_settings('node_3', 0, sleep_time)
-    check_settings('node_4', 1, 0)
-    
+    update_configs(node_1_sleep_in_send_data=sleep_time,
+                    node_2_sleep_in_send_tables_status=1000,
+                    node_3_sleep_in_send_data=sleep_time,
+                    node_4_sleep_in_send_tables_status=1000)
     check_query()
     check_changing_replica_events(4)
 
 
 def test_query_with_no_data_to_sample(started_cluster):
-    NODES['node_1'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    NODES['node_2'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time))
-
-    NODES['node_3'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=0))
-
-    NODES['node_4'].replace_config(
-        '/etc/clickhouse-server/users.d/users1.xml',
-        config.format(sleep_in_send_tables_status=0, sleep_in_send_data=0))
-    check_settings('node_1', 0, sleep_time)
-    check_settings('node_2', 0, sleep_time)
-    check_settings('node_3', 0, 0)
-    check_settings('node_4', 0, 0)
+    update_configs(node_1_sleep_in_send_data=sleep_time,
+                    node_2_sleep_in_send_data=sleep_time)
 
     # When there is no way to sample data, the whole query will be performed by
     # the first replica and the second replica will just send EndOfStream,

From b824df2d3f35f9a4d9cd6f2bfb0ae465654b1c20 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 22:43:26 +0300
Subject: [PATCH 073/260] Add tzdata to Docker

---
 docker/client/Dockerfile | 1 +
 docker/server/Dockerfile | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docker/client/Dockerfile b/docker/client/Dockerfile
index 8443eae691b..d9cd68254b7 100644
--- a/docker/client/Dockerfile
+++ b/docker/client/Dockerfile
@@ -18,6 +18,7 @@ RUN apt-get update \
             clickhouse-client=$version \
             clickhouse-common-static=$version \
             locales \
+            tzdata \
     && rm -rf /var/lib/apt/lists/* /var/cache/debconf \
     && apt-get clean
 
diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile
index 295784a6184..414eb23d044 100644
--- a/docker/server/Dockerfile
+++ b/docker/server/Dockerfile
@@ -32,6 +32,7 @@ RUN groupadd -r clickhouse --gid=101 \
             clickhouse-server=$version \
             locales \
             wget \
+            tzdata \
     && rm -rf \
         /var/lib/apt/lists/* \
         /var/cache/debconf \

From 84be58453b5129f3b516eb3011f53806b6bb6f21 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 22:50:04 +0300
Subject: [PATCH 074/260] Add function timezoneOf and fix inconsistencies

---
 .../registerFunctionsMiscellaneous.cpp        |   6 +-
 src/Functions/timezone.cpp                    |   9 +-
 src/Functions/timezoneOf.cpp                  | 111 ++++++++++++++++++
 .../{toTimeZone.cpp => toTimezone.cpp}        |   9 +-
 .../0_stateless/01767_timezoneOf.reference    |   1 +
 tests/queries/0_stateless/01767_timezoneOf.sh |   7 ++
 6 files changed, 133 insertions(+), 10 deletions(-)
 create mode 100644 src/Functions/timezoneOf.cpp
 rename src/Functions/{toTimeZone.cpp => toTimezone.cpp} (90%)
 create mode 100644 tests/queries/0_stateless/01767_timezoneOf.reference
 create mode 100755 tests/queries/0_stateless/01767_timezoneOf.sh

diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp
index 592f0d6774d..ca9bc32486e 100644
--- a/src/Functions/registerFunctionsMiscellaneous.cpp
+++ b/src/Functions/registerFunctionsMiscellaneous.cpp
@@ -41,7 +41,8 @@ void registerFunctionThrowIf(FunctionFactory &);
 void registerFunctionVersion(FunctionFactory &);
 void registerFunctionBuildId(FunctionFactory &);
 void registerFunctionUptime(FunctionFactory &);
-void registerFunctionTimeZone(FunctionFactory &);
+void registerFunctionTimezone(FunctionFactory &);
+void registerFunctionTimezoneOf(FunctionFactory &);
 void registerFunctionRunningAccumulate(FunctionFactory &);
 void registerFunctionRunningDifference(FunctionFactory &);
 void registerFunctionRunningDifferenceStartingWithFirstValue(FunctionFactory &);
@@ -111,7 +112,8 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
     registerFunctionVersion(factory);
     registerFunctionBuildId(factory);
     registerFunctionUptime(factory);
-    registerFunctionTimeZone(factory);
+    registerFunctionTimezone(factory);
+    registerFunctionTimezoneOf(factory);
     registerFunctionRunningAccumulate(factory);
     registerFunctionRunningDifference(factory);
     registerFunctionRunningDifferenceStartingWithFirstValue(factory);
diff --git a/src/Functions/timezone.cpp b/src/Functions/timezone.cpp
index 4522f21c8b2..2cd0c28612b 100644
--- a/src/Functions/timezone.cpp
+++ b/src/Functions/timezone.cpp
@@ -12,13 +12,13 @@ namespace
 
 /** Returns the server time zone.
   */
-class FunctionTimeZone : public IFunction
+class FunctionTimezone : public IFunction
 {
 public:
     static constexpr auto name = "timezone";
     static FunctionPtr create(const Context &)
     {
-        return std::make_shared<FunctionTimeZone>();
+        return std::make_shared<FunctionTimezone>();
     }
 
     String getName() const override
@@ -45,9 +45,10 @@ public:
 
 }
 
-void registerFunctionTimeZone(FunctionFactory & factory)
+void registerFunctionTimezone(FunctionFactory & factory)
 {
-    factory.registerFunction<FunctionTimeZone>();
+    factory.registerFunction<FunctionTimezone>();
+    factory.registerAlias("timeZone", "timezone");
 }
 
 }
diff --git a/src/Functions/timezoneOf.cpp b/src/Functions/timezoneOf.cpp
new file mode 100644
index 00000000000..cdf686e276b
--- /dev/null
+++ b/src/Functions/timezoneOf.cpp
@@ -0,0 +1,111 @@
+#include <Functions/IFunctionImpl.h>
+#include <Functions/FunctionFactory.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <common/DateLUTImpl.h>
+#include <Core/Field.h>
+
+
+namespace DB
+{
+namespace
+{
+
+
+/** timezoneOf(x) - get the name of the timezone of DateTime data type.
+  * Example: Europe/Moscow.
+  */
+class ExecutableFunctionTimezoneOf : public IExecutableFunctionImpl
+{
+public:
+    static constexpr auto name = "timezoneOf";
+    String getName() const override { return name; }
+
+    bool useDefaultImplementationForNulls() const override { return false; }
+    bool useDefaultImplementationForLowCardinalityColumns() const override { return false; }
+
+    /// Execute the function on the columns.
+    ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
+    {
+        DataTypePtr type_no_nullable = removeNullable(arguments[0].type);
+
+        return DataTypeString().createColumnConst(input_rows_count,
+            dynamic_cast<const TimezoneMixin &>(*type_no_nullable).getTimeZone().getTimeZone());
+    }
+};
+
+
+class BaseFunctionTimezoneOf : public IFunctionBaseImpl
+{
+public:
+    BaseFunctionTimezoneOf(DataTypes argument_types_, DataTypePtr return_type_)
+        : argument_types(std::move(argument_types_)), return_type(std::move(return_type_)) {}
+
+    static constexpr auto name = "timezoneOf";
+    String getName() const override { return name; }
+
+    bool isDeterministic() const override { return true; }
+    bool isDeterministicInScopeOfQuery() const override { return true; }
+
+    const DataTypes & getArgumentTypes() const override { return argument_types; }
+    const DataTypePtr & getResultType() const override { return return_type; }
+
+    ExecutableFunctionImplPtr prepare(const ColumnsWithTypeAndName &) const override
+    {
+        return std::make_unique<ExecutableFunctionTimezoneOf>();
+    }
+
+    ColumnPtr getResultIfAlwaysReturnsConstantAndHasArguments(const ColumnsWithTypeAndName & arguments) const override
+    {
+        DataTypePtr type_no_nullable = removeNullable(arguments[0].type);
+
+        return DataTypeString().createColumnConst(1,
+            dynamic_cast<const TimezoneMixin &>(*type_no_nullable).getTimeZone().getTimeZone());
+    }
+
+private:
+    DataTypes argument_types;
+    DataTypePtr return_type;
+};
+
+
+class FunctionTimezoneOfBuilder : public IFunctionOverloadResolverImpl
+{
+public:
+    static constexpr auto name = "timezoneOf";
+    String getName() const override { return name; }
+    static FunctionOverloadResolverImplPtr create(const Context &) { return std::make_unique<FunctionTimezoneOfBuilder>(); }
+
+    size_t getNumberOfArguments() const override { return 1; }
+
+    DataTypePtr getReturnType(const DataTypes & types) const override
+    {
+        DataTypePtr type_no_nullable = removeNullable(types[0]);
+
+        if (isDateTime(type_no_nullable) || isDateTime64(type_no_nullable))
+            return std::make_shared<DataTypeString>();
+        else
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad argument for function {}, should be DateTime or DateTime64", name);
+    }
+
+    FunctionBaseImplPtr build(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
+    {
+        return std::make_unique<BaseFunctionTimezoneOf>(DataTypes{arguments[0].type}, return_type);
+    }
+
+    bool useDefaultImplementationForNulls() const override { return false; }
+    bool useDefaultImplementationForLowCardinalityColumns() const override { return false; }
+    ColumnNumbers getArgumentsThatDontImplyNullableReturnType(size_t /*number_of_arguments*/) const override { return {0}; }
+};
+
+}
+
+void registerFunctionTimezoneOf(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionTimezoneOfBuilder>();
+    factory.registerAlias("timeZoneOf", "timezoneOf");
+}
+
+}
+
diff --git a/src/Functions/toTimeZone.cpp b/src/Functions/toTimezone.cpp
similarity index 90%
rename from src/Functions/toTimeZone.cpp
rename to src/Functions/toTimezone.cpp
index fbf3a0778a6..d12f926b284 100644
--- a/src/Functions/toTimeZone.cpp
+++ b/src/Functions/toTimezone.cpp
@@ -21,11 +21,11 @@ namespace
 {
 
 /// Just changes time zone information for data type. The calculation is free.
-class FunctionToTimeZone : public IFunction
+class FunctionToTimezone : public IFunction
 {
 public:
-    static constexpr auto name = "toTimeZone";
-    static FunctionPtr create(const Context &) { return std::make_shared<FunctionToTimeZone>(); }
+    static constexpr auto name = "toTimezone";
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionToTimezone>(); }
 
     String getName() const override
     {
@@ -64,7 +64,8 @@ public:
 
 void registerFunctionToTimeZone(FunctionFactory & factory)
 {
-    factory.registerFunction<FunctionToTimeZone>();
+    factory.registerFunction<FunctionToTimezone>();
+    factory.registerAlias("toTimeZone", "toTimezone");
 }
 
 }
diff --git a/tests/queries/0_stateless/01767_timezoneOf.reference b/tests/queries/0_stateless/01767_timezoneOf.reference
new file mode 100644
index 00000000000..63c027eecfd
--- /dev/null
+++ b/tests/queries/0_stateless/01767_timezoneOf.reference
@@ -0,0 +1 @@
+Asia/Tehran	Asia/Tehran	Asia/Tehran	Africa/Accra
diff --git a/tests/queries/0_stateless/01767_timezoneOf.sh b/tests/queries/0_stateless/01767_timezoneOf.sh
new file mode 100755
index 00000000000..428db2ee737
--- /dev/null
+++ b/tests/queries/0_stateless/01767_timezoneOf.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+TZ=Asia/Tehran $CLICKHOUSE_LOCAL --query "SELECT timezone(), timezoneOf(now()), timeZone(), timeZoneOf(toTimezone(toNullable(now()), 'Africa/Accra'))"

From a9c25579ce392bd9e5ed0324524cddfd8850d6fb Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 22:53:39 +0300
Subject: [PATCH 075/260] More tests

---
 tests/queries/0_stateless/01767_timezoneOf.reference | 2 +-
 tests/queries/0_stateless/01767_timezoneOf.sh        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/queries/0_stateless/01767_timezoneOf.reference b/tests/queries/0_stateless/01767_timezoneOf.reference
index 63c027eecfd..0a8a8c32d4e 100644
--- a/tests/queries/0_stateless/01767_timezoneOf.reference
+++ b/tests/queries/0_stateless/01767_timezoneOf.reference
@@ -1 +1 @@
-Asia/Tehran	Asia/Tehran	Asia/Tehran	Africa/Accra
+Asia/Tehran	Asia/Tehran	Asia/Tehran	Africa/Accra	Pacific/Pitcairn
diff --git a/tests/queries/0_stateless/01767_timezoneOf.sh b/tests/queries/0_stateless/01767_timezoneOf.sh
index 428db2ee737..9dee051ee3f 100755
--- a/tests/queries/0_stateless/01767_timezoneOf.sh
+++ b/tests/queries/0_stateless/01767_timezoneOf.sh
@@ -4,4 +4,4 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CUR_DIR"/../shell_config.sh
 
-TZ=Asia/Tehran $CLICKHOUSE_LOCAL --query "SELECT timezone(), timezoneOf(now()), timeZone(), timeZoneOf(toTimezone(toNullable(now()), 'Africa/Accra'))"
+TZ=Asia/Tehran $CLICKHOUSE_LOCAL --query "SELECT timezone(), timezoneOf(now()), timeZone(), timeZoneOf(toTimezone(toNullable(now()), 'Africa/Accra')), timeZoneOf(toTimeZone(now64(3), 'Pacific/Pitcairn'))"

From 9845ff6694cea2786be7b88f9c90db307d399e7e Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 22 Mar 2021 23:03:51 +0300
Subject: [PATCH 076/260] Move db to separate file

---
 .../src/jepsen/nukeeper/constants.clj         | 26 +++--
 .../src/jepsen/nukeeper/db.clj                | 99 +++++++++++++++++++
 .../src/jepsen/nukeeper/main.clj              | 57 +----------
 .../src/jepsen/nukeeper/nemesis.clj           |  8 +-
 .../src/jepsen/nukeeper/utils.clj             | 26 +++--
 5 files changed, 138 insertions(+), 78 deletions(-)
 create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
index 511ff8e3bf3..95b142e43f9 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
@@ -1,12 +1,18 @@
 (ns jepsen.nukeeper.constants)
 
-(def dir "/var/lib/clickhouse")
-(def binary "clickhouse")
-(def logdir "/var/log/clickhouse-server")
-(def logfile "/var/log/clickhouse-server/stderr.log")
-(def serverlog "/var/log/clickhouse-server/clickhouse-server.log")
-(def snapshotsdir "/var/lib/clickhouse/coordination/snapshots")
-(def coordinationdir "/var/lib/clickhouse/coordination")
-(def logsdir "/var/lib/clickhouse/coordination/logs")
-(def pidfile (str dir "/clickhouse.pid"))
-(def binary-path "/tmp")
+(def common-prefix "/tmp/clickhouse")
+
+(def binary-name "clickhouse")
+
+(def binary-path (str common-prefix "/" binary-name))
+(def pid-file-path (str common-prefix "/clickhouse.pid"))
+
+(def data-dir (str common-prefix "/db"))
+(def logs-dir (str common-prefix "/logs"))
+(def configs-dir (str common-prefix "/config"))
+(def sub-configs-dir (str configs-dir "/config.d"))
+(def coordination-data-dir (str data-dir "/coordination"))
+(def coordination-snapshots-dir (str coordination-data-dir "/snapshots"))
+(def coordination-logs-dir (str coordination-data-dir "/logs"))
+
+(def stderr-file (str logs-dir "/stderr.log"))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
new file mode 100644
index 00000000000..b4bcd363740
--- /dev/null
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
@@ -0,0 +1,99 @@
+(ns jepsen.nukeeper.db
+  (:require [clojure.tools.logging :refer :all]
+            [jepsen
+             [control :as c]
+             [db :as db]
+             [util :as util :refer [meh]]]
+            [jepsen.nukeeper.constants :refer :all]
+            [jepsen.nukeeper.utils :refer :all]
+            [clojure.java.io :as io]
+            [jepsen.control.util :as cu]
+            [jepsen.os.ubuntu :as ubuntu]))
+
+(defn get-clickhouse-sky
+  [version]
+  (c/exec :sky :get :-d common-prefix :-N :Backbone version))
+
+(defn get-clickhouse-url
+  [url]
+  (let [download-result (cu/wget! url)]
+    (do (c/exec :mv download-result common-prefix)
+        (str common-prefix "/" download-result))))
+
+(defn unpack-deb
+  [path]
+  (do
+  (c/exec :dpkg :-x path :.)
+  (c/exec :mv "usr/bin/clickhouse" common-prefix)))
+
+(defn unpack-tgz
+  [path]
+  (do
+  (c/exec :tar :-zxvf path :.)
+  (c/exec :mv "usr/bin/clickhouse" common-prefix)))
+
+(defn prepare-dirs
+  []
+  (do
+    (c/exec :rm :-rf common-prefix)
+    (c/exec :mkdir :-p common-prefix)
+    (c/exec :mkdir :-p data-dir)
+    (c/exec :mkdir :-p logs-dir)
+    (c/exec :mkdir :-p configs-dir)
+    (c/exec :mkdir :-p sub-configs-dir)
+    (c/exec :touch stderr-file)
+    (c/exec :chown :-R :root common-prefix)))
+
+(defn cluster-config
+  [test node config-template]
+  (let [nodes (:nodes test)
+        replacement-map {#"\{srv1\}" (get nodes 0)
+                         #"\{srv2\}" (get nodes 1)
+                         #"\{srv3\}" (get nodes 2)
+                         #"\{id\}" (str (inc (.indexOf nodes node)))
+                         #"\{quorum_reads\}" (str (boolean (:quorum test)))
+                         #"\{snapshot_distance\}" (str (:snapshot-distance test))
+                         #"\{stale_log_gap\}" (str (:stale-log-gap test))
+                         #"\{reserved_log_items\}" (str (:reserved-log-items test))}]
+    (reduce #(clojure.string/replace %1 (get %2 0) (get %2 1)) config-template replacement-map)))
+
+(defn install-configs
+  [test node]
+     (c/exec :echo (slurp (io/resource "config.xml")) :> (str configs-dir "/config.xml"))
+     (c/exec :echo (slurp (io/resource "users.xml")) :> (str configs-dir "/users.xml"))
+     (c/exec :echo (slurp (io/resource "listen.xml")) :> (str sub-configs-dir "/listen.xml"))
+     (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> (str sub-configs-dir "/test_keeper_config.xml")))
+
+(defn db
+  [version]
+  (reify db/DB
+    (setup! [_ test node]
+      (c/su
+       (do
+       (info "Preparing directories")
+       (prepare-dirs)
+       (info "Downloading clickhouse")
+       (get-clickhouse-sky version)
+       (info "Installing configs")
+       (install-configs test node)
+       (info "Starting server")
+       (start-clickhouse! node test)
+      (info "ClickHouse started"))))
+
+
+    (teardown! [_ test node]
+      (info node "Tearing down clickhouse")
+      (kill-clickhouse! node test)
+      (c/su
+       ;(c/exec :rm :-f binary-path)
+       (c/exec :rm :-rf data-dir)
+       (c/exec :rm :-rf logs-dir)
+       (c/exec :rm :-rf configs-dir)))
+
+    db/LogFiles
+    (log-files [_ test node]
+      (c/su
+       (kill-clickhouse! node test)
+       (c/cd data-dir
+        (c/exec :tar :czf "coordination.tar.gz" "coordination")))
+      [stderr-file (str logs-dir "/clickhouse-server.log") (str data-dir "/coordination.tar.gz")])))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index dfa1cfd913e..e027b956937 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -3,6 +3,7 @@
             [jepsen.nukeeper.utils :refer :all]
             [clojure.pprint :refer [pprint]]
             [jepsen.nukeeper.set :as set]
+            [jepsen.nukeeper.db :refer :all]
             [jepsen.nukeeper.nemesis :as custom-nemesis]
             [jepsen.nukeeper.register :as register]
             [jepsen.nukeeper.unique :as unique]
@@ -31,60 +32,6 @@
            (ch.qos.logback.classic Level)
            (org.slf4j Logger LoggerFactory)))
 
-(defn cluster-config
-  [test node config-template]
-  (let [nodes (:nodes test)
-        replacement-map {#"\{srv1\}" (get nodes 0)
-                         #"\{srv2\}" (get nodes 1)
-                         #"\{srv3\}" (get nodes 2)
-                         #"\{id\}" (str (inc (.indexOf nodes node)))
-                         #"\{quorum_reads\}" (str (boolean (:quorum test)))
-                         #"\{snapshot_distance\}" (str (:snapshot-distance test))
-                         #"\{stale_log_gap\}" (str (:stale-log-gap test))
-                         #"\{reserved_log_items\}" (str (:reserved-log-items test))}]
-    (reduce #(clojure.string/replace %1 (get %2 0) (get %2 1)) config-template replacement-map)))
-
-(defn db
-  [version]
-  (reify db/DB
-    (setup! [_ test node]
-      (info node "installing clickhouse" version)
-      (c/su
-       (if-not (cu/exists? (str binary-path "/clickhouse"))
-         (c/exec :sky :get :-d binary-path :-N :Backbone version))
-       (c/exec :mkdir :-p logdir)
-       (c/exec :touch logfile)
-       (c/exec (str binary-path "/clickhouse") :install)
-       (c/exec :chown :-R :root dir)
-       (c/exec :chown :-R :root logdir)
-       (c/exec :echo (slurp (io/resource "listen.xml")) :> "/etc/clickhouse-server/config.d/listen.xml")
-       (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> "/etc/clickhouse-server/config.d/test_keeper_config.xml")
-       (cu/start-daemon!
-        {:pidfile pidfile
-         :logfile logfile
-         :chdir dir}
-        (str binary-path "/clickhouse")
-        :server
-        :--config "/etc/clickhouse-server/config.xml")
-       (wait-clickhouse-alive! node test)))
-
-    (teardown! [_ test node]
-      (info node "tearing down clickhouse")
-      (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
-      (c/su
-       ;(c/exec :rm :-f (str binary-path "/clickhouse"))
-       (c/exec :rm :-rf dir)
-       (c/exec :rm :-rf logdir)
-       (c/exec :rm :-rf "/etc/clickhouse-server")))
-
-    db/LogFiles
-    (log-files [_ test node]
-      (c/su
-       (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
-       (c/cd dir
-             (c/exec :tar :czf "coordination.tar.gz" "coordination")))
-      [logfile serverlog (str dir "/coordination.tar.gz")])))
-
 (def workloads
   "A map of workload names to functions that construct workloads, given opts."
   {"set"      set/workload
@@ -137,7 +84,7 @@
            opts
            {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)) " " (name (:nemesis opts)))
             :os ubuntu/os
-            :db (db "rbtorrent:156b85947eac9c85ef5d0ef15757a9f9e7c9e430")
+            :db (db "rbtorrent:a284492c715974b69f73add62b4ff590110369af")
             :pure-generators true
             :client (:client workload)
             :nemesis (:nemesis current-nemesis)
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index ec39c2b3e35..8314d29f575 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -76,21 +76,21 @@
 
 (defn logs-corruption-nemesis
   []
-  (corruptor-nemesis logsdir #(corrupt-file (select-last-file %1))))
+  (corruptor-nemesis coordination-logs-dir #(corrupt-file (select-last-file %1))))
 
 (defn snapshots-corruption-nemesis
   []
-  (corruptor-nemesis snapshotsdir #(corrupt-file (select-last-file %1))))
+  (corruptor-nemesis coordination-snapshots-dir #(corrupt-file (select-last-file %1))))
 
 (defn logs-and-snapshots-corruption-nemesis
   []
-  (corruptor-nemesis coordinationdir (fn [path]
+  (corruptor-nemesis coordination-data-dir (fn [path]
                                        (do
                                          (corrupt-file (select-last-file (str path "/snapshots")))
                                          (corrupt-file (select-last-file (str path "/logs")))))))
 (defn drop-all-corruption-nemesis
   []
-  (corruptor-nemesis coordinationdir (fn [path]
+  (corruptor-nemesis coordination-data-dir (fn [path]
                                        (c/exec :rm :-fr path))))
 
 (defn partition-bridge-nemesis
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index fe415ff9e51..30774c24dae 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -130,7 +130,7 @@
   [node test]
   (info "Checking server alive on" node)
   (try
-    (c/exec (str binary-path "/clickhouse") :client :--query "SELECT 1")
+    (c/exec binary-path :client :--query "SELECT 1")
     (catch Exception _ false)))
 
 (defn wait-clickhouse-alive!
@@ -144,18 +144,26 @@
   [node test]
   (info "Killing server on node" node)
   (c/su
-   (cu/stop-daemon! (str binary-path "/clickhouse") pidfile)
-   (c/exec :rm :-fr (str dir "/status"))))
+   (cu/stop-daemon! binary-path pid-file-path)
+   (c/exec :rm :-fr (str data-dir "/status"))))
 
 (defn start-clickhouse!
   [node test]
   (info "Starting server on node" node)
   (c/su
    (cu/start-daemon!
-    {:pidfile pidfile
-     :logfile logfile
-     :chdir dir}
-    (str binary-path "/clickhouse")
+    {:pidfile pid-file-path
+     :logfile stderr-file
+     :chdir data-dir}
+    binary-path
     :server
-    :--config "/etc/clickhouse-server/config.xml"))
-  (wait-clickhouse-alive! node test))
+    :--config (str configs-dir "/config.xml")
+    :--
+    :--path data-dir
+    :--user_files_path (str data-dir "/user_files")
+    :--top_level_domains_path (str data-dir "/top_level_domains")
+    :--logger.log (str logs-dir "/clickhouse-server.log")
+    :--logger.errorlog (str logs-dir "/clickhouse-server.err.log")
+    :--test_keeper_server.snapshot_storage_path coordination-snapshots-dir
+    :--test_keeper_server.logs_storage_path coordination-logs-dir)
+  (wait-clickhouse-alive! node test)))

From 43c36462f0f046cab084baa213e918e8b04ab386 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 23:04:07 +0300
Subject: [PATCH 077/260] Add a test for #15784

---
 tests/queries/0_stateless/01768_extended_range.reference | 3 +++
 tests/queries/0_stateless/01768_extended_range.sql       | 4 ++++
 2 files changed, 7 insertions(+)
 create mode 100644 tests/queries/0_stateless/01768_extended_range.reference
 create mode 100644 tests/queries/0_stateless/01768_extended_range.sql

diff --git a/tests/queries/0_stateless/01768_extended_range.reference b/tests/queries/0_stateless/01768_extended_range.reference
new file mode 100644
index 00000000000..1436eeae43a
--- /dev/null
+++ b/tests/queries/0_stateless/01768_extended_range.reference
@@ -0,0 +1,3 @@
+1968
+-473
+1990-01-01
diff --git a/tests/queries/0_stateless/01768_extended_range.sql b/tests/queries/0_stateless/01768_extended_range.sql
new file mode 100644
index 00000000000..4acaccd1399
--- /dev/null
+++ b/tests/queries/0_stateless/01768_extended_range.sql
@@ -0,0 +1,4 @@
+SELECT toYear(toDateTime64('1968-12-12 11:22:33', 0, 'UTC'));
+SELECT toInt16(toRelativeWeekNum(toDateTime64('1960-11-30 18:00:11.999', 3, 'UTC')));
+SELECT toStartOfQuarter(toDateTime64('1990-01-04 12:14:12', 0, 'UTC'));
+SELECT toUnixTimestamp(toDateTime64('1900-12-12 11:22:33', 0, 'UTC')); -- { serverError 407 }

From 8f87914bae2dcfcfe987945d4180452f894046c3 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 23:07:15 +0300
Subject: [PATCH 078/260] Add a test for #16222

---
 tests/queries/0_stateless/01769_extended_range_2.reference | 3 +++
 tests/queries/0_stateless/01769_extended_range_2.sql       | 3 +++
 2 files changed, 6 insertions(+)
 create mode 100644 tests/queries/0_stateless/01769_extended_range_2.reference
 create mode 100644 tests/queries/0_stateless/01769_extended_range_2.sql

diff --git a/tests/queries/0_stateless/01769_extended_range_2.reference b/tests/queries/0_stateless/01769_extended_range_2.reference
new file mode 100644
index 00000000000..e9c4e1d8604
--- /dev/null
+++ b/tests/queries/0_stateless/01769_extended_range_2.reference
@@ -0,0 +1,3 @@
+1969-12-31 18:00:12
+1969-12-30 18:00:12
+1969-12-31 18:00:12
diff --git a/tests/queries/0_stateless/01769_extended_range_2.sql b/tests/queries/0_stateless/01769_extended_range_2.sql
new file mode 100644
index 00000000000..a2570c9397b
--- /dev/null
+++ b/tests/queries/0_stateless/01769_extended_range_2.sql
@@ -0,0 +1,3 @@
+SELECT toDateTime64('1969-12-31 18:00:12', 0, 'America/Phoenix');
+SELECT toDateTime64('1969-12-30 18:00:12', 0, 'America/Phoenix');
+SELECT toDateTime64('1969-12-31 18:00:12', 0, 'Europe/Moscow');

From 855d36be45af5b15829dcee2926d8825d9f01384 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 23:12:01 +0300
Subject: [PATCH 079/260] Add a test for #16924

---
 tests/queries/0_stateless/01770_extended_range_3.reference | 2 ++
 tests/queries/0_stateless/01770_extended_range_3.sql       | 2 ++
 2 files changed, 4 insertions(+)
 create mode 100644 tests/queries/0_stateless/01770_extended_range_3.reference
 create mode 100644 tests/queries/0_stateless/01770_extended_range_3.sql

diff --git a/tests/queries/0_stateless/01770_extended_range_3.reference b/tests/queries/0_stateless/01770_extended_range_3.reference
new file mode 100644
index 00000000000..1a35ee6cc1e
--- /dev/null
+++ b/tests/queries/0_stateless/01770_extended_range_3.reference
@@ -0,0 +1,2 @@
+1984-04-01 08:00:00
+1985-03-31 09:00:00
diff --git a/tests/queries/0_stateless/01770_extended_range_3.sql b/tests/queries/0_stateless/01770_extended_range_3.sql
new file mode 100644
index 00000000000..68e0782d3d5
--- /dev/null
+++ b/tests/queries/0_stateless/01770_extended_range_3.sql
@@ -0,0 +1,2 @@
+SELECT addHours(toDateTime64('1984-03-31 23:00:00', 0, 'Asia/Novosibirsk'), 8);
+SELECT addHours(toDateTime64('1985-03-31 00:00:00', 0, 'Asia/Novosibirsk'), 8);

From 81e91734f6ee74d55dd38df56eae40dd0fd46c4d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 23:14:05 +0300
Subject: [PATCH 080/260] Add a test for #17080

---
 .../queries/0_stateless/01771_datetime64_no_time_part.reference  | 1 +
 tests/queries/0_stateless/01771_datetime64_no_time_part.sql      | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 tests/queries/0_stateless/01771_datetime64_no_time_part.reference
 create mode 100644 tests/queries/0_stateless/01771_datetime64_no_time_part.sql

diff --git a/tests/queries/0_stateless/01771_datetime64_no_time_part.reference b/tests/queries/0_stateless/01771_datetime64_no_time_part.reference
new file mode 100644
index 00000000000..c13116eeefe
--- /dev/null
+++ b/tests/queries/0_stateless/01771_datetime64_no_time_part.reference
@@ -0,0 +1 @@
+1985-03-31 00:00:00
diff --git a/tests/queries/0_stateless/01771_datetime64_no_time_part.sql b/tests/queries/0_stateless/01771_datetime64_no_time_part.sql
new file mode 100644
index 00000000000..debf4783eb8
--- /dev/null
+++ b/tests/queries/0_stateless/01771_datetime64_no_time_part.sql
@@ -0,0 +1 @@
+SELECT toDateTime64('1985-03-31', 0, 'Europe/Helsinki');

From 3952a8e976cef6a912b93d90b95b0d60a752d262 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 23:42:09 +0300
Subject: [PATCH 081/260] Fix UBSan report in addMonths

---
 base/common/DateLUTImpl.h                                  | 2 +-
 tests/queries/0_stateless/01770_add_months_ubsan.reference | 1 +
 tests/queries/0_stateless/01770_add_months_ubsan.sql       | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/01770_add_months_ubsan.reference
 create mode 100644 tests/queries/0_stateless/01770_add_months_ubsan.sql

diff --git a/base/common/DateLUTImpl.h b/base/common/DateLUTImpl.h
index 43fc1b8befd..1a44c670650 100644
--- a/base/common/DateLUTImpl.h
+++ b/base/common/DateLUTImpl.h
@@ -1073,7 +1073,7 @@ public:
     {
         const Values & values = lut[toLUTIndex(v)];
 
-        Int64 month = static_cast<Int64>(values.month) + delta;
+        Int64 month = values.month + static_cast<UInt64>(delta);    /// Cast is to avoid UB in signed integer overflow.
 
         if (month > 0)
         {
diff --git a/tests/queries/0_stateless/01770_add_months_ubsan.reference b/tests/queries/0_stateless/01770_add_months_ubsan.reference
new file mode 100644
index 00000000000..573541ac970
--- /dev/null
+++ b/tests/queries/0_stateless/01770_add_months_ubsan.reference
@@ -0,0 +1 @@
+0
diff --git a/tests/queries/0_stateless/01770_add_months_ubsan.sql b/tests/queries/0_stateless/01770_add_months_ubsan.sql
new file mode 100644
index 00000000000..039434ff9bc
--- /dev/null
+++ b/tests/queries/0_stateless/01770_add_months_ubsan.sql
@@ -0,0 +1,2 @@
+-- Result does not make sense but UBSan report should not be triggered.
+SELECT ignore(now() + INTERVAL 9223372036854775807 MONTH);

From e6c85df04b717bac9528f61a338a6aea1479165d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Mon, 22 Mar 2021 23:50:04 +0300
Subject: [PATCH 082/260] Add a test for #7963

---
 .../0_stateless/01771_bloom_filter_not_has.reference       | 3 +++
 tests/queries/0_stateless/01771_bloom_filter_not_has.sql   | 7 +++++++
 2 files changed, 10 insertions(+)
 create mode 100644 tests/queries/0_stateless/01771_bloom_filter_not_has.reference
 create mode 100644 tests/queries/0_stateless/01771_bloom_filter_not_has.sql

diff --git a/tests/queries/0_stateless/01771_bloom_filter_not_has.reference b/tests/queries/0_stateless/01771_bloom_filter_not_has.reference
new file mode 100644
index 00000000000..fc08c4c0d15
--- /dev/null
+++ b/tests/queries/0_stateless/01771_bloom_filter_not_has.reference
@@ -0,0 +1,3 @@
+10000000
+1
+9999999
diff --git a/tests/queries/0_stateless/01771_bloom_filter_not_has.sql b/tests/queries/0_stateless/01771_bloom_filter_not_has.sql
new file mode 100644
index 00000000000..ab0e3d308f9
--- /dev/null
+++ b/tests/queries/0_stateless/01771_bloom_filter_not_has.sql
@@ -0,0 +1,7 @@
+DROP TABLE IF EXISTS bloom_filter_null_array;
+CREATE TABLE bloom_filter_null_array (v Array(Int32), INDEX idx v TYPE bloom_filter GRANULARITY 3) ENGINE = MergeTree() ORDER BY v;
+INSERT INTO bloom_filter_null_array SELECT [number] FROM numbers(10000000);
+SELECT COUNT() FROM bloom_filter_null_array;
+SELECT COUNT() FROM bloom_filter_null_array WHERE has(v, 0);
+SELECT COUNT() FROM bloom_filter_null_array WHERE not has(v, 0);
+DROP TABLE bloom_filter_null_array;

From 6278a7eface92e2f74f4bd83f327d8851afab84d Mon Sep 17 00:00:00 2001
From: Pavel Kruglov <avogar@sandbox-633380738>
Date: Tue, 23 Mar 2021 00:21:52 +0300
Subject: [PATCH 083/260] Fix build

---
 src/Server/TCPHandler.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index e00a8f01c3e..ae9358c6159 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -725,7 +725,8 @@ void TCPHandler::processTablesStatusRequest()
     if (settings.sleep_in_send_tables_status_ms.totalMilliseconds())
     {
         out->next();
-        std::this_thread::sleep_for(settings.sleep_in_send_tables_status_ms);
+        std::chrono::milliseconds ms(settings.sleep_in_send_tables_status_ms.totalMilliseconds());
+        std::this_thread::sleep_for(ms);
     }
 
     response.write(*out, client_tcp_protocol_version);
@@ -1417,7 +1418,8 @@ void TCPHandler::sendData(const Block & block)
     if (block.rows() > 0 && settings.sleep_in_send_data_ms.totalMilliseconds())
     {
         out->next();
-        std::this_thread::sleep_for(settings.sleep_in_send_data_ms);
+        std::chrono::milliseconds ms(settings.sleep_in_send_data_ms.totalMilliseconds());
+        std::this_thread::sleep_for(ms);
     }
 
     state.block_out->write(block);

From c94841bca5ada1984091647cd1e1cd1647e9f32c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Tue, 23 Mar 2021 00:38:25 +0300
Subject: [PATCH 084/260] Fix UBSan report in intDiv #21769

---
 src/Functions/intDiv.cpp                               |  3 +--
 .../0_stateless/01772_intdiv_minus_one_ubsan.reference | 10 ++++++++++
 .../0_stateless/01772_intdiv_minus_one_ubsan.sql       |  1 +
 3 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 tests/queries/0_stateless/01772_intdiv_minus_one_ubsan.reference
 create mode 100644 tests/queries/0_stateless/01772_intdiv_minus_one_ubsan.sql

diff --git a/src/Functions/intDiv.cpp b/src/Functions/intDiv.cpp
index 55396b1d1c7..804696f2776 100644
--- a/src/Functions/intDiv.cpp
+++ b/src/Functions/intDiv.cpp
@@ -49,11 +49,10 @@ struct DivideIntegralByConstantImpl
 #pragma GCC diagnostic ignored "-Wsign-compare"
 
         /// Division by -1. By the way, we avoid FPE by division of the largest negative number by -1.
-        /// And signed integer overflow is well defined in C++20.
         if (unlikely(is_signed_v<B> && b == -1))
         {
             for (size_t i = 0; i < size; ++i)
-                c_pos[i] = -a_pos[i];
+                c_pos[i] = -make_unsigned_t<A>(a_pos[i]);   /// Avoid UBSan report in signed integer overflow.
             return;
         }
 
diff --git a/tests/queries/0_stateless/01772_intdiv_minus_one_ubsan.reference b/tests/queries/0_stateless/01772_intdiv_minus_one_ubsan.reference
new file mode 100644
index 00000000000..6b764d18a4d
--- /dev/null
+++ b/tests/queries/0_stateless/01772_intdiv_minus_one_ubsan.reference
@@ -0,0 +1,10 @@
+-9223372036854775807
+-9223372036854775808
+9223372036854775807
+9223372036854775806
+9223372036854775805
+9223372036854775804
+9223372036854775803
+9223372036854775802
+9223372036854775801
+9223372036854775800
diff --git a/tests/queries/0_stateless/01772_intdiv_minus_one_ubsan.sql b/tests/queries/0_stateless/01772_intdiv_minus_one_ubsan.sql
new file mode 100644
index 00000000000..20b4f585182
--- /dev/null
+++ b/tests/queries/0_stateless/01772_intdiv_minus_one_ubsan.sql
@@ -0,0 +1 @@
+SELECT intDiv(toInt64(number), -1) FROM numbers(9223372036854775807, 10);

From d8088190f9d25bf9b48ae42abdfcac5ce42c8369 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Tue, 23 Mar 2021 00:44:25 +0300
Subject: [PATCH 085/260] Fix documentation for CurrentMetrics

---
 src/Common/CurrentMetrics.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index 4fb2709c8e4..2bc6258aa18 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -52,7 +52,7 @@
     M(RWLockActiveWriters, "Number of threads holding write lock in a table RWLock.") \
     M(GlobalThread, "Number of threads in global thread pool.") \
     M(GlobalThreadActive, "Number of threads in global thread pool running a task.") \
-    M(LocalThread, "Number of threads in local thread pools. Should be similar to GlobalThreadActive.") \
+    M(LocalThread, "Number of threads in local thread pools. The threads in local thread pools are taken from the global thread pool.") \
     M(LocalThreadActive, "Number of threads in local thread pools running a task.") \
     M(DistributedFilesToInsert, "Number of pending files to process for asynchronous insertion into Distributed tables. Number of files for every shard is summed.") \
     M(TablesToDropQueueSize, "Number of dropped tables, that are waiting for background data removal.") \

From 8d0210b510dc723cc1737f6ac6aade02dfb7cc11 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Tue, 23 Mar 2021 01:16:41 +0300
Subject: [PATCH 086/260] Expose DateTime64 minmax part index in system.parts
 and system.parts_columns #18244

---
 src/Storages/MergeTree/IMergeTreeDataPart.cpp | 55 +++++++++++--------
 src/Storages/MergeTree/IMergeTreeDataPart.h   | 10 ++--
 src/Storages/MergeTree/MergeTreeData.cpp      | 22 +++++---
 src/Storages/System/StorageSystemParts.cpp    | 11 ++--
 .../System/StorageSystemPartsColumns.cpp      | 17 ++++--
 ...max_time_system_parts_datetime64.reference |  2 +
 ...3_min_max_time_system_parts_datetime64.sql |  9 +++
 7 files changed, 81 insertions(+), 45 deletions(-)
 create mode 100644 tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference
 create mode 100644 tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql

diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 1f18c894465..50a3169de0e 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -333,40 +333,49 @@ IMergeTreeDataPart::State IMergeTreeDataPart::getState() const
 }
 
 
-DayNum IMergeTreeDataPart::getMinDate() const
+std::pair<DayNum, DayNum> IMergeTreeDataPart::getMinMaxDate() const
 {
     if (storage.minmax_idx_date_column_pos != -1 && minmax_idx.initialized)
-        return DayNum(minmax_idx.hyperrectangle[storage.minmax_idx_date_column_pos].left.get<UInt64>());
+    {
+        const auto & hyperrectangle = minmax_idx.hyperrectangle[storage.minmax_idx_date_column_pos];
+        return {DayNum(hyperrectangle.left.get<UInt64>()), DayNum(hyperrectangle.right.get<UInt64>())};
+    }
     else
-        return DayNum();
+        return {};
 }
 
-
-DayNum IMergeTreeDataPart::getMaxDate() const
-{
-    if (storage.minmax_idx_date_column_pos != -1 && minmax_idx.initialized)
-        return DayNum(minmax_idx.hyperrectangle[storage.minmax_idx_date_column_pos].right.get<UInt64>());
-    else
-        return DayNum();
-}
-
-time_t IMergeTreeDataPart::getMinTime() const
+std::pair<time_t, time_t> IMergeTreeDataPart::getMinMaxTime() const
 {
     if (storage.minmax_idx_time_column_pos != -1 && minmax_idx.initialized)
-        return minmax_idx.hyperrectangle[storage.minmax_idx_time_column_pos].left.get<UInt64>();
+    {
+        const auto & hyperrectangle = minmax_idx.hyperrectangle[storage.minmax_idx_time_column_pos];
+
+        /// The case of DateTime
+        if (hyperrectangle.left.getType() == Field::Types::UInt64)
+        {
+            assert(hyperrectangle.right.getType() == Field::Types::UInt64);
+            return {hyperrectangle.left.get<UInt64>(), hyperrectangle.right.get<UInt64>()};
+        }
+        /// The case of DateTime64
+        else if (hyperrectangle.left.getType() == Field::Types::Decimal64)
+        {
+            assert(hyperrectangle.right.getType() == Field::Types::UInt64);
+
+            auto left = hyperrectangle.left.get<DecimalField<Decimal64>>();
+            auto right = hyperrectangle.right.get<DecimalField<Decimal64>>();
+
+            assert(left.getScale() == right.getScale());
+
+            return { left.getValue() / left.getScaleMultiplier(), right.getValue() / right.getScaleMultiplier() };
+        }
+        else
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Part minmax index by time is neither DateTime or DateTime64");
+    }
     else
-        return 0;
+        return {};
 }
 
 
-time_t IMergeTreeDataPart::getMaxTime() const
-{
-    if (storage.minmax_idx_time_column_pos != -1 && minmax_idx.initialized)
-        return minmax_idx.hyperrectangle[storage.minmax_idx_time_column_pos].right.get<UInt64>();
-    else
-        return 0;
-}
-
 void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns)
 {
     columns = new_columns;
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h
index 83f8c672001..92b05e5cbd2 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@@ -155,13 +155,11 @@ public:
 
     bool contains(const IMergeTreeDataPart & other) const { return info.contains(other.info); }
 
-    /// If the partition key includes date column (a common case), these functions will return min and max values for this column.
-    DayNum getMinDate() const;
-    DayNum getMaxDate() const;
+    /// If the partition key includes date column (a common case), this function will return min and max values for that column.
+    std::pair<DayNum, DayNum> getMinMaxDate() const;
 
-    /// otherwise, if the partition key includes dateTime column (also a common case), these functions will return min and max values for this column.
-    time_t getMinTime() const;
-    time_t getMaxTime() const;
+    /// otherwise, if the partition key includes dateTime column (also a common case), this function will return min and max values for that column.
+    std::pair<time_t, time_t> getMinMaxTime() const;
 
     bool isEmpty() const { return rows_count == 0; }
 
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index cf69de44a27..d02f9df4ad1 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -469,15 +469,19 @@ void MergeTreeData::checkPartitionKeyAndInitMinMax(const KeyDescription & new_pa
     DataTypes minmax_idx_columns_types = getMinMaxColumnsTypes(new_partition_key);
 
     /// Try to find the date column in columns used by the partition key (a common case).
-    bool encountered_date_column = false;
+    /// If there are no - DateTime or DateTime64 would also suffice.
+
+    bool has_date_column = false;
+    bool has_datetime_column = false;
+
     for (size_t i = 0; i < minmax_idx_columns_types.size(); ++i)
     {
-        if (typeid_cast<const DataTypeDate *>(minmax_idx_columns_types[i].get()))
+        if (isDate(minmax_idx_columns_types[i]))
         {
-            if (!encountered_date_column)
+            if (!has_date_column)
             {
                 minmax_idx_date_column_pos = i;
-                encountered_date_column = true;
+                has_date_column = true;
             }
             else
             {
@@ -486,16 +490,18 @@ void MergeTreeData::checkPartitionKeyAndInitMinMax(const KeyDescription & new_pa
             }
         }
     }
-    if (!encountered_date_column)
+    if (!has_date_column)
     {
         for (size_t i = 0; i < minmax_idx_columns_types.size(); ++i)
         {
-            if (typeid_cast<const DataTypeDateTime *>(minmax_idx_columns_types[i].get()))
+            if (isDateTime(minmax_idx_columns_types[i])
+                || isDateTime64(minmax_idx_columns_types[i])
+            )
             {
-                if (!encountered_date_column)
+                if (!has_datetime_column)
                 {
                     minmax_idx_time_column_pos = i;
-                    encountered_date_column = true;
+                    has_datetime_column = true;
                 }
                 else
                 {
diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp
index eece092206d..6a643dbe1b9 100644
--- a/src/Storages/System/StorageSystemParts.cpp
+++ b/src/Storages/System/StorageSystemParts.cpp
@@ -137,14 +137,17 @@ void StorageSystemParts::processNextStorage(
         if (columns_mask[src_index++])
             columns[res_index++]->insert(static_cast<UInt64>(part.use_count() - 1));
 
+        auto min_max_date = part->getMinMaxDate();
+        auto min_max_time = part->getMinMaxTime();
+
         if (columns_mask[src_index++])
-            columns[res_index++]->insert(part->getMinDate());
+            columns[res_index++]->insert(min_max_date.first);
         if (columns_mask[src_index++])
-            columns[res_index++]->insert(part->getMaxDate());
+            columns[res_index++]->insert(min_max_date.second);
         if (columns_mask[src_index++])
-            columns[res_index++]->insert(static_cast<UInt32>(part->getMinTime()));
+            columns[res_index++]->insert(static_cast<UInt32>(min_max_time.first));
         if (columns_mask[src_index++])
-            columns[res_index++]->insert(static_cast<UInt32>(part->getMaxTime()));
+            columns[res_index++]->insert(static_cast<UInt32>(min_max_time.second));
         if (columns_mask[src_index++])
             columns[res_index++]->insert(part->info.partition_id);
         if (columns_mask[src_index++])
diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp
index 8754e424281..703de70d17f 100644
--- a/src/Storages/System/StorageSystemPartsColumns.cpp
+++ b/src/Storages/System/StorageSystemPartsColumns.cpp
@@ -32,6 +32,8 @@ StorageSystemPartsColumns::StorageSystemPartsColumns(const StorageID & table_id_
         {"refcount",                                   std::make_shared<DataTypeUInt32>()},
         {"min_date",                                   std::make_shared<DataTypeDate>()},
         {"max_date",                                   std::make_shared<DataTypeDate>()},
+        {"min_time",                                   std::make_shared<DataTypeDateTime>()},
+        {"max_time",                                   std::make_shared<DataTypeDateTime>()},
         {"partition_id",                               std::make_shared<DataTypeString>()},
         {"min_block_number",                           std::make_shared<DataTypeInt64>()},
         {"max_block_number",                           std::make_shared<DataTypeInt64>()},
@@ -95,8 +97,10 @@ void StorageSystemPartsColumns::processNextStorage(
 
         /// For convenience, in returned refcount, don't add references that was due to local variables in this method: all_parts, active_parts.
         auto use_count = part.use_count() - 1;
-        auto min_date = part->getMinDate();
-        auto max_date = part->getMaxDate();
+
+        auto min_max_date = part->getMinMaxDate();
+        auto min_max_time = part->getMinMaxTime();
+
         auto index_size_in_bytes = part->getIndexSizeInBytes();
         auto index_size_in_allocated_bytes = part->getIndexSizeInAllocatedBytes();
 
@@ -141,9 +145,14 @@ void StorageSystemPartsColumns::processNextStorage(
                 columns[res_index++]->insert(UInt64(use_count));
 
             if (columns_mask[src_index++])
-                columns[res_index++]->insert(min_date);
+                columns[res_index++]->insert(min_max_date.first);
             if (columns_mask[src_index++])
-                columns[res_index++]->insert(max_date);
+                columns[res_index++]->insert(min_max_date.second);
+            if (columns_mask[src_index++])
+                columns[res_index++]->insert(static_cast<UInt32>(min_max_time.first));
+            if (columns_mask[src_index++])
+                columns[res_index++]->insert(static_cast<UInt32>(min_max_time.second));
+
             if (columns_mask[src_index++])
                 columns[res_index++]->insert(part->info.partition_id);
             if (columns_mask[src_index++])
diff --git a/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference
new file mode 100644
index 00000000000..1cea52ec1c2
--- /dev/null
+++ b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference
@@ -0,0 +1,2 @@
+2000-01-02 03:04:05	2001-02-03 04:05:06
+2000-01-02 03:04:05	2001-02-03 04:05:06
diff --git a/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql
new file mode 100644
index 00000000000..5a1f809b03b
--- /dev/null
+++ b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql
@@ -0,0 +1,9 @@
+DROP TABLE IF EXISTS test;
+CREATE TABLE test (time DateTime64(3)) ENGINE = MergeTree ORDER BY tuple() PARTITION BY toStartOfInterval(time, INTERVAL 2 YEAR);
+
+INSERT INTO test VALUES ('2000-01-02 03:04:05.123'), ('2001-02-03 04:05:06.789');
+
+SELECT min_time, max_time FROM system.parts WHERE table = 'test' AND database = currentDatabase();
+SELECT min_time, max_time FROM system.parts_columns WHERE table = 'test' AND database = currentDatabase();
+
+DROP TABLE test;

From c18018ae06f3708ee73d7dba16556be1e18819c7 Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Tue, 23 Mar 2021 07:26:30 +0300
Subject: [PATCH 087/260] Longer SQLancer run

---
 docker/test/sqlancer/process_sqlancer_result.py | 1 +
 docker/test/sqlancer/run.sh                     | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/test/sqlancer/process_sqlancer_result.py b/docker/test/sqlancer/process_sqlancer_result.py
index 411c1e18e19..ede3cabc1c5 100755
--- a/docker/test/sqlancer/process_sqlancer_result.py
+++ b/docker/test/sqlancer/process_sqlancer_result.py
@@ -26,6 +26,7 @@ def process_result(result_folder):
             with open(err_path, 'r') as f:
                 if 'AssertionError' in f.read():
                     summary.append((test, "FAIL"))
+                    status = 'failure'
                 else:
                     summary.append((test, "OK"))
 
diff --git a/docker/test/sqlancer/run.sh b/docker/test/sqlancer/run.sh
index 20e82603567..e465ba1c993 100755
--- a/docker/test/sqlancer/run.sh
+++ b/docker/test/sqlancer/run.sh
@@ -11,7 +11,7 @@ service clickhouse-server start && sleep 5
 
 cd /sqlancer/sqlancer-master
 
-export TIMEOUT=60
+export TIMEOUT=300
 export NUM_QUERIES=1000
 
 ( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES  --username default --password "" clickhouse --oracle TLPWhere | tee /test_output/TLPWhere.out )  3>&1 1>&2 2>&3 | tee /test_output/TLPWhere.err

From cce2e0acaffaaf5c26b452d26455851b21acaab2 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 23 Mar 2021 10:28:14 +0300
Subject: [PATCH 088/260] Fix typo

---
 utils/nukeeper-data-dumper/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/nukeeper-data-dumper/main.cpp b/utils/nukeeper-data-dumper/main.cpp
index 0340c94c5a0..c80aeb473e2 100644
--- a/utils/nukeeper-data-dumper/main.cpp
+++ b/utils/nukeeper-data-dumper/main.cpp
@@ -66,7 +66,7 @@ int main(int argc, char *argv[])
     state_machine->init();
     size_t last_commited_index = state_machine->last_commit_index();
 
-    LOG_INFO(logger, "Last commited index: {}", last_commited_index);
+    LOG_INFO(logger, "Last committed index: {}", last_commited_index);
 
     DB::NuKeeperLogStore changelog(argv[2], 10000000, true);
     changelog.init(last_commited_index, 10000000000UL); /// collect all logs

From 77935931120d33e58446be32765fe24566294760 Mon Sep 17 00:00:00 2001
From: filimonov <1549571+filimonov@users.noreply.github.com>
Date: Tue, 23 Mar 2021 10:21:51 +0100
Subject: [PATCH 089/260] Update Dockerfile.alpine

---
 docker/server/Dockerfile.alpine | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine
index 0f9de1996ab..ea64c839cb0 100644
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@@ -21,7 +21,7 @@ RUN addgroup -S -g 101 clickhouse \
     && chown clickhouse:clickhouse /var/lib/clickhouse \
     && chown root:clickhouse /var/log/clickhouse-server \
     && chmod +x /entrypoint.sh \
-    && apk add --no-cache su-exec bash \
+    && apk add --no-cache su-exec bash tzdata \
     && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client
 
 # we need to allow "others" access to clickhouse folder, because docker container

From a92cf30b6738cf0f47f7e2a5e9d448b40ead748f Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Tue, 23 Mar 2021 13:33:07 +0300
Subject: [PATCH 090/260] Code review fixes.

---
 src/Disks/DiskCacheWrapper.cpp                | 9 ++++++---
 src/Storages/MergeTree/IMergeTreeDataPart.cpp | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index 0fd03f951ce..7586b4a28f0 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -209,7 +209,13 @@ void DiskCacheWrapper::clearDirectory(const String & path)
 void DiskCacheWrapper::moveDirectory(const String & from_path, const String & to_path)
 {
     if (cache_disk->exists(from_path))
+    {
+        /// Destination directory may not be empty if previous directory move attempt was failed.
+        if (cache_disk->exists(to_path) && cache_disk->isDirectory(to_path))
+            cache_disk->clearDirectory(to_path);
+
         cache_disk->moveDirectory(from_path, to_path);
+    }
     DiskDecorator::moveDirectory(from_path, to_path);
 }
 
@@ -217,9 +223,6 @@ void DiskCacheWrapper::moveFile(const String & from_path, const String & to_path
 {
     if (cache_disk->exists(from_path))
     {
-        if (cache_disk->exists(to_path) && cache_disk->isDirectory(to_path))
-            cache_disk->clearDirectory(to_path);
-
         auto dir_path = directoryPath(to_path);
         if (!cache_disk->exists(dir_path))
             cache_disk->createDirectories(dir_path);
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 1f18c894465..7f2a9cdb1f6 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -1013,7 +1013,7 @@ void IMergeTreeDataPart::renameTo(const String & new_relative_path, bool remove_
     }
 
     volume->getDisk()->setLastModified(from, Poco::Timestamp::fromEpochTime(time(nullptr)));
-    volume->getDisk()->moveFile(from, to);
+    volume->getDisk()->moveDirectory(from, to);
     relative_path = new_relative_path;
 
     SyncGuardPtr sync_guard;
@@ -1065,7 +1065,7 @@ void IMergeTreeDataPart::remove(bool keep_s3) const
 
     try
     {
-        volume->getDisk()->moveFile(from, to);
+        volume->getDisk()->moveDirectory(from, to);
     }
     catch (const Poco::FileNotFoundException &)
     {

From 4cbf741e527275ac73e8c475b0a2125ab8a1ca39 Mon Sep 17 00:00:00 2001
From: filimonov <1549571+filimonov@users.noreply.github.com>
Date: Tue, 23 Mar 2021 11:48:28 +0100
Subject: [PATCH 091/260] Update Dockerfile.alpine

---
 docker/server/Dockerfile.alpine | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine
index ea64c839cb0..cd192c0c9da 100644
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@@ -22,6 +22,8 @@ RUN addgroup -S -g 101 clickhouse \
     && chown root:clickhouse /var/log/clickhouse-server \
     && chmod +x /entrypoint.sh \
     && apk add --no-cache su-exec bash tzdata \
+    && cp /usr/share/zoneinfo/UTC /etc/localtime \
+    && echo "UTC" > /etc/timezone \
     && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client
 
 # we need to allow "others" access to clickhouse folder, because docker container

From c938f4f2fe5fc5d07ce3e1fc0616979a80b32cc7 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Tue, 23 Mar 2021 14:29:29 +0300
Subject: [PATCH 092/260] fix

---
 src/Interpreters/InterpreterCreateQuery.cpp |  4 +---
 src/Storages/StorageURL.cpp                 | 18 +++++++++++++++---
 src/Storages/StorageURL.h                   |  6 +-----
 src/TableFunctions/ITableFunction.cpp       | 14 +++++++++++---
 4 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp
index 7034e74eaf8..f8bcbf02ab4 100644
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@@ -972,9 +972,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
     {
         const auto & factory = TableFunctionFactory::instance();
         auto table_func = factory.get(create.as_table_function, context);
-        /// We should use global context here because there will be no query context on server startup
-        /// and because storage lifetime is bigger than query context lifetime.
-        res = table_func->execute(create.as_table_function, context.getGlobalContext(), create.table, properties.columns);
+        res = table_func->execute(create.as_table_function, context, create.table, properties.columns);
         res->renameInMemory({create.database, create.table, create.uuid});
     }
     else
diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp
index b59f4b4a02a..2d3879340dc 100644
--- a/src/Storages/StorageURL.cpp
+++ b/src/Storages/StorageURL.cpp
@@ -33,7 +33,7 @@ namespace ErrorCodes
 
 IStorageURLBase::IStorageURLBase(
     const Poco::URI & uri_,
-    const Context & context_,
+    const Context & /*context_*/,
     const StorageID & table_id_,
     const String & format_name_,
     const std::optional<FormatSettings> & format_settings_,
@@ -46,8 +46,6 @@ IStorageURLBase::IStorageURLBase(
     , format_name(format_name_)
     , format_settings(format_settings_)
 {
-    context_.getRemoteHostFilter().checkURL(uri);
-
     StorageInMemoryMetadata storage_metadata;
     storage_metadata.setColumns(columns_);
     storage_metadata.setConstraints(constraints_);
@@ -244,6 +242,20 @@ BlockOutputStreamPtr IStorageURLBase::write(const ASTPtr & /*query*/, const Stor
         chooseCompressionMethod(uri.toString(), compression_method));
 }
 
+StorageURL::StorageURL(const Poco::URI & uri_,
+           const StorageID & table_id_,
+           const String & format_name_,
+           const std::optional<FormatSettings> & format_settings_,
+           const ColumnsDescription & columns_,
+           const ConstraintsDescription & constraints_,
+           Context & context_,
+           const String & compression_method_)
+    : IStorageURLBase(uri_, context_, table_id_, format_name_,
+                      format_settings_, columns_, constraints_, compression_method_)
+{
+    context_.getRemoteHostFilter().checkURL(uri);
+}
+
 void registerStorageURL(StorageFactory & factory)
 {
     factory.registerStorage("URL", [](const StorageFactory::Arguments & args)
diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h
index 0ea86980b8c..2b2384b1043 100644
--- a/src/Storages/StorageURL.h
+++ b/src/Storages/StorageURL.h
@@ -113,11 +113,7 @@ public:
             const ColumnsDescription & columns_,
             const ConstraintsDescription & constraints_,
             Context & context_,
-            const String & compression_method_)
-        : IStorageURLBase(uri_, context_, table_id_, format_name_,
-            format_settings_, columns_, constraints_, compression_method_)
-    {
-    }
+            const String & compression_method_);
 
     String getName() const override
     {
diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp
index 804a5b232ec..b637838c6da 100644
--- a/src/TableFunctions/ITableFunction.cpp
+++ b/src/TableFunctions/ITableFunction.cpp
@@ -20,12 +20,20 @@ StoragePtr ITableFunction::execute(const ASTPtr & ast_function, const Context &
     ProfileEvents::increment(ProfileEvents::TableFunctionExecute);
     context.checkAccess(AccessType::CREATE_TEMPORARY_TABLE | StorageFactory::instance().getSourceAccessType(getStorageTypeName()));
 
-    if (cached_columns.empty() || (hasStaticStructure() && cached_columns == getActualTableStructure(context)))
+    if (cached_columns.empty())
         return executeImpl(ast_function, context, table_name, std::move(cached_columns));
 
-    auto get_storage = [=, tf = shared_from_this()]() -> StoragePtr
+    /// We have table structure, so it's CREATE AS table_function().
+    /// We should use global context here because there will be no query context on server startup
+    /// and because storage lifetime is bigger than query context lifetime.
+    const Context & global_context = context.getGlobalContext();
+    if (hasStaticStructure() && cached_columns == getActualTableStructure(context))
+        return executeImpl(ast_function, global_context, table_name, std::move(cached_columns));
+
+    auto this_table_function = shared_from_this();
+    auto get_storage = [=, &global_context]() -> StoragePtr
     {
-        return tf->executeImpl(ast_function, context, table_name, cached_columns);
+        return this_table_function->executeImpl(ast_function, global_context, table_name, cached_columns);
     };
 
     /// It will request actual table structure and create underlying storage lazily

From 75f5679bf05a6dd63164a578b7ad1d7f5b203acb Mon Sep 17 00:00:00 2001
From: Maksim Kita <maksim-kita@yandex-team.ru>
Date: Tue, 23 Mar 2021 14:58:00 +0300
Subject: [PATCH 093/260] Cast to enum nullable fix

---
 src/Functions/FunctionsConversion.h                       | 4 ++++
 tests/queries/0_stateless/01761_cast_to_enum_nullable.sql | 1 +
 2 files changed, 5 insertions(+)
 create mode 100644 tests/queries/0_stateless/01761_cast_to_enum_nullable.sql

diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h
index 4889132eeb2..ba5f0fcedc9 100644
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@@ -2774,12 +2774,16 @@ private:
                 auto & out_data = static_cast<typename EnumType::ColumnType &>(*res).getData();
                 out_data.resize(size);
 
+                auto default_enum_value = result_type.getValues().front().second;
+
                 if (nullable_col)
                 {
                     for (const auto i : ext::range(0, size))
                     {
                         if (!nullable_col->isNullAt(i))
                             out_data[i] = result_type.getValue(col->getDataAt(i));
+                        else
+                            out_data[i] = default_enum_value;
                     }
                 }
                 else
diff --git a/tests/queries/0_stateless/01761_cast_to_enum_nullable.sql b/tests/queries/0_stateless/01761_cast_to_enum_nullable.sql
new file mode 100644
index 00000000000..42a51d2f7b9
--- /dev/null
+++ b/tests/queries/0_stateless/01761_cast_to_enum_nullable.sql
@@ -0,0 +1 @@
+SELECT toUInt8(assumeNotNull(cast(cast(NULL, 'Nullable(String)'), 'Nullable(Enum8(\'Hello\' = 1))')));

From be76defcfa7e8ba640f9bdcc46d2cd64323c4154 Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Tue, 23 Mar 2021 14:59:36 +0300
Subject: [PATCH 094/260] Remove check for absent checkpoint on rollback

---
 src/IO/tests/gtest_peekable_read_buffer.cpp | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/IO/tests/gtest_peekable_read_buffer.cpp b/src/IO/tests/gtest_peekable_read_buffer.cpp
index 8c491338bd3..ddb947d8b2f 100644
--- a/src/IO/tests/gtest_peekable_read_buffer.cpp
+++ b/src/IO/tests/gtest_peekable_read_buffer.cpp
@@ -48,20 +48,6 @@ try
         readAndAssert(peekable, "01234");
     }
 
-#ifndef ABORT_ON_LOGICAL_ERROR
-    bool exception = false;
-    try
-    {
-        peekable.rollbackToCheckpoint();
-    }
-    catch (DB::Exception & e)
-    {
-        if (e.code() != DB::ErrorCodes::LOGICAL_ERROR)
-            throw;
-        exception = true;
-    }
-    ASSERT_TRUE(exception);
-#endif
     assertAvailable(peekable, "56789");
 
     readAndAssert(peekable, "56");

From 82660d008d97a50115192f3e4fa8b7ef9cc9818c Mon Sep 17 00:00:00 2001
From: Maksim Kita <maksim-kita@yandex-team.ru>
Date: Tue, 23 Mar 2021 15:00:38 +0300
Subject: [PATCH 095/260] Added test reference

---
 tests/queries/0_stateless/01761_cast_to_enum_nullable.reference | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 tests/queries/0_stateless/01761_cast_to_enum_nullable.reference

diff --git a/tests/queries/0_stateless/01761_cast_to_enum_nullable.reference b/tests/queries/0_stateless/01761_cast_to_enum_nullable.reference
new file mode 100644
index 00000000000..d00491fd7e5
--- /dev/null
+++ b/tests/queries/0_stateless/01761_cast_to_enum_nullable.reference
@@ -0,0 +1 @@
+1

From 0c525b4ec4fcd3f6225333f7c9a9f7ecea5abdbb Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 23 Mar 2021 15:07:21 +0300
Subject: [PATCH 096/260] Add an ability to run from .deb and .tgz package

---
 src/Coordination/NuKeeperServer.cpp           |  4 +-
 src/Coordination/NuKeeperServer.h             |  1 +
 .../src/jepsen/nukeeper/constants.clj         |  2 +-
 .../src/jepsen/nukeeper/db.clj                | 47 +++++++++++++++----
 .../src/jepsen/nukeeper/main.clj              |  8 ++--
 .../src/jepsen/nukeeper/utils.clj             |  2 +-
 6 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp
index bfff7bf8f69..62af9656fb9 100644
--- a/src/Coordination/NuKeeperServer.cpp
+++ b/src/Coordination/NuKeeperServer.cpp
@@ -199,7 +199,8 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
     {
         case nuraft::cb_func::BecomeLeader:
         {
-            if (commited_store) /// We become leader and store is empty, ready to serve requests
+            /// We become leader and store is empty or we already committed it
+            if (commited_store || initial_batch_committed)
                 set_initialized();
             return nuraft::cb_func::ReturnCode::Ok;
         }
@@ -224,6 +225,7 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
         {
             if (isLeader()) /// We have committed our log store and we are leader, ready to serve requests.
                 set_initialized();
+            initial_batch_committed = true;
             return nuraft::cb_func::ReturnCode::Ok;
         }
         default: /// ignore other events
diff --git a/src/Coordination/NuKeeperServer.h b/src/Coordination/NuKeeperServer.h
index 17099045640..ba25d5c181b 100644
--- a/src/Coordination/NuKeeperServer.h
+++ b/src/Coordination/NuKeeperServer.h
@@ -33,6 +33,7 @@ private:
     std::mutex initialized_mutex;
     bool initialized_flag = false;
     std::condition_variable initialized_cv;
+    std::atomic<bool> initial_batch_committed = false;
 
     nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param);
 
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
index 95b142e43f9..d6245d450f5 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj
@@ -1,6 +1,6 @@
 (ns jepsen.nukeeper.constants)
 
-(def common-prefix "/tmp/clickhouse")
+(def common-prefix "/home/robot-clickhouse")
 
 (def binary-name "clickhouse")
 
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
index b4bcd363740..106af25be17 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
@@ -12,7 +12,8 @@
 
 (defn get-clickhouse-sky
   [version]
-  (c/exec :sky :get :-d common-prefix :-N :Backbone version))
+  (c/exec :sky :get :-d common-prefix :-N :Backbone version)
+  (str common-prefix "/clickhouse"))
 
 (defn get-clickhouse-url
   [url]
@@ -20,22 +21,47 @@
     (do (c/exec :mv download-result common-prefix)
         (str common-prefix "/" download-result))))
 
+(defn download-clickhouse
+  [source]
+  (info "Downloading clickhouse from" source)
+  (cond
+    (clojure.string/starts-with? source "rbtorrent:") (get-clickhouse-sky source)
+    (clojure.string/starts-with? source "http") (get-clickhouse-url source)
+    :else (throw (Exception. (str "Don't know how to download clickhouse from" source)))))
+
 (defn unpack-deb
   [path]
   (do
-  (c/exec :dpkg :-x path :.)
-  (c/exec :mv "usr/bin/clickhouse" common-prefix)))
+  (c/exec :dpkg :-x path common-prefix)
+  (c/exec :rm :-f path)
+  (c/exec :mv (str common-prefix "/usr/bin/clickhouse") common-prefix)
+  (c/exec :rm :-rf (str common-prefix "/usr") (str common-prefix "/etc"))))
 
 (defn unpack-tgz
   [path]
   (do
-  (c/exec :tar :-zxvf path :.)
-  (c/exec :mv "usr/bin/clickhouse" common-prefix)))
+  (c/exec :mkdir :-p (str common-prefix "/unpacked"))
+  (c/exec :tar :-zxvf path :-C (str common-prefix "/unpacked"))
+  (c/exec :rm :-f path)
+  (let [subdir (c/exec :ls (str common-prefix "/unpacked"))]
+    (c/exec :mv (str common-prefix "/unpacked/" subdir "/usr/bin/clickhouse") common-prefix)
+    (c/exec :rm :-fr (str common-prefix "/unpacked")))))
+
+(defn chmod-binary
+  [path]
+  (c/exec :chmod :+x path))
+
+(defn install-downloaded-clickhouse
+  [path]
+  (cond
+    (clojure.string/ends-with? path ".deb") (unpack-deb path)
+    (clojure.string/ends-with? path ".tgz") (unpack-tgz path)
+    (clojure.string/ends-with? path "clickhouse") (chmod-binary path)
+    :else (throw (Exception. (str "Don't know how to install clickhouse from path" path)))))
 
 (defn prepare-dirs
   []
   (do
-    (c/exec :rm :-rf common-prefix)
     (c/exec :mkdir :-p common-prefix)
     (c/exec :mkdir :-p data-dir)
     (c/exec :mkdir :-p logs-dir)
@@ -72,8 +98,10 @@
        (do
        (info "Preparing directories")
        (prepare-dirs)
-       (info "Downloading clickhouse")
-       (get-clickhouse-sky version)
+       (if (not (cu/exists? binary-path))
+           (do (info "Downloading clickhouse")
+           (install-downloaded-clickhouse (download-clickhouse version)))
+           (info "Binary already exsist on path" binary-path "skipping download"))
        (info "Installing configs")
        (install-configs test node)
        (info "Starting server")
@@ -85,7 +113,8 @@
       (info node "Tearing down clickhouse")
       (kill-clickhouse! node test)
       (c/su
-       ;(c/exec :rm :-f binary-path)
+       (c/exec :rm :-rf binary-path)
+       (c/exec :rm :-rf pid-file-path)
        (c/exec :rm :-rf data-dir)
        (c/exec :rm :-rf logs-dir)
        (c/exec :rm :-rf configs-dir)))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index e027b956937..f3db61c6d53 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -70,7 +70,9 @@
     :default  100
     :parse-fn parse-long
     :validate [pos? "Must be a positive integer."]]
-   [nil, "--lightweight-run", "Subset of workloads/nemesises which is simple to validate"]])
+   [nil, "--lightweight-run" "Subset of workloads/nemesises which is simple to validate"]
+   ["-c" "--clickhouse-source URL" "URL for clickhouse deb or tgz package"
+    :default "https://clickhouse-builds.s3.yandex.net/21677/ef82333089156907a0979669d9374c2e18daabe5/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/clickhouse-common-static_21.4.1.6313_amd64.deb"]])
 
 (defn nukeeper-test
   "Given an options map from the command line runner (e.g. :nodes, :ssh,
@@ -82,9 +84,9 @@
         current-nemesis (get custom-nemesis/custom-nemesises (:nemesis opts))]
     (merge tests/noop-test
            opts
-           {:name (str "clickhouse-keeper quorum=" quorum " "  (name (:workload opts)) " " (name (:nemesis opts)))
+           {:name (str "clickhouse-keeper-quorum=" quorum "-"  (name (:workload opts)) "-" (name (:nemesis opts)))
             :os ubuntu/os
-            :db (db "rbtorrent:a284492c715974b69f73add62b4ff590110369af")
+            :db (db (:clickhouse-source opts))
             :pure-generators true
             :client (:client workload)
             :nemesis (:nemesis current-nemesis)
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index 30774c24dae..0e0db2d3a6d 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -159,7 +159,7 @@
     :server
     :--config (str configs-dir "/config.xml")
     :--
-    :--path data-dir
+    :--path (str data-dir "/")
     :--user_files_path (str data-dir "/user_files")
     :--top_level_domains_path (str data-dir "/top_level_domains")
     :--logger.log (str logs-dir "/clickhouse-server.log")

From 890bd6f1e96a499422fa331706752d9c58b96d6b Mon Sep 17 00:00:00 2001
From: Maksim Kita <maksim-kita@yandex-team.ru>
Date: Tue, 23 Mar 2021 15:14:37 +0300
Subject: [PATCH 097/260] Fixed code review issues

---
 src/Dictionaries/CacheDictionaryStorage.h    |  6 ++++--
 src/Dictionaries/SSDCacheDictionaryStorage.h | 14 +++++++++-----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h
index ffab7f1f9cf..f0028dd8848 100644
--- a/src/Dictionaries/CacheDictionaryStorage.h
+++ b/src/Dictionaries/CacheDictionaryStorage.h
@@ -31,7 +31,9 @@ struct CacheDictionaryStorageConfiguration
     const DictionaryLifetime lifetime;
 };
 
-/// TODO: Add documentation
+/** ICacheDictionaryStorage implementation that keeps key in hash table with fixed collision length.
+  * Value in hash table point to index in attributes arrays.
+  */
 template <DictionaryKeyType dictionary_key_type>
 class CacheDictionaryStorage final : public ICacheDictionaryStorage
 {
@@ -484,7 +486,7 @@ private:
         PaddedPODArray<KeyType> result;
         result.reserve(size);
 
-        for (auto cell : cells)
+        for (auto & cell : cells)
         {
             if (cell.deadline == 0)
                 continue;
diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h
index d0b4a5ca835..baac725e184 100644
--- a/src/Dictionaries/SSDCacheDictionaryStorage.h
+++ b/src/Dictionaries/SSDCacheDictionaryStorage.h
@@ -336,9 +336,7 @@ inline bool operator==(const SSDCacheIndex & lhs, const SSDCacheIndex & rhs)
     return lhs.block_index == rhs.block_index && lhs.offset_in_block == rhs.offset_in_block;
 }
 
-/** SSDCacheMemoryBuffer initialized with block size and memory buffer blocks size.
-  * Allocate block_size * memory_buffer_blocks_size bytes with page alignment.
-  * Logically represents multiple memory_buffer_blocks_size blocks and current write block.
+/** Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks and current write block.
   * If key cannot be written into current_write_block, current block keys size and check summ is written
   * and buffer increase index of current_write_block_index.
   * If current_write_block_index == memory_buffer_blocks_size write key will always returns true.
@@ -443,7 +441,7 @@ private:
     size_t current_block_index = 0;
 };
 
-/// TODO: Add documentation
+/// Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks on file system
 template <typename SSDCacheKeyType>
 class SSDCacheFileBuffer : private boost::noncopyable
 {
@@ -796,7 +794,13 @@ private:
     size_t current_blocks_size = 0;
 };
 
-/// TODO: Add documentation
+/** ICacheDictionaryStorage implementation that keeps column data serialized in memory index and in disk partitions.
+  * Data is first writen in memory buffer.
+  * If memory buffer is full then buffer is flushed to disk partition.
+  * If memory buffer cannot be flushed to associated disk partition, then if partition
+  * can be allocated (current partition index < max_partitions_size) storage allocates new partition, if not old partitions are reused.
+  * Index maps key to partition block and offset.
+  */
 template <DictionaryKeyType dictionary_key_type>
 class SSDCacheDictionaryStorage final : public ICacheDictionaryStorage
 {

From 46f4c60839e32a2d46740143dcf59774dfa23d5d Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitbar@yandex-team.ru>
Date: Tue, 23 Mar 2021 15:15:44 +0300
Subject: [PATCH 098/260] Small simplification in ExternalLoader.

---
 src/Interpreters/ExternalLoader.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/Interpreters/ExternalLoader.cpp b/src/Interpreters/ExternalLoader.cpp
index 73257ba5185..853fe296d1c 100644
--- a/src/Interpreters/ExternalLoader.cpp
+++ b/src/Interpreters/ExternalLoader.cpp
@@ -818,13 +818,10 @@ private:
             if (!min_id)
                 min_id = getMinIDToFinishLoading(forced_to_reload);
 
-            if (info->state_id >= min_id)
-                return true; /// stop
-
             if (info->loading_id < min_id)
                 startLoading(*info, forced_to_reload, *min_id);
 
-            /// Wait for the next event if loading wasn't completed, and stop otherwise.
+            /// Wait for the next event if loading wasn't completed, or stop otherwise.
             return (info->state_id >= min_id);
         };
 
@@ -850,9 +847,6 @@ private:
                 if (filter && !filter(name))
                     continue;
 
-                if (info.state_id >= min_id)
-                    continue;
-
                 if (info.loading_id < min_id)
                     startLoading(info, forced_to_reload, *min_id);
 

From 83255cbd64305910fd2a72b0f1d00fcb63ed5ea6 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 23 Mar 2021 15:19:37 +0300
Subject: [PATCH 099/260] Add option to reuse same binary

---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj   | 7 ++++---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
index 106af25be17..7bc2b9c6cea 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
@@ -91,14 +91,14 @@
      (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> (str sub-configs-dir "/test_keeper_config.xml")))
 
 (defn db
-  [version]
+  [version reuse-binary]
   (reify db/DB
     (setup! [_ test node]
       (c/su
        (do
        (info "Preparing directories")
        (prepare-dirs)
-       (if (not (cu/exists? binary-path))
+       (if (or (not (cu/exists? binary-path)) (not reuse-binary))
            (do (info "Downloading clickhouse")
            (install-downloaded-clickhouse (download-clickhouse version)))
            (info "Binary already exsist on path" binary-path "skipping download"))
@@ -113,7 +113,8 @@
       (info node "Tearing down clickhouse")
       (kill-clickhouse! node test)
       (c/su
-       (c/exec :rm :-rf binary-path)
+       (if (not reuse-binary)
+           (c/exec :rm :-rf binary-path))
        (c/exec :rm :-rf pid-file-path)
        (c/exec :rm :-rf data-dir)
        (c/exec :rm :-rf logs-dir)
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index f3db61c6d53..45a1f442d24 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -71,6 +71,7 @@
     :parse-fn parse-long
     :validate [pos? "Must be a positive integer."]]
    [nil, "--lightweight-run" "Subset of workloads/nemesises which is simple to validate"]
+   [nil, "--reuse-binary" "Use already downloaded binary if it exists, don't remove it on shutdown"]
    ["-c" "--clickhouse-source URL" "URL for clickhouse deb or tgz package"
     :default "https://clickhouse-builds.s3.yandex.net/21677/ef82333089156907a0979669d9374c2e18daabe5/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/clickhouse-common-static_21.4.1.6313_amd64.deb"]])
 
@@ -86,7 +87,7 @@
            opts
            {:name (str "clickhouse-keeper-quorum=" quorum "-"  (name (:workload opts)) "-" (name (:nemesis opts)))
             :os ubuntu/os
-            :db (db (:clickhouse-source opts))
+            :db (db (:clickhouse-source opts) (boolean (:reuse-binary opts)))
             :pure-generators true
             :client (:client workload)
             :nemesis (:nemesis current-nemesis)

From 9079dd2027bba8fb9cd3c77bd2f075a4d79d9dbb Mon Sep 17 00:00:00 2001
From: Anton Popov <pad11rus@gmail.com>
Date: Tue, 23 Mar 2021 17:05:59 +0300
Subject: [PATCH 100/260] add test

---
 ...o_table_overlapping_block_number.reference |  4 ++++
 ...move_to_table_overlapping_block_number.sql | 20 +++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 tests/queries/0_stateless/01765_move_to_table_overlapping_block_number.reference
 create mode 100644 tests/queries/0_stateless/01765_move_to_table_overlapping_block_number.sql

diff --git a/tests/queries/0_stateless/01765_move_to_table_overlapping_block_number.reference b/tests/queries/0_stateless/01765_move_to_table_overlapping_block_number.reference
new file mode 100644
index 00000000000..a07ed155918
--- /dev/null
+++ b/tests/queries/0_stateless/01765_move_to_table_overlapping_block_number.reference
@@ -0,0 +1,4 @@
+1	1	1_1_1_0
+1	2	1_2_2_0
+1	3	1_3_3_0
+1	4	1_4_4_0
diff --git a/tests/queries/0_stateless/01765_move_to_table_overlapping_block_number.sql b/tests/queries/0_stateless/01765_move_to_table_overlapping_block_number.sql
new file mode 100644
index 00000000000..ea00c573c74
--- /dev/null
+++ b/tests/queries/0_stateless/01765_move_to_table_overlapping_block_number.sql
@@ -0,0 +1,20 @@
+DROP TABLE IF EXISTS t_src;
+DROP TABLE IF EXISTS t_dst;
+
+CREATE TABLE t_src (id UInt32, v UInt32) ENGINE = MergeTree ORDER BY id PARTITION BY id;
+CREATE TABLE t_dst (id UInt32, v UInt32) ENGINE = MergeTree ORDER BY id PARTITION BY id;
+
+SYSTEM STOP MERGES t_src;
+SYSTEM STOP MERGES t_dst;
+
+INSERT INTO t_dst VALUES (1, 1);
+INSERT INTO t_dst VALUES (1, 2);
+INSERT INTO t_dst VALUES (1, 3);
+
+INSERT INTO t_src VALUES (1, 4);
+
+ALTER TABLE t_src MOVE PARTITION 1 TO TABLE t_dst;
+SELECT *, _part FROM t_dst ORDER BY v;
+
+DROP TABLE t_src;
+DROP TABLE t_dst;

From 0c70fe1a6d3a5f4db78a97f445bbac72824c6cff Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Tue, 23 Mar 2021 17:38:58 +0300
Subject: [PATCH 101/260] fix field get

---
 src/Processors/Transforms/WindowTransform.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 3a97698453a..121b9c818e1 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -117,9 +117,7 @@ static int compareValuesWithOffsetFloat(const IColumn * _compared_column,
         _compared_column);
     const auto * reference_column = assert_cast<const ColumnType *>(
         _reference_column);
-    // The underlying field type is Float64 for Float32 as well. get<Float32>()
-    // would be a reinterpret_cast and yield an incorrect result.
-    const auto offset = _offset.get<Float64>();
+    const auto offset = _offset.get<Float32>();
 
     const auto compared_value_data = compared_column->getDataAt(compared_row);
     assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));

From bde02c72f3f1b082f9337a5aed88b5a50570979f Mon Sep 17 00:00:00 2001
From: Maksim Kita <maksim-kita@yandex-team.ru>
Date: Tue, 23 Mar 2021 18:14:22 +0300
Subject: [PATCH 102/260] Fixed typos

---
 src/Dictionaries/SSDCacheDictionaryStorage.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h
index baac725e184..67f0465a2c7 100644
--- a/src/Dictionaries/SSDCacheDictionaryStorage.h
+++ b/src/Dictionaries/SSDCacheDictionaryStorage.h
@@ -795,7 +795,7 @@ private:
 };
 
 /** ICacheDictionaryStorage implementation that keeps column data serialized in memory index and in disk partitions.
-  * Data is first writen in memory buffer.
+  * Data is first written in memory buffer.
   * If memory buffer is full then buffer is flushed to disk partition.
   * If memory buffer cannot be flushed to associated disk partition, then if partition
   * can be allocated (current partition index < max_partitions_size) storage allocates new partition, if not old partitions are reused.

From 3813515a3ac82faa1cd6fb5e3fefd10cb1362180 Mon Sep 17 00:00:00 2001
From: Maksim Kita <maksim-kita@yandex-team.ru>
Date: Tue, 23 Mar 2021 18:35:13 +0300
Subject: [PATCH 103/260] Darwin cmake disable memcpy benchmark

---
 utils/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
index d38b34f3419..ee9c7c8eb93 100644
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -32,7 +32,10 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS)
     add_subdirectory (db-generator)
     add_subdirectory (wal-dump)
     add_subdirectory (check-mysql-binlog)
+
+    if (NOT OS_DARWIN)
     add_subdirectory (memcpy-bench)
+    endif ()
 endif ()
 
 if (ENABLE_CODE_QUALITY)

From 4716791a1abf6522c9628d429497229479fba568 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 23 Mar 2021 19:06:13 +0300
Subject: [PATCH 104/260] Better README.md

---
 tests/jepsen.nukeeper/README.md | 137 +++++++++++++++++++++++++++++++-
 1 file changed, 135 insertions(+), 2 deletions(-)

diff --git a/tests/jepsen.nukeeper/README.md b/tests/jepsen.nukeeper/README.md
index f72409e080f..6bcd7a37069 100644
--- a/tests/jepsen.nukeeper/README.md
+++ b/tests/jepsen.nukeeper/README.md
@@ -1,10 +1,143 @@
 # jepsen.nukeeper
 
-A Clojure library designed to ... well, that part is up to you.
+A Clojure library designed to test ZooKeeper-like implementation inside ClickHouse.
+
+## Test scenarios (workloads)
+
+### CAS register
+
+CAS Register has three operations: read number, write number, compare-and-swap number. This register is simulated as a single ZooKeeper node. Read transforms to ZooKeeper's `getData` request. Write transforms to the `set` request. Compare-and-swap implemented via `getData` + compare in code + `set` new value with `version` from `getData`.
+
+In this test, we use a linearizable checker, so Jepsen validates that history was linearizable. One of the heaviest workloads.
+
+Strictly requires `quorum_reads` to be true.
+
+### Set
+
+Set has two operations: add a number to set and read all values from set. This workload is simulated on a single ZooKeeper node with a string value that represents Clojure set data structure. Add operation very similar to compare-and-swap. We read string value from ZooKeeper node with `getData`, parse it to Clojure's set, add new value to the set and try to write it with the received version.
+
+In this test, Jepsen validates that all successfully added values can be read. Generator for this workload performs only add operations until a timeout and after that tries to read set once.
+
+### Unique IDs
+
+In the Unique IDs workload we have only one operation: generate a new unique number. It's implemented using ZooKeeper's sequential nodes. For each generates request client just creates a new sequential node in ZooKeeper with a fixed prefix. After that cuts the prefix off from the returned path and parses the number from the rest part.
+
+Jepsen checks that all returned IDs were unique.
+
+### Counter
+
+Counter workload has two operations: read counter value and add some number to the counter. Its implementation is quite weird. We add number `N` to the counter creating `N` sequential nodes in a single ZooKeeper transaction. Counter read implemented as `getChildren` ZooKeeper request and count of all returned nodes.
+
+Jepsen checks that counter value lies in the interval of possible value. Strictly requires `quorum_reads` to be true.
+
+### Total queue
+
+Simulates an unordered queue with three operations: enqueue number, dequeue, and drain. Enqueue operation uses `create` request with node name equals to number. `Dequeue` operation is more interesting. We list (`getChildren`) all nodes and remember the parent node version. After that we choose the smallest one and prepare the transaction: `check` parent node version + set an empty value to parent node + delete smalled child node. Drain operation is just `getChildren` on the parent path.
+
+Jepsen checks that all enqueued values were dequeued or drained. Duplicates are allowed because  Jepsen doesn't know the value of the unknown-status (`:info`) dequeue operation. So when we try to `dequeue` some element we should return it even if our delete transaction failed with `Connection loss` error.
+
+### Linear queue
+
+Same with the total queue, but without drain operation. Checks linearizability between enqueue and dequeue. Sometimes consume more than 10GB during validation even for very short histories.
+
+
+## Nemesis
+
+We use almost all standard nemeses with small changes for our storage.
+
+### Random node killer (random-node-killer)
+
+Sleep 5 seconds, kills random node, sleep for 5 seconds, and starts it back.
+
+### All nodes killer (all-nodes-killer)
+
+Kill all nodes at once, sleep for 5 seconds, and starts them back.
+
+### Simple partitioner (simple-partitioner)
+
+Partition one node from others using iptables. No one can see the victim and the victim cannot see anybody.
+
+### Random node stop (random-node-hammer-time)
+
+Send `SIGSTOP` to the random node. Sleep 5 seconds. Send `SIGCONT`.
+
+### All nodes stop (all-nodes-hammer-time)
+
+Send `SIGSTOP` to all nodes. Sleep 5 seconds. Send `SIGCONT`.
+
+### Logs corruptor (logs-corruptor)
+
+Corrupts latest log (change one random byte) in `clickhouse_path/coordination/logs`. Restarts nodes.
+
+### Snapshots corruptor (snapshots-corruptor)
+
+Corrupts latest snapshot (change one random byte) in `clickhouse_path/coordination/snapshots`. Restarts nodes.
+
+### Logs and snapshots corruptor  (logs-and-snapshots-corruptor)
+
+Corrupts both the latest log and snapshot. Restarts node.
+
+### Drop data corruptor (drop-data-corruptor)
+
+Drop all data from `clickhouse_path/coordinator`. Restarts node.
+
+### Bridge partitioner (bridge-partitioner)
+
+Two nodes don't see each other but can see another node. The last node can see both.
+
+### Blind node partitioner (blind-node-partitioner)
+
+One of the nodes cannot see another, but they can see it.
+
+### Blind others partitioner (blind-others-partitioner)
+
+Two nodes don't see one node but it can see both.
 
 ## Usage
 
-FIXME
+### Dependencies
+
+- leiningen (https://leiningen.org/)
+- clojure (https://clojure.org/)
+- jvm
+
+### Options for `lein run`
+
+- `test` Run a single test.
+- `test-all` Run all available tests from tests-set.
+- `-w (--workload)` One of the workloads. Option for a single `test`.
+- `--nemesis` One of nemeses. Option for a single `test`.
+- `-q (--quorum)` Run test with quorum reads.
+- `-r (--rate)` How many operations per second Jepsen will generate in a single thread.
+- `-s (--snapshot-distance)` ClickHouse Keeper setting. How often we will create a new snapshot.
+- `--stale-log-gap` ClickHosue Keeper setting. A leader will send a snapshot instead of a log to this node if it's committed index less than leaders - this setting value.
+- `--reserved-log-items` ClickHouse Keeper setting. How many log items to keep after the snapshot.
+- `--ops-per-key` Option for CAS register workload. Total ops that will be generated for a single register.
+- `--lightweight-run` Run some lightweight tests without linearizability checks. Option for `tests-all` run.
+- `--reuse-binary` Don't download clickhouse binary if it already exists on the node.
+- `--clickhouse-source` URL to clickhouse `.deb`, `.tgz` or binary.
+- `--time-limit` (in seconds) How long Jepsen will generate new operations.
+- `--nodes-file` File with nodes for SSH. Newline separated.
+- `--username` SSH username for nodes.
+- `--password` SSH password for nodes.
+- `--concurrency` How many threads Jepsen will use for concurrent requests.
+- `--test-count` How many times to run a single test or how many tests to run from the tests set.
+
+
+### Examples:
+
+1. Run `Set` workload with `logs-and-snapshots-corruptor` ten times:
+
+```sh
+$ lein run test --nodes-file nodes.txt --username root --password '' --time-limit 30 --concurrency 50 -r 50 --workload set --nemesis logs-and-snapshots-corruptor  --clickhouse-source 'https://clickhouse-builds.s3.yandex.net/someurl/clickhouse-common-static_21.4.1.6321_amd64.deb' -q --test-count 10 --reuse-binary
+```
+
+2. Run ten random tests from `lightweight-run` with some custom Keeper settings:
+
+``` sh
+$ lein run test-all --nodes-file nodes.txt --username root --password '' --time-limit 30 --concurrency 50 -r 50 --snapshot-distance 100 --stale-log-gap 100 --reserved-log-items 10 --lightweight-run  --clickhouse-source 'someurl' -q --reuse-binary --test-count 10
+```
+
 
 ## License
 

From ba6ccbab42fd19da1322f8443de737fc8ef08edc Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 23 Mar 2021 19:07:41 +0300
Subject: [PATCH 105/260] Fix header

---
 tests/jepsen.nukeeper/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/jepsen.nukeeper/README.md b/tests/jepsen.nukeeper/README.md
index 6bcd7a37069..8f3754b8f7b 100644
--- a/tests/jepsen.nukeeper/README.md
+++ b/tests/jepsen.nukeeper/README.md
@@ -1,4 +1,4 @@
-# jepsen.nukeeper
+# Jepsen tests ClickHouse Keeper
 
 A Clojure library designed to test ZooKeeper-like implementation inside ClickHouse.
 

From 4c2278c215412b36f62242d5b995e7be8fe54847 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Tue, 23 Mar 2021 19:39:55 +0300
Subject: [PATCH 106/260] Merge pull request #22047 from
 ClickHouse/aku/llvm-cloudflare-21.3

21.3: quick fix for broken resolution of apt.llvm.org on Yandex infra

(cherry picked from commit 12f5753e5f6e1d8be58c815884e70ad576601f65)
---
 docker/packager/binary/Dockerfile | 23 +++++++++++++++++++----
 docker/test/base/Dockerfile       |  6 ++++--
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile
index 74de1a3e9bd..91036d88d8c 100644
--- a/docker/packager/binary/Dockerfile
+++ b/docker/packager/binary/Dockerfile
@@ -4,14 +4,26 @@ FROM ubuntu:20.04
 ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11
 
 RUN apt-get update \
-    && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \
+    && apt-get install \
+        apt-transport-https \
+        apt-utils \
+        ca-certificates \
+        dnsutils \
+        gnupg \
+        iputils-ping \
+        lsb-release \
+        wget \
         --yes --no-install-recommends --verbose-versions \
+    && cat /etc/resolv.conf \
+    && echo "nameserver 1.1.1.1" >> /etc/resolv.conf \
+    && nslookup -debug apt.llvm.org \
+    && ping -c1 apt.llvm.org \
+    && wget -nv --retry-connrefused --tries=10 -O /tmp/llvm-snapshot.gpg.key https://apt.llvm.org/llvm-snapshot.gpg.key \
     && export LLVM_PUBKEY_HASH="bda960a8da687a275a2078d43c111d66b1c6a893a3275271beedf266c1ff4a0cdecb429c7a5cccf9f486ea7aa43fd27f" \
-    && wget -nv -O /tmp/llvm-snapshot.gpg.key https://apt.llvm.org/llvm-snapshot.gpg.key \
     && echo "${LLVM_PUBKEY_HASH} /tmp/llvm-snapshot.gpg.key" | sha384sum -c \
     && apt-key add /tmp/llvm-snapshot.gpg.key \
     && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \
-    && echo "deb [trusted=yes] http://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \
+    && echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \
         /etc/apt/sources.list
 
 # initial packages
@@ -24,7 +36,10 @@ RUN apt-get update \
         software-properties-common \
         --yes --no-install-recommends
 
-RUN apt-get update \
+RUN cat /etc/resolv.conf \
+    && echo "nameserver 1.1.1.1" >> /etc/resolv.conf \
+    && nslookup -debug apt.llvm.org \
+    && apt-get update \
     && apt-get install \
         bash \
         cmake \
diff --git a/docker/test/base/Dockerfile b/docker/test/base/Dockerfile
index e8653c2122e..158d2608f41 100644
--- a/docker/test/base/Dockerfile
+++ b/docker/test/base/Dockerfile
@@ -4,8 +4,9 @@ FROM ubuntu:20.04
 ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11
 
 RUN apt-get update \
-    && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \
+    && apt-get install apt-utils ca-certificates lsb-release wget gnupg apt-transport-https \
         --yes --no-install-recommends --verbose-versions \
+    && echo "nameserver 1.1.1.1" >> /etc/resolv.conf \
     && export LLVM_PUBKEY_HASH="bda960a8da687a275a2078d43c111d66b1c6a893a3275271beedf266c1ff4a0cdecb429c7a5cccf9f486ea7aa43fd27f" \
     && wget -nv -O /tmp/llvm-snapshot.gpg.key https://apt.llvm.org/llvm-snapshot.gpg.key \
     && echo "${LLVM_PUBKEY_HASH} /tmp/llvm-snapshot.gpg.key" | sha384sum -c \
@@ -31,7 +32,8 @@ RUN curl -O https://clickhouse-builds.s3.yandex.net/utils/1/dpkg-deb \
     && chmod +x dpkg-deb \
     && cp dpkg-deb /usr/bin
 
-RUN apt-get update \
+RUN echo "nameserver 1.1.1.1" >> /etc/resolv.conf \
+    && apt-get update \
     && apt-get install \
         clang-${LLVM_VERSION} \
         debhelper \

From d3c8a4ad32210fb83b8e5b51b546b63a6443becb Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Tue, 23 Mar 2021 20:02:16 +0300
Subject: [PATCH 107/260] Update CMakeLists.txt

---
 utils/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
index ee9c7c8eb93..d534fd8fd4f 100644
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -34,7 +34,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS)
     add_subdirectory (check-mysql-binlog)
 
     if (NOT OS_DARWIN)
-    add_subdirectory (memcpy-bench)
+        add_subdirectory (memcpy-bench)
     endif ()
 endif ()
 

From 1f4df07e08fadddf8f3f2ef205691460bd191622 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitbar@yandex-team.ru>
Date: Tue, 23 Mar 2021 20:58:30 +0300
Subject: [PATCH 108/260] Update used version of simdjson to 0.9.1

---
 .gitmodules                    |  2 +-
 contrib/simdjson               |  2 +-
 src/Functions/SimdJSONParser.h | 20 ++++++++++----------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 7a2c5600e65..f9bc8a56a5c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -93,7 +93,7 @@
 	url = https://github.com/ClickHouse-Extras/libunwind.git
 [submodule "contrib/simdjson"]
 	path = contrib/simdjson
-	url = https://github.com/ClickHouse-Extras/simdjson.git
+	url = https://github.com/simdjson/simdjson.git
 [submodule "contrib/rapidjson"]
 	path = contrib/rapidjson
 	url = https://github.com/ClickHouse-Extras/rapidjson
diff --git a/contrib/simdjson b/contrib/simdjson
index 3190d66a490..95b4870e20b 160000
--- a/contrib/simdjson
+++ b/contrib/simdjson
@@ -1 +1 @@
-Subproject commit 3190d66a49059092a1753dc35595923debfc1698
+Subproject commit 95b4870e20be5f97d9dcf63b23b1c6f520c366c1
diff --git a/src/Functions/SimdJSONParser.h b/src/Functions/SimdJSONParser.h
index a9adfa27e2c..7ff3c45130d 100644
--- a/src/Functions/SimdJSONParser.h
+++ b/src/Functions/SimdJSONParser.h
@@ -42,11 +42,11 @@ struct SimdJSONParser
         ALWAYS_INLINE bool isBool() const { return element.type() == simdjson::dom::element_type::BOOL; }
         ALWAYS_INLINE bool isNull() const { return element.type() == simdjson::dom::element_type::NULL_VALUE; }
 
-        ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().first; }
-        ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().first; }
-        ALWAYS_INLINE double getDouble() const { return element.get_double().first; }
-        ALWAYS_INLINE bool getBool() const { return element.get_bool().first; }
-        ALWAYS_INLINE std::string_view getString() const { return element.get_string().first; }
+        ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().value_unsafe(); }
+        ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().value_unsafe(); }
+        ALWAYS_INLINE double getDouble() const { return element.get_double().value_unsafe(); }
+        ALWAYS_INLINE bool getBool() const { return element.get_bool().value_unsafe(); }
+        ALWAYS_INLINE std::string_view getString() const { return element.get_string().value_unsafe(); }
         ALWAYS_INLINE Array getArray() const;
         ALWAYS_INLINE Object getObject() const;
 
@@ -75,7 +75,7 @@ struct SimdJSONParser
         ALWAYS_INLINE Iterator begin() const { return array.begin(); }
         ALWAYS_INLINE Iterator end() const { return array.end(); }
         ALWAYS_INLINE size_t size() const { return array.size(); }
-        ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).first; }
+        ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).value_unsafe(); }
 
     private:
         simdjson::dom::array array;
@@ -111,7 +111,7 @@ struct SimdJSONParser
             if (x.error())
                 return false;
 
-            result = x.first;
+            result = x.value_unsafe();
             return true;
         }
 
@@ -137,7 +137,7 @@ struct SimdJSONParser
         if (document.error())
             return false;
 
-        result = document.first;
+        result = document.value_unsafe();
         return true;
     }
 
@@ -155,12 +155,12 @@ private:
 
 inline ALWAYS_INLINE SimdJSONParser::Array SimdJSONParser::Element::getArray() const
 {
-    return element.get_array().first;
+    return element.get_array().value_unsafe();
 }
 
 inline ALWAYS_INLINE SimdJSONParser::Object SimdJSONParser::Element::getObject() const
 {
-    return element.get_object().first;
+    return element.get_object().value_unsafe();
 }
 
 }

From 912144307d974afa5ecbf81e671f6adfba9aa231 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 23 Mar 2021 23:18:02 +0300
Subject: [PATCH 109/260] Fix type of the ErrorCodes

---
 src/Common/ErrorCodes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp
index 586c0fbde4d..30714cb82ae 100644
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@@ -560,7 +560,7 @@ namespace DB
 {
 namespace ErrorCodes
 {
-#define M(VALUE, NAME) extern const Value NAME = VALUE;
+#define M(VALUE, NAME) extern const ErrorCode NAME = VALUE;
     APPLY_FOR_ERROR_CODES(M)
 #undef M
 

From 7154b36a2da6cefac708ed2b0ebc67615ff9ead9 Mon Sep 17 00:00:00 2001
From: Dmitriy <sevirov@yandex-team.ru>
Date: Wed, 24 Mar 2021 00:08:07 +0300
Subject: [PATCH 110/260] Add ORC output format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Задокументировал вывод данных в ORC формате.
---
 docs/en/interfaces/formats.md | 51 ++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index ee2235b7861..940fc8cd636 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -50,7 +50,7 @@ The supported formats are:
 | [Parquet](#data-format-parquet)                                                         | ✔     | ✔      |
 | [Arrow](#data-format-arrow)                                                             | ✔     | ✔      |
 | [ArrowStream](#data-format-arrow-stream)                                                | ✔     | ✔      |
-| [ORC](#data-format-orc)                                                                 | ✔     | ✗      |
+| [ORC](#data-format-orc)                                                                 | ✔     | ✔      |
 | [RowBinary](#rowbinary)                                                                 | ✔     | ✔      |
 | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes)                               | ✔     | ✔      |
 | [Native](#native)                                                                       | ✔     | ✔      |
@@ -1284,36 +1284,37 @@ To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-e
 
 ## ORC {#data-format-orc}
 
-[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. You can only insert data in this format to ClickHouse.
+[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the [Hadoop](https://hadoop.apache.org/) ecosystem.
 
 ### Data Types Matching {#data_types-matching-3}
 
-The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` queries.
+The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md).
 
-| ORC data type (`INSERT`) | ClickHouse data type                                |
-|--------------------------|-----------------------------------------------------|
-| `UINT8`, `BOOL`          | [UInt8](../sql-reference/data-types/int-uint.md)    |
-| `INT8`                   | [Int8](../sql-reference/data-types/int-uint.md)     |
-| `UINT16`                 | [UInt16](../sql-reference/data-types/int-uint.md)   |
-| `INT16`                  | [Int16](../sql-reference/data-types/int-uint.md)    |
-| `UINT32`                 | [UInt32](../sql-reference/data-types/int-uint.md)   |
-| `INT32`                  | [Int32](../sql-reference/data-types/int-uint.md)    |
-| `UINT64`                 | [UInt64](../sql-reference/data-types/int-uint.md)   |
-| `INT64`                  | [Int64](../sql-reference/data-types/int-uint.md)    |
-| `FLOAT`, `HALF_FLOAT`    | [Float32](../sql-reference/data-types/float.md)     |
-| `DOUBLE`                 | [Float64](../sql-reference/data-types/float.md)     |
-| `DATE32`                 | [Date](../sql-reference/data-types/date.md)         |
-| `DATE64`, `TIMESTAMP`    | [DateTime](../sql-reference/data-types/datetime.md) |
-| `STRING`, `BINARY`       | [String](../sql-reference/data-types/string.md)     |
-| `DECIMAL`                | [Decimal](../sql-reference/data-types/decimal.md)   |
+| ORC data type (`INSERT`) | ClickHouse data type                                | ORC data type (`SELECT`) |
+|--------------------------|-----------------------------------------------------|--------------------------|
+| `UINT8`, `BOOL`          | [UInt8](../sql-reference/data-types/int-uint.md)    | `UINT8`                  |
+| `INT8`                   | [Int8](../sql-reference/data-types/int-uint.md)     | `INT8`                   |
+| `UINT16`                 | [UInt16](../sql-reference/data-types/int-uint.md)   | `UINT16`                 |
+| `INT16`                  | [Int16](../sql-reference/data-types/int-uint.md)    | `INT16`                  |
+| `UINT32`                 | [UInt32](../sql-reference/data-types/int-uint.md)   | `UINT32`                 |
+| `INT32`                  | [Int32](../sql-reference/data-types/int-uint.md)    | `INT32`                  |
+| `UINT64`                 | [UInt64](../sql-reference/data-types/int-uint.md)   | `UINT64`                 |
+| `INT64`                  | [Int64](../sql-reference/data-types/int-uint.md)    | `INT64`                  |
+| `FLOAT`, `HALF_FLOAT`    | [Float32](../sql-reference/data-types/float.md)     | `FLOAT`                  |
+| `DOUBLE`                 | [Float64](../sql-reference/data-types/float.md)     | `DOUBLE`                 |
+| `DATE32`                 | [Date](../sql-reference/data-types/date.md)         | `DATE32`                 |
+| `DATE64`, `TIMESTAMP`    | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP`              |
+| `STRING`, `BINARY`       | [String](../sql-reference/data-types/string.md)     | `BINARY`                 |
+| `DECIMAL`                | [Decimal](../sql-reference/data-types/decimal.md)   | `DECIMAL`                |
+| `-`                      | [Array](../sql-reference/data-types/array.md)       | `LIST`                   |
 
-ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type.
+ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` or `SELECT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type.
 
-Unsupported ORC data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
+Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
 
 The data types of ClickHouse table columns don’t have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to the data type set for the ClickHouse table column.
 
-### Inserting Data {#inserting-data-2}
+### Inserting and Selecting Data {#inserting-and-selecting-data-1}
 
 You can insert ORC data from a file into ClickHouse table by the following command:
 
@@ -1321,6 +1322,12 @@ You can insert ORC data from a file into ClickHouse table by the following comma
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
+You can select data from a ClickHouse table and save them into some file in the ORC format by the following command:
+
+``` bash
+$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc}
+```
+
 To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-engines/integrations/hdfs.md).
 
 ## LineAsString {#lineasstring}

From 767eba04f99a19cdcd933a73c3f3decee5a1a63c Mon Sep 17 00:00:00 2001
From: Dmitriy <sevirov@yandex-team.ru>
Date: Wed, 24 Mar 2021 00:22:38 +0300
Subject: [PATCH 111/260] Update ORC format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Поправил якоря.
---
 docs/en/interfaces/formats.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index 940fc8cd636..0d582fab12b 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -1288,7 +1288,7 @@ To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-e
 
 ### Data Types Matching {#data_types-matching-3}
 
-The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md).
+The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
 
 | ORC data type (`INSERT`) | ClickHouse data type                                | ORC data type (`SELECT`) |
 |--------------------------|-----------------------------------------------------|--------------------------|
@@ -1314,7 +1314,7 @@ Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM
 
 The data types of ClickHouse table columns don’t have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to the data type set for the ClickHouse table column.
 
-### Inserting and Selecting Data {#inserting-and-selecting-data-1}
+### Inserting Data {#inserting-data-2}
 
 You can insert ORC data from a file into ClickHouse table by the following command:
 
@@ -1322,6 +1322,8 @@ You can insert ORC data from a file into ClickHouse table by the following comma
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
+### Selecting Data {#selecting-data-2}
+
 You can select data from a ClickHouse table and save them into some file in the ORC format by the following command:
 
 ``` bash

From f2ef536dfb9e7d6b7fd9b6cd95f75293840d9729 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 24 Mar 2021 01:40:27 +0300
Subject: [PATCH 112/260] fix formatting

---
 src/Parsers/ASTSelectQuery.cpp      | 2 +-
 src/Parsers/ASTWindowDefinition.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Parsers/ASTSelectQuery.cpp b/src/Parsers/ASTSelectQuery.cpp
index aa5508bf190..89ef08e0289 100644
--- a/src/Parsers/ASTSelectQuery.cpp
+++ b/src/Parsers/ASTSelectQuery.cpp
@@ -137,7 +137,7 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F
     if (window())
     {
         s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str <<
-            "WINDOW " << (s.hilite ? hilite_none : "");
+            "WINDOW" << (s.hilite ? hilite_none : "");
         window()->formatImpl(s, state, frame);
     }
 
diff --git a/src/Parsers/ASTWindowDefinition.cpp b/src/Parsers/ASTWindowDefinition.cpp
index ff08bda65ed..a645960bd0a 100644
--- a/src/Parsers/ASTWindowDefinition.cpp
+++ b/src/Parsers/ASTWindowDefinition.cpp
@@ -37,7 +37,7 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
 {
     if (partition_by)
     {
-        settings.ostr << "PARTITION BY ";
+        settings.ostr << "PARTITION BY";
         partition_by->formatImpl(settings, state, format_frame);
     }
 
@@ -48,7 +48,7 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
 
     if (order_by)
     {
-        settings.ostr << "ORDER BY ";
+        settings.ostr << "ORDER BY";
         order_by->formatImpl(settings, state, format_frame);
     }
 

From e0d1f6d80fc3b727acc3b3e336da48bfb005fa61 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 24 Mar 2021 01:52:16 +0300
Subject: [PATCH 113/260] fixes

---
 src/Processors/Transforms/WindowTransform.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 121b9c818e1..3ab16d0d1b4 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -49,6 +49,7 @@ static int compareValuesWithOffset(const IColumn * _compared_column,
     const auto * reference_column = assert_cast<const ColumnType *>(
         _reference_column);
     const auto offset = _offset.get<typename ColumnType::ValueType>();
+    assert(offset >= 0);
 
     const auto compared_value_data = compared_column->getDataAt(compared_row);
     assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));
@@ -117,7 +118,8 @@ static int compareValuesWithOffsetFloat(const IColumn * _compared_column,
         _compared_column);
     const auto * reference_column = assert_cast<const ColumnType *>(
         _reference_column);
-    const auto offset = _offset.get<Float32>();
+    const auto offset = _offset.get<typename ColumnType::ValueType>();
+    assert(offset >= 0);
 
     const auto compared_value_data = compared_column->getDataAt(compared_row);
     assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));
@@ -1403,6 +1405,7 @@ struct WindowFunctionRowNumber final : public WindowFunction
     }
 };
 
+// ClickHouse-specific variant of lag/lead that respects the window frame.
 template <bool is_lead>
 struct WindowFunctionLagLeadInFrame final : public WindowFunction
 {

From 612d4fb073e37c4c6b89fb63e5ac864945b43959 Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 02:03:14 +0300
Subject: [PATCH 114/260] Update IMergeTreeDataPart.cpp

---
 src/Storages/MergeTree/IMergeTreeDataPart.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 50a3169de0e..453edcdbbcd 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -359,7 +359,7 @@ std::pair<time_t, time_t> IMergeTreeDataPart::getMinMaxTime() const
         /// The case of DateTime64
         else if (hyperrectangle.left.getType() == Field::Types::Decimal64)
         {
-            assert(hyperrectangle.right.getType() == Field::Types::UInt64);
+            assert(hyperrectangle.right.getType() == Field::Types::Decimal64);
 
             auto left = hyperrectangle.left.get<DecimalField<Decimal64>>();
             auto right = hyperrectangle.right.get<DecimalField<Decimal64>>();

From 6bac215fac352dce59b29cc34cb2ab376cffdc6d Mon Sep 17 00:00:00 2001
From: filimonov <1549571+filimonov@users.noreply.github.com>
Date: Wed, 24 Mar 2021 00:04:54 +0100
Subject: [PATCH 115/260] Update column.md

---
 docs/en/sql-reference/statements/alter/column.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md
index 030f1dd92dd..e39ca8dcaf2 100644
--- a/docs/en/sql-reference/statements/alter/column.md
+++ b/docs/en/sql-reference/statements/alter/column.md
@@ -144,7 +144,7 @@ This query changes the `name` column properties:
 
 -   TTL
 
-        For examples of columns TTL modifying, see [Column TTL](../../engines/table_engines/mergetree_family/mergetree.md#mergetree-column-ttl).
+For examples of columns TTL modifying, see [Column TTL](../../engines/table_engines/mergetree_family/mergetree.md#mergetree-column-ttl).
 
 If the `IF EXISTS` clause is specified, the query won’t return an error if the column doesn’t exist.
 

From 4de5afed4eb15e1fc2f57721df0e464557410a43 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 02:07:13 +0300
Subject: [PATCH 116/260] Whitespaces

---
 .../FunctionDateOrDateTimeAddInterval.h       | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/Functions/FunctionDateOrDateTimeAddInterval.h b/src/Functions/FunctionDateOrDateTimeAddInterval.h
index 2b0082f4334..95b9e25ead1 100644
--- a/src/Functions/FunctionDateOrDateTimeAddInterval.h
+++ b/src/Functions/FunctionDateOrDateTimeAddInterval.h
@@ -40,7 +40,8 @@ struct AddSecondsImpl
 {
     static constexpr auto name = "addSeconds";
 
-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64> execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &)
+    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
+    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &)
     {
         return {t.whole + delta, t.fractional};
     }
@@ -60,7 +61,8 @@ struct AddMinutesImpl
 {
     static constexpr auto name = "addMinutes";
 
-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64> execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &)
+    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
+    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &)
     {
         return {t.whole + delta * 60, t.fractional};
     }
@@ -80,7 +82,8 @@ struct AddHoursImpl
 {
     static constexpr auto name = "addHours";
 
-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64> execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &)
+    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
+    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl &)
     {
         return {t.whole + delta * 3600, t.fractional};
     }
@@ -99,7 +102,8 @@ struct AddDaysImpl
 {
     static constexpr auto name = "addDays";
 
-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64> execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone)
+    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
+    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone)
     {
         return {time_zone.addDays(t.whole, delta), t.fractional};
     }
@@ -119,7 +123,8 @@ struct AddWeeksImpl
 {
     static constexpr auto name = "addWeeks";
 
-    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64> execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone)
+    static inline NO_SANITIZE_UNDEFINED DecimalUtils::DecimalComponents<DateTime64>
+    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone)
     {
         return {time_zone.addWeeks(t.whole, delta), t.fractional};
     }
@@ -139,7 +144,8 @@ struct AddMonthsImpl
 {
     static constexpr auto name = "addMonths";
 
-    static inline DecimalUtils::DecimalComponents<DateTime64> execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone)
+    static inline DecimalUtils::DecimalComponents<DateTime64>
+    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone)
     {
         return {time_zone.addMonths(t.whole, delta), t.fractional};
     }
@@ -159,7 +165,8 @@ struct AddQuartersImpl
 {
     static constexpr auto name = "addQuarters";
 
-    static inline DecimalUtils::DecimalComponents<DateTime64> execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone)
+    static inline DecimalUtils::DecimalComponents<DateTime64>
+    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone)
     {
         return {time_zone.addQuarters(t.whole, delta), t.fractional};
     }
@@ -179,7 +186,8 @@ struct AddYearsImpl
 {
     static constexpr auto name = "addYears";
 
-    static inline DecimalUtils::DecimalComponents<DateTime64> execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone)
+    static inline DecimalUtils::DecimalComponents<DateTime64>
+    execute(DecimalUtils::DecimalComponents<DateTime64> t, Int64 delta, const DateLUTImpl & time_zone)
     {
         return {time_zone.addYears(t.whole, delta), t.fractional};
     }
@@ -265,14 +273,16 @@ struct Adder
 
 private:
     template <typename FromVectorType, typename ToVectorType, typename DeltaColumnType>
-    NO_INLINE NO_SANITIZE_UNDEFINED void vectorVector(const FromVectorType & vec_from, ToVectorType & vec_to, const DeltaColumnType & delta, const DateLUTImpl & time_zone, size_t size) const
+    NO_INLINE NO_SANITIZE_UNDEFINED void vectorVector(
+        const FromVectorType & vec_from, ToVectorType & vec_to, const DeltaColumnType & delta, const DateLUTImpl & time_zone, size_t size) const
     {
         for (size_t i = 0; i < size; ++i)
             vec_to[i] = transform.execute(vec_from[i], delta.getData()[i], time_zone);
     }
 
     template <typename FromType, typename ToVectorType, typename DeltaColumnType>
-    NO_INLINE NO_SANITIZE_UNDEFINED void constantVector(const FromType & from, ToVectorType & vec_to, const DeltaColumnType & delta, const DateLUTImpl & time_zone, size_t size) const
+    NO_INLINE NO_SANITIZE_UNDEFINED void constantVector(
+        const FromType & from, ToVectorType & vec_to, const DeltaColumnType & delta, const DateLUTImpl & time_zone, size_t size) const
     {
         for (size_t i = 0; i < size; ++i)
             vec_to[i] = transform.execute(from, delta.getData()[i], time_zone);

From 47b00ad11bc06449d6b59c594b0b439a4323251b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 02:49:17 +0300
Subject: [PATCH 117/260] Fix UBSan report in TransformDateTime64

---
 src/Functions/TransformDateTime64.h                          | 5 ++++-
 .../queries/0_stateless/01773_datetime64_add_ubsan.reference | 2 ++
 tests/queries/0_stateless/01773_datetime64_add_ubsan.sql     | 2 ++
 3 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/01773_datetime64_add_ubsan.reference
 create mode 100644 tests/queries/0_stateless/01773_datetime64_add_ubsan.sql

diff --git a/src/Functions/TransformDateTime64.h b/src/Functions/TransformDateTime64.h
index e42c3155327..4eab2a491c7 100644
--- a/src/Functions/TransformDateTime64.h
+++ b/src/Functions/TransformDateTime64.h
@@ -49,8 +49,11 @@ public:
     {}
 
     template <typename ... Args>
-    inline auto execute(const DateTime64 & t, Args && ... args) const
+    inline auto NO_SANITIZE_UNDEFINED execute(const DateTime64 & t, Args && ... args) const
     {
+        /// Type conversion from float to integer may be required.
+        /// We are Ok with implementation specific result for out of range and denormals conversion.
+
         if constexpr (TransformHasExecuteOverload_v<DateTime64, decltype(scale_multiplier), Args...>)
         {
             return wrapped_transform.execute(t, scale_multiplier, std::forward<Args>(args)...);
diff --git a/tests/queries/0_stateless/01773_datetime64_add_ubsan.reference b/tests/queries/0_stateless/01773_datetime64_add_ubsan.reference
new file mode 100644
index 00000000000..aa47d0d46d4
--- /dev/null
+++ b/tests/queries/0_stateless/01773_datetime64_add_ubsan.reference
@@ -0,0 +1,2 @@
+0
+0
diff --git a/tests/queries/0_stateless/01773_datetime64_add_ubsan.sql b/tests/queries/0_stateless/01773_datetime64_add_ubsan.sql
new file mode 100644
index 00000000000..f7267f2b6b4
--- /dev/null
+++ b/tests/queries/0_stateless/01773_datetime64_add_ubsan.sql
@@ -0,0 +1,2 @@
+-- The result is unspecified but UBSan should not argue.
+SELECT ignore(addHours(now64(3), inf)) FROM numbers(2);

From eae268f2f0ce9935a0be0dd3d05e0d99897aeb00 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 03:15:48 +0300
Subject: [PATCH 118/260] Allow to search tuple of NULLs in a set

---
 src/Interpreters/convertFieldToType.cpp                 | 5 +++++
 tests/queries/0_stateless/01774_tuple_null_in.reference | 2 ++
 tests/queries/0_stateless/01774_tuple_null_in.sql       | 2 ++
 3 files changed, 9 insertions(+)
 create mode 100644 tests/queries/0_stateless/01774_tuple_null_in.reference
 create mode 100644 tests/queries/0_stateless/01774_tuple_null_in.sql

diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp
index d47f64cb1dc..5d124add0df 100644
--- a/src/Interpreters/convertFieldToType.cpp
+++ b/src/Interpreters/convertFieldToType.cpp
@@ -377,6 +377,11 @@ Field convertFieldToType(const Field & from_value, const IDataType & to_type, co
     else if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(&to_type))
     {
         const IDataType & nested_type = *nullable_type->getNestedType();
+
+        /// NULL remains NULL after any conversion.
+        if (WhichDataType(nested_type).isNothing())
+            return {};
+
         if (from_type_hint && from_type_hint->equals(nested_type))
             return from_value;
         return convertFieldToTypeImpl(from_value, nested_type, from_type_hint);
diff --git a/tests/queries/0_stateless/01774_tuple_null_in.reference b/tests/queries/0_stateless/01774_tuple_null_in.reference
new file mode 100644
index 00000000000..aa47d0d46d4
--- /dev/null
+++ b/tests/queries/0_stateless/01774_tuple_null_in.reference
@@ -0,0 +1,2 @@
+0
+0
diff --git a/tests/queries/0_stateless/01774_tuple_null_in.sql b/tests/queries/0_stateless/01774_tuple_null_in.sql
new file mode 100644
index 00000000000..c9dad49e8ed
--- /dev/null
+++ b/tests/queries/0_stateless/01774_tuple_null_in.sql
@@ -0,0 +1,2 @@
+SELECT (NULL, NULL) = (8, 0) OR (NULL, NULL) = (3, 2) OR (NULL, NULL) = (0, 0) OR (NULL, NULL) = (3, 1);
+SELECT (NULL, NULL) IN ((NULL, 0), (3, 1), (3, 2), (8, 0));

From 3e74f56261ef0055553f69d014681c7f830400d6 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 03:34:30 +0300
Subject: [PATCH 119/260] Fix missing check in decrypt for AEAD mode

---
 src/Functions/FunctionsAES.h                   | 18 ++++++++++++++----
 .../01776_decrypt_aead_size_check.reference    |  0
 .../01776_decrypt_aead_size_check.sql          |  1 +
 3 files changed, 15 insertions(+), 4 deletions(-)
 create mode 100644 tests/queries/0_stateless/01776_decrypt_aead_size_check.reference
 create mode 100644 tests/queries/0_stateless/01776_decrypt_aead_size_check.sql

diff --git a/src/Functions/FunctionsAES.h b/src/Functions/FunctionsAES.h
index 132e94907f5..8af4a27ecc9 100644
--- a/src/Functions/FunctionsAES.h
+++ b/src/Functions/FunctionsAES.h
@@ -538,8 +538,9 @@ private:
 
         [[maybe_unused]] const auto block_size = static_cast<size_t>(EVP_CIPHER_block_size(evp_cipher));
         [[maybe_unused]] const auto iv_size = static_cast<size_t>(EVP_CIPHER_iv_length(evp_cipher));
-        const auto key_size = static_cast<size_t>(EVP_CIPHER_key_length(evp_cipher));
-        const auto tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1
+
+        const size_t key_size = static_cast<size_t>(EVP_CIPHER_key_length(evp_cipher));
+        static constexpr size_t tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1
 
         auto decrypted_result_column = ColumnString::create();
         auto & decrypted_result_column_data = decrypted_result_column->getChars();
@@ -549,9 +550,17 @@ private:
             size_t resulting_size = 0;
             for (size_t r = 0; r < input_rows_count; ++r)
             {
-                resulting_size += input_column->getDataAt(r).size + 1;
+                size_t string_size = input_column->getDataAt(r).size;
+                resulting_size += string_size + 1;  /// With terminating zero.
+
                 if constexpr (mode == CipherMode::RFC5116_AEAD_AES_GCM)
+                {
+                    if (string_size < tag_size)
+                        throw Exception("Encrypted data is smaller than the size of additional data for AEAD mode, cannot decrypt.",
+                            ErrorCodes::BAD_ARGUMENTS);
+
                     resulting_size -= tag_size;
+                }
             }
 
 #if defined(MEMORY_SANITIZER)
@@ -565,6 +574,7 @@ private:
             decrypted_result_column_data.resize(resulting_size);
 #endif
         }
+
         auto * decrypted = decrypted_result_column_data.data();
 
         KeyHolder<mode> key_holder;
@@ -631,7 +641,7 @@ private:
                     // 1.a.2: Set AAD if present
                     if (aad_column)
                     {
-                        const auto aad_data = aad_column->getDataAt(r);
+                        StringRef aad_data = aad_column->getDataAt(r);
                         int tmp_len = 0;
                         if (aad_data.size != 0 && EVP_DecryptUpdate(evp_ctx, nullptr, &tmp_len,
                                 reinterpret_cast<const unsigned char *>(aad_data.data), aad_data.size) != 1)
diff --git a/tests/queries/0_stateless/01776_decrypt_aead_size_check.reference b/tests/queries/0_stateless/01776_decrypt_aead_size_check.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01776_decrypt_aead_size_check.sql b/tests/queries/0_stateless/01776_decrypt_aead_size_check.sql
new file mode 100644
index 00000000000..8730ed0eda2
--- /dev/null
+++ b/tests/queries/0_stateless/01776_decrypt_aead_size_check.sql
@@ -0,0 +1 @@
+SELECT decrypt('aes-128-gcm', 'text', 'key', 'IV'); -- { serverError 36 }

From 5dc9223288a5fe3f17a90faabded0c31e74178b7 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 04:11:57 +0300
Subject: [PATCH 120/260] Fix Arcadia

---
 src/Functions/ya.make | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Functions/ya.make b/src/Functions/ya.make
index 3ac64828b9c..aed2bd9b70d 100644
--- a/src/Functions/ya.make
+++ b/src/Functions/ya.make
@@ -467,6 +467,7 @@ SRCS(
     timeSlot.cpp
     timeSlots.cpp
     timezone.cpp
+    timezoneOf.cpp
     timezoneOffset.cpp
     toColumnTypeName.cpp
     toCustomWeek.cpp
@@ -506,7 +507,7 @@ SRCS(
     toStartOfTenMinutes.cpp
     toStartOfYear.cpp
     toTime.cpp
-    toTimeZone.cpp
+    toTimezone.cpp
     toTypeName.cpp
     toUnixTimestamp64Micro.cpp
     toUnixTimestamp64Milli.cpp

From 37948ac80a8ddc8f045721409598936757a2d3f2 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 04:12:52 +0300
Subject: [PATCH 121/260] Fix style

---
 src/Functions/timezoneOf.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/Functions/timezoneOf.cpp b/src/Functions/timezoneOf.cpp
index cdf686e276b..1d007a6e10e 100644
--- a/src/Functions/timezoneOf.cpp
+++ b/src/Functions/timezoneOf.cpp
@@ -9,6 +9,13 @@
 
 namespace DB
 {
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
+
 namespace
 {
 

From 7c07b43597c9bebd4853dea50ff484be3a13ee01 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 23 Mar 2021 23:03:08 +0300
Subject: [PATCH 122/260] Convert system.errors.stack_trace from String into
 Array(UInt64)

This should decrease overhead for the errors collecting.
---
 docs/en/operations/system-tables/errors.md  | 11 ++++++-
 src/Common/ErrorCodes.cpp                   |  8 ++---
 src/Common/ErrorCodes.h                     | 12 +++++---
 src/Common/Exception.cpp                    | 34 ++++++++++++++++++---
 src/Common/Exception.h                      |  4 +++
 src/Storages/System/StorageSystemErrors.cpp | 12 ++++++--
 6 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/docs/en/operations/system-tables/errors.md b/docs/en/operations/system-tables/errors.md
index 72a537f15b9..583cce88ca4 100644
--- a/docs/en/operations/system-tables/errors.md
+++ b/docs/en/operations/system-tables/errors.md
@@ -9,7 +9,7 @@ Columns:
 -   `value` ([UInt64](../../sql-reference/data-types/int-uint.md)) — the number of times this error has been happened.
 -   `last_error_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — time when the last error happened.
 -   `last_error_message` ([String](../../sql-reference/data-types/string.md)) — message for the last error.
--   `last_error_stacktrace` ([String](../../sql-reference/data-types/string.md)) — stacktrace for the last error.
+-   `last_error_trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — A [stack trace](https://en.wikipedia.org/wiki/Stack_trace) which represents a list of physical addresses where the called methods are stored.
 -   `remote` ([UInt8](../../sql-reference/data-types/int-uint.md)) — remote exception (i.e. received during one of the distributed query).
 
 **Example**
@@ -25,3 +25,12 @@ LIMIT 1
 │ CANNOT_OPEN_FILE │   76 │     1 │
 └──────────────────┴──────┴───────┘
 ```
+
+``` sql
+WITH arrayMap(x -> demangle(addressToSymbol(x)), last_error_trace) AS all
+SELECT name, arrayStringConcat(all, '\n') AS res
+FROM system.errors
+LIMIT 1
+SETTINGS allow_introspection_functions=1\G
+```
+
diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp
index 30714cb82ae..918bc301754 100644
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@@ -587,7 +587,7 @@ namespace ErrorCodes
 
     ErrorCode end() { return END + 1; }
 
-    void increment(ErrorCode error_code, bool remote, const std::string & message, const std::string & stacktrace)
+    void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace)
     {
         if (error_code >= end())
         {
@@ -596,10 +596,10 @@ namespace ErrorCodes
             error_code = end() - 1;
         }
 
-        values[error_code].increment(remote, message, stacktrace);
+        values[error_code].increment(remote, message, trace);
     }
 
-    void ErrorPairHolder::increment(bool remote, const std::string & message, const std::string & stacktrace)
+    void ErrorPairHolder::increment(bool remote, const std::string & message, const FramePointers & trace)
     {
         const auto now = std::chrono::system_clock::now();
 
@@ -609,7 +609,7 @@ namespace ErrorCodes
 
         ++error.count;
         error.message = message;
-        error.stacktrace = stacktrace;
+        error.trace = trace;
         error.error_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
     }
     ErrorPair ErrorPairHolder::get()
diff --git a/src/Common/ErrorCodes.h b/src/Common/ErrorCodes.h
index edb9be9e0c0..ffd0b8b8619 100644
--- a/src/Common/ErrorCodes.h
+++ b/src/Common/ErrorCodes.h
@@ -1,11 +1,12 @@
 #pragma once
 
-#include <stddef.h>
+#include <cstddef>
 #include <cstdint>
 #include <utility>
 #include <mutex>
-#include <common/types.h>
 #include <string_view>
+#include <vector>
+#include <common/types.h>
 
 /** Allows to count number of simultaneously happening error codes.
   * See also Exception.cpp for incrementing part.
@@ -19,6 +20,7 @@ namespace ErrorCodes
     /// ErrorCode identifier (index in array).
     using ErrorCode = int;
     using Value = size_t;
+    using FramePointers = std::vector<void *>;
 
     /// Get name of error_code by identifier.
     /// Returns statically allocated string.
@@ -33,7 +35,7 @@ namespace ErrorCodes
         /// Message for the last error.
         std::string message;
         /// Stacktrace for the last error.
-        std::string stacktrace;
+        FramePointers trace;
     };
     struct ErrorPair
     {
@@ -46,7 +48,7 @@ namespace ErrorCodes
     {
     public:
         ErrorPair get();
-        void increment(bool remote, const std::string & message, const std::string & stacktrace);
+        void increment(bool remote, const std::string & message, const FramePointers & trace);
 
     private:
         ErrorPair value;
@@ -60,7 +62,7 @@ namespace ErrorCodes
     ErrorCode end();
 
     /// Add value for specified error_code.
-    void increment(ErrorCode error_code, bool remote, const std::string & message, const std::string & stacktrace);
+    void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace);
 }
 
 }
diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp
index 08afd0397f5..ff638af22ad 100644
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@@ -36,7 +36,7 @@ namespace ErrorCodes
 
 /// - Aborts the process if error code is LOGICAL_ERROR.
 /// - Increments error codes statistics.
-void handle_error_code([[maybe_unused]] const std::string & msg, const std::string & stacktrace, int code, bool remote)
+void handle_error_code([[maybe_unused]] const std::string & msg, int code, bool remote, const Exception::FramePointers & trace)
 {
     // In debug builds and builds with sanitizers, treat LOGICAL_ERROR as an assertion failure.
     // Log the message before we fail.
@@ -47,20 +47,21 @@ void handle_error_code([[maybe_unused]] const std::string & msg, const std::stri
         abort();
     }
 #endif
-    ErrorCodes::increment(code, remote, msg, stacktrace);
+
+    ErrorCodes::increment(code, remote, msg, trace);
 }
 
 Exception::Exception(const std::string & msg, int code, bool remote_)
     : Poco::Exception(msg, code)
     , remote(remote_)
 {
-    handle_error_code(msg, getStackTraceString(), code, remote);
+    handle_error_code(msg, code, remote, getStackFramePointers());
 }
 
 Exception::Exception(const std::string & msg, const Exception & nested, int code)
     : Poco::Exception(msg, nested, code)
 {
-    handle_error_code(msg, getStackTraceString(), code, remote);
+    handle_error_code(msg, code, remote, getStackFramePointers());
 }
 
 Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc)
@@ -101,6 +102,31 @@ std::string Exception::getStackTraceString() const
 #endif
 }
 
+Exception::FramePointers Exception::getStackFramePointers() const
+{
+    FramePointers trace;
+#ifdef STD_EXCEPTION_HAS_STACK_TRACE
+    {
+        trace.resize(get_stack_trace_size());
+        for (size_t i = 0; i < trace.size(); ++i)
+        {
+            trace[i] = get_stack_trace_frames()[i];
+        }
+    }
+#else
+    {
+        size_t stack_trace_size = trace.getSize();
+        size_t stack_trace_offset = trace.getOffset();
+        trace.resize(stack_trace_size - stack_trace_offset);
+        for (size_t i = stack_trace_offset; i < stack_trace_size; ++i)
+        {
+            trace[i] = trace.getFramePointers()[i];
+        }
+    }
+#endif
+    return trace;
+}
+
 
 void throwFromErrno(const std::string & s, int code, int the_errno)
 {
diff --git a/src/Common/Exception.h b/src/Common/Exception.h
index e487badafa5..79b4394948a 100644
--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@@ -24,6 +24,8 @@ namespace DB
 class Exception : public Poco::Exception
 {
 public:
+    using FramePointers = std::vector<void *>;
+
     Exception() = default;
     Exception(const std::string & msg, int code, bool remote_ = false);
     Exception(const std::string & msg, const Exception & nested, int code);
@@ -66,6 +68,8 @@ public:
     bool isRemoteException() const { return remote; }
 
     std::string getStackTraceString() const;
+    /// Used for system.errors
+    FramePointers getStackFramePointers() const;
 
 private:
 #ifndef STD_EXCEPTION_HAS_STACK_TRACE
diff --git a/src/Storages/System/StorageSystemErrors.cpp b/src/Storages/System/StorageSystemErrors.cpp
index 5243cb11aa3..09d0aaddb3d 100644
--- a/src/Storages/System/StorageSystemErrors.cpp
+++ b/src/Storages/System/StorageSystemErrors.cpp
@@ -1,6 +1,7 @@
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeArray.h>
 #include <Storages/System/StorageSystemErrors.h>
 #include <Common/ErrorCodes.h>
 #include <Interpreters/Context.h>
@@ -16,7 +17,7 @@ NamesAndTypesList StorageSystemErrors::getNamesAndTypes()
         { "value",                   std::make_shared<DataTypeUInt64>() },
         { "last_error_time",         std::make_shared<DataTypeDateTime>() },
         { "last_error_message",      std::make_shared<DataTypeString>() },
-        { "last_error_stacktrace",   std::make_shared<DataTypeString>() },
+        { "last_error_trace",        std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>()) },
         { "remote",                  std::make_shared<DataTypeUInt8>() },
     };
 }
@@ -34,7 +35,14 @@ void StorageSystemErrors::fillData(MutableColumns & res_columns, const Context &
             res_columns[col_num++]->insert(error.count);
             res_columns[col_num++]->insert(error.error_time_ms / 1000);
             res_columns[col_num++]->insert(error.message);
-            res_columns[col_num++]->insert(error.stacktrace);
+            {
+                Array trace_array;
+                trace_array.reserve(error.trace.size());
+                for (size_t i = 0; i < error.trace.size(); ++i)
+                    trace_array.emplace_back(reinterpret_cast<intptr_t>(error.trace[i]));
+
+                res_columns[col_num++]->insert(trace_array);
+            }
             res_columns[col_num++]->insert(remote);
         }
     };

From f164c2462ff3aae8e51fc85abff09037401ba474 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <KochetovNicolai@users.noreply.github.com>
Date: Wed, 24 Mar 2021 09:41:57 +0300
Subject: [PATCH 123/260] Update Exception.cpp

Fix build
---
 src/Common/Exception.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp
index ff638af22ad..e8a98021588 100644
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@@ -104,27 +104,27 @@ std::string Exception::getStackTraceString() const
 
 Exception::FramePointers Exception::getStackFramePointers() const
 {
-    FramePointers trace;
+    FramePointers frame_pointers;
 #ifdef STD_EXCEPTION_HAS_STACK_TRACE
     {
-        trace.resize(get_stack_trace_size());
-        for (size_t i = 0; i < trace.size(); ++i)
+        frame_pointers.resize(get_stack_trace_size());
+        for (size_t i = 0; i < frame_pointers.size(); ++i)
         {
-            trace[i] = get_stack_trace_frames()[i];
+            frame_pointers[i] = get_stack_trace_frames()[i];
         }
     }
 #else
     {
         size_t stack_trace_size = trace.getSize();
         size_t stack_trace_offset = trace.getOffset();
-        trace.resize(stack_trace_size - stack_trace_offset);
+        frame_pointers.reserve(stack_trace_size - stack_trace_offset);
         for (size_t i = stack_trace_offset; i < stack_trace_size; ++i)
         {
-            trace[i] = trace.getFramePointers()[i];
+            frame_pointers.push_back(trace.getFramePointers()[i]);
         }
     }
 #endif
-    return trace;
+    return frame_pointers;
 }
 
 

From 9d8b21a04dbca5d27fdb504a9ad667de5b540acb Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 24 Mar 2021 11:12:37 +0300
Subject: [PATCH 124/260] Fix ephemeral node removal

---
 src/Coordination/NuKeeperStorage.cpp       | 14 ++++++----
 src/Coordination/tests/gtest_for_build.cpp | 31 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp
index 2440d6f6613..c1a8ebdfb44 100644
--- a/src/Coordination/NuKeeperStorage.cpp
+++ b/src/Coordination/NuKeeperStorage.cpp
@@ -233,7 +233,7 @@ struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest
 struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
 {
     using NuKeeperStorageRequest::NuKeeperStorageRequest;
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t session_id) const override
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t /*session_id*/) const override
     {
         Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
         Coordination::ZooKeeperRemoveResponse & response = dynamic_cast<Coordination::ZooKeeperRemoveResponse &>(*response_ptr);
@@ -257,7 +257,12 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
         {
             auto prev_node = it->value;
             if (prev_node.stat.ephemeralOwner != 0)
-                ephemerals[session_id].erase(request.path);
+            {
+                auto ephemerals_it = ephemerals.find(prev_node.stat.ephemeralOwner);
+                ephemerals_it->second.erase(request.path);
+                if (ephemerals_it->second.empty())
+                    ephemerals.erase(ephemerals_it);
+            }
 
             auto child_basename = getBaseName(it->key);
             container.updateValue(parentPath(request.path), [&child_basename] (NuKeeperStorage::Node & parent)
@@ -271,10 +276,10 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
 
             container.erase(request.path);
 
-            undo = [prev_node, &container, &ephemerals, session_id, path = request.path, child_basename]
+            undo = [prev_node, &container, &ephemerals, path = request.path, child_basename]
             {
                 if (prev_node.stat.ephemeralOwner != 0)
-                    ephemerals[session_id].emplace(path);
+                    ephemerals[prev_node.stat.ephemeralOwner].emplace(path);
 
                 container.insert(path, prev_node);
                 container.updateValue(parentPath(path), [&child_basename] (NuKeeperStorage::Node & parent)
@@ -377,7 +382,6 @@ struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest
     {
         return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CHANGED);
     }
-
 };
 
 struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest
diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp
index d90b711498e..cc3dcc04e53 100644
--- a/src/Coordination/tests/gtest_for_build.cpp
+++ b/src/Coordination/tests/gtest_for_build.cpp
@@ -1232,6 +1232,37 @@ TEST(CoordinationTest, TestStateMachineAndLogStore)
     }
 }
 
+TEST(CoordinationTest, TestEphemeralNodeRemove)
+{
+    using namespace Coordination;
+    using namespace DB;
+
+    ChangelogDirTest snapshots("./snapshots");
+    CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
+
+    ResponsesQueue queue;
+    SnapshotsQueue snapshots_queue{1};
+    auto state_machine = std::make_shared<NuKeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings);
+    state_machine->init();
+
+    std::shared_ptr<ZooKeeperCreateRequest> request_c = std::make_shared<ZooKeeperCreateRequest>();
+    request_c->path = "/hello";
+    request_c->is_ephemeral = true;
+    auto entry_c = getLogEntryFromZKRequest(0, 1, request_c);
+    state_machine->commit(1, entry_c->get_buf());
+    const auto & storage = state_machine->getStorage();
+
+    EXPECT_EQ(storage.ephemerals.size(), 1);
+    std::shared_ptr<ZooKeeperRemoveRequest> request_d = std::make_shared<ZooKeeperRemoveRequest>();
+    request_d->path = "/hello";
+    /// Delete from other session
+    auto entry_d = getLogEntryFromZKRequest(0, 2, request_d);
+    state_machine->commit(2, entry_d->get_buf());
+
+    EXPECT_EQ(storage.ephemerals.size(), 0);
+}
+
+
 int main(int argc, char ** argv)
 {
     Poco::AutoPtr<Poco::ConsoleChannel> channel(new Poco::ConsoleChannel(std::cerr));

From 487d6bdcd91d8c1267650d2e8bc35b6077a1b071 Mon Sep 17 00:00:00 2001
From: fuqi <fuqi@growingio.com>
Date: Wed, 24 Mar 2021 16:35:20 +0800
Subject: [PATCH 125/260] add test case fix order key check

---
 .../MergeTree/MergeTreeWhereOptimizer.cpp     |  9 ++---
 .../MergeTree/MergeTreeWhereOptimizer.h       |  4 +--
 ...der_key_to_prewhere_select_final.reference | 35 +++++++++++++++++++
 ...37_move_order_to_prewhere_select_final.sql | 15 ++++++++
 4 files changed, 57 insertions(+), 6 deletions(-)
 create mode 100644 tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference
 create mode 100644 tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql

diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
index 98e40bf394d..692d2ac4b94 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
@@ -37,7 +37,8 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
     : table_columns{ext::map<std::unordered_set>(
         metadata_snapshot->getColumns().getAllPhysical(), [](const NameAndTypePair & col) { return col.name; })}
     , queried_columns{queried_columns_}
-    , primary_key_columns{metadata_snapshot->getPrimaryKey().column_names}
+    , sorting_key_names{NameSet(
+          metadata_snapshot->getSortingKey().column_names.begin(), metadata_snapshot->getSortingKey().column_names.end())}
     , block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)}
     , log{log_}
     , column_sizes{std::move(column_sizes_)}
@@ -301,9 +302,9 @@ bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const
 }
 
 
-bool MergeTreeWhereOptimizer::isPrimaryKey(const String & column_name) const
+bool MergeTreeWhereOptimizer::isSortingKey(const String & column_name) const
 {
-    return std::find(primary_key_columns.begin(), primary_key_columns.end(), column_name) != primary_key_columns.end();
+    return sorting_key_names.count(column_name);
 }
 
 
@@ -344,7 +345,7 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool is_final) c
         /// disallow moving result of ARRAY JOIN to PREWHERE
         if (array_joined_names.count(*opt_name) ||
             array_joined_names.count(Nested::extractTableName(*opt_name)) ||
-            (is_final && !isPrimaryKey(*opt_name)))
+            (is_final && !isSortingKey(*opt_name)))
             return true;
     }
 
diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
index 85d1df583fa..8fd973e9ba3 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
@@ -85,7 +85,7 @@ private:
 
     bool isPrimaryKeyAtom(const ASTPtr & ast) const;
 
-    bool isPrimaryKey(const String & column_name) const;
+    bool isSortingKey(const String & column_name) const;
 
     bool isConstant(const ASTPtr & expr) const;
 
@@ -106,7 +106,7 @@ private:
     String first_primary_key_column;
     const StringSet table_columns;
     const Names queried_columns;
-    const Names primary_key_columns;
+    const NameSet sorting_key_names;
     const Block block_with_constants;
     Poco::Logger * log;
     std::unordered_map<std::string, UInt64> column_sizes;
diff --git a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference
new file mode 100644
index 00000000000..bde1e20ab10
--- /dev/null
+++ b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference
@@ -0,0 +1,35 @@
+SELECT
+    x,
+    y,
+    z
+FROM prewhere_move_select_final
+PREWHERE y > 100
+SELECT
+    x,
+    y,
+    z
+FROM prewhere_move_select_final
+FINAL
+PREWHERE y > 100
+SELECT
+    x,
+    y,
+    z
+FROM prewhere_move_select_final
+FINAL
+WHERE z > 400
+SELECT
+    x,
+    y,
+    z
+FROM prewhere_move_select_final
+FINAL
+WHERE z > 400
+SELECT
+    x,
+    y,
+    z
+FROM prewhere_move_select_final
+FINAL
+PREWHERE y > 100
+WHERE (y > 100) AND (z > 400)
\ No newline at end of file
diff --git a/tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql b/tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql
new file mode 100644
index 00000000000..a3a882c461a
--- /dev/null
+++ b/tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql
@@ -0,0 +1,15 @@
+DROP TABLE IF EXISTS prewhere_move_select_final;
+CREATE TABLE prewhere_move_select_final (x Int, y Int, z Int) ENGINE = ReplacingMergeTree() ORDER BY (x, y);
+INSERT INTO prewhere_move_select_final SELECT number, number * 2, number * 3 FROM numbers(1000);
+
+-- order key can be pushed down with final
+EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final WHERE y > 100;
+EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE y > 100;
+
+-- can not be pushed down
+EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE z > 400;
+
+-- only y can be pushed down
+EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE y > 100 and z > 400;
+
+DROP TABLE prewhere_move_select_final;

From fb3af77098bf646e456fce5d1639dfc194623bb3 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitbar@yandex-team.ru>
Date: Tue, 23 Mar 2021 21:01:40 +0300
Subject: [PATCH 126/260] Add test.

---
 .../0_stateless/00966_invalid_json_must_not_parse.reference   | 4 ++++
 .../queries/0_stateless/00966_invalid_json_must_not_parse.sql | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference
index f7eb44d66e0..4521d575ff3 100644
--- a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference
+++ b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference
@@ -4,3 +4,7 @@
 0
 0
 0
+0
+0
+0
+0
diff --git a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql
index afcbc78cfd5..0e7fa55dbae 100644
--- a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql
+++ b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql
@@ -3,6 +3,8 @@ SET allow_simdjson=1;
 SELECT JSONLength('"HX-=');
 SELECT JSONLength('[9]\0\x42\xD3\x36\xE3');
 SELECT JSONLength(unhex('5B30000E06D7AA5D'));
+SELECT JSONLength('{"success"test:"123"}');
+SELECT isValidJSON('{"success"test:"123"}');
 
 
 SET allow_simdjson=0;
@@ -10,3 +12,5 @@ SET allow_simdjson=0;
 SELECT JSONLength('"HX-=');
 SELECT JSONLength('[9]\0\x42\xD3\x36\xE3');
 SELECT JSONLength(unhex('5B30000E06D7AA5D'));
+SELECT JSONLength('{"success"test:"123"}');
+SELECT isValidJSON('{"success"test:"123"}');

From c89861a2ac5a464aeb8acc8fbb23c4e44c196704 Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.ru>
Date: Wed, 24 Mar 2021 13:11:52 +0300
Subject: [PATCH 127/260] Better

---
 src/Common/ya.make | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Common/ya.make b/src/Common/ya.make
index debad6c5de2..d1ff04f8f0a 100644
--- a/src/Common/ya.make
+++ b/src/Common/ya.make
@@ -14,7 +14,7 @@ PEERDIR(
     clickhouse/base/common
     clickhouse/base/pcg-random
     clickhouse/base/widechar_width
-    contrib/libs/libcpuid/libcpuid
+    contrib/libs/libcpuid
     contrib/libs/openssl
     contrib/libs/poco/NetSSL_OpenSSL
     contrib/libs/re2

From e2760101c109e33d9403a0457d77b0de076771f4 Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.ru>
Date: Wed, 24 Mar 2021 13:13:04 +0300
Subject: [PATCH 128/260] Update ya.make.in

---
 src/Common/ya.make.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Common/ya.make.in b/src/Common/ya.make.in
index 210ecab6ef6..420384bb4a9 100644
--- a/src/Common/ya.make.in
+++ b/src/Common/ya.make.in
@@ -13,7 +13,7 @@ PEERDIR(
     clickhouse/base/common
     clickhouse/base/pcg-random
     clickhouse/base/widechar_width
-    contrib/libs/libcpuid/libcpuid
+    contrib/libs/libcpuid
     contrib/libs/openssl
     contrib/libs/poco/NetSSL_OpenSSL
     contrib/libs/re2

From d76edc33d57a6d86d919aa36824a0a7a034cc919 Mon Sep 17 00:00:00 2001
From: fuqi <fuqi@growingio.com>
Date: Wed, 24 Mar 2021 18:34:20 +0800
Subject: [PATCH 129/260] rename test case name

---
 ...inal.sql => 01737_move_order_key_to_prewhere_select_final.sql} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/queries/0_stateless/{01737_move_order_to_prewhere_select_final.sql => 01737_move_order_key_to_prewhere_select_final.sql} (100%)

diff --git a/tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql
similarity index 100%
rename from tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql
rename to tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql

From 57c6ebc844e53f18a634c0c647e3969b20458edd Mon Sep 17 00:00:00 2001
From: fuqi <fuqi@growingio.com>
Date: Wed, 24 Mar 2021 19:37:47 +0800
Subject: [PATCH 130/260] fix test case

---
 ...737_move_order_key_to_prewhere_select_final.reference | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference
index bde1e20ab10..95479cf37ba 100644
--- a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference
+++ b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference
@@ -24,12 +24,5 @@ SELECT
     z
 FROM prewhere_move_select_final
 FINAL
-WHERE z > 400
-SELECT
-    x,
-    y,
-    z
-FROM prewhere_move_select_final
-FINAL
 PREWHERE y > 100
-WHERE (y > 100) AND (z > 400)
\ No newline at end of file
+WHERE (y > 100) AND (z > 400)

From 02eee100a0554c69e85efb5d7cdd88253e7cc9f2 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 24 Mar 2021 15:36:39 +0300
Subject: [PATCH 131/260] formatting fixes

---
 src/Parsers/ASTSelectQuery.cpp      | 2 +-
 src/Parsers/ASTWindowDefinition.cpp | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/Parsers/ASTSelectQuery.cpp b/src/Parsers/ASTSelectQuery.cpp
index 89ef08e0289..4715c7f201b 100644
--- a/src/Parsers/ASTSelectQuery.cpp
+++ b/src/Parsers/ASTSelectQuery.cpp
@@ -138,7 +138,7 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F
     {
         s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str <<
             "WINDOW" << (s.hilite ? hilite_none : "");
-        window()->formatImpl(s, state, frame);
+        window()->as<ASTExpressionList &>().formatImplMultiline(s, state, frame);
     }
 
     if (orderBy())
diff --git a/src/Parsers/ASTWindowDefinition.cpp b/src/Parsers/ASTWindowDefinition.cpp
index a645960bd0a..35374df6177 100644
--- a/src/Parsers/ASTWindowDefinition.cpp
+++ b/src/Parsers/ASTWindowDefinition.cpp
@@ -35,9 +35,11 @@ String ASTWindowDefinition::getID(char) const
 void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
     FormatState & state, FormatStateStacked format_frame) const
 {
+    format_frame.expression_list_prepend_whitespace = false;
+
     if (partition_by)
     {
-        settings.ostr << "PARTITION BY";
+        settings.ostr << "PARTITION BY ";
         partition_by->formatImpl(settings, state, format_frame);
     }
 
@@ -48,7 +50,7 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
 
     if (order_by)
     {
-        settings.ostr << "ORDER BY";
+        settings.ostr << "ORDER BY ";
         order_by->formatImpl(settings, state, format_frame);
     }
 

From 725c4f254473c71d3863cd47652d0c936f875d69 Mon Sep 17 00:00:00 2001
From: Maksim Kita <maksim-kita@yandex-team.ru>
Date: Wed, 24 Mar 2021 22:20:50 +0300
Subject: [PATCH 132/260] Update 01774_tuple_null_in.sql

---
 tests/queries/0_stateless/01774_tuple_null_in.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/01774_tuple_null_in.sql b/tests/queries/0_stateless/01774_tuple_null_in.sql
index c9dad49e8ed..a9cc39e8840 100644
--- a/tests/queries/0_stateless/01774_tuple_null_in.sql
+++ b/tests/queries/0_stateless/01774_tuple_null_in.sql
@@ -1,2 +1,2 @@
 SELECT (NULL, NULL) = (8, 0) OR (NULL, NULL) = (3, 2) OR (NULL, NULL) = (0, 0) OR (NULL, NULL) = (3, 1);
-SELECT (NULL, NULL) IN ((NULL, 0), (3, 1), (3, 2), (8, 0));
+SELECT (NULL, NULL) IN ((NULL, 0), (3, 1), (3, 2), (8, 0), (NULL, NULL));

From 6341b083fb2c6d4550d36b0217fa223f09b2ece1 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 22:35:22 +0300
Subject: [PATCH 133/260] Add test to skip list

---
 docker/test/fasttest/run.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh
index 649f9f812e1..bbd5443ffb6 100755
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@@ -292,6 +292,7 @@ function run_tests
         01318_decrypt                           # Depends on OpenSSL
         01663_aes_msan                          # Depends on OpenSSL
         01667_aes_args_check                    # Depends on OpenSSL
+        01776_decrypt_aead_size_check           # Depends on OpenSSL
         01281_unsucceeded_insert_select_queries_counter
         01292_create_user
         01294_lazy_database_concurrent

From b610afe7715f387260803ab41c57173ed20545f0 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 22:40:45 +0300
Subject: [PATCH 134/260] Another fix

---
 base/common/DateLUTImpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/common/DateLUTImpl.h b/base/common/DateLUTImpl.h
index 1a44c670650..8d393465b82 100644
--- a/base/common/DateLUTImpl.h
+++ b/base/common/DateLUTImpl.h
@@ -1069,11 +1069,11 @@ public:
     }
 
     template <typename DateOrTime>
-    inline LUTIndex addMonthsIndex(DateOrTime v, Int64 delta) const
+    inline LUTIndex NO_SANITIZE_UNDEFINED addMonthsIndex(DateOrTime v, Int64 delta) const
     {
         const Values & values = lut[toLUTIndex(v)];
 
-        Int64 month = values.month + static_cast<UInt64>(delta);    /// Cast is to avoid UB in signed integer overflow.
+        Int64 month = values.month + delta;
 
         if (month > 0)
         {

From c325ed65e24d390e6d86d3a47d5134128be507c0 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 23:10:15 +0300
Subject: [PATCH 135/260] Fix UBSan report in mapPopulateSeries

---
 src/Functions/array/mapPopulateSeries.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/array/mapPopulateSeries.cpp b/src/Functions/array/mapPopulateSeries.cpp
index 2050e0c28ab..c025117af69 100644
--- a/src/Functions/array/mapPopulateSeries.cpp
+++ b/src/Functions/array/mapPopulateSeries.cpp
@@ -190,7 +190,7 @@ private:
             }
 
             static constexpr size_t MAX_ARRAY_SIZE = 1ULL << 30;
-            if (static_cast<size_t>(max_key - min_key) > MAX_ARRAY_SIZE)
+            if (static_cast<size_t>(max_key) - static_cast<size_t>(min_key) > MAX_ARRAY_SIZE)
                 throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in the result of function {}", getName());
 
             /* fill the result arrays */

From 2d8e82f3d9f43aca0217a7d33e0902e330ac5695 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 24 Mar 2021 23:12:58 +0300
Subject: [PATCH 136/260] Add a test #22094

---
 .../0_stateless/01777_map_populate_series_ubsan.reference       | 0
 tests/queries/0_stateless/01777_map_populate_series_ubsan.sql   | 2 ++
 2 files changed, 2 insertions(+)
 create mode 100644 tests/queries/0_stateless/01777_map_populate_series_ubsan.reference
 create mode 100644 tests/queries/0_stateless/01777_map_populate_series_ubsan.sql

diff --git a/tests/queries/0_stateless/01777_map_populate_series_ubsan.reference b/tests/queries/0_stateless/01777_map_populate_series_ubsan.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01777_map_populate_series_ubsan.sql b/tests/queries/0_stateless/01777_map_populate_series_ubsan.sql
new file mode 100644
index 00000000000..5a8c182425a
--- /dev/null
+++ b/tests/queries/0_stateless/01777_map_populate_series_ubsan.sql
@@ -0,0 +1,2 @@
+-- Should correctly throw exception about overflow:
+SELECT mapPopulateSeries([-9223372036854775808, toUInt32(2)], [toUInt32(1023), -1]); -- { serverError 128 }

From d1f72f81f5be0ea6460b24d28cbba881a6d9de0a Mon Sep 17 00:00:00 2001
From: Dmitriy <sevirov@yandex-team.ru>
Date: Thu, 25 Mar 2021 00:21:08 +0300
Subject: [PATCH 137/260] Translate to Russian
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Перевел на русский язык.
---
 docs/en/interfaces/formats.md |  2 +-
 docs/ru/interfaces/formats.md | 56 ++++++++++++++++++++---------------
 2 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index 0d582fab12b..5987ba0f676 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -1308,7 +1308,7 @@ The table below shows supported data types and how they match ClickHouse [data t
 | `DECIMAL`                | [Decimal](../sql-reference/data-types/decimal.md)   | `DECIMAL`                |
 | `-`                      | [Array](../sql-reference/data-types/array.md)       | `LIST`                   |
 
-ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` or `SELECT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type.
+ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type.
 
 Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
 
diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md
index 67cc80f5cd8..8ec26ec66f5 100644
--- a/docs/ru/interfaces/formats.md
+++ b/docs/ru/interfaces/formats.md
@@ -49,7 +49,7 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT
 | [Parquet](#data-format-parquet)                                                         | ✔     | ✔      |
 | [Arrow](#data-format-arrow)                                                             | ✔     | ✔      |
 | [ArrowStream](#data-format-arrow-stream)                                                | ✔     | ✔      |
-| [ORC](#data-format-orc)                                                                 | ✔     | ✗      |
+| [ORC](#data-format-orc)                                                                 | ✔     | ✔      |
 | [RowBinary](#rowbinary)                                                                 | ✔     | ✔      |
 | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes)                               | ✔     | ✔      |
 | [Native](#native)                                                                       | ✔     | ✔      |
@@ -1203,45 +1203,53 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_
 
 ## ORC {#data-format-orc}
 
-[Apache ORC](https://orc.apache.org/) - это column-oriented формат данных, распространённый в экосистеме Hadoop. Вы можете только вставлять данные этого формата в ClickHouse.
+[Apache ORC](https://orc.apache.org/) — это столбцовый формат данных, распространенный в экосистеме [Hadoop](https://hadoop.apache.org/).
 
 ### Соответствие типов данных {#sootvetstvie-tipov-dannykh-1}
 
-Таблица показывает поддержанные типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT`.
+Таблица ниже содержит поддерживаемые типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT` и `SELECT`.
 
-| Тип данных ORC (`INSERT`) | Тип данных ClickHouse                               |
-|---------------------------|-----------------------------------------------------|
-| `UINT8`, `BOOL`           | [UInt8](../sql-reference/data-types/int-uint.md)    |
-| `INT8`                    | [Int8](../sql-reference/data-types/int-uint.md)     |
-| `UINT16`                  | [UInt16](../sql-reference/data-types/int-uint.md)   |
-| `INT16`                   | [Int16](../sql-reference/data-types/int-uint.md)    |
-| `UINT32`                  | [UInt32](../sql-reference/data-types/int-uint.md)   |
-| `INT32`                   | [Int32](../sql-reference/data-types/int-uint.md)    |
-| `UINT64`                  | [UInt64](../sql-reference/data-types/int-uint.md)   |
-| `INT64`                   | [Int64](../sql-reference/data-types/int-uint.md)    |
-| `FLOAT`, `HALF_FLOAT`     | [Float32](../sql-reference/data-types/float.md)     |
-| `DOUBLE`                  | [Float64](../sql-reference/data-types/float.md)     |
-| `DATE32`                  | [Date](../sql-reference/data-types/date.md)         |
-| `DATE64`, `TIMESTAMP`     | [DateTime](../sql-reference/data-types/datetime.md) |
-| `STRING`, `BINARY`        | [String](../sql-reference/data-types/string.md)     |
-| `DECIMAL`                 | [Decimal](../sql-reference/data-types/decimal.md)   |
+| Тип данных ORC (`INSERT`) | Тип данных ClickHouse                               | Тип данных ORC (`SELECT`) |
+|---------------------------|-----------------------------------------------------|---------------------------|
+| `UINT8`, `BOOL`           | [UInt8](../sql-reference/data-types/int-uint.md)    | `UINT8`                   |
+| `INT8`                    | [Int8](../sql-reference/data-types/int-uint.md)     | `INT8`                    |
+| `UINT16`                  | [UInt16](../sql-reference/data-types/int-uint.md)   | `UINT16`                  |
+| `INT16`                   | [Int16](../sql-reference/data-types/int-uint.md)    | `INT16`                   |
+| `UINT32`                  | [UInt32](../sql-reference/data-types/int-uint.md)   | `UINT32`                  |
+| `INT32`                   | [Int32](../sql-reference/data-types/int-uint.md)    | `INT32`                   |
+| `UINT64`                  | [UInt64](../sql-reference/data-types/int-uint.md)   | `UINT64`                  |
+| `INT64`                   | [Int64](../sql-reference/data-types/int-uint.md)    | `INT64`                   |
+| `FLOAT`, `HALF_FLOAT`     | [Float32](../sql-reference/data-types/float.md)     | `FLOAT`                   |
+| `DOUBLE`                  | [Float64](../sql-reference/data-types/float.md)     | `DOUBLE`                  |
+| `DATE32`                  | [Date](../sql-reference/data-types/date.md)         | `DATE32`                  |
+| `DATE64`, `TIMESTAMP`     | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP`               |
+| `STRING`, `BINARY`        | [String](../sql-reference/data-types/string.md)     | `BINARY`                  |
+| `DECIMAL`                 | [Decimal](../sql-reference/data-types/decimal.md)   | `DECIMAL`                 |
+| `-`                       | [Array](../sql-reference/data-types/array.md)       | `LIST`                    |
 
-ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных Parquet `DECIMAL` как `Decimal128`.
+ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных ORC `DECIMAL` как `Decimal128`.
 
-Неподдержанные типы данных ORC: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
+Неподдерживаемые типы данных ORC: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
 
-Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных, ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse.
+Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse.
 
 ### Вставка данных {#vstavka-dannykh-1}
 
-Данные ORC можно вставить в таблицу ClickHouse командой:
+Чтобы вставить в ClickHouse данные из файла в формате ORC, вы можете использовать команду следующего вида:
 
 ``` bash
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
-Для обмена данных с Hadoop можно использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md).
+### Выборка данных {#vyborka-dannykh-1}
 
+Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, вы можете использовать команду следующего вида:
+
+``` bash
+$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc}
+```
+
+Для обмена данных с экосистемой Hadoop вы можете использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md).
 
 ## LineAsString {#lineasstring}
 

From 8121c52c53b969bd023dc5687ce680efd4a06f82 Mon Sep 17 00:00:00 2001
From: filimonov <1549571+filimonov@users.noreply.github.com>
Date: Wed, 24 Mar 2021 22:24:07 +0100
Subject: [PATCH 138/260] Update entrypoint.sh

fix for #22100
---
 docker/server/entrypoint.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh
index 0138a165505..1e665e0019c 100755
--- a/docker/server/entrypoint.sh
+++ b/docker/server/entrypoint.sh
@@ -46,9 +46,11 @@ DATA_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --
 TMP_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=tmp_path || true)"
 USER_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=user_files_path || true)"
 LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.log || true)"
-LOG_DIR="$(dirname "$LOG_PATH" || true)"
+LOG_DIR=""
+if [ -n "$LOG_PATH" ]; then LOG_DIR="$(dirname "$LOG_PATH")"; fi
 ERROR_LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.errorlog || true)"
-ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH" || true)"
+ERROR_LOG_DIR=""
+if [ -n "$ERROR_LOG_PATH" ]; then LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi
 FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=format_schema_path || true)"
 
 CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"

From 4b6b1311ce630513139a53a05c4a7be1036b686b Mon Sep 17 00:00:00 2001
From: filimonov <1549571+filimonov@users.noreply.github.com>
Date: Wed, 24 Mar 2021 22:33:08 +0100
Subject: [PATCH 139/260] Update entrypoint.sh

---
 docker/server/entrypoint.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh
index 1e665e0019c..81e04bd7874 100755
--- a/docker/server/entrypoint.sh
+++ b/docker/server/entrypoint.sh
@@ -50,7 +50,7 @@ LOG_DIR=""
 if [ -n "$LOG_PATH" ]; then LOG_DIR="$(dirname "$LOG_PATH")"; fi
 ERROR_LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.errorlog || true)"
 ERROR_LOG_DIR=""
-if [ -n "$ERROR_LOG_PATH" ]; then LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi
+if [ -n "$ERROR_LOG_PATH" ]; then ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi
 FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=format_schema_path || true)"
 
 CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"

From 1bdf12b3f1f5aafbd3bf4f0113105b93b3373e47 Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Thu, 25 Mar 2021 10:44:10 +0800
Subject: [PATCH 140/260] bump replxx

---
 contrib/replxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/replxx b/contrib/replxx
index cdb6e3f2ce4..2b24f14594d 160000
--- a/contrib/replxx
+++ b/contrib/replxx
@@ -1 +1 @@
-Subproject commit cdb6e3f2ce4464225daf9c8beeae7db98d590bdc
+Subproject commit 2b24f14594d7606792b92544bb112a6322ba34d7

From 8354cdd0e14aa62e9b47935edf7642f563efd09f Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Tue, 2 Feb 2021 20:00:41 +0800
Subject: [PATCH 141/260] WIP update-aggregate-funcions-in-zh

---
 docs/zh/faq/terms_translation_zh.md           |  11 ++
 .../aggregate-functions/reference.md          |  89 ----------
 .../aggregate-functions/reference/any.md      |  13 ++
 .../aggregate-functions/reference/anyheavy.md |  30 ++++
 .../aggregate-functions/reference/anylast.md  |   9 +
 .../aggregate-functions/reference/argmax.md   |  32 ++++
 .../aggregate-functions/reference/argmin.md   |  31 ++++
 .../aggregate-functions/reference/avg.md      |  62 +++++++
 .../reference/avgweighted.md                  |  99 +++++++++++
 .../reference/categoricalinformationvalue.md  |  13 ++
 .../aggregate-functions/reference/corr.md     |  12 ++
 .../aggregate-functions/reference/count.md    |  69 ++++++++
 .../aggregate-functions/reference/covarpop.md |  12 ++
 .../reference/covarsamp.md                    |  12 ++
 .../reference/grouparray.md                   |  14 ++
 .../reference/grouparrayinsertat.md           |  91 ++++++++++
 .../reference/grouparraymovingavg.md          |  78 +++++++++
 .../reference/grouparraymovingsum.md          |  76 ++++++++
 .../reference/grouparraysample.md             |  81 +++++++++
 .../reference/groupbitand.md                  |  46 +++++
 .../reference/groupbitmap.md                  |  44 +++++
 .../reference/groupbitmapand.md               |  46 +++++
 .../reference/groupbitmapor.md                |  46 +++++
 .../reference/groupbitmapxor.md               |  46 +++++
 .../reference/groupbitor.md                   |  46 +++++
 .../reference/groupbitxor.md                  |  46 +++++
 .../reference/groupuniqarray.md               |  12 ++
 .../aggregate-functions/reference/index.md    |  74 ++++++++
 .../reference/initializeAggregation.md        |  37 ++++
 .../aggregate-functions/reference/kurtpop.md  |  25 +++
 .../aggregate-functions/reference/kurtsamp.md |  27 +++
 .../aggregate-functions/reference/max.md      |   7 +
 .../aggregate-functions/reference/maxmap.md   |  28 +++
 .../aggregate-functions/reference/median.md   |  41 +++++
 .../aggregate-functions/reference/min.md      |   7 +
 .../aggregate-functions/reference/minmap.md   |  28 +++
 .../aggregate-functions/reference/quantile.md |  66 +++++++
 .../reference/quantiledeterministic.md        |  67 +++++++
 .../reference/quantileexact.md                | 164 ++++++++++++++++++
 .../reference/quantileexactweighted.md        |  67 +++++++
 .../reference/quantiles.md                    |   9 +
 .../reference/quantiletdigest.md              |  57 ++++++
 .../reference/quantiletdigestweighted.md      |  58 +++++++
 .../reference/quantiletiming.md               |  86 +++++++++
 .../reference/quantiletimingweighted.md       |  85 +++++++++
 .../aggregate-functions/reference/rankCorr.md |  53 ++++++
 .../reference/simplelinearregression.md       |  42 +++++
 .../aggregate-functions/reference/skewpop.md  |  25 +++
 .../aggregate-functions/reference/skewsamp.md |  27 +++
 .../reference/stddevpop.md                    |  10 ++
 .../reference/stddevsamp.md                   |  10 ++
 .../reference/stochasticlinearregression.md   |  75 ++++++++
 .../reference/stochasticlogisticregression.md |  55 ++++++
 .../aggregate-functions/reference/sum.md      |   8 +
 .../aggregate-functions/reference/summap.md   |  48 +++++
 .../reference/sumwithoverflow.md              |   9 +
 .../aggregate-functions/reference/topk.md     |  42 +++++
 .../reference/topkweighted.md                 |  42 +++++
 .../aggregate-functions/reference/uniq.md     |  40 +++++
 .../reference/uniqcombined.md                 |  51 ++++++
 .../reference/uniqcombined64.md               |   7 +
 .../reference/uniqexact.md                    |  25 +++
 .../reference/uniqhll12.md                    |  39 +++++
 .../aggregate-functions/reference/varpop.md   |  12 ++
 .../aggregate-functions/reference/varsamp.md  |  14 ++
 65 files changed, 2644 insertions(+), 89 deletions(-)
 create mode 100644 docs/zh/faq/terms_translation_zh.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/any.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/anylast.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/argmax.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/argmin.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/avg.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/corr.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/count.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/covarpop.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/grouparray.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/index.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/max.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/median.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/min.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/minmap.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/quantile.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/quantiles.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/skewpop.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/sum.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/summap.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/topk.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/uniq.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/varpop.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/varsamp.md

diff --git a/docs/zh/faq/terms_translation_zh.md b/docs/zh/faq/terms_translation_zh.md
new file mode 100644
index 00000000000..c7eece6847a
--- /dev/null
+++ b/docs/zh/faq/terms_translation_zh.md
@@ -0,0 +1,11 @@
+# 术语翻译约定
+本文档用来维护从英文翻译成中文的术语集。
+
+## 保持英文，不译
+Parquet
+
+## 英文  <-> 中文
+Tuple       元组
+
+
+
diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 3a224886a00..6d19b404af0 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -69,49 +69,6 @@ SELECT count(DISTINCT num) FROM t
 └────────────────┘
 ```
 
-这个例子表明 `count(DISTINCT num)` 由执行 `uniqExact` 根据功能 `count_distinct_implementation` 设定值。
-
-## any(x) {#agg_function-any}
-
-选择第一个遇到的值。
-查询可以以任何顺序执行，甚至每次都以不同的顺序执行，因此此函数的结果是不确定的。
-要获得确定的结果，您可以使用 ‘min’ 或 ‘max’ 功能，而不是 ‘any’.
-
-在某些情况下，可以依靠执行的顺序。 这适用于SELECT来自使用ORDER BY的子查询的情况。
-
-当一个 `SELECT` 查询具有 `GROUP BY` 子句或至少一个聚合函数，ClickHouse（相对于MySQL）要求在所有表达式 `SELECT`, `HAVING`，和 `ORDER BY` 子句可以从键或聚合函数计算。 换句话说，从表中选择的每个列必须在键或聚合函数内使用。 要获得像MySQL这样的行为，您可以将其他列放在 `any` 聚合函数。
-
-## anyHeavy(x) {#anyheavyx}
-
-使用选择一个频繁出现的值 [重打者](http://www.cs.umd.edu/~samir/498/karp.pdf) 算法。 如果某个值在查询的每个执行线程中出现的情况超过一半，则返回此值。 通常情况下，结果是不确定的。
-
-``` sql
-anyHeavy(column)
-```
-
-**参数**
-
--   `column` – The column name.
-
-**示例**
-
-就拿 [时间](../../getting-started/example-datasets/ontime.md) 数据集，并选择在任何频繁出现的值 `AirlineID` 列。
-
-``` sql
-SELECT anyHeavy(AirlineID) AS res
-FROM ontime
-```
-
-``` text
-┌───res─┐
-│ 19690 │
-└───────┘
-```
-
-## anyLast(x) {#anylastx}
-
-选择遇到的最后一个值。
-其结果是一样不确定的 `any` 功能。
 
 ## groupBitAnd {#groupbitand}
 
@@ -283,46 +240,6 @@ num
 3
 ```
 
-## min(x) {#agg_function-min}
-
-计算最小值。
-
-## max(x) {#agg_function-max}
-
-计算最大值。
-
-## argMin(arg,val) {#agg-function-argmin}
-
-计算 ‘arg’ 最小值的值 ‘val’ 价值。 如果有几个不同的值 ‘arg’ 对于最小值 ‘val’，遇到的第一个值是输出。
-
-**示例:**
-
-``` text
-┌─user─────┬─salary─┐
-│ director │   5000 │
-│ manager  │   3000 │
-│ worker   │   1000 │
-└──────────┴────────┘
-```
-
-``` sql
-SELECT argMin(user, salary) FROM salary
-```
-
-``` text
-┌─argMin(user, salary)─┐
-│ worker               │
-└──────────────────────┘
-```
-
-## argMax(arg,val) {#agg-function-argmax}
-
-计算 ‘arg’ 最大值 ‘val’ 价值。 如果有几个不同的值 ‘arg’ 对于最大值 ‘val’，遇到的第一个值是输出。
-
-## sum(x) {#agg_function-sum}
-
-计算总和。
-只适用于数字。
 
 ## sumWithOverflow(x) {#sumwithoverflowx}
 
@@ -462,12 +379,6 @@ kurtSamp(expr)
 SELECT kurtSamp(value) FROM series_with_value_column
 ```
 
-## avg(x) {#agg_function-avg}
-
-计算平均值。
-只适用于数字。
-结果总是Float64。
-
 ## avgWeighted {#avgweighted}
 
 计算 [加权算术平均值](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean).
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/any.md b/docs/zh/sql-reference/aggregate-functions/reference/any.md
new file mode 100644
index 00000000000..3df326ac84b
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/any.md
@@ -0,0 +1,13 @@
+---
+toc_priority: 6
+---
+
+# any(x) {#agg_function-any}
+
+选择第一个遇到的值。
+查询可以以任何顺序执行，甚至每次都以不同的顺序执行，因此此函数的结果是不确定的。
+要获得确定的结果，您可以使用 ‘min’ 或 ‘max’ 功能，而不是 ‘any’.
+
+在某些情况下，可以依靠执行的顺序。 这适用于SELECT来自使用ORDER BY的子查询的情况。
+
+当一个 `SELECT` 查询具有 `GROUP BY` 子句或至少一个聚合函数，ClickHouse（相对于MySQL）要求在所有表达式 `SELECT`, `HAVING`，和 `ORDER BY` 子句可以从键或聚合函数计算。 换句话说，从表中选择的每个列必须在键或聚合函数内使用。 要获得像MySQL这样的行为，您可以将其他列放在 `any` 聚合函数。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md b/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
new file mode 100644
index 00000000000..e01320e85b1
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
@@ -0,0 +1,30 @@
+---
+toc_priority: 103
+---
+
+# anyHeavy {#anyheavyx}
+
+选择一个频繁出现的值，使用[heavy hitters](http://www.cs.umd.edu/~samir/498/karp.pdf) 算法。 如果某个值在查询的每个执行线程中出现的情况超过一半，则返回此值。 通常情况下，结果是不确定的。
+
+``` sql
+anyHeavy(column)
+```
+
+**参数**
+
+-   `column` – The column name.
+
+**示例**
+
+使用 [OnTime](../../getting-started/example-datasets/ontime.md) 数据集，并选择在 `AirlineID` 列任何频繁出现的值。
+
+``` sql
+SELECT anyHeavy(AirlineID) AS res
+FROM ontime
+```
+
+``` text
+┌───res─┐
+│ 19690 │
+└───────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/anylast.md b/docs/zh/sql-reference/aggregate-functions/reference/anylast.md
new file mode 100644
index 00000000000..e6792e0e449
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/anylast.md
@@ -0,0 +1,9 @@
+---
+toc_priority: 104
+---
+
+## anyLast {#anylastx}
+
+选择遇到的最后一个值。
+其结果和[any](../../../sql-reference/aggregate-functions/reference/any.md) 函数一样是不确定的 。
+ 
\ No newline at end of file
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/argmax.md b/docs/zh/sql-reference/aggregate-functions/reference/argmax.md
new file mode 100644
index 00000000000..1791ef8f88e
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/argmax.md
@@ -0,0 +1,32 @@
+---
+toc_priority: 106
+---
+
+# argMax {#agg-function-argmax}
+
+语法: `argMax(arg, val)` 或 `argMax(tuple(arg, val))`
+
+计算 `val` 最大值对应的 `arg`  值。 如果 `val` 最大值存在几个不同的 `arg` 值，输出遇到的第一个(`arg`)值。
+
+
+这个函数的Tuple版本将返回`val`最大值对应的tuple。本函数适合和`SimpleAggregateFunction`搭配使用。
+
+**示例:**
+
+``` text
+┌─user─────┬─salary─┐
+│ director │   5000 │
+│ manager  │   3000 │
+│ worker   │   1000 │
+└──────────┴────────┘
+```
+
+``` sql
+SELECT argMax(user, salary), argMax(tuple(user, salary)) FROM salary
+```
+
+``` text
+┌─argMax(user, salary)─┬─argMax(tuple(user, salary))─┐
+│ director             │ ('director',5000)           │
+└──────────────────────┴─────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/argmin.md b/docs/zh/sql-reference/aggregate-functions/reference/argmin.md
new file mode 100644
index 00000000000..a174fb16c59
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/argmin.md
@@ -0,0 +1,31 @@
+---
+toc_priority: 105
+---
+
+# argMin {#agg-function-argmin}
+
+语法: `argMin(arg, val)` 或 `argMin(tuple(arg, val))`
+
+计算 `val` 最小值对应的 `arg` 值。 如果 `val` 最小值存在几个不同的 `arg` 值，输出遇到的第一个(`arg`)值。
+
+这个函数的Tuple版本将返回 `val` 最小值对应的tuple。本函数适合和`SimpleAggregateFunction`搭配使用。
+
+**示例:**
+
+``` text
+┌─user─────┬─salary─┐
+│ director │   5000 │
+│ manager  │   3000 │
+│ worker   │   1000 │
+└──────────┴────────┘
+```
+
+``` sql
+SELECT argMin(user, salary), argMin(tuple(user, salary)) FROM salary
+```
+
+``` text
+┌─argMin(user, salary)─┬─argMin(tuple(user, salary))─┐
+│ worker               │ ('worker',1000)             │
+└──────────────────────┴─────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/avg.md b/docs/zh/sql-reference/aggregate-functions/reference/avg.md
new file mode 100644
index 00000000000..ea4f351b55e
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/avg.md
@@ -0,0 +1,62 @@
+---
+toc_priority: 5
+---
+
+# avg {#agg_function-avg}
+
+计算算术平均值。
+
+**语法**
+
+``` sql
+avg(x)
+```
+
+**参数**
+
+-   `x` — 列名
+
+`x` 必须是
+[Integer](../../../sql-reference/data-types/int-uint.md),
+[floating-point](../../../sql-reference/data-types/float.md), or 
+[Decimal](../../../sql-reference/data-types/decimal.md).
+
+**返回值**
+
+- `NaN`。 参数列为空时返回。
+- 算术平均值。 其他情况。
+
+**返回类型** 总是 [Float64](../../../sql-reference/data-types/float.md).
+
+**示例**
+
+查询:
+
+``` sql
+SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5)
+```
+
+结果:
+
+``` text
+┌─avg(x)─┐
+│    2.5 │
+└────────┘
+```
+
+**示例**
+
+查询:
+
+``` sql
+CREATE table test (t UInt8) ENGINE = Memory;
+SELECT avg(t) FROM test
+```
+
+结果:
+
+``` text
+┌─avg(x)─┐
+│    nan │
+└────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
new file mode 100644
index 00000000000..7b9c0de2755
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
@@ -0,0 +1,99 @@
+---
+toc_priority: 107
+---
+
+# avgWeighted {#avgweighted}
+
+Calculates the [weighted arithmetic mean](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean).
+
+**Syntax**
+
+``` sql
+avgWeighted(x, weight)
+```
+
+**Parameters**
+
+-   `x` — Values.
+-   `weight` — Weights of the values.
+
+`x` and `weight` must both be
+[Integer](../../../sql-reference/data-types/int-uint.md),
+[floating-point](../../../sql-reference/data-types/float.md), or 
+[Decimal](../../../sql-reference/data-types/decimal.md),
+but may have different types.
+
+**Returned value**
+
+-   `NaN` if all the weights are equal to 0 or the supplied weights parameter is empty.
+-   Weighted mean otherwise.
+
+**Return type** is always [Float64](../../../sql-reference/data-types/float.md).
+
+**Example**
+
+Query:
+
+``` sql
+SELECT avgWeighted(x, w)
+FROM values('x Int8, w Int8', (4, 1), (1, 0), (10, 2))
+```
+
+Result:
+
+``` text
+┌─avgWeighted(x, weight)─┐
+│                      8 │
+└────────────────────────┘
+```
+
+**Example**
+
+Query:
+
+``` sql
+SELECT avgWeighted(x, w)
+FROM values('x Int8, w Float64', (4, 1), (1, 0), (10, 2))
+```
+
+Result:
+
+``` text
+┌─avgWeighted(x, weight)─┐
+│                      8 │
+└────────────────────────┘
+```
+
+**Example**
+
+Query:
+
+``` sql
+SELECT avgWeighted(x, w)
+FROM values('x Int8, w Int8', (0, 0), (1, 0), (10, 0))
+```
+
+Result:
+
+``` text
+┌─avgWeighted(x, weight)─┐
+│                    nan │
+└────────────────────────┘
+```
+
+**Example**
+
+Query:
+
+``` sql
+CREATE table test (t UInt8) ENGINE = Memory;
+SELECT avgWeighted(t) FROM test
+```
+
+Result:
+
+``` text
+┌─avgWeighted(x, weight)─┐
+│                    nan │
+└────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md b/docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md
new file mode 100644
index 00000000000..2e9001dec19
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md
@@ -0,0 +1,13 @@
+---
+toc_priority: 250
+---
+
+# categoricalInformationValue {#categoricalinformationvalue}
+
+Calculates the value of `(P(tag = 1) - P(tag = 0))(log(P(tag = 1)) - log(P(tag = 0)))` for each category.
+
+``` sql
+categoricalInformationValue(category1, category2, ..., tag)
+```
+
+The result indicates how a discrete (categorical) feature `[category1, category2, ...]` contribute to a learning model which predicting the value of `tag`.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/corr.md b/docs/zh/sql-reference/aggregate-functions/reference/corr.md
new file mode 100644
index 00000000000..88f9295a8f2
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/corr.md
@@ -0,0 +1,12 @@
+---
+toc_priority: 107
+---
+
+# corr {#corrx-y}
+
+Syntax: `corr(x, y)`
+
+Calculates the Pearson correlation coefficient: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`.
+
+!!! note "Note"
+    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `corrStable` function. It works slower but provides a lower computational error.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/count.md b/docs/zh/sql-reference/aggregate-functions/reference/count.md
new file mode 100644
index 00000000000..e5d31429e12
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/count.md
@@ -0,0 +1,69 @@
+---
+toc_priority: 1
+---
+
+# count {#agg_function-count}
+
+Counts the number of rows or not-NULL values.
+
+ClickHouse supports the following syntaxes for `count`:
+- `count(expr)` or `COUNT(DISTINCT expr)`.
+- `count()` or `COUNT(*)`. The `count()` syntax is ClickHouse-specific.
+
+**Parameters**
+
+The function can take:
+
+-   Zero parameters.
+-   One [expression](../../../sql-reference/syntax.md#syntax-expressions).
+
+**Returned value**
+
+-   If the function is called without parameters it counts the number of rows.
+-   If the [expression](../../../sql-reference/syntax.md#syntax-expressions) is passed, then the function counts how many times this expression returned not null. If the expression returns a [Nullable](../../../sql-reference/data-types/nullable.md)-type value, then the result of `count` stays not `Nullable`. The function returns 0 if the expression returned `NULL` for all the rows.
+
+In both cases the type of the returned value is [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Details**
+
+ClickHouse supports the `COUNT(DISTINCT ...)` syntax. The behavior of this construction depends on the [count_distinct_implementation](../../../operations/settings/settings.md#settings-count_distinct_implementation) setting. It defines which of the [uniq\*](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) functions is used to perform the operation. The default is the [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) function.
+
+The `SELECT count() FROM table` query is not optimized, because the number of entries in the table is not stored separately. It chooses a small column from the table and counts the number of values in it.
+
+**Examples**
+
+Example 1:
+
+``` sql
+SELECT count() FROM t
+```
+
+``` text
+┌─count()─┐
+│       5 │
+└─────────┘
+```
+
+Example 2:
+
+``` sql
+SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation'
+```
+
+``` text
+┌─name──────────────────────────┬─value─────┐
+│ count_distinct_implementation │ uniqExact │
+└───────────────────────────────┴───────────┘
+```
+
+``` sql
+SELECT count(DISTINCT num) FROM t
+```
+
+``` text
+┌─uniqExact(num)─┐
+│              3 │
+└────────────────┘
+```
+
+This example shows that `count(DISTINCT num)` is performed by the `uniqExact` function according to the `count_distinct_implementation` setting value.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md
new file mode 100644
index 00000000000..2a7d805763e
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md
@@ -0,0 +1,12 @@
+---
+toc_priority: 36
+---
+
+# covarPop {#covarpop}
+
+Syntax: `covarPop(x, y)`
+
+Calculates the value of `Σ((x - x̅)(y - y̅)) / n`.
+
+!!! note "Note"
+    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarPopStable` function. It works slower but provides a lower computational error.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md
new file mode 100644
index 00000000000..4bdb1b02d40
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md
@@ -0,0 +1,12 @@
+---
+toc_priority: 37
+---
+
+# covarSamp {#covarsamp}
+
+Calculates the value of `Σ((x - x̅)(y - y̅)) / (n - 1)`.
+
+Returns Float64. When `n <= 1`, returns +∞.
+
+!!! note "Note"
+    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarSampStable` function. It works slower but provides a lower computational error.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md
new file mode 100644
index 00000000000..86b7b83022b
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md
@@ -0,0 +1,14 @@
+---
+toc_priority: 110
+---
+
+# groupArray {#agg_function-grouparray}
+
+Syntax: `groupArray(x)` or `groupArray(max_size)(x)`
+
+Creates an array of argument values.
+Values can be added to the array in any (indeterminate) order.
+
+The second version (with the `max_size` parameter) limits the size of the resulting array to `max_size` elements. For example, `groupArray(1)(x)` is equivalent to `[any (x)]`.
+
+In some cases, you can still rely on the order of execution. This applies to cases when `SELECT` comes from a subquery that uses `ORDER BY`.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
new file mode 100644
index 00000000000..f4b8665a0a4
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
@@ -0,0 +1,91 @@
+---
+toc_priority: 112
+---
+
+# groupArrayInsertAt {#grouparrayinsertat}
+
+Inserts a value into the array at the specified position.
+
+**Syntax**
+
+``` sql
+groupArrayInsertAt(default_x, size)(x, pos);
+```
+
+If in one query several values are inserted into the same position, the function behaves in the following ways:
+
+-   If a query is executed in a single thread, the first one of the inserted values is used.
+-   If a query is executed in multiple threads, the resulting value is an undetermined one of the inserted values.
+
+**Parameters**
+
+-   `x` — Value to be inserted. [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in one of the [supported data types](../../../sql-reference/data-types/index.md).
+-   `pos` — Position at which the specified element `x` is to be inserted. Index numbering in the array starts from zero. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges).
+-   `default_x`— Default value for substituting in empty positions. Optional parameter. [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in the data type configured for the `x` parameter. If `default_x` is not defined, the [default values](../../../sql-reference/statements/create/table.md#create-default-values) are used.
+-   `size`— Length of the resulting array. Optional parameter. When using this parameter, the default value `default_x` must be specified. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges).
+
+**Returned value**
+
+-   Array with inserted values.
+
+Type: [Array](../../../sql-reference/data-types/array.md#data-type-array).
+
+**Example**
+
+Query:
+
+``` sql
+SELECT groupArrayInsertAt(toString(number), number * 2) FROM numbers(5);
+```
+
+Result:
+
+``` text
+┌─groupArrayInsertAt(toString(number), multiply(number, 2))─┐
+│ ['0','','1','','2','','3','','4']                         │
+└───────────────────────────────────────────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT groupArrayInsertAt('-')(toString(number), number * 2) FROM numbers(5);
+```
+
+Result:
+
+``` text
+┌─groupArrayInsertAt('-')(toString(number), multiply(number, 2))─┐
+│ ['0','-','1','-','2','-','3','-','4']                          │
+└────────────────────────────────────────────────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT groupArrayInsertAt('-', 5)(toString(number), number * 2) FROM numbers(5);
+```
+
+Result:
+
+``` text
+┌─groupArrayInsertAt('-', 5)(toString(number), multiply(number, 2))─┐
+│ ['0','-','1','-','2']                                             │
+└───────────────────────────────────────────────────────────────────┘
+```
+
+Multi-threaded insertion of elements into one position.
+
+Query:
+
+``` sql
+SELECT groupArrayInsertAt(number, 0) FROM numbers_mt(10) SETTINGS max_block_size = 1;
+```
+
+As a result of this query you get random integer in the `[0,9]` range. For example:
+
+``` text
+┌─groupArrayInsertAt(number, 0)─┐
+│ [7]                           │
+└───────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md
new file mode 100644
index 00000000000..1cd40c2002f
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md
@@ -0,0 +1,78 @@
+---
+toc_priority: 114
+---
+
+# groupArrayMovingAvg {#agg_function-grouparraymovingavg}
+
+Calculates the moving average of input values.
+
+``` sql
+groupArrayMovingAvg(numbers_for_summing)
+groupArrayMovingAvg(window_size)(numbers_for_summing)
+```
+
+The function can take the window size as a parameter. If left unspecified, the function takes the window size equal to the number of rows in the column.
+
+**Parameters**
+
+-   `numbers_for_summing` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in a numeric data type value.
+-   `window_size` — Size of the calculation window.
+
+**Returned values**
+
+-   Array of the same size and type as the input data.
+
+The function uses [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero). It truncates the decimal places insignificant for the resulting data type.
+
+**Example**
+
+The sample table `b`:
+
+``` sql
+CREATE TABLE t
+(
+    `int` UInt8,
+    `float` Float32,
+    `dec` Decimal32(2)
+)
+ENGINE = TinyLog
+```
+
+``` text
+┌─int─┬─float─┬──dec─┐
+│   1 │   1.1 │ 1.10 │
+│   2 │   2.2 │ 2.20 │
+│   4 │   4.4 │ 4.40 │
+│   7 │  7.77 │ 7.77 │
+└─────┴───────┴──────┘
+```
+
+The queries:
+
+``` sql
+SELECT
+    groupArrayMovingAvg(int) AS I,
+    groupArrayMovingAvg(float) AS F,
+    groupArrayMovingAvg(dec) AS D
+FROM t
+```
+
+``` text
+┌─I─────────┬─F───────────────────────────────────┬─D─────────────────────┐
+│ [0,0,1,3] │ [0.275,0.82500005,1.9250001,3.8675] │ [0.27,0.82,1.92,3.86] │
+└───────────┴─────────────────────────────────────┴───────────────────────┘
+```
+
+``` sql
+SELECT
+    groupArrayMovingAvg(2)(int) AS I,
+    groupArrayMovingAvg(2)(float) AS F,
+    groupArrayMovingAvg(2)(dec) AS D
+FROM t
+```
+
+``` text
+┌─I─────────┬─F────────────────────────────────┬─D─────────────────────┐
+│ [0,1,3,5] │ [0.55,1.6500001,3.3000002,6.085] │ [0.55,1.65,3.30,6.08] │
+└───────────┴──────────────────────────────────┴───────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
new file mode 100644
index 00000000000..ef979cd5f6a
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
@@ -0,0 +1,76 @@
+---
+toc_priority: 113
+---
+
+# groupArrayMovingSum {#agg_function-grouparraymovingsum}
+
+Calculates the moving sum of input values.
+
+``` sql
+groupArrayMovingSum(numbers_for_summing)
+groupArrayMovingSum(window_size)(numbers_for_summing)
+```
+
+The function can take the window size as a parameter. If left unspecified, the function takes the window size equal to the number of rows in the column.
+
+**Parameters**
+
+-   `numbers_for_summing` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in a numeric data type value.
+-   `window_size` — Size of the calculation window.
+
+**Returned values**
+
+-   Array of the same size and type as the input data.
+
+**Example**
+
+The sample table:
+
+``` sql
+CREATE TABLE t
+(
+    `int` UInt8,
+    `float` Float32,
+    `dec` Decimal32(2)
+)
+ENGINE = TinyLog
+```
+
+``` text
+┌─int─┬─float─┬──dec─┐
+│   1 │   1.1 │ 1.10 │
+│   2 │   2.2 │ 2.20 │
+│   4 │   4.4 │ 4.40 │
+│   7 │  7.77 │ 7.77 │
+└─────┴───────┴──────┘
+```
+
+The queries:
+
+``` sql
+SELECT
+    groupArrayMovingSum(int) AS I,
+    groupArrayMovingSum(float) AS F,
+    groupArrayMovingSum(dec) AS D
+FROM t
+```
+
+``` text
+┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐
+│ [1,3,7,14] │ [1.1,3.3000002,7.7000003,15.47] │ [1.10,3.30,7.70,15.47] │
+└────────────┴─────────────────────────────────┴────────────────────────┘
+```
+
+``` sql
+SELECT
+    groupArrayMovingSum(2)(int) AS I,
+    groupArrayMovingSum(2)(float) AS F,
+    groupArrayMovingSum(2)(dec) AS D
+FROM t
+```
+
+``` text
+┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐
+│ [1,3,6,11] │ [1.1,3.3000002,6.6000004,12.17] │ [1.10,3.30,6.60,12.17] │
+└────────────┴─────────────────────────────────┴────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
new file mode 100644
index 00000000000..36fa6a9d661
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
@@ -0,0 +1,81 @@
+---
+toc_priority: 114
+---
+
+# groupArraySample {#grouparraysample}
+
+Creates an array of sample argument values. The size of the resulting array is limited to `max_size` elements. Argument values are selected and added to the array randomly. 
+
+**Syntax**
+
+``` sql
+groupArraySample(max_size[, seed])(x)
+```
+
+**Parameters**
+
+-   `max_size` — Maximum size of the resulting array. [UInt64](../../data-types/int-uint.md).
+-   `seed` — Seed for the random number generator. Optional. [UInt64](../../data-types/int-uint.md). Default value: `123456`.
+-   `x` — Argument (column name or expression).
+
+**Returned values**
+
+-   Array of randomly selected `x` arguments.
+
+Type: [Array](../../data-types/array.md).
+
+**Examples**
+
+Consider table `colors`:
+
+``` text
+┌─id─┬─color──┐
+│  1 │ red    │
+│  2 │ blue   │
+│  3 │ green  │
+│  4 │ white  │
+│  5 │ orange │
+└────┴────────┘
+```
+
+Query with column name as argument:
+
+``` sql
+SELECT groupArraySample(3)(color) as newcolors FROM colors;
+```
+
+Result:
+
+```text
+┌─newcolors──────────────────┐
+│ ['white','blue','green']   │
+└────────────────────────────┘
+```
+
+Query with column name and different seed:
+
+``` sql
+SELECT groupArraySample(3, 987654321)(color) as newcolors FROM colors;
+```
+
+Result:
+
+```text
+┌─newcolors──────────────────┐
+│ ['red','orange','green']   │
+└────────────────────────────┘
+```
+
+Query with expression as argument:
+
+``` sql
+SELECT groupArraySample(3)(concat('light-', color)) as newcolors FROM colors;
+```
+
+Result:
+
+```text
+┌─newcolors───────────────────────────────────┐
+│ ['light-blue','light-orange','light-green'] │
+└─────────────────────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md
new file mode 100644
index 00000000000..9be73fd54ec
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md
@@ -0,0 +1,46 @@
+---
+toc_priority: 125
+---
+
+# groupBitAnd {#groupbitand}
+
+Applies bitwise `AND` for series of numbers.
+
+``` sql
+groupBitAnd(expr)
+```
+
+**Parameters**
+
+`expr` – An expression that results in `UInt*` type.
+
+**Return value**
+
+Value of the `UInt*` type.
+
+**Example**
+
+Test data:
+
+``` text
+binary     decimal
+00101100 = 44
+00011100 = 28
+00001101 = 13
+01010101 = 85
+```
+
+Query:
+
+``` sql
+SELECT groupBitAnd(num) FROM t
+```
+
+Where `num` is the column with the test data.
+
+Result:
+
+``` text
+binary     decimal
+00000100 = 4
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md
new file mode 100644
index 00000000000..9367652db38
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md
@@ -0,0 +1,44 @@
+---
+toc_priority: 128
+---
+
+# groupBitmap {#groupbitmap}
+
+Bitmap or Aggregate calculations from a unsigned integer column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md).
+
+``` sql
+groupBitmap(expr)
+```
+
+**Parameters**
+
+`expr` – An expression that results in `UInt*` type.
+
+**Return value**
+
+Value of the `UInt64` type.
+
+**Example**
+
+Test data:
+
+``` text
+UserID
+1
+1
+2
+3
+```
+
+Query:
+
+``` sql
+SELECT groupBitmap(UserID) as num FROM t
+```
+
+Result:
+
+``` text
+num
+3
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md
new file mode 100644
index 00000000000..7c0c89040bb
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md
@@ -0,0 +1,46 @@
+---
+toc_priority: 129
+---
+
+# groupBitmapAnd {#groupbitmapand}
+
+Calculations the AND of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md).
+
+``` sql
+groupBitmapAnd(expr)
+```
+
+**Parameters**
+
+`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type.
+
+**Return value**
+
+Value of the `UInt64` type.
+
+**Example**
+
+``` sql
+DROP TABLE IF EXISTS bitmap_column_expr_test2;
+CREATE TABLE bitmap_column_expr_test2
+(
+    tag_id String,
+    z AggregateFunction(groupBitmap, UInt32)
+)
+ENGINE = MergeTree
+ORDER BY tag_id;
+
+INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32))));
+INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32))));
+INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32))));
+
+SELECT groupBitmapAnd(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
+┌─groupBitmapAnd(z)─┐
+│               3   │
+└───────────────────┘
+
+SELECT arraySort(bitmapToArray(groupBitmapAndState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
+┌─arraySort(bitmapToArray(groupBitmapAndState(z)))─┐
+│ [6,8,10]                                         │
+└──────────────────────────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md
new file mode 100644
index 00000000000..894c6c90aab
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md
@@ -0,0 +1,46 @@
+---
+toc_priority: 130
+---
+
+# groupBitmapOr {#groupbitmapor}
+
+Calculations the OR of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md). This is equivalent to `groupBitmapMerge`.
+
+``` sql
+groupBitmapOr(expr)
+```
+
+**Parameters**
+
+`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type.
+
+**Return value**
+
+Value of the `UInt64` type.
+
+**Example**
+
+``` sql
+DROP TABLE IF EXISTS bitmap_column_expr_test2;
+CREATE TABLE bitmap_column_expr_test2
+(
+    tag_id String,
+    z AggregateFunction(groupBitmap, UInt32)
+)
+ENGINE = MergeTree
+ORDER BY tag_id;
+
+INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32))));
+INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32))));
+INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32))));
+
+SELECT groupBitmapOr(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
+┌─groupBitmapOr(z)─┐
+│             15   │
+└──────────────────┘
+
+SELECT arraySort(bitmapToArray(groupBitmapOrState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
+┌─arraySort(bitmapToArray(groupBitmapOrState(z)))─┐
+│ [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]           │
+└─────────────────────────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md
new file mode 100644
index 00000000000..5d0ec0fb097
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md
@@ -0,0 +1,46 @@
+---
+toc_priority: 131
+---
+
+# groupBitmapXor {#groupbitmapxor}
+
+Calculations the XOR of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md).
+
+``` sql
+groupBitmapOr(expr)
+```
+
+**Parameters**
+
+`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type.
+
+**Return value**
+
+Value of the `UInt64` type.
+
+**Example**
+
+``` sql
+DROP TABLE IF EXISTS bitmap_column_expr_test2;
+CREATE TABLE bitmap_column_expr_test2
+(
+    tag_id String,
+    z AggregateFunction(groupBitmap, UInt32)
+)
+ENGINE = MergeTree
+ORDER BY tag_id;
+
+INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32))));
+INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32))));
+INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32))));
+
+SELECT groupBitmapXor(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
+┌─groupBitmapXor(z)─┐
+│              10   │
+└───────────────────┘
+
+SELECT arraySort(bitmapToArray(groupBitmapXorState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
+┌─arraySort(bitmapToArray(groupBitmapXorState(z)))─┐
+│ [1,3,5,6,8,10,11,13,14,15]                       │
+└──────────────────────────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md
new file mode 100644
index 00000000000..7383e620060
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md
@@ -0,0 +1,46 @@
+---
+toc_priority: 126
+---
+
+# groupBitOr {#groupbitor}
+
+Applies bitwise `OR` for series of numbers.
+
+``` sql
+groupBitOr(expr)
+```
+
+**Parameters**
+
+`expr` – An expression that results in `UInt*` type.
+
+**Return value**
+
+Value of the `UInt*` type.
+
+**Example**
+
+Test data:
+
+``` text
+binary     decimal
+00101100 = 44
+00011100 = 28
+00001101 = 13
+01010101 = 85
+```
+
+Query:
+
+``` sql
+SELECT groupBitOr(num) FROM t
+```
+
+Where `num` is the column with the test data.
+
+Result:
+
+``` text
+binary     decimal
+01111101 = 125
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md
new file mode 100644
index 00000000000..01026012b91
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md
@@ -0,0 +1,46 @@
+---
+toc_priority: 127
+---
+
+# groupBitXor {#groupbitxor}
+
+Applies bitwise `XOR` for series of numbers.
+
+``` sql
+groupBitXor(expr)
+```
+
+**Parameters**
+
+`expr` – An expression that results in `UInt*` type.
+
+**Return value**
+
+Value of the `UInt*` type.
+
+**Example**
+
+Test data:
+
+``` text
+binary     decimal
+00101100 = 44
+00011100 = 28
+00001101 = 13
+01010101 = 85
+```
+
+Query:
+
+``` sql
+SELECT groupBitXor(num) FROM t
+```
+
+Where `num` is the column with the test data.
+
+Result:
+
+``` text
+binary     decimal
+01101000 = 104
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md b/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md
new file mode 100644
index 00000000000..537212e5b94
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md
@@ -0,0 +1,12 @@
+---
+toc_priority: 111
+---
+
+# groupUniqArray {#groupuniqarray}
+
+Syntax: `groupUniqArray(x)` or `groupUniqArray(max_size)(x)`
+
+Creates an array from different argument values. Memory consumption is the same as for the [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md) function.
+
+The second version (with the `max_size` parameter) limits the size of the resulting array to `max_size` elements.
+For example, `groupUniqArray(1)(x)` is equivalent to `[any(x)]`.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/index.md b/docs/zh/sql-reference/aggregate-functions/reference/index.md
new file mode 100644
index 00000000000..b96fa887279
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/index.md
@@ -0,0 +1,74 @@
+---
+toc_folder_title: Reference
+toc_priority: 36
+toc_hidden: true
+---
+
+# List of Aggregate Functions {#aggregate-functions-reference}
+
+Standard aggregate functions:
+
+-   [count](../../../sql-reference/aggregate-functions/reference/count.md)
+-   [min](../../../sql-reference/aggregate-functions/reference/min.md)
+-   [max](../../../sql-reference/aggregate-functions/reference/max.md)
+-   [sum](../../../sql-reference/aggregate-functions/reference/sum.md)
+-   [avg](../../../sql-reference/aggregate-functions/reference/avg.md)
+-   [any](../../../sql-reference/aggregate-functions/reference/any.md)
+-   [stddevPop](../../../sql-reference/aggregate-functions/reference/stddevpop.md)
+-   [stddevSamp](../../../sql-reference/aggregate-functions/reference/stddevsamp.md)
+-   [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md)
+-   [varSamp](../../../sql-reference/aggregate-functions/reference/varsamp.md)
+-   [covarPop](../../../sql-reference/aggregate-functions/reference/covarpop.md)
+-   [covarSamp](../../../sql-reference/aggregate-functions/reference/covarsamp.md)
+
+ClickHouse-specific aggregate functions:
+
+-   [anyHeavy](../../../sql-reference/aggregate-functions/reference/anyheavy.md)
+-   [anyLast](../../../sql-reference/aggregate-functions/reference/anylast.md)
+-   [argMin](../../../sql-reference/aggregate-functions/reference/argmin.md)
+-   [argMax](../../../sql-reference/aggregate-functions/reference/argmax.md)
+-   [avgWeighted](../../../sql-reference/aggregate-functions/reference/avgweighted.md)
+-   [topK](../../../sql-reference/aggregate-functions/reference/topk.md)
+-   [topKWeighted](../../../sql-reference/aggregate-functions/reference/topkweighted.md)
+-   [groupArray](../../../sql-reference/aggregate-functions/reference/grouparray.md)
+-   [groupUniqArray](../../../sql-reference/aggregate-functions/reference/groupuniqarray.md)
+-   [groupArrayInsertAt](../../../sql-reference/aggregate-functions/reference/grouparrayinsertat.md)
+-   [groupArrayMovingAvg](../../../sql-reference/aggregate-functions/reference/grouparraymovingavg.md)
+-   [groupArrayMovingSum](../../../sql-reference/aggregate-functions/reference/grouparraymovingsum.md)
+-   [groupBitAnd](../../../sql-reference/aggregate-functions/reference/groupbitand.md)
+-   [groupBitOr](../../../sql-reference/aggregate-functions/reference/groupbitor.md)
+-   [groupBitXor](../../../sql-reference/aggregate-functions/reference/groupbitxor.md)
+-   [groupBitmap](../../../sql-reference/aggregate-functions/reference/groupbitmap.md)
+-   [groupBitmapAnd](../../../sql-reference/aggregate-functions/reference/groupbitmapand.md)
+-   [groupBitmapOr](../../../sql-reference/aggregate-functions/reference/groupbitmapor.md)
+-   [groupBitmapXor](../../../sql-reference/aggregate-functions/reference/groupbitmapxor.md)
+-   [sumWithOverflow](../../../sql-reference/aggregate-functions/reference/sumwithoverflow.md)
+-   [sumMap](../../../sql-reference/aggregate-functions/reference/summap.md)
+-   [minMap](../../../sql-reference/aggregate-functions/reference/minmap.md)
+-   [maxMap](../../../sql-reference/aggregate-functions/reference/maxmap.md)
+-   [skewSamp](../../../sql-reference/aggregate-functions/reference/skewsamp.md)
+-   [skewPop](../../../sql-reference/aggregate-functions/reference/skewpop.md)
+-   [kurtSamp](../../../sql-reference/aggregate-functions/reference/kurtsamp.md)
+-   [kurtPop](../../../sql-reference/aggregate-functions/reference/kurtpop.md)
+-   [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md)
+-   [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md)
+-   [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md)
+-   [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md)
+-   [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md)
+-   [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md)
+-   [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md)
+-   [quantileExactLow](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexactlow)
+-   [quantileExactHigh](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexacthigh)
+-   [quantileExactWeighted](../../../sql-reference/aggregate-functions/reference/quantileexactweighted.md)
+-   [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md)
+-   [quantileTimingWeighted](../../../sql-reference/aggregate-functions/reference/quantiletimingweighted.md)
+-   [quantileDeterministic](../../../sql-reference/aggregate-functions/reference/quantiledeterministic.md)
+-   [quantileTDigest](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md)
+-   [quantileTDigestWeighted](../../../sql-reference/aggregate-functions/reference/quantiletdigestweighted.md)
+-   [simpleLinearRegression](../../../sql-reference/aggregate-functions/reference/simplelinearregression.md)
+-   [stochasticLinearRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md)
+-   [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md)
+-   [categoricalInformationValue](../../../sql-reference/aggregate-functions/reference/categoricalinformationvalue.md)
+
+[Original article](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/) <!--hide-->
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md b/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md
new file mode 100644
index 00000000000..ea44d5f1ddd
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md
@@ -0,0 +1,37 @@
+---
+toc_priority: 150
+---
+
+## initializeAggregation {#initializeaggregation}
+
+Initializes aggregation for your input rows. It is intended for the functions with the suffix `State`.
+Use it for tests or to process columns of types `AggregateFunction` and `AggregationgMergeTree`.
+
+**Syntax**
+
+``` sql
+initializeAggregation (aggregate_function, column_1, column_2);
+```
+
+**Parameters**
+
+-   `aggregate_function` — Name of the aggregation function. The state of this function — the creating one. [String](../../../sql-reference/data-types/string.md#string).
+-   `column_n` — The column to translate it into the function as it's argument. [String](../../../sql-reference/data-types/string.md#string).
+
+**Returned value(s)**
+
+Returns the result of the aggregation for your input rows. The return type will be the same as the return type of function, that `initializeAgregation` takes as first argument.
+For example for functions with the suffix `State` the return type will be `AggregateFunction`.
+
+**Example**
+
+Query:
+
+```sql
+SELECT uniqMerge(state) FROM (SELECT initializeAggregation('uniqState', number % 3) AS state FROM system.numbers LIMIT 10000);
+```
+Result:
+
+┌─uniqMerge(state)─┐
+│                3 │
+└──────────────────┘
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md b/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md
new file mode 100644
index 00000000000..65e7e31b9b4
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md
@@ -0,0 +1,25 @@
+---
+toc_priority: 153
+---
+
+# kurtPop {#kurtpop}
+
+Computes the [kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence.
+
+``` sql
+kurtPop(expr)
+```
+
+**Parameters**
+
+`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
+
+**Returned value**
+
+The kurtosis of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md)
+
+**Example**
+
+``` sql
+SELECT kurtPop(value) FROM series_with_value_column
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md
new file mode 100644
index 00000000000..224bbbdb9e7
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md
@@ -0,0 +1,27 @@
+---
+toc_priority: 154
+---
+
+# kurtSamp {#kurtsamp}
+
+Computes the [sample kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence.
+
+It represents an unbiased estimate of the kurtosis of a random variable if passed values form its sample.
+
+``` sql
+kurtSamp(expr)
+```
+
+**Parameters**
+
+`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
+
+**Returned value**
+
+The kurtosis of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md). If `n <= 1` (`n` is a size of the sample), then the function returns `nan`.
+
+**Example**
+
+``` sql
+SELECT kurtSamp(value) FROM series_with_value_column
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/max.md b/docs/zh/sql-reference/aggregate-functions/reference/max.md
new file mode 100644
index 00000000000..8372d5c6f85
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/max.md
@@ -0,0 +1,7 @@
+---
+toc_priority: 3
+---
+
+# max {#agg_function-max}
+
+计算最大值。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md b/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
new file mode 100644
index 00000000000..c62502cf46e
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
@@ -0,0 +1,28 @@
+---
+toc_priority: 143
+---
+
+# maxMap {#agg_functions-maxmap}
+
+Syntax: `maxMap(key, value)` or `maxMap(Tuple(key, value))`
+
+Calculates the maximum from `value` array according to the keys specified in the `key` array.
+
+Passing a tuple of keys and value arrays is identical to passing two arrays of keys and values.
+
+The number of elements in `key` and `value` must be the same for each row that is totaled.
+
+Returns a tuple of two arrays: keys and values calculated for the corresponding keys.
+
+Example:
+
+``` sql
+SELECT maxMap(a, b)
+FROM values('a Array(Int32), b Array(Int64)', ([1, 2], [2, 2]), ([2, 3], [1, 1]))
+```
+
+``` text
+┌─maxMap(a, b)──────┐
+│ ([1,2,3],[2,2,1]) │
+└───────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/median.md b/docs/zh/sql-reference/aggregate-functions/reference/median.md
new file mode 100644
index 00000000000..b4f38a9b562
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/median.md
@@ -0,0 +1,41 @@
+# median {#median}
+
+The `median*` functions are the aliases for the corresponding `quantile*` functions. They calculate median of a numeric data sample.
+
+Functions:
+
+-   `median` — Alias for [quantile](#quantile).
+-   `medianDeterministic` — Alias for [quantileDeterministic](#quantiledeterministic).
+-   `medianExact` — Alias for [quantileExact](#quantileexact).
+-   `medianExactWeighted` — Alias for [quantileExactWeighted](#quantileexactweighted).
+-   `medianTiming` — Alias for [quantileTiming](#quantiletiming).
+-   `medianTimingWeighted` — Alias for [quantileTimingWeighted](#quantiletimingweighted).
+-   `medianTDigest` — Alias for [quantileTDigest](#quantiletdigest).
+-   `medianTDigestWeighted` — Alias for [quantileTDigestWeighted](#quantiletdigestweighted).
+
+**Example**
+
+Input table:
+
+``` text
+┌─val─┐
+│   1 │
+│   1 │
+│   2 │
+│   3 │
+└─────┘
+```
+
+Query:
+
+``` sql
+SELECT medianDeterministic(val, 1) FROM t
+```
+
+Result:
+
+``` text
+┌─medianDeterministic(val, 1)─┐
+│                         1.5 │
+└─────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/min.md b/docs/zh/sql-reference/aggregate-functions/reference/min.md
new file mode 100644
index 00000000000..95a4099a1b7
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/min.md
@@ -0,0 +1,7 @@
+---
+toc_priority: 2
+---
+
+## min {#agg_function-min}
+
+计算最小值。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/minmap.md b/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
new file mode 100644
index 00000000000..9408d0ddfff
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
@@ -0,0 +1,28 @@
+---
+toc_priority: 142
+---
+
+# minMap {#agg_functions-minmap}
+
+Syntax: `minMap(key, value)` or `minMap(Tuple(key, value))`
+
+Calculates the minimum from `value` array according to the keys specified in the `key` array.
+
+Passing a tuple of keys and value ​​arrays is identical to passing two arrays of keys and values.
+
+The number of elements in `key` and `value` must be the same for each row that is totaled.
+
+Returns a tuple of two arrays: keys in sorted order, and values calculated for the corresponding keys.
+
+Example:
+
+``` sql
+SELECT minMap(a, b)
+FROM values('a Array(Int32), b Array(Int64)', ([1, 2], [2, 2]), ([2, 3], [1, 1]))
+```
+
+``` text
+┌─minMap(a, b)──────┐
+│ ([1,2,3],[2,1,1]) │
+└───────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantile.md b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
new file mode 100644
index 00000000000..77f858a1735
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
@@ -0,0 +1,66 @@
+---
+toc_priority: 200
+---
+
+# quantile {#quantile}
+
+Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+
+This function applies [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) with a reservoir size up to 8192 and a random number generator for sampling. The result is non-deterministic. To get an exact quantile, use the [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact) function.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantile(level)(expr)
+```
+
+Alias: `median`.
+
+**Parameters**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+
+**Returned value**
+
+-   Approximate quantile of the specified level.
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Input table:
+
+``` text
+┌─val─┐
+│   1 │
+│   1 │
+│   2 │
+│   3 │
+└─────┘
+```
+
+Query:
+
+``` sql
+SELECT quantile(val) FROM t
+```
+
+Result:
+
+``` text
+┌─quantile(val)─┐
+│           1.5 │
+└───────────────┘
+```
+
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md
new file mode 100644
index 00000000000..6046447dd10
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md
@@ -0,0 +1,67 @@
+---
+toc_priority: 206
+---
+
+# quantileDeterministic {#quantiledeterministic}
+
+Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+
+This function applies [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) with a reservoir size up to 8192 and deterministic algorithm of sampling. The result is deterministic. To get an exact quantile, use the [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact) function.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileDeterministic(level)(expr, determinator)
+```
+
+Alias: `medianDeterministic`.
+
+**Parameters**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `determinator` — Number whose hash is used instead of a random number generator in the reservoir sampling algorithm to make the result of sampling deterministic. As a determinator you can use any deterministic positive number, for example, a user id or an event id. If the same determinator value occures too often, the function works incorrectly.
+
+**Returned value**
+
+-   Approximate quantile of the specified level.
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Input table:
+
+``` text
+┌─val─┐
+│   1 │
+│   1 │
+│   2 │
+│   3 │
+└─────┘
+```
+
+Query:
+
+``` sql
+SELECT quantileDeterministic(val, 1) FROM t
+```
+
+Result:
+
+``` text
+┌─quantileDeterministic(val, 1)─┐
+│                           1.5 │
+└───────────────────────────────┘
+```
+
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md b/docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md
new file mode 100644
index 00000000000..a39f724f368
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md
@@ -0,0 +1,164 @@
+---
+toc_priority: 202
+---
+
+# quantileExact {#quantileexact}
+
+Exactly computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+
+To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Therefore, the function consumes `O(n)` memory, where `n` is a number of values that were passed. However, for a small number of values, the function is very effective.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileExact(level)(expr)
+```
+
+Alias: `medianExact`.
+
+**Parameters**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+
+**Returned value**
+
+-   Quantile of the specified level.
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT quantileExact(number) FROM numbers(10)
+```
+
+Result:
+
+``` text
+┌─quantileExact(number)─┐
+│                     5 │
+└───────────────────────┘
+```
+
+# quantileExactLow {#quantileexactlow}
+
+Similar to `quantileExact`, this computes the exact [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+
+To get the exact value, all the passed values are combined into an array, which is then fully sorted.  The sorting [algorithm's](https://en.cppreference.com/w/cpp/algorithm/sort) complexity is `O(N·log(N))`, where `N = std::distance(first, last)` comparisons.
+
+The return value depends on the quantile level and the number of elements in the selection, i.e. if the level is 0.5, then the function returns the lower median value for an even number of elements and the middle median value for an odd number of elements. Median is calculated similarly to the [median_low](https://docs.python.org/3/library/statistics.html#statistics.median_low) implementation which is used in python.
+
+For all other levels, the element at the index corresponding to the value of `level * size_of_array` is returned. For example:
+
+``` sql
+SELECT quantileExactLow(0.1)(number) FROM numbers(10)
+
+┌─quantileExactLow(0.1)(number)─┐
+│                             1 │
+└───────────────────────────────┘
+```
+                                                                                                                                                                                 
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileExact(level)(expr)
+```
+
+Alias: `medianExactLow`.
+
+**Parameters**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+
+**Returned value**
+
+-   Quantile of the specified level.
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT quantileExactLow(number) FROM numbers(10)
+```
+
+Result:
+
+``` text
+┌─quantileExactLow(number)─┐
+│                        4 │
+└──────────────────────────┘
+```
+# quantileExactHigh {#quantileexacthigh}
+
+Similar to `quantileExact`, this computes the exact [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+
+All the passed values are combined into an array, which is then fully sorted, 
+to get the exact value.  The sorting [algorithm's](https://en.cppreference.com/w/cpp/algorithm/sort) complexity is `O(N·log(N))`, where `N = std::distance(first, last)` comparisons.
+
+The return value depends on the quantile level and the number of elements in the selection, i.e. if the level is 0.5, then the function returns the higher median value for an even number of elements and the middle median value for an odd number of elements. Median is calculated similarly to the [median_high](https://docs.python.org/3/library/statistics.html#statistics.median_high) implementation which is used in python. For all other levels, the element at the index corresponding to the value of `level * size_of_array` is returned. 
+
+This implementation behaves exactly similar to the current `quantileExact` implementation.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileExactHigh(level)(expr)
+```
+
+Alias: `medianExactHigh`.
+
+**Parameters**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+
+**Returned value**
+
+-   Quantile of the specified level.
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT quantileExactHigh(number) FROM numbers(10)
+```
+
+Result:
+
+``` text
+┌─quantileExactHigh(number)─┐
+│                         5 │
+└───────────────────────────┘
+```
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
new file mode 100644
index 00000000000..3251f8298a6
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
@@ -0,0 +1,67 @@
+---
+toc_priority: 203
+---
+
+# quantileExactWeighted {#quantileexactweighted}
+
+Exactly computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence, taking into account the weight of each element.
+
+To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Each value is counted with its weight, as if it is present `weight` times. A hash table is used in the algorithm. Because of this, if the passed values ​​are frequently repeated, the function consumes less RAM than [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact). You can use this function instead of `quantileExact` and specify the weight 1.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileExactWeighted(level)(expr, weight)
+```
+
+Alias: `medianExactWeighted`.
+
+**Parameters**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `weight` — Column with weights of sequence members. Weight is a number of value occurrences.
+
+**Returned value**
+
+-   Quantile of the specified level.
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Input table:
+
+``` text
+┌─n─┬─val─┐
+│ 0 │   3 │
+│ 1 │   2 │
+│ 2 │   1 │
+│ 5 │   4 │
+└───┴─────┘
+```
+
+Query:
+
+``` sql
+SELECT quantileExactWeighted(n, val) FROM t
+```
+
+Result:
+
+``` text
+┌─quantileExactWeighted(n, val)─┐
+│                             1 │
+└───────────────────────────────┘
+```
+
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md
new file mode 100644
index 00000000000..abce6a9e7f0
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md
@@ -0,0 +1,9 @@
+---
+toc_priority: 201
+---
+
+# quantiles {#quantiles}
+
+Syntax: `quantiles(level1, level2, …)(x)`
+
+All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md
new file mode 100644
index 00000000000..bda98ea338d
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md
@@ -0,0 +1,57 @@
+---
+toc_priority: 207
+---
+
+# quantileTDigest {#quantiletdigest}
+
+Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm.
+
+The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values. The result depends on the order of running the query, and is nondeterministic.
+
+The performance of the function is lower than performance of [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) or [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming). In terms of the ratio of State size to precision, this function is much better than `quantile`.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileTDigest(level)(expr)
+```
+
+Alias: `medianTDigest`.
+
+**Parameters**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+
+**Returned value**
+
+-   Approximate quantile of the specified level.
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT quantileTDigest(number) FROM numbers(10)
+```
+
+Result:
+
+``` text
+┌─quantileTDigest(number)─┐
+│                     4.5 │
+└─────────────────────────┘
+```
+
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
new file mode 100644
index 00000000000..309cbe95e95
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
@@ -0,0 +1,58 @@
+---
+toc_priority: 208
+---
+
+# quantileTDigestWeighted {#quantiletdigestweighted}
+
+Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm. The function takes into account the weight of each sequence member. The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values.
+
+The performance of the function is lower than performance of [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) or [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming). In terms of the ratio of State size to precision, this function is much better than `quantile`.
+
+The result depends on the order of running the query, and is nondeterministic.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileTDigest(level)(expr)
+```
+
+Alias: `medianTDigest`.
+
+**Parameters**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `weight` — Column with weights of sequence elements. Weight is a number of value occurrences.
+
+**Returned value**
+
+-   Approximate quantile of the specified level.
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT quantileTDigestWeighted(number, 1) FROM numbers(10)
+```
+
+Result:
+
+``` text
+┌─quantileTDigestWeighted(number, 1)─┐
+│                                4.5 │
+└────────────────────────────────────┘
+```
+
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md
new file mode 100644
index 00000000000..867e8b87e74
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md
@@ -0,0 +1,86 @@
+---
+toc_priority: 204
+---
+
+# quantileTiming {#quantiletiming}
+
+With the determined precision computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+
+The result is deterministic (it doesn’t depend on the query processing order). The function is optimized for working with sequences which describe distributions like loading web pages times or backend response times.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileTiming(level)(expr)
+```
+
+Alias: `medianTiming`.
+
+**Parameters**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+
+-   `expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) over a column values returning a [Float\*](../../../sql-reference/data-types/float.md)-type number.
+
+    -   If negative values are passed to the function, the behavior is undefined.
+    -   If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000.
+
+**Accuracy**
+
+The calculation is accurate if:
+
+-   Total number of values doesn’t exceed 5670.
+-   Total number of values exceeds 5670, but the page loading time is less than 1024ms.
+
+Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms.
+
+!!! note "Note"
+    For calculating page loading time quantiles, this function is more effective and accurate than [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile).
+
+**Returned value**
+
+-   Quantile of the specified level.
+
+Type: `Float32`.
+
+!!! note "Note"
+    If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) for notes on sorting `NaN` values.
+
+**Example**
+
+Input table:
+
+``` text
+┌─response_time─┐
+│            72 │
+│           112 │
+│           126 │
+│           145 │
+│           104 │
+│           242 │
+│           313 │
+│           168 │
+│           108 │
+└───────────────┘
+```
+
+Query:
+
+``` sql
+SELECT quantileTiming(response_time) FROM t
+```
+
+Result:
+
+``` text
+┌─quantileTiming(response_time)─┐
+│                           126 │
+└───────────────────────────────┘
+```
+
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
new file mode 100644
index 00000000000..0f8606986c8
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
@@ -0,0 +1,85 @@
+---
+toc_priority: 205
+---
+
+# quantileTimingWeighted {#quantiletimingweighted}
+
+With the determined precision computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence according to the weight of each sequence member.
+
+The result is deterministic (it doesn’t depend on the query processing order). The function is optimized for working with sequences which describe distributions like loading web pages times or backend response times.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileTimingWeighted(level)(expr, weight)
+```
+
+Alias: `medianTimingWeighted`.
+
+**Parameters**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+
+-   `expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) over a column values returning a [Float\*](../../../sql-reference/data-types/float.md)-type number.
+
+        - If negative values are passed to the function, the behavior is undefined.
+        - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000.
+
+-   `weight` — Column with weights of sequence elements. Weight is a number of value occurrences.
+
+**Accuracy**
+
+The calculation is accurate if:
+
+-   Total number of values doesn’t exceed 5670.
+-   Total number of values exceeds 5670, but the page loading time is less than 1024ms.
+
+Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms.
+
+!!! note "Note"
+    For calculating page loading time quantiles, this function is more effective and accurate than [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile).
+
+**Returned value**
+
+-   Quantile of the specified level.
+
+Type: `Float32`.
+
+!!! note "Note"
+    If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) for notes on sorting `NaN` values.
+
+**Example**
+
+Input table:
+
+``` text
+┌─response_time─┬─weight─┐
+│            68 │      1 │
+│           104 │      2 │
+│           112 │      3 │
+│           126 │      2 │
+│           138 │      1 │
+│           162 │      1 │
+└───────────────┴────────┘
+```
+
+Query:
+
+``` sql
+SELECT quantileTimingWeighted(response_time, weight) FROM t
+```
+
+Result:
+
+``` text
+┌─quantileTimingWeighted(response_time, weight)─┐
+│                                           112 │
+└───────────────────────────────────────────────┘
+```
+
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md b/docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md
new file mode 100644
index 00000000000..dc23029f239
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md
@@ -0,0 +1,53 @@
+## rankCorr {#agg_function-rankcorr}
+
+Computes a rank correlation coefficient.
+
+**Syntax**
+
+``` sql
+rankCorr(x, y)
+```
+
+**Parameters**
+
+-   `x` — Arbitrary value. [Float32](../../../sql-reference/data-types/float.md#float32-float64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64).
+-   `y` — Arbitrary value. [Float32](../../../sql-reference/data-types/float.md#float32-float64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64).
+
+**Returned value(s)**
+
+-   Returns a rank correlation coefficient of the ranks of x and y. The value of the correlation coefficient ranges from -1 to +1. If less than two arguments are passed, the function will return an exception. The value close to +1 denotes a high linear relationship, and with an increase of one random variable, the second random variable also increases. The value close to -1 denotes a high linear relationship, and with an increase of one random variable, the second random variable decreases. The value close or equal to 0 denotes no relationship between the two random variables.
+
+Type: [Float64](../../../sql-reference/data-types/float.md#float32-float64).
+
+**Example**
+
+Query:
+
+``` sql
+SELECT rankCorr(number, number) FROM numbers(100);
+```
+
+Result:
+
+``` text
+┌─rankCorr(number, number)─┐
+│                        1 │
+└──────────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT roundBankers(rankCorr(exp(number), sin(number)), 3) FROM numbers(100);
+```
+
+Result:
+
+``` text
+┌─roundBankers(rankCorr(exp(number), sin(number)), 3)─┐
+│                                              -0.037 │
+└─────────────────────────────────────────────────────┘
+```
+**See Also**
+
+-   [Spearman's rank correlation coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
\ No newline at end of file
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md
new file mode 100644
index 00000000000..fee71cdeb49
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md
@@ -0,0 +1,42 @@
+---
+toc_priority: 220
+---
+
+# simpleLinearRegression {#simplelinearregression}
+
+Performs simple (unidimensional) linear regression.
+
+``` sql
+simpleLinearRegression(x, y)
+```
+
+Parameters:
+
+-   `x` — Column with dependent variable values.
+-   `y` — Column with explanatory variable values.
+
+Returned values:
+
+Constants `(a, b)` of the resulting line `y = a*x + b`.
+
+**Examples**
+
+``` sql
+SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])
+```
+
+``` text
+┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])─┐
+│ (1,0)                                                             │
+└───────────────────────────────────────────────────────────────────┘
+```
+
+``` sql
+SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])
+```
+
+``` text
+┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])─┐
+│ (1,3)                                                             │
+└───────────────────────────────────────────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md b/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md
new file mode 100644
index 00000000000..d15a5ffdd47
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md
@@ -0,0 +1,25 @@
+---
+toc_priority: 150
+---
+
+# skewPop {#skewpop}
+
+Computes the [skewness](https://en.wikipedia.org/wiki/Skewness) of a sequence.
+
+``` sql
+skewPop(expr)
+```
+
+**Parameters**
+
+`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
+
+**Returned value**
+
+The skewness of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md)
+
+**Example**
+
+``` sql
+SELECT skewPop(value) FROM series_with_value_column
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md
new file mode 100644
index 00000000000..cb323f4b142
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md
@@ -0,0 +1,27 @@
+---
+toc_priority: 151
+---
+
+# skewSamp {#skewsamp}
+
+Computes the [sample skewness](https://en.wikipedia.org/wiki/Skewness) of a sequence.
+
+It represents an unbiased estimate of the skewness of a random variable if passed values form its sample.
+
+``` sql
+skewSamp(expr)
+```
+
+**Parameters**
+
+`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
+
+**Returned value**
+
+The skewness of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md). If `n <= 1` (`n` is the size of the sample), then the function returns `nan`.
+
+**Example**
+
+``` sql
+SELECT skewSamp(value) FROM series_with_value_column
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md b/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md
new file mode 100644
index 00000000000..58f8c27cd72
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md
@@ -0,0 +1,10 @@
+---
+toc_priority: 30
+---
+
+# stddevPop {#stddevpop}
+
+The result is equal to the square root of [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md).
+
+!!! note "Note"
+    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevPopStable` function. It works slower but provides a lower computational error.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md
new file mode 100644
index 00000000000..4ec72881ae5
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md
@@ -0,0 +1,10 @@
+---
+toc_priority: 31
+---
+
+# stddevSamp {#stddevsamp}
+
+The result is equal to the square root of [varSamp](../../../sql-reference/aggregate-functions/reference/varsamp.md).
+
+!!! note "Note"
+    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevSampStable` function. It works slower but provides a lower computational error.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md
new file mode 100644
index 00000000000..7a37ed83e17
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md
@@ -0,0 +1,75 @@
+---
+toc_priority: 221
+---
+
+# stochasticLinearRegression {#agg_functions-stochasticlinearregression}
+
+This function implements stochastic linear regression. It supports custom parameters for learning rate, L2 regularization coefficient, mini-batch size and has few methods for updating weights ([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (used by default), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)).
+
+### Parameters {#agg_functions-stochasticlinearregression-parameters}
+
+There are 4 customizable parameters. They are passed to the function sequentially, but there is no need to pass all four - default values will be used, however good model required some parameter tuning.
+
+``` text
+stochasticLinearRegression(1.0, 1.0, 10, 'SGD')
+```
+
+1.  `learning rate` is the coefficient on step length, when gradient descent step is performed. Too big learning rate may cause infinite weights of the model. Default is `0.00001`.
+2.  `l2 regularization coefficient` which may help to prevent overfitting. Default is `0.1`.
+3.  `mini-batch size` sets the number of elements, which gradients will be computed and summed to perform one step of gradient descent. Pure stochastic descent uses one element, however having small batches(about 10 elements) make gradient steps more stable. Default is `15`.
+4.  `method for updating weights`, they are: `Adam` (by default), `SGD`, `Momentum`, `Nesterov`. `Momentum` and `Nesterov` require little bit more computations and memory, however they happen to be useful in terms of speed of convergance and stability of stochastic gradient methods.
+
+### Usage {#agg_functions-stochasticlinearregression-usage}
+
+`stochasticLinearRegression` is used in two steps: fitting the model and predicting on new data. In order to fit the model and save its state for later usage we use `-State` combinator, which basically saves the state (model weights, etc).
+To predict we use function [evalMLMethod](../../../sql-reference/functions/machine-learning-functions.md#machine_learning_methods-evalmlmethod), which takes a state as an argument as well as features to predict on.
+
+<a name="stochasticlinearregression-usage-fitting"></a>
+
+**1.** Fitting
+
+Such query may be used.
+
+``` sql
+CREATE TABLE IF NOT EXISTS train_data
+(
+    param1 Float64,
+    param2 Float64,
+    target Float64
+) ENGINE = Memory;
+
+CREATE TABLE your_model ENGINE = Memory AS SELECT
+stochasticLinearRegressionState(0.1, 0.0, 5, 'SGD')(target, param1, param2)
+AS state FROM train_data;
+```
+
+Here we also need to insert data into `train_data` table. The number of parameters is not fixed, it depends only on number of arguments, passed into `linearRegressionState`. They all must be numeric values.
+Note that the column with target value(which we would like to learn to predict) is inserted as the first argument.
+
+**2.** Predicting
+
+After saving a state into the table, we may use it multiple times for prediction, or even merge with other states and create new even better models.
+
+``` sql
+WITH (SELECT state FROM your_model) AS model SELECT
+evalMLMethod(model, param1, param2) FROM test_data
+```
+
+The query will return a column of predicted values. Note that first argument of `evalMLMethod` is `AggregateFunctionState` object, next are columns of features.
+
+`test_data` is a table like `train_data` but may not contain target value.
+
+### Notes {#agg_functions-stochasticlinearregression-notes}
+
+1.  To merge two models user may create such query:
+    `sql  SELECT state1 + state2 FROM your_models`
+    where `your_models` table contains both models. This query will return new `AggregateFunctionState` object.
+
+2.  User may fetch weights of the created model for its own purposes without saving the model if no `-State` combinator is used.
+    `sql  SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data`
+    Such query will fit the model and return its weights - first are weights, which correspond to the parameters of the model, the last one is bias. So in the example above the query will return a column with 3 values.
+
+**See Also**
+
+-   [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#agg_functions-stochasticlogisticregression)
+-   [Difference between linear and logistic regressions](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md
new file mode 100644
index 00000000000..35d1e3899ac
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md
@@ -0,0 +1,55 @@
+---
+toc_priority: 222
+---
+
+# stochasticLogisticRegression {#agg_functions-stochasticlogisticregression}
+
+This function implements stochastic logistic regression. It can be used for binary classification problem, supports the same custom parameters as stochasticLinearRegression and works the same way.
+
+### Parameters {#agg_functions-stochasticlogisticregression-parameters}
+
+Parameters are exactly the same as in stochasticLinearRegression:
+`learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`.
+For more information see [parameters](#agg_functions-stochasticlinearregression-parameters).
+
+``` text
+stochasticLogisticRegression(1.0, 1.0, 10, 'SGD')
+```
+
+**1.** Fitting
+
+<!-- -->
+
+    See the `Fitting` section in the [stochasticLinearRegression](#stochasticlinearregression-usage-fitting) description.
+
+    Predicted labels have to be in \[-1, 1\].
+
+**2.** Predicting
+
+<!-- -->
+
+    Using saved state we can predict probability of object having label `1`.
+
+    ``` sql
+    WITH (SELECT state FROM your_model) AS model SELECT
+    evalMLMethod(model, param1, param2) FROM test_data
+    ```
+
+    The query will return a column of probabilities. Note that first argument of `evalMLMethod` is `AggregateFunctionState` object, next are columns of features.
+
+    We can also set a bound of probability, which assigns elements to different labels.
+
+    ``` sql
+    SELECT ans < 1.1 AND ans > 0.5 FROM
+    (WITH (SELECT state FROM your_model) AS model SELECT
+    evalMLMethod(model, param1, param2) AS ans FROM test_data)
+    ```
+
+    Then the result will be labels.
+
+    `test_data` is a table like `train_data` but may not contain target value.
+
+**See Also**
+
+-   [stochasticLinearRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md#agg_functions-stochasticlinearregression)
+-   [Difference between linear and logistic regressions.](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/sum.md b/docs/zh/sql-reference/aggregate-functions/reference/sum.md
new file mode 100644
index 00000000000..049c491d2a5
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/sum.md
@@ -0,0 +1,8 @@
+---
+toc_priority: 4
+---
+
+# sum {#agg_function-sum}
+
+计算总和。
+只适用于数字。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/summap.md b/docs/zh/sql-reference/aggregate-functions/reference/summap.md
new file mode 100644
index 00000000000..4ccbc22de35
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/summap.md
@@ -0,0 +1,48 @@
+---
+toc_priority: 141
+---
+
+# sumMap {#agg_functions-summap}
+
+Syntax: `sumMap(key, value)` or `sumMap(Tuple(key, value))`
+
+Totals the `value` array according to the keys specified in the `key` array.
+
+Passing tuple of keys and values arrays is a synonym to passing two arrays of keys and values.
+
+The number of elements in `key` and `value` must be the same for each row that is totaled.
+
+Returns a tuple of two arrays: keys in sorted order, and values ​​summed for the corresponding keys.
+
+Example:
+
+``` sql
+CREATE TABLE sum_map(
+    date Date,
+    timeslot DateTime,
+    statusMap Nested(
+        status UInt16,
+        requests UInt64
+    ),
+    statusMapTuple Tuple(Array(Int32), Array(Int32))
+) ENGINE = Log;
+INSERT INTO sum_map VALUES
+    ('2000-01-01', '2000-01-01 00:00:00', [1, 2, 3], [10, 10, 10], ([1, 2, 3], [10, 10, 10])),
+    ('2000-01-01', '2000-01-01 00:00:00', [3, 4, 5], [10, 10, 10], ([3, 4, 5], [10, 10, 10])),
+    ('2000-01-01', '2000-01-01 00:01:00', [4, 5, 6], [10, 10, 10], ([4, 5, 6], [10, 10, 10])),
+    ('2000-01-01', '2000-01-01 00:01:00', [6, 7, 8], [10, 10, 10], ([6, 7, 8], [10, 10, 10]));
+
+SELECT
+    timeslot,
+    sumMap(statusMap.status, statusMap.requests),
+    sumMap(statusMapTuple)
+FROM sum_map
+GROUP BY timeslot
+```
+
+``` text
+┌────────────timeslot─┬─sumMap(statusMap.status, statusMap.requests)─┬─sumMap(statusMapTuple)─────────┐
+│ 2000-01-01 00:00:00 │ ([1,2,3,4,5],[10,10,20,10,10])               │ ([1,2,3,4,5],[10,10,20,10,10]) │
+│ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10])               │ ([4,5,6,7,8],[10,10,20,10,10]) │
+└─────────────────────┴──────────────────────────────────────────────┴────────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md b/docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md
new file mode 100644
index 00000000000..1b39e9d0eb1
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md
@@ -0,0 +1,9 @@
+---
+toc_priority: 140
+---
+
+# sumWithOverflow {#sumwithoverflowx}
+
+Computes the sum of the numbers, using the same data type for the result as for the input parameters. If the sum exceeds the maximum value for this data type, it is calculated with overflow.
+
+Only works for numbers.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/topk.md b/docs/zh/sql-reference/aggregate-functions/reference/topk.md
new file mode 100644
index 00000000000..004a67d33af
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/topk.md
@@ -0,0 +1,42 @@
+---
+toc_priority: 108
+---
+
+# topK {#topk}
+
+Returns an array of the approximately most frequent values in the specified column. The resulting array is sorted in descending order of approximate frequency of values (not by the values themselves).
+
+Implements the [Filtered Space-Saving](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf) algorithm for analyzing TopK, based on the reduce-and-combine algorithm from [Parallel Space Saving](https://arxiv.org/pdf/1401.0702.pdf).
+
+``` sql
+topK(N)(column)
+```
+
+This function doesn’t provide a guaranteed result. In certain situations, errors might occur and it might return frequent values that aren’t the most frequent values.
+
+We recommend using the `N < 10` value; performance is reduced with large `N` values. Maximum value of `N = 65536`.
+
+**Parameters**
+
+-   ‘N’ is the number of elements to return.
+
+If the parameter is omitted, default value 10 is used.
+
+**Arguments**
+
+-   ’ x ’ – The value to calculate frequency.
+
+**Example**
+
+Take the [OnTime](../../../getting-started/example-datasets/ontime.md) data set and select the three most frequently occurring values in the `AirlineID` column.
+
+``` sql
+SELECT topK(3)(AirlineID) AS res
+FROM ontime
+```
+
+``` text
+┌─res─────────────────┐
+│ [19393,19790,19805] │
+└─────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md
new file mode 100644
index 00000000000..b597317f44e
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md
@@ -0,0 +1,42 @@
+---
+toc_priority: 109
+---
+
+# topKWeighted {#topkweighted}
+
+Similar to `topK` but takes one additional argument of integer type - `weight`. Every value is accounted `weight` times for frequency calculation.
+
+**Syntax**
+
+``` sql
+topKWeighted(N)(x, weight)
+```
+
+**Parameters**
+
+-   `N` — The number of elements to return.
+
+**Arguments**
+
+-   `x` – The value.
+-   `weight` — The weight. [UInt8](../../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+Returns an array of the values with maximum approximate sum of weights.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT topKWeighted(10)(number, number) FROM numbers(1000)
+```
+
+Result:
+
+``` text
+┌─topKWeighted(10)(number, number)──────────┐
+│ [999,998,997,996,995,994,993,992,991,990] │
+└───────────────────────────────────────────┘
+```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniq.md b/docs/zh/sql-reference/aggregate-functions/reference/uniq.md
new file mode 100644
index 00000000000..81d1ec6761e
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/uniq.md
@@ -0,0 +1,40 @@
+---
+toc_priority: 190
+---
+
+# uniq {#agg_function-uniq}
+
+Calculates the approximate number of different values of the argument.
+
+``` sql
+uniq(x[, ...])
+```
+
+**Parameters**
+
+The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
+
+**Returned value**
+
+-   A [UInt64](../../../sql-reference/data-types/int-uint.md)-type number.
+
+**Implementation details**
+
+Function:
+
+-   Calculates a hash for all parameters in the aggregate, then uses it in calculations.
+
+-   Uses an adaptive sampling algorithm. For the calculation state, the function uses a sample of element hash values up to 65536.
+
+        This algorithm is very accurate and very efficient on the CPU. When the query contains several of these functions, using `uniq` is almost as fast as using other aggregate functions.
+
+-   Provides the result deterministically (it doesn’t depend on the query processing order).
+
+We recommend using this function in almost all scenarios.
+
+**See Also**
+
+-   [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined)
+-   [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)
+-   [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12)
+-   [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md
new file mode 100644
index 00000000000..c52486bc38f
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md
@@ -0,0 +1,51 @@
+---
+toc_priority: 192
+---
+
+# uniqCombined {#agg_function-uniqcombined}
+
+Calculates the approximate number of different argument values.
+
+``` sql
+uniqCombined(HLL_precision)(x[, ...])
+```
+
+The `uniqCombined` function is a good choice for calculating the number of different values.
+
+**Parameters**
+
+The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
+
+`HLL_precision` is the base-2 logarithm of the number of cells in [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog). Optional, you can use the function as `uniqCombined(x[, ...])`. The default value for `HLL_precision` is 17, which is effectively 96 KiB of space (2^17 cells, 6 bits each).
+
+**Returned value**
+
+-   A number [UInt64](../../../sql-reference/data-types/int-uint.md)-type number.
+
+**Implementation details**
+
+Function:
+
+-   Calculates a hash (64-bit hash for `String` and 32-bit otherwise) for all parameters in the aggregate, then uses it in calculations.
+
+-   Uses a combination of three algorithms: array, hash table, and HyperLogLog with an error correction table.
+
+        For a small number of distinct elements, an array is used. When the set size is larger, a hash table is used. For a larger number of elements, HyperLogLog is used, which will occupy a fixed amount of memory.
+
+-   Provides the result deterministically (it doesn’t depend on the query processing order).
+
+!!! note "Note"
+    Since it uses 32-bit hash for non-`String` type, the result will have very high error for cardinalities significantly larger than `UINT_MAX` (error will raise quickly after a few tens of billions of distinct values), hence in this case you should use [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)
+
+Compared to the [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) function, the `uniqCombined`:
+
+-   Consumes several times less memory.
+-   Calculates with several times higher accuracy.
+-   Usually has slightly lower performance. In some scenarios, `uniqCombined` can perform better than `uniq`, for example, with distributed queries that transmit a large number of aggregation states over the network.
+
+**See Also**
+
+-   [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)
+-   [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)
+-   [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12)
+-   [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md
new file mode 100644
index 00000000000..6d060d82779
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md
@@ -0,0 +1,7 @@
+---
+toc_priority: 193
+---
+
+# uniqCombined64 {#agg_function-uniqcombined64}
+
+Same as [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined), but uses 64-bit hash for all data types.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md
new file mode 100644
index 00000000000..9a6224533c8
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md
@@ -0,0 +1,25 @@
+---
+toc_priority: 191
+---
+
+# uniqExact {#agg_function-uniqexact}
+
+Calculates the exact number of different argument values.
+
+``` sql
+uniqExact(x[, ...])
+```
+
+Use the `uniqExact` function if you absolutely need an exact result. Otherwise use the [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) function.
+
+The `uniqExact` function uses more memory than `uniq`, because the size of the state has unbounded growth as the number of different values increases.
+
+**Parameters**
+
+The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
+
+**See Also**
+
+-   [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)
+-   [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqcombined)
+-   [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqhll12)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md
new file mode 100644
index 00000000000..fcddc22cc46
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md
@@ -0,0 +1,39 @@
+---
+toc_priority: 194
+---
+
+# uniqHLL12 {#agg_function-uniqhll12}
+
+Calculates the approximate number of different argument values, using the [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) algorithm.
+
+``` sql
+uniqHLL12(x[, ...])
+```
+
+**Parameters**
+
+The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
+
+**Returned value**
+
+-   A [UInt64](../../../sql-reference/data-types/int-uint.md)-type number.
+
+**Implementation details**
+
+Function:
+
+-   Calculates a hash for all parameters in the aggregate, then uses it in calculations.
+
+-   Uses the HyperLogLog algorithm to approximate the number of different argument values.
+
+        212 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements).
+
+-   Provides the determinate result (it doesn’t depend on the query processing order).
+
+We don’t recommend using this function. In most cases, use the [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) or [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) function.
+
+**See Also**
+
+-   [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)
+-   [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined)
+-   [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/varpop.md b/docs/zh/sql-reference/aggregate-functions/reference/varpop.md
new file mode 100644
index 00000000000..c08dcfd9bfd
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/varpop.md
@@ -0,0 +1,12 @@
+---
+toc_priority: 32
+---
+
+# varPop(x) {#varpopx}
+
+Calculates the amount `Σ((x - x̅)^2) / n`, where `n` is the sample size and `x̅`is the average value of `x`.
+
+In other words, dispersion for a set of values. Returns `Float64`.
+
+!!! note "Note"
+    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varPopStable` function. It works slower but provides a lower computational error.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md
new file mode 100644
index 00000000000..78bc545a5d0
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md
@@ -0,0 +1,14 @@
+---
+toc_priority: 33
+---
+
+# varSamp {#varsamp}
+
+Calculates the amount `Σ((x - x̅)^2) / (n - 1)`, where `n` is the sample size and `x̅`is the average value of `x`.
+
+It represents an unbiased estimate of the variance of a random variable if passed values form its sample.
+
+Returns `Float64`. When `n <= 1`, returns `+∞`.
+
+!!! note "Note"
+    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varSampStable` function. It works slower but provides a lower computational error.

From 7d0430c0ec3d9bf38aa8b5a59ab404ca139b4d3a Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Wed, 3 Feb 2021 23:22:18 +0800
Subject: [PATCH 142/260] WIP update-aggregate-funcions-in-zh

---
 docs/zh/faq/terms_translation_zh.md           |  8 ++-
 .../aggregate-functions/reference.md          | 41 ------------
 .../reference/avgweighted.md                  | 63 +++++++------------
 .../reference/categoricalinformationvalue.md  |  4 +-
 .../aggregate-functions/reference/corr.md     | 11 ++--
 .../aggregate-functions/reference/count.md    | 39 ++++++------
 .../aggregate-functions/reference/covarpop.md | 11 ++--
 .../reference/covarsamp.md                    | 13 ++--
 .../reference/grouparray.md                   | 18 ++++--
 9 files changed, 88 insertions(+), 120 deletions(-)

diff --git a/docs/zh/faq/terms_translation_zh.md b/docs/zh/faq/terms_translation_zh.md
index c7eece6847a..db6e19fa259 100644
--- a/docs/zh/faq/terms_translation_zh.md
+++ b/docs/zh/faq/terms_translation_zh.md
@@ -4,8 +4,12 @@
 ## 保持英文，不译
 Parquet
 
-## 英文  <-> 中文
-Tuple       元组
+## 英文   <->     中文
+Integer             整数
+floating-point      浮点数
+Decimal             定点数
+Tuple               元组
+function            函数
 
 
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 6d19b404af0..b3b81cc1276 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -379,47 +379,6 @@ kurtSamp(expr)
 SELECT kurtSamp(value) FROM series_with_value_column
 ```
 
-## avgWeighted {#avgweighted}
-
-计算 [加权算术平均值](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean).
-
-**语法**
-
-``` sql
-avgWeighted(x, weight)
-```
-
-**参数**
-
--   `x` — 值。 [整数](../data-types/int-uint.md) 或 [浮点](../data-types/float.md).
--   `weight` — 值的加权。 [整数](../data-types/int-uint.md) 或 [浮点](../data-types/float.md).
-
-`x` 和 `weight` 的类型一定是一样的
-
-**返回值**
-
--   加权平均值。
--   `NaN`. 如果所有的权重都等于0。
-
-类型: [Float64](../data-types/float.md).
-
-**示例**
-
-查询:
-
-``` sql
-SELECT avgWeighted(x, w)
-FROM values('x Int8, w Int8', (4, 1), (1, 0), (10, 2))
-```
-
-结果:
-
-``` text
-┌─avgWeighted(x, weight)─┐
-│                      8 │
-└────────────────────────┘
-```
-
 ## uniq {#agg_function-uniq}
 
 计算参数的不同值的近似数量。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
index 7b9c0de2755..a353c02c13b 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
@@ -4,42 +4,43 @@ toc_priority: 107
 
 # avgWeighted {#avgweighted}
 
-Calculates the [weighted arithmetic mean](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean).
 
-**Syntax**
+计算 [加权算术平均值](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean).
+
+**语法**
 
 ``` sql
 avgWeighted(x, weight)
 ```
 
-**Parameters**
+**参数**
 
--   `x` — Values.
--   `weight` — Weights of the values.
+-   `x` — 值。 
+-   `weight` — 值的加权。 
 
-`x` and `weight` must both be
-[Integer](../../../sql-reference/data-types/int-uint.md),
-[floating-point](../../../sql-reference/data-types/float.md), or 
-[Decimal](../../../sql-reference/data-types/decimal.md),
-but may have different types.
+`x` 和 `weight` 的类型必须是
+[整数](../../../sql-reference/data-types/int-uint.md), 或
+[浮点数](../../../sql-reference/data-types/float.md), 或
+[定点数](../../../sql-reference/data-types/decimal.md),
+但是可以不一样。
 
-**Returned value**
+**返回值**
 
--   `NaN` if all the weights are equal to 0 or the supplied weights parameter is empty.
--   Weighted mean otherwise.
+-   `NaN`。 如果所有的权重都等于0 或所提供的权重参数是空。
+-   加权平均值。 其他。
 
-**Return type** is always [Float64](../../../sql-reference/data-types/float.md).
+类型: 总是[Float64](../data-types/float.md).
 
-**Example**
+**示例**
 
-Query:
+查询:
 
 ``` sql
 SELECT avgWeighted(x, w)
 FROM values('x Int8, w Int8', (4, 1), (1, 0), (10, 2))
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─avgWeighted(x, weight)─┐
@@ -47,33 +48,17 @@ Result:
 └────────────────────────┘
 ```
 
-**Example**
 
-Query:
+**示例**
 
-``` sql
-SELECT avgWeighted(x, w)
-FROM values('x Int8, w Float64', (4, 1), (1, 0), (10, 2))
-```
-
-Result:
-
-``` text
-┌─avgWeighted(x, weight)─┐
-│                      8 │
-└────────────────────────┘
-```
-
-**Example**
-
-Query:
+查询:
 
 ``` sql
 SELECT avgWeighted(x, w)
 FROM values('x Int8, w Int8', (0, 0), (1, 0), (10, 0))
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─avgWeighted(x, weight)─┐
@@ -81,16 +66,16 @@ Result:
 └────────────────────────┘
 ```
 
-**Example**
+**示例**
 
-Query:
+查询:
 
 ``` sql
 CREATE table test (t UInt8) ENGINE = Memory;
 SELECT avgWeighted(t) FROM test
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─avgWeighted(x, weight)─┐
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md b/docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md
index 2e9001dec19..1970e76c2fd 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md
@@ -4,10 +4,10 @@ toc_priority: 250
 
 # categoricalInformationValue {#categoricalinformationvalue}
 
-Calculates the value of `(P(tag = 1) - P(tag = 0))(log(P(tag = 1)) - log(P(tag = 0)))` for each category.
+对于每个类别计算 `(P(tag = 1) - P(tag = 0))(log(P(tag = 1)) - log(P(tag = 0)))` 。
 
 ``` sql
 categoricalInformationValue(category1, category2, ..., tag)
 ```
 
-The result indicates how a discrete (categorical) feature `[category1, category2, ...]` contribute to a learning model which predicting the value of `tag`.
+结果指示离散（分类）要素如何使用 `[category1, category2, ...]` 有助于使用学习模型预测`tag`的值。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/corr.md b/docs/zh/sql-reference/aggregate-functions/reference/corr.md
index 88f9295a8f2..5ab49f75023 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/corr.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/corr.md
@@ -4,9 +4,12 @@ toc_priority: 107
 
 # corr {#corrx-y}
 
-Syntax: `corr(x, y)`
+**语法**
+``` sql
+`corr(x, y)`
+```
 
-Calculates the Pearson correlation coefficient: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`.
+计算Pearson相关系数: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`。
 
-!!! note "Note"
-    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `corrStable` function. It works slower but provides a lower computational error.
+!!! note "注"
+该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `corrStable` 函数。 它的工作速度较慢，但提供较低的计算错误。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/count.md b/docs/zh/sql-reference/aggregate-functions/reference/count.md
index e5d31429e12..5e464a0cb61 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/count.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/count.md
@@ -4,35 +4,36 @@ toc_priority: 1
 
 # count {#agg_function-count}
 
-Counts the number of rows or not-NULL values.
 
-ClickHouse supports the following syntaxes for `count`:
-- `count(expr)` or `COUNT(DISTINCT expr)`.
-- `count()` or `COUNT(*)`. The `count()` syntax is ClickHouse-specific.
+计数行数或非空值。
 
-**Parameters**
+ClickHouse支持以下 `count` 语法:
+- `count(expr)` 或 `COUNT(DISTINCT expr)`。
+- `count()` 或 `COUNT(*)`. 该 `count()` 语法是ClickHouse特定的。
 
-The function can take:
+**参数**
 
--   Zero parameters.
--   One [expression](../../../sql-reference/syntax.md#syntax-expressions).
+该函数可以采取:
 
-**Returned value**
+-   零参数。
+-   一个 [表达式](../../../sql-reference/syntax.md#syntax-expressions)。
 
--   If the function is called without parameters it counts the number of rows.
--   If the [expression](../../../sql-reference/syntax.md#syntax-expressions) is passed, then the function counts how many times this expression returned not null. If the expression returns a [Nullable](../../../sql-reference/data-types/nullable.md)-type value, then the result of `count` stays not `Nullable`. The function returns 0 if the expression returned `NULL` for all the rows.
+**返回值**
 
-In both cases the type of the returned value is [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   如果没有参数调用函数，它会计算行数。
+-   如果 [表达式](../../../syntax.md#syntax-expressions) 被传递，则该函数计数此表达式返回非null的次数。 如果表达式返回 [可为空](../../../sql-reference/data-types/nullable.md)类型的值，`count`的结果仍然不 `Nullable`。 如果表达式对于所有的行都返回 `NULL` ，则该函数返回 0 。
 
-**Details**
+在这两种情况下，返回值的类型为 [UInt64](../../../sql-reference/data-types/int-uint.md)。
 
-ClickHouse supports the `COUNT(DISTINCT ...)` syntax. The behavior of this construction depends on the [count_distinct_implementation](../../../operations/settings/settings.md#settings-count_distinct_implementation) setting. It defines which of the [uniq\*](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) functions is used to perform the operation. The default is the [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) function.
+**详细信息**
 
-The `SELECT count() FROM table` query is not optimized, because the number of entries in the table is not stored separately. It chooses a small column from the table and counts the number of values in it.
+ClickHouse支持 `COUNT(DISTINCT ...)` 语法，这种结构的行为取决于 [count_distinct_implementation](../../../operations/settings/settings.md#settings-count_distinct_implementation) 设置。 它定义了用于执行该操作的 [uniq\*](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)函数。 默认值是 [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact)函数。
 
-**Examples**
+`SELECT count() FROM table` 这个查询未被优化，因为表中的条目数没有单独存储。 它从表中选择一个小列并计算其值的个数。
 
-Example 1:
+**示例**
+
+示例1:
 
 ``` sql
 SELECT count() FROM t
@@ -44,7 +45,7 @@ SELECT count() FROM t
 └─────────┘
 ```
 
-Example 2:
+示例2:
 
 ``` sql
 SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation'
@@ -66,4 +67,4 @@ SELECT count(DISTINCT num) FROM t
 └────────────────┘
 ```
 
-This example shows that `count(DISTINCT num)` is performed by the `uniqExact` function according to the `count_distinct_implementation` setting value.
+这个例子表明 `count(DISTINCT num)` 是通过 `count_distinct_implementation` 的设定值 `uniqExact` 函数来执行的。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md
index 2a7d805763e..cddad69e56a 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md
@@ -4,9 +4,12 @@ toc_priority: 36
 
 # covarPop {#covarpop}
 
-Syntax: `covarPop(x, y)`
+**语法**
+``` sql
+`covarPop(x, y)`
+```
 
-Calculates the value of `Σ((x - x̅)(y - y̅)) / n`.
+计算 `Σ((x - x̅)(y - y̅)) / n` 的值。
 
-!!! note "Note"
-    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarPopStable` function. It works slower but provides a lower computational error.
+!!! note "注"
+该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `covarPopStable` 函数。 它的工作速度较慢，但提供了较低的计算错误。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md
index 4bdb1b02d40..5ee18cf3f97 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md
@@ -4,9 +4,14 @@ toc_priority: 37
 
 # covarSamp {#covarsamp}
 
-Calculates the value of `Σ((x - x̅)(y - y̅)) / (n - 1)`.
+**语法**
+``` sql
+`covarSamp(x, y)`
+```
 
-Returns Float64. When `n <= 1`, returns +∞.
+计算 `Σ((x - x̅)(y - y̅)) / (n - 1)` 的值。
 
-!!! note "Note"
-    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarSampStable` function. It works slower but provides a lower computational error.
+返回Float64。 当 `n <= 1`, 返回 +∞。
+
+!!! note "注"
+该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `covarSampStable` 函数。 它的工作速度较慢，但提供较低的计算错误。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md
index 86b7b83022b..81cd38db8b5 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md
@@ -4,11 +4,19 @@ toc_priority: 110
 
 # groupArray {#agg_function-grouparray}
 
-Syntax: `groupArray(x)` or `groupArray(max_size)(x)`
+**语法**
+``` sql
+`groupArray(x)`
 
-Creates an array of argument values.
-Values can be added to the array in any (indeterminate) order.
+or
 
-The second version (with the `max_size` parameter) limits the size of the resulting array to `max_size` elements. For example, `groupArray(1)(x)` is equivalent to `[any (x)]`.
+`groupArray(max_size)(x)`
+```
 
-In some cases, you can still rely on the order of execution. This applies to cases when `SELECT` comes from a subquery that uses `ORDER BY`.
+创建参数值的数组。
+值可以按任何（不确定）顺序添加到数组中。
+
+第二个版本（带有 `max_size` 参数）将结果数组的大小限制为 `max_size` 个元素。
+例如, `groupArray (1) (x)` 相当于 `[any (x)]` 。
+
+在某些情况下，您仍然可以依赖执行顺序。这适用于SELECT(查询)来自使用了 `ORDER BY` 子查询的情况。

From af0930f62a66a594781f605542b6b9b34cb7e281 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Thu, 4 Feb 2021 22:54:31 +0800
Subject: [PATCH 143/260] WIP update-aggregate-funcions-in-zh

---
 docs/zh/faq/terms_translation_zh.md           |  19 +-
 .../aggregate-functions/reference.md          | 205 ------------------
 .../reference/grouparrayinsertat.md           |  46 ++--
 .../reference/grouparraymovingavg.md          |  41 ++--
 4 files changed, 65 insertions(+), 246 deletions(-)

diff --git a/docs/zh/faq/terms_translation_zh.md b/docs/zh/faq/terms_translation_zh.md
index db6e19fa259..ab58b9769d7 100644
--- a/docs/zh/faq/terms_translation_zh.md
+++ b/docs/zh/faq/terms_translation_zh.md
@@ -1,15 +1,32 @@
 # 术语翻译约定
 本文档用来维护从英文翻译成中文的术语集。
 
+
+
 ## 保持英文，不译
 Parquet
 
-## 英文   <->     中文
+## 英文   <->         中文
 Integer             整数
 floating-point      浮点数
 Decimal             定点数
 Tuple               元组
 function            函数
+array               数组/阵列
 
 
+##
+1. 对于array的翻译，保持初始翻译 数组/阵列 不变。
+
+2. 对于倒装句。翻译时非直译，会调整语序。
+比如, groupArrayInsertAt 翻译中
+
+``` text
+-   `x` — [Expression] resulting in one of the [supported data types].
+```
+
+``` text
+`x` — 生成所[支持的数据类型](数据)的[表达式]。
+```
+
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index b3b81cc1276..f22d54955ee 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -526,103 +526,6 @@ uniqExact(x[, ...])
 -   [uniqCombined](#agg_function-uniqcombined)
 -   [uniqHLL12](#agg_function-uniqhll12)
 
-## groupArray(x), groupArray(max_size)(x) {#agg_function-grouparray}
-
-创建参数值的数组。
-值可以按任何（不确定）顺序添加到数组中。
-
-第二个版本（与 `max_size` 参数）将结果数组的大小限制为 `max_size` 元素。
-例如, `groupArray (1) (x)` 相当于 `[any (x)]`.
-
-在某些情况下，您仍然可以依靠执行的顺序。 这适用于以下情况 `SELECT` 来自使用 `ORDER BY`.
-
-## groupArrayInsertAt {#grouparrayinsertat}
-
-在指定位置向数组中插入一个值。
-
-**语法**
-
-``` sql
-groupArrayInsertAt(default_x, size)(x, pos);
-```
-
-如果在一个查询中将多个值插入到同一位置，则该函数的行为方式如下:
-
--   如果在单个线程中执行查询，则使用第一个插入的值。
--   如果在多个线程中执行查询，则结果值是未确定的插入值之一。
-
-**参数**
-
--   `x` — 被插入的值。[表达式](../syntax.md#syntax-expressions) 导致的一个 [支持的数据类型](../../sql-reference/data-types/index.md).
--   `pos` — `x` 将被插入的位置。 数组中的索引编号从零开始。 [UInt32](../../sql-reference/data-types/int-uint.md#uint-ranges).
--   `default_x`— 如果代入值为空，则使用默认值。可选参数。[表达式](../syntax.md#syntax-expressions) 为 `x` 数据类型的数据。 如果 `default_x` 未定义，则 [默认值](../../sql-reference/statements/create.md#create-default-values) 被使用。
--   `size`— 结果数组的长度。可选参数。如果使用该参数，`default_x` 必须指定。 [UInt32](../../sql-reference/data-types/int-uint.md#uint-ranges).
-
-**返回值**
-
--   具有插入值的数组。
-
-类型: [阵列](../../sql-reference/data-types/array.md#data-type-array).
-
-**示例**
-
-查询:
-
-``` sql
-SELECT groupArrayInsertAt(toString(number), number * 2) FROM numbers(5);
-```
-
-结果:
-
-``` text
-┌─groupArrayInsertAt(toString(number), multiply(number, 2))─┐
-│ ['0','','1','','2','','3','','4']                         │
-└───────────────────────────────────────────────────────────┘
-```
-
-查询:
-
-``` sql
-SELECT groupArrayInsertAt('-')(toString(number), number * 2) FROM numbers(5);
-```
-
-结果:
-
-``` text
-┌─groupArrayInsertAt('-')(toString(number), multiply(number, 2))─┐
-│ ['0','-','1','-','2','-','3','-','4']                          │
-└────────────────────────────────────────────────────────────────┘
-```
-
-查询:
-
-``` sql
-SELECT groupArrayInsertAt('-', 5)(toString(number), number * 2) FROM numbers(5);
-```
-
-结果:
-
-``` text
-┌─groupArrayInsertAt('-', 5)(toString(number), multiply(number, 2))─┐
-│ ['0','-','1','-','2']                                             │
-└───────────────────────────────────────────────────────────────────┘
-```
-
-在一个位置多线程插入数据。
-
-查询:
-
-``` sql
-SELECT groupArrayInsertAt(number, 0) FROM numbers_mt(10) SETTINGS max_block_size = 1;
-```
-
-作为这个查询的结果，你会得到随机整数 `[0,9]` 范围。 例如:
-
-``` text
-┌─groupArrayInsertAt(number, 0)─┐
-│ [7]                           │
-└───────────────────────────────┘
-```
 
 ## groupArrayMovingSum {#agg_function-grouparraymovingsum}
 
@@ -697,81 +600,6 @@ FROM t
 └────────────┴─────────────────────────────────┴────────────────────────┘
 ```
 
-## groupArrayMovingAvg {#agg_function-grouparraymovingavg}
-
-计算输入值的移动平均值。
-
-``` sql
-groupArrayMovingAvg(numbers_for_summing)
-groupArrayMovingAvg(window_size)(numbers_for_summing)
-```
-
-该函数可以将窗口大小作为参数。 如果未指定，则该函数的窗口大小等于列中的行数。
-
-**参数**
-
--   `numbers_for_summing` — [表达式](../syntax.md#syntax-expressions) 生成数值数据类型值。
--   `window_size` — 窗口大小。
-
-**返回值**
-
--   与输入数据大小和类型相同的数组。
-
-该函数使用 [四舍五入到零](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero). 它截断无意义的小数位来保证结果的数据类型。
-
-**示例**
-
-样品表 `b`:
-
-``` sql
-CREATE TABLE t
-(
-    `int` UInt8,
-    `float` Float32,
-    `dec` Decimal32(2)
-)
-ENGINE = TinyLog
-```
-
-``` text
-┌─int─┬─float─┬──dec─┐
-│   1 │   1.1 │ 1.10 │
-│   2 │   2.2 │ 2.20 │
-│   4 │   4.4 │ 4.40 │
-│   7 │  7.77 │ 7.77 │
-└─────┴───────┴──────┘
-```
-
-查询:
-
-``` sql
-SELECT
-    groupArrayMovingAvg(int) AS I,
-    groupArrayMovingAvg(float) AS F,
-    groupArrayMovingAvg(dec) AS D
-FROM t
-```
-
-``` text
-┌─I─────────┬─F───────────────────────────────────┬─D─────────────────────┐
-│ [0,0,1,3] │ [0.275,0.82500005,1.9250001,3.8675] │ [0.27,0.82,1.92,3.86] │
-└───────────┴─────────────────────────────────────┴───────────────────────┘
-```
-
-``` sql
-SELECT
-    groupArrayMovingAvg(2)(int) AS I,
-    groupArrayMovingAvg(2)(float) AS F,
-    groupArrayMovingAvg(2)(dec) AS D
-FROM t
-```
-
-``` text
-┌─I─────────┬─F────────────────────────────────┬─D─────────────────────┐
-│ [0,1,3,5] │ [0.55,1.6500001,3.3000002,6.085] │ [0.55,1.65,3.30,6.08] │
-└───────────┴──────────────────────────────────┴───────────────────────┘
-```
-
 ## groupUniqArray(x), groupUniqArray(max_size)(x) {#groupuniqarrayx-groupuniqarraymax-sizex}
 
 从不同的参数值创建一个数组。 内存消耗是一样的 `uniqExact` 功能。
@@ -1454,39 +1282,6 @@ SELECT topKWeighted(10)(number, number) FROM numbers(1000)
 └───────────────────────────────────────────┘
 ```
 
-## covarSamp(x,y) {#covarsampx-y}
-
-计算 `Σ((x - x̅)(y - y̅)) / (n - 1)`。
-
-返回Float64。 当 `n <= 1`, returns +∞。
-
-!!! note "注"
-    该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `covarSampStable` 功能。 它的工作速度较慢，但提供较低的计算错误。
-
-## covarPop(x,y) {#covarpopx-y}
-
-计算 `Σ((x - x̅)(y - y̅)) / n`。
-
-!!! note "注"
-    该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `covarPopStable` 功能。 它的工作速度较慢，但提供了较低的计算错误。
-
-## corr(x,y) {#corrx-y}
-
-计算Pearson相关系数: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`。
-
-!!! note "注"
-    该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `corrStable` 功能。 它的工作速度较慢，但提供较低的计算错误。
-
-## categoricalInformationValue {#categoricalinformationvalue}
-
-对于每个类别计算 `(P(tag = 1) - P(tag = 0))(log(P(tag = 1)) - log(P(tag = 0)))` 。
-
-``` sql
-categoricalInformationValue(category1, category2, ..., tag)
-```
-
-结果指示离散（分类）要素如何使用 `[category1, category2, ...]` 有助于使用学习模型预测`tag`的值。
-
 ## simpleLinearRegression {#simplelinearregression}
 
 执行简单（一维）线性回归。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
index f4b8665a0a4..ed1b0806eb4 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
@@ -4,41 +4,41 @@ toc_priority: 112
 
 # groupArrayInsertAt {#grouparrayinsertat}
 
-Inserts a value into the array at the specified position.
+在指定位置向数组中插入一个值。
 
-**Syntax**
+**语法**
 
 ``` sql
 groupArrayInsertAt(default_x, size)(x, pos);
 ```
 
-If in one query several values are inserted into the same position, the function behaves in the following ways:
+如果在一个查询中将多个值插入到同一位置，则该函数的行为方式如下:
 
--   If a query is executed in a single thread, the first one of the inserted values is used.
--   If a query is executed in multiple threads, the resulting value is an undetermined one of the inserted values.
+-   如果在单个线程中执行查询，则使用第一个插入的值。
+-   如果在多个线程中执行查询，则结果值是未确定的插入值之一。
 
-**Parameters**
+**参数**
 
--   `x` — Value to be inserted. [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in one of the [supported data types](../../../sql-reference/data-types/index.md).
--   `pos` — Position at which the specified element `x` is to be inserted. Index numbering in the array starts from zero. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges).
--   `default_x`— Default value for substituting in empty positions. Optional parameter. [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in the data type configured for the `x` parameter. If `default_x` is not defined, the [default values](../../../sql-reference/statements/create/table.md#create-default-values) are used.
--   `size`— Length of the resulting array. Optional parameter. When using this parameter, the default value `default_x` must be specified. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges).
+-   `x` — 要插入的值。生成所[支持的数据类型](../../../sql-reference/data-types/index.md)(数据)的[表达式](../../../syntax.md#syntax-expressions)。
+-   `pos` — 指定元素 `x` 将被插入的位置。 数组中的索引编号从零开始。 [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges).
+-   `default_x`— 在空位置替换的默认值。可选参数。生成 `x` 数据类型 (数据) 的[表达式](../../../syntax.md#syntax-expressions)。  如果 `default_x` 未定义，则 [默认值](../../../sql-reference/statements/create.md#create-default-values) 被使用。
+-   `size`— 结果数组的长度。可选参数。如果使用该参数，必须指定默认值 `default_x` 。 [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges)。
 
-**Returned value**
+**返回值**
 
--   Array with inserted values.
+-   具有插入值的数组。
 
-Type: [Array](../../../sql-reference/data-types/array.md#data-type-array).
+类型: [阵列](../../../sql-reference/data-types/array.md#data-type-array)。
 
-**Example**
+**示例**
 
-Query:
+查询:
 
 ``` sql
 SELECT groupArrayInsertAt(toString(number), number * 2) FROM numbers(5);
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─groupArrayInsertAt(toString(number), multiply(number, 2))─┐
@@ -46,13 +46,13 @@ Result:
 └───────────────────────────────────────────────────────────┘
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT groupArrayInsertAt('-')(toString(number), number * 2) FROM numbers(5);
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─groupArrayInsertAt('-')(toString(number), multiply(number, 2))─┐
@@ -60,13 +60,13 @@ Result:
 └────────────────────────────────────────────────────────────────┘
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT groupArrayInsertAt('-', 5)(toString(number), number * 2) FROM numbers(5);
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─groupArrayInsertAt('-', 5)(toString(number), multiply(number, 2))─┐
@@ -74,15 +74,15 @@ Result:
 └───────────────────────────────────────────────────────────────────┘
 ```
 
-Multi-threaded insertion of elements into one position.
+在一个位置多线程插入数据。
 
-Query:
+查询:
 
 ``` sql
 SELECT groupArrayInsertAt(number, 0) FROM numbers_mt(10) SETTINGS max_block_size = 1;
 ```
 
-As a result of this query you get random integer in the `[0,9]` range. For example:
+作为这个查询的结果，你会得到 `[0,9]` 范围的随机整数。 例如:
 
 ``` text
 ┌─groupArrayInsertAt(number, 0)─┐
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md
index 1cd40c2002f..8cdfc302b39 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingavg.md
@@ -4,29 +4,36 @@ toc_priority: 114
 
 # groupArrayMovingAvg {#agg_function-grouparraymovingavg}
 
-Calculates the moving average of input values.
+计算输入值的移动平均值。
+
+**语法**
 
 ``` sql
 groupArrayMovingAvg(numbers_for_summing)
 groupArrayMovingAvg(window_size)(numbers_for_summing)
 ```
 
-The function can take the window size as a parameter. If left unspecified, the function takes the window size equal to the number of rows in the column.
+该函数可以将窗口大小作为参数。 如果未指定，则该函数的窗口大小等于列中的行数。
 
-**Parameters**
+**参数**
 
--   `numbers_for_summing` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in a numeric data type value.
--   `window_size` — Size of the calculation window.
+-   `numbers_for_summing` — [表达式](../../../sql-reference/syntax.md#syntax-expressions) 生成数值数据类型值。
+-   `window_size` — 窗口大小。
 
-**Returned values**
+**返回值**
 
--   Array of the same size and type as the input data.
+-   与输入数据大小相同的数组。
 
-The function uses [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero). It truncates the decimal places insignificant for the resulting data type.
+对于输入数据类型是[Integer](../../../sql-reference/data-types/int-uint.md),
+和[floating-point](../../../sql-reference/data-types/float.md),
+对应的返回值类型是 `Float64` 。
+对于输入数据类型是[Decimal](../../../sql-reference/data-types/decimal.md) 返回值类型是 `Decimal128` 。
 
-**Example**
+该函数对于 `Decimal128` 使用 [四舍五入到零](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero). 它截断无意义的小数位来保证结果的数据类型。
 
-The sample table `b`:
+**示例**
+
+样表 `t`:
 
 ``` sql
 CREATE TABLE t
@@ -47,7 +54,7 @@ ENGINE = TinyLog
 └─────┴───────┴──────┘
 ```
 
-The queries:
+查询:
 
 ``` sql
 SELECT
@@ -58,9 +65,9 @@ FROM t
 ```
 
 ``` text
-┌─I─────────┬─F───────────────────────────────────┬─D─────────────────────┐
-│ [0,0,1,3] │ [0.275,0.82500005,1.9250001,3.8675] │ [0.27,0.82,1.92,3.86] │
-└───────────┴─────────────────────────────────────┴───────────────────────┘
+┌─I────────────────────┬─F─────────────────────────────────────────────────────────────────────────────┬─D─────────────────────┐
+│ [0.25,0.75,1.75,3.5] │ [0.2750000059604645,0.8250000178813934,1.9250000417232513,3.8499999940395355] │ [0.27,0.82,1.92,3.86] │
+└──────────────────────┴───────────────────────────────────────────────────────────────────────────────┴───────────────────────┘
 ```
 
 ``` sql
@@ -72,7 +79,7 @@ FROM t
 ```
 
 ``` text
-┌─I─────────┬─F────────────────────────────────┬─D─────────────────────┐
-│ [0,1,3,5] │ [0.55,1.6500001,3.3000002,6.085] │ [0.55,1.65,3.30,6.08] │
-└───────────┴──────────────────────────────────┴───────────────────────┘
+┌─I───────────────┬─F───────────────────────────────────────────────────────────────────────────┬─D─────────────────────┐
+│ [0.5,1.5,3,5.5] │ [0.550000011920929,1.6500000357627869,3.3000000715255737,6.049999952316284] │ [0.55,1.65,3.30,6.08] │
+└─────────────────┴─────────────────────────────────────────────────────────────────────────────┴───────────────────────┘
 ```

From a6e9d9200bfb1ac34d13344965f22b2c746ed756 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Thu, 4 Feb 2021 23:34:04 +0800
Subject: [PATCH 144/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 73 -------------------
 .../reference/grouparraymovingsum.md          | 25 ++++---
 .../reference/grouparraysample.md             | 35 ++++-----
 3 files changed, 33 insertions(+), 100 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index f22d54955ee..0c49dbce8a7 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -527,79 +527,6 @@ uniqExact(x[, ...])
 -   [uniqHLL12](#agg_function-uniqhll12)
 
 
-## groupArrayMovingSum {#agg_function-grouparraymovingsum}
-
-计算输入值的移动和。
-
-``` sql
-groupArrayMovingSum(numbers_for_summing)
-groupArrayMovingSum(window_size)(numbers_for_summing)
-```
-
-该函数可以将窗口大小作为参数。 如果未指定，则该函数的窗口大小等于列中的行数。
-
-**参数**
-
--   `numbers_for_summing` — [表达式](../syntax.md#syntax-expressions) 为数值数据类型值。
--   `window_size` — 窗口大小。
-
-**返回值**
-
--   与输入数据大小和类型相同的数组。
-
-**示例**
-
-样品表:
-
-``` sql
-CREATE TABLE t
-(
-    `int` UInt8,
-    `float` Float32,
-    `dec` Decimal32(2)
-)
-ENGINE = TinyLog
-```
-
-``` text
-┌─int─┬─float─┬──dec─┐
-│   1 │   1.1 │ 1.10 │
-│   2 │   2.2 │ 2.20 │
-│   4 │   4.4 │ 4.40 │
-│   7 │  7.77 │ 7.77 │
-└─────┴───────┴──────┘
-```
-
-查询:
-
-``` sql
-SELECT
-    groupArrayMovingSum(int) AS I,
-    groupArrayMovingSum(float) AS F,
-    groupArrayMovingSum(dec) AS D
-FROM t
-```
-
-``` text
-┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐
-│ [1,3,7,14] │ [1.1,3.3000002,7.7000003,15.47] │ [1.10,3.30,7.70,15.47] │
-└────────────┴─────────────────────────────────┴────────────────────────┘
-```
-
-``` sql
-SELECT
-    groupArrayMovingSum(2)(int) AS I,
-    groupArrayMovingSum(2)(float) AS F,
-    groupArrayMovingSum(2)(dec) AS D
-FROM t
-```
-
-``` text
-┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐
-│ [1,3,6,11] │ [1.1,3.3000002,6.6000004,12.17] │ [1.10,3.30,6.60,12.17] │
-└────────────┴─────────────────────────────────┴────────────────────────┘
-```
-
 ## groupUniqArray(x), groupUniqArray(max_size)(x) {#groupuniqarrayx-groupuniqarraymax-sizex}
 
 从不同的参数值创建一个数组。 内存消耗是一样的 `uniqExact` 功能。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
index ef979cd5f6a..2fc811a9de1 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
@@ -4,27 +4,32 @@ toc_priority: 113
 
 # groupArrayMovingSum {#agg_function-grouparraymovingsum}
 
-Calculates the moving sum of input values.
+
+计算输入值的移动和。
+
+**语法**
 
 ``` sql
 groupArrayMovingSum(numbers_for_summing)
 groupArrayMovingSum(window_size)(numbers_for_summing)
 ```
 
-The function can take the window size as a parameter. If left unspecified, the function takes the window size equal to the number of rows in the column.
+该函数可以将窗口大小作为参数。 如果未指定，则该函数的窗口大小等于列中的行数。
 
-**Parameters**
+**参数**
 
--   `numbers_for_summing` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in a numeric data type value.
--   `window_size` — Size of the calculation window.
+-   `numbers_for_summing` — [表达式](../../../sql-reference/syntax.md#syntax-expressions) 生成数值数据类型值。。
+-   `window_size` — 窗口大小。
 
-**Returned values**
+**返回值**
 
--   Array of the same size and type as the input data.
+-   与输入数据大小相同的数组。
+对于输入数据类型是[Decimal](../../../sql-reference/data-types/decimal.md) 数组元素类型是 `Decimal128` 。
+对于其他的数值类型, 获取其对应的 `NearestFieldType` 。
 
-**Example**
+**示例**
 
-The sample table:
+样表:
 
 ``` sql
 CREATE TABLE t
@@ -45,7 +50,7 @@ ENGINE = TinyLog
 └─────┴───────┴──────┘
 ```
 
-The queries:
+查询:
 
 ``` sql
 SELECT
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
index 36fa6a9d661..9f6cae2ca32 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
@@ -4,29 +4,30 @@ toc_priority: 114
 
 # groupArraySample {#grouparraysample}
 
-Creates an array of sample argument values. The size of the resulting array is limited to `max_size` elements. Argument values are selected and added to the array randomly. 
+构建一个参数值的采样数组。
+结果数组的大小限制为 `max_size` 个元素。参数值被随机选择并添加到数组中。
 
-**Syntax**
+**语法**
 
 ``` sql
 groupArraySample(max_size[, seed])(x)
 ```
 
-**Parameters**
+**参数**
 
--   `max_size` — Maximum size of the resulting array. [UInt64](../../data-types/int-uint.md).
--   `seed` — Seed for the random number generator. Optional. [UInt64](../../data-types/int-uint.md). Default value: `123456`.
--   `x` — Argument (column name or expression).
+-   `max_size` — 结果数组的最大长度. [UInt64](../../data-types/int-uint.md)。
+-   `seed` — 随机数发生器的种子. 可选。 [UInt64](../../data-types/int-uint.md)。 默认值: `123456`。
+-   `x` — 参数 (列名 或者 表达式).
 
-**Returned values**
+**返回值**
 
--   Array of randomly selected `x` arguments.
+-   随机选取参数 `x` (的值)组成的数组。
 
-Type: [Array](../../data-types/array.md).
+类型: [Array](../../../data-types/array.md).
 
-**Examples**
+**示例**
 
-Consider table `colors`:
+样表 `colors`:
 
 ``` text
 ┌─id─┬─color──┐
@@ -38,13 +39,13 @@ Consider table `colors`:
 └────┴────────┘
 ```
 
-Query with column name as argument:
+使用列名做参数查询:
 
 ``` sql
 SELECT groupArraySample(3)(color) as newcolors FROM colors;
 ```
 
-Result:
+结果:
 
 ```text
 ┌─newcolors──────────────────┐
@@ -52,13 +53,13 @@ Result:
 └────────────────────────────┘
 ```
 
-Query with column name and different seed:
+使用列名和不同的(随机数)种子查询:
 
 ``` sql
 SELECT groupArraySample(3, 987654321)(color) as newcolors FROM colors;
 ```
 
-Result:
+结果:
 
 ```text
 ┌─newcolors──────────────────┐
@@ -66,13 +67,13 @@ Result:
 └────────────────────────────┘
 ```
 
-Query with expression as argument:
+使用表达式做参数查询:
 
 ``` sql
 SELECT groupArraySample(3)(concat('light-', color)) as newcolors FROM colors;
 ```
 
-Result:
+结果:
 
 ```text
 ┌─newcolors───────────────────────────────────┐

From 84c318502f2df99bfc22aaff473b3e5de533ffea Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Fri, 5 Feb 2021 12:14:07 +0800
Subject: [PATCH 145/260] WIP update-aggregate-funcions-in-zh: fix broken links

---
 docs/zh/sql-reference/aggregate-functions/reference/any.md    | 2 +-
 .../sql-reference/aggregate-functions/reference/anyheavy.md   | 2 +-
 .../aggregate-functions/reference/avgweighted.md              | 2 +-
 docs/zh/sql-reference/aggregate-functions/reference/count.md  | 2 +-
 .../aggregate-functions/reference/grouparrayinsertat.md       | 4 ++--
 .../aggregate-functions/reference/grouparraysample.md         | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/any.md b/docs/zh/sql-reference/aggregate-functions/reference/any.md
index 3df326ac84b..205ff1c1944 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/any.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/any.md
@@ -2,7 +2,7 @@
 toc_priority: 6
 ---
 
-# any(x) {#agg_function-any}
+# any {#agg_function-any}
 
 选择第一个遇到的值。
 查询可以以任何顺序执行，甚至每次都以不同的顺序执行，因此此函数的结果是不确定的。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md b/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
index e01320e85b1..f47027bd0c4 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
@@ -16,7 +16,7 @@ anyHeavy(column)
 
 **示例**
 
-使用 [OnTime](../../getting-started/example-datasets/ontime.md) 数据集，并选择在 `AirlineID` 列任何频繁出现的值。
+使用 [OnTime](../../../getting-started/example-datasets/ontime.md) 数据集，并选择在 `AirlineID` 列任何频繁出现的值。
 
 ``` sql
 SELECT anyHeavy(AirlineID) AS res
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
index a353c02c13b..ba53678b704 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
@@ -29,7 +29,7 @@ avgWeighted(x, weight)
 -   `NaN`。 如果所有的权重都等于0 或所提供的权重参数是空。
 -   加权平均值。 其他。
 
-类型: 总是[Float64](../data-types/float.md).
+类型: 总是[Float64](../../../sql-reference/data-types/float.md).
 
 **示例**
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/count.md b/docs/zh/sql-reference/aggregate-functions/reference/count.md
index 5e464a0cb61..fc528980bfa 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/count.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/count.md
@@ -21,7 +21,7 @@ ClickHouse支持以下 `count` 语法:
 **返回值**
 
 -   如果没有参数调用函数，它会计算行数。
--   如果 [表达式](../../../syntax.md#syntax-expressions) 被传递，则该函数计数此表达式返回非null的次数。 如果表达式返回 [可为空](../../../sql-reference/data-types/nullable.md)类型的值，`count`的结果仍然不 `Nullable`。 如果表达式对于所有的行都返回 `NULL` ，则该函数返回 0 。
+-   如果 [表达式](../../../sql-reference/syntax.md#syntax-expressions) 被传递，则该函数计数此表达式返回非null的次数。 如果表达式返回 [可为空](../../../sql-reference/data-types/nullable.md)类型的值，`count`的结果仍然不 `Nullable`。 如果表达式对于所有的行都返回 `NULL` ，则该函数返回 0 。
 
 在这两种情况下，返回值的类型为 [UInt64](../../../sql-reference/data-types/int-uint.md)。
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
index ed1b0806eb4..6d2d15fa584 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
@@ -19,9 +19,9 @@ groupArrayInsertAt(default_x, size)(x, pos);
 
 **参数**
 
--   `x` — 要插入的值。生成所[支持的数据类型](../../../sql-reference/data-types/index.md)(数据)的[表达式](../../../syntax.md#syntax-expressions)。
+-   `x` — 要插入的值。生成所[支持的数据类型](../../../sql-reference/data-types/index.md)(数据)的[表达式](../../../sql-reference/syntax.md#syntax-expressions)。
 -   `pos` — 指定元素 `x` 将被插入的位置。 数组中的索引编号从零开始。 [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges).
--   `default_x`— 在空位置替换的默认值。可选参数。生成 `x` 数据类型 (数据) 的[表达式](../../../syntax.md#syntax-expressions)。  如果 `default_x` 未定义，则 [默认值](../../../sql-reference/statements/create.md#create-default-values) 被使用。
+-   `default_x`— 在空位置替换的默认值。可选参数。生成 `x` 数据类型 (数据) 的[表达式](../../../sql-reference/syntax.md#syntax-expressions)。  如果 `default_x` 未定义，则 [默认值](../../../sql-reference/statements/create.md#create-default-values) 被使用。
 -   `size`— 结果数组的长度。可选参数。如果使用该参数，必须指定默认值 `default_x` 。 [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges)。
 
 **返回值**
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
index 9f6cae2ca32..f08199c7907 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
@@ -23,7 +23,7 @@ groupArraySample(max_size[, seed])(x)
 
 -   随机选取参数 `x` (的值)组成的数组。
 
-类型: [Array](../../../data-types/array.md).
+类型: [Array](../../../sql-reference/data-types/array.md).
 
 **示例**
 

From 1ba1cbc6a9ff14a18d86d52db94b9c16fa6140c9 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Fri, 5 Feb 2021 23:50:14 +0800
Subject: [PATCH 146/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 403 ------------------
 .../reference/grouparraymovingsum.md          |   2 +-
 .../reference/grouparraysample.md             |   6 +-
 .../reference/groupbitand.md                  |  22 +-
 .../reference/groupbitmap.md                  |  20 +-
 .../reference/groupbitmapand.md               |  14 +-
 .../reference/groupbitmapor.md                |  14 +-
 .../reference/groupbitmapxor.md               |  16 +-
 .../reference/groupbitor.md                   |  22 +-
 .../reference/groupbitxor.md                  |  22 +-
 .../aggregate-functions/reference/summap.md   |  18 +-
 11 files changed, 87 insertions(+), 472 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 0c49dbce8a7..5f97d43b11d 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -5,287 +5,12 @@ toc_title: 参考手册
 
 # 参考手册 {#aggregate-functions-reference}
 
-## count {#agg_function-count}
-
-计数行数或非空值。
-
-ClickHouse支持以下语法 `count`:
-- `count(expr)` 或 `COUNT(DISTINCT expr)`.
-- `count()` 或 `COUNT(*)`. 该 `count()` 语法是ClickHouse特定的。
-
-**参数**
-
-该功能可以采取:
-
--   零参数。
--   一 [表达式](../syntax.md#syntax-expressions).
-
-**返回值**
-
--   如果没有参数调用函数，它会计算行数。
--   如果 [表达式](../syntax.md#syntax-expressions) 被传递，则该函数计数此表达式返回的次数非null。 如果表达式返回 [可为空](../../sql-reference/data-types/nullable.md)-键入值，然后结果 `count` 保持不 `Nullable`. 如果返回表达式，则该函数返回0 `NULL` 对于所有的行。
-
-在这两种情况下，返回值的类型为 [UInt64](../../sql-reference/data-types/int-uint.md).
-
-**详细信息**
-
-ClickHouse支持 `COUNT(DISTINCT ...)` 语法 这种结构的行为取决于 [count_distinct_implementation](../../operations/settings/settings.md#settings-count_distinct_implementation) 设置。 它定义了其中的 [uniq\*](#agg_function-uniq) 函数用于执行操作。 默认值为 [uniqExact](#agg_function-uniqexact) 功能。
-
-该 `SELECT count() FROM table` 查询未被优化，因为表中的条目数没有单独存储。 它从表中选择一个小列并计算其中的值数。
-
-**例**
-
-示例1:
-
-``` sql
-SELECT count() FROM t
-```
-
-``` text
-┌─count()─┐
-│       5 │
-└─────────┘
-```
-
-示例2:
-
-``` sql
-SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation'
-```
-
-``` text
-┌─name──────────────────────────┬─value─────┐
-│ count_distinct_implementation │ uniqExact │
-└───────────────────────────────┴───────────┘
-```
-
-``` sql
-SELECT count(DISTINCT num) FROM t
-```
-
-``` text
-┌─uniqExact(num)─┐
-│              3 │
-└────────────────┘
-```
-
-
-## groupBitAnd {#groupbitand}
-
-按位应用 `AND` 对于一系列的数字。
-
-``` sql
-groupBitAnd(expr)
-```
-
-**参数**
-
-`expr` – An expression that results in `UInt*` 类型。
-
-**返回值**
-
-的价值 `UInt*` 类型。
-
-**示例**
-
-测试数据:
-
-``` text
-binary     decimal
-00101100 = 44
-00011100 = 28
-00001101 = 13
-01010101 = 85
-```
-
-查询:
-
-``` sql
-SELECT groupBitAnd(num) FROM t
-```
-
-哪里 `num` 是包含测试数据的列。
-
-结果:
-
-``` text
-binary     decimal
-00000100 = 4
-```
-
-## groupBitOr {#groupbitor}
-
-按位应用 `OR` 对于一系列的数字。
-
-``` sql
-groupBitOr(expr)
-```
-
-**参数**
-
-`expr` – An expression that results in `UInt*` 类型。
-
-**返回值**
-
-的价值 `UInt*` 类型。
-
-**示例**
-
-测试数据:
-
-``` text
-binary     decimal
-00101100 = 44
-00011100 = 28
-00001101 = 13
-01010101 = 85
-```
-
-查询:
-
-``` sql
-SELECT groupBitOr(num) FROM t
-```
-
-哪里 `num` 是包含测试数据的列。
-
-结果:
-
-``` text
-binary     decimal
-01111101 = 125
-```
-
-## groupBitXor {#groupbitxor}
-
-按位应用 `XOR` 对于一系列的数字。
-
-``` sql
-groupBitXor(expr)
-```
-
-**参数**
-
-`expr` – An expression that results in `UInt*` 类型。
-
-**返回值**
-
-的价值 `UInt*` 类型。
-
-**示例**
-
-测试数据:
-
-``` text
-binary     decimal
-00101100 = 44
-00011100 = 28
-00001101 = 13
-01010101 = 85
-```
-
-查询:
-
-``` sql
-SELECT groupBitXor(num) FROM t
-```
-
-哪里 `num` 是包含测试数据的列。
-
-结果:
-
-``` text
-binary     decimal
-01101000 = 104
-```
-
-## groupBitmap {#groupbitmap}
-
-从无符号整数列的位图或聚合计算，返回UInt64类型的基数，如果添加后缀状态，则返回 [位图对象](../../sql-reference/functions/bitmap-functions.md).
-
-``` sql
-groupBitmap(expr)
-```
-
-**参数**
-
-`expr` – An expression that results in `UInt*` 类型。
-
-**返回值**
-
-的价值 `UInt64` 类型。
-
-**示例**
-
-测试数据:
-
-``` text
-UserID
-1
-1
-2
-3
-```
-
-查询:
-
-``` sql
-SELECT groupBitmap(UserID) as num FROM t
-```
-
-结果:
-
-``` text
-num
-3
-```
-
-
 ## sumWithOverflow(x) {#sumwithoverflowx}
 
 使用与输入参数相同的数据类型计算数字的总和。 如果总和超过此数据类型的最大值，则函数返回错误。
 
 只适用于数字。
 
-## sumMap(key,value),sumMap(Tuple(key,value)) {#agg_functions-summap}
-
-总计 ‘value’ 数组根据在指定的键 ‘key’ 阵列。
-传递键和值数组的元组与传递两个键和值数组是同义的。
-元素的数量 ‘key’ 和 ‘value’ 总计的每一行必须相同。
-返回两个数组的一个二元组： key是排好序的，value是对应key的求和。
-
-示例:
-
-``` sql
-CREATE TABLE sum_map(
-    date Date,
-    timeslot DateTime,
-    statusMap Nested(
-        status UInt16,
-        requests UInt64
-    ),
-    statusMapTuple Tuple(Array(Int32), Array(Int32))
-) ENGINE = Log;
-INSERT INTO sum_map VALUES
-    ('2000-01-01', '2000-01-01 00:00:00', [1, 2, 3], [10, 10, 10], ([1, 2, 3], [10, 10, 10])),
-    ('2000-01-01', '2000-01-01 00:00:00', [3, 4, 5], [10, 10, 10], ([3, 4, 5], [10, 10, 10])),
-    ('2000-01-01', '2000-01-01 00:01:00', [4, 5, 6], [10, 10, 10], ([4, 5, 6], [10, 10, 10])),
-    ('2000-01-01', '2000-01-01 00:01:00', [6, 7, 8], [10, 10, 10], ([6, 7, 8], [10, 10, 10]));
-
-SELECT
-    timeslot,
-    sumMap(statusMap.status, statusMap.requests),
-    sumMap(statusMapTuple)
-FROM sum_map
-GROUP BY timeslot
-```
-
-``` text
-┌────────────timeslot─┬─sumMap(statusMap.status, statusMap.requests)─┬─sumMap(statusMapTuple)─────────┐
-│ 2000-01-01 00:00:00 │ ([1,2,3,4,5],[10,10,20,10,10])               │ ([1,2,3,4,5],[10,10,20,10,10]) │
-│ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10])               │ ([4,5,6,7,8],[10,10,20,10,10]) │
-└─────────────────────┴──────────────────────────────────────────────┴────────────────────────────────┘
-```
 
 ## skewPop {#skewpop}
 
@@ -1372,133 +1097,5 @@ stochasticLogisticRegression(1.0, 1.0, 10, 'SGD')
 -   [随机指标线上回归](#agg_functions-stochasticlinearregression)
 -   [线性回归和逻辑回归之间的差异](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression)
 
-## groupBitmapAnd {#groupbitmapand}
-
-计算位图列的AND，返回UInt64类型的基数，如果添加后缀状态，则返回 [位图对象](../../sql-reference/functions/bitmap-functions.md).
-
-``` sql
-groupBitmapAnd(expr)
-```
-
-**参数**
-
-`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` 类型。
-
-**返回值**
-
-的价值 `UInt64` 类型。
-
-**示例**
-
-``` sql
-DROP TABLE IF EXISTS bitmap_column_expr_test2;
-CREATE TABLE bitmap_column_expr_test2
-(
-    tag_id String,
-    z AggregateFunction(groupBitmap, UInt32)
-)
-ENGINE = MergeTree
-ORDER BY tag_id;
-
-INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32))));
-INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32))));
-INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32))));
-
-SELECT groupBitmapAnd(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
-┌─groupBitmapAnd(z)─┐
-│               3   │
-└───────────────────┘
-
-SELECT arraySort(bitmapToArray(groupBitmapAndState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
-┌─arraySort(bitmapToArray(groupBitmapAndState(z)))─┐
-│ [6,8,10]                                         │
-└──────────────────────────────────────────────────┘
-```
-
-## groupBitmapOr {#groupbitmapor}
-
-计算位图列的OR，返回UInt64类型的基数，如果添加后缀状态，则返回 [位图对象](../../sql-reference/functions/bitmap-functions.md). 这相当于 `groupBitmapMerge`.
-
-``` sql
-groupBitmapOr(expr)
-```
-
-**参数**
-
-`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` 类型。
-
-**返回值**
-
-的价值 `UInt64` 类型。
-
-**示例**
-
-``` sql
-DROP TABLE IF EXISTS bitmap_column_expr_test2;
-CREATE TABLE bitmap_column_expr_test2
-(
-    tag_id String,
-    z AggregateFunction(groupBitmap, UInt32)
-)
-ENGINE = MergeTree
-ORDER BY tag_id;
-
-INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32))));
-INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32))));
-INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32))));
-
-SELECT groupBitmapOr(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
-┌─groupBitmapOr(z)─┐
-│             15   │
-└──────────────────┘
-
-SELECT arraySort(bitmapToArray(groupBitmapOrState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
-┌─arraySort(bitmapToArray(groupBitmapOrState(z)))─┐
-│ [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]           │
-└─────────────────────────────────────────────────┘
-```
-
-## groupBitmapXor {#groupbitmapxor}
-
-计算位图列的XOR，返回UInt64类型的基数，如果添加后缀状态，则返回 [位图对象](../../sql-reference/functions/bitmap-functions.md).
-
-``` sql
-groupBitmapOr(expr)
-```
-
-**参数**
-
-`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` 类型。
-
-**返回值**
-
-的价值 `UInt64` 类型。
-
-**示例**
-
-``` sql
-DROP TABLE IF EXISTS bitmap_column_expr_test2;
-CREATE TABLE bitmap_column_expr_test2
-(
-    tag_id String,
-    z AggregateFunction(groupBitmap, UInt32)
-)
-ENGINE = MergeTree
-ORDER BY tag_id;
-
-INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32))));
-INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32))));
-INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32))));
-
-SELECT groupBitmapXor(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
-┌─groupBitmapXor(z)─┐
-│              10   │
-└───────────────────┘
-
-SELECT arraySort(bitmapToArray(groupBitmapXorState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%');
-┌─arraySort(bitmapToArray(groupBitmapXorState(z)))─┐
-│ [1,3,5,6,8,10,11,13,14,15]                       │
-└──────────────────────────────────────────────────┘
-```
 
 [原始文章](https://clickhouse.tech/docs/en/query_language/agg_functions/reference/) <!--hide-->
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
index 2fc811a9de1..d58d848e7ac 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
@@ -18,7 +18,7 @@ groupArrayMovingSum(window_size)(numbers_for_summing)
 
 **参数**
 
--   `numbers_for_summing` — [表达式](../../../sql-reference/syntax.md#syntax-expressions) 生成数值数据类型值。。
+-   `numbers_for_summing` — [表达式](../../../sql-reference/syntax.md#syntax-expressions) 生成数值数据类型值。
 -   `window_size` — 窗口大小。
 
 **返回值**
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
index f08199c7907..529b63a2316 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparraysample.md
@@ -15,9 +15,9 @@ groupArraySample(max_size[, seed])(x)
 
 **参数**
 
--   `max_size` — 结果数组的最大长度. [UInt64](../../data-types/int-uint.md)。
--   `seed` — 随机数发生器的种子. 可选。 [UInt64](../../data-types/int-uint.md)。 默认值: `123456`。
--   `x` — 参数 (列名 或者 表达式).
+-   `max_size` — 结果数组的最大长度。[UInt64](../../data-types/int-uint.md)。
+-   `seed` — 随机数发生器的种子。可选。[UInt64](../../data-types/int-uint.md)。默认值: `123456`。
+-   `x` — 参数 (列名 或者 表达式)。
 
 **返回值**
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md
index 9be73fd54ec..1a8520b0f08 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitand.md
@@ -4,23 +4,25 @@ toc_priority: 125
 
 # groupBitAnd {#groupbitand}
 
-Applies bitwise `AND` for series of numbers.
+对于数字序列按位应用 `AND` 。
+
+**语法**
 
 ``` sql
 groupBitAnd(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` – An expression that results in `UInt*` type.
+`expr` – 结果为 `UInt*` 类型的表达式。
 
-**Return value**
+**返回值**
 
-Value of the `UInt*` type.
+`UInt*` 类型的值。
 
-**Example**
+**示例**
 
-Test data:
+测试数据:
 
 ``` text
 binary     decimal
@@ -30,15 +32,15 @@ binary     decimal
 01010101 = 85
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT groupBitAnd(num) FROM t
 ```
 
-Where `num` is the column with the test data.
+`num` 是包含测试数据的列。
 
-Result:
+结果:
 
 ``` text
 binary     decimal
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md
index 9367652db38..5e14c3a21ea 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmap.md
@@ -4,23 +4,25 @@ toc_priority: 128
 
 # groupBitmap {#groupbitmap}
 
-Bitmap or Aggregate calculations from a unsigned integer column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md).
+从无符号整数列进行位图或聚合计算，返回 `UInt64` 类型的基数，如果添加后缀 `State` ，则返回[位图对象](../../../sql-reference/functions/bitmap-functions.md)。
+
+**语法**
 
 ``` sql
 groupBitmap(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` – An expression that results in `UInt*` type.
+`expr` –  结果为 `UInt*` 类型的表达式。
 
-**Return value**
+**返回值**
 
-Value of the `UInt64` type.
+`UInt64` 类型的值。
 
-**Example**
+**示例**
 
-Test data:
+测试数据:
 
 ``` text
 UserID
@@ -30,13 +32,13 @@ UserID
 3
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT groupBitmap(UserID) as num FROM t
 ```
 
-Result:
+结果:
 
 ``` text
 num
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md
index 7c0c89040bb..bd5aa17c7ff 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapand.md
@@ -4,21 +4,23 @@ toc_priority: 129
 
 # groupBitmapAnd {#groupbitmapand}
 
-Calculations the AND of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md).
+计算位图列的 `AND` ，返回 `UInt64` 类型的基数，如果添加后缀 `State` ，则返回 [位图对象](../../../sql-reference/functions/bitmap-functions.md)。
+
+**语法**
 
 ``` sql
 groupBitmapAnd(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type.
+`expr` – 结果为 `AggregateFunction(groupBitmap, UInt*)` 类型的表达式。
 
-**Return value**
+**返回值**
 
-Value of the `UInt64` type.
+`UInt64` 类型的值。
 
-**Example**
+**示例**
 
 ``` sql
 DROP TABLE IF EXISTS bitmap_column_expr_test2;
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md
index 894c6c90aab..52048083d17 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapor.md
@@ -4,21 +4,23 @@ toc_priority: 130
 
 # groupBitmapOr {#groupbitmapor}
 
-Calculations the OR of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md). This is equivalent to `groupBitmapMerge`.
+计算位图列的 `OR` ，返回 `UInt64` 类型的基数，如果添加后缀 `State` ，则返回 [位图对象](../../../sql-reference/functions/bitmap-functions.md)。
+
+**语法**
 
 ``` sql
 groupBitmapOr(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type.
+`expr` – 结果为 `AggregateFunction(groupBitmap, UInt*)` 类型的表达式。
 
-**Return value**
+**返回值**
 
-Value of the `UInt64` type.
+`UInt64` 类型的值。
 
-**Example**
+**示例**
 
 ``` sql
 DROP TABLE IF EXISTS bitmap_column_expr_test2;
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md
index 5d0ec0fb097..d862e974418 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitmapxor.md
@@ -4,21 +4,23 @@ toc_priority: 131
 
 # groupBitmapXor {#groupbitmapxor}
 
-Calculations the XOR of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md).
+计算位图列的 `XOR` ，返回 `UInt64` 类型的基数，如果添加后缀 `State` ，则返回 [位图对象](../../../sql-reference/functions/bitmap-functions.md)。
+
+**语法**
 
 ``` sql
-groupBitmapOr(expr)
+groupBitmapXor(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type.
+`expr` – 结果为 `AggregateFunction(groupBitmap, UInt*)` 类型的表达式。
 
-**Return value**
+**返回值**
 
-Value of the `UInt64` type.
+`UInt64` 类型的值。
 
-**Example**
+**示例**
 
 ``` sql
 DROP TABLE IF EXISTS bitmap_column_expr_test2;
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md
index 7383e620060..175cc8d7286 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitor.md
@@ -4,23 +4,25 @@ toc_priority: 126
 
 # groupBitOr {#groupbitor}
 
-Applies bitwise `OR` for series of numbers.
+对于数字序列按位应用 `OR` 。
+
+**语法**
 
 ``` sql
 groupBitOr(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` – An expression that results in `UInt*` type.
+`expr` – 结果为 `UInt*` 类型的表达式。
 
-**Return value**
+**返回值**
 
-Value of the `UInt*` type.
+`UInt*` 类型的值。
 
-**Example**
+**示例**
 
-Test data:
+测试数据::
 
 ``` text
 binary     decimal
@@ -30,15 +32,15 @@ binary     decimal
 01010101 = 85
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT groupBitOr(num) FROM t
 ```
 
-Where `num` is the column with the test data.
+`num` 是包含测试数据的列。
 
-Result:
+结果:
 
 ``` text
 binary     decimal
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md b/docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md
index 01026012b91..26409f00032 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupbitxor.md
@@ -4,23 +4,25 @@ toc_priority: 127
 
 # groupBitXor {#groupbitxor}
 
-Applies bitwise `XOR` for series of numbers.
+对于数字序列按位应用 `XOR` 。
+
+**语法**
 
 ``` sql
 groupBitXor(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` – An expression that results in `UInt*` type.
+`expr` – 结果为 `UInt*` 类型的表达式。
 
-**Return value**
+**返回值**
 
-Value of the `UInt*` type.
+`UInt*` 类型的值。
 
-**Example**
+**示例**
 
-Test data:
+测试数据:
 
 ``` text
 binary     decimal
@@ -30,15 +32,15 @@ binary     decimal
 01010101 = 85
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT groupBitXor(num) FROM t
 ```
 
-Where `num` is the column with the test data.
+`num` 是包含测试数据的列。
 
-Result:
+结果:
 
 ``` text
 binary     decimal
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/summap.md b/docs/zh/sql-reference/aggregate-functions/reference/summap.md
index 4ccbc22de35..03e8e447125 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/summap.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/summap.md
@@ -4,17 +4,21 @@ toc_priority: 141
 
 # sumMap {#agg_functions-summap}
 
-Syntax: `sumMap(key, value)` or `sumMap(Tuple(key, value))`
+**语法**
 
-Totals the `value` array according to the keys specified in the `key` array.
+``` sql
+`sumMap(key, value)`
+或
+`sumMap(Tuple(key, value))`
+```
 
-Passing tuple of keys and values arrays is a synonym to passing two arrays of keys and values.
+根据 `key` 数组中指定的键对 `value` 数组进行求和。
 
-The number of elements in `key` and `value` must be the same for each row that is totaled.
+传递键和值数组的元组与传递两个键和值数组是同义的。
+要总计的每一行的 `key` 和 `value` (数组)元素的数量必须相同。
+返回两个数组组成的一个元组: 排好序的 `key` 和对应 `key` 的 `value` 之和。
 
-Returns a tuple of two arrays: keys in sorted order, and values ​​summed for the corresponding keys.
-
-Example:
+示例:
 
 ``` sql
 CREATE TABLE sum_map(

From d9209f6c3255ec6c635c0c56a031d651aa42c11e Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Sat, 6 Feb 2021 00:07:17 +0800
Subject: [PATCH 147/260] WIP update-aggregate-funcions-in-zh

---
 .../sql-reference/aggregate-functions/reference.md |  7 -------
 .../reference/groupuniqarray.md                    | 14 ++++++++++----
 .../aggregate-functions/reference/index.md         |  6 +++---
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 5f97d43b11d..c1ffec1fa52 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -252,13 +252,6 @@ uniqExact(x[, ...])
 -   [uniqHLL12](#agg_function-uniqhll12)
 
 
-## groupUniqArray(x), groupUniqArray(max_size)(x) {#groupuniqarrayx-groupuniqarraymax-sizex}
-
-从不同的参数值创建一个数组。 内存消耗是一样的 `uniqExact` 功能。
-
-第二个版本（`max_size` 参数）将结果数组的大小限制为 `max_size` 元素。
-例如, `groupUniqArray(1)(x)` 相当于 `[any(x)]`.
-
 ## quantile {#quantile}
 
 计算数字序列的近似[分位数](https://en.wikipedia.org/wiki/Quantile)。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md b/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md
index 537212e5b94..d89e575cfa4 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md
@@ -4,9 +4,15 @@ toc_priority: 111
 
 # groupUniqArray {#groupuniqarray}
 
-Syntax: `groupUniqArray(x)` or `groupUniqArray(max_size)(x)`
+**语法**
 
-Creates an array from different argument values. Memory consumption is the same as for the [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md) function.
+``` sql
+`groupUniqArray(x)`
+或
+`groupUniqArray(max_size)(x)`
+```
 
-The second version (with the `max_size` parameter) limits the size of the resulting array to `max_size` elements.
-For example, `groupUniqArray(1)(x)` is equivalent to `[any(x)]`.
+从不同的参数值创建一个数组。 内存消耗和 [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md) 函数是一样的。
+
+第二个版本（带有 `max_size` 参数）将结果数组的大小限制为 `max_size` 个元素。
+例如, `groupUniqArray(1)(x)` 相当于 `[any(x)]`.
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/index.md b/docs/zh/sql-reference/aggregate-functions/reference/index.md
index b96fa887279..3598a3cc536 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/index.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/index.md
@@ -4,9 +4,9 @@ toc_priority: 36
 toc_hidden: true
 ---
 
-# List of Aggregate Functions {#aggregate-functions-reference}
+# 聚合函数列表 {#aggregate-functions-reference}
 
-Standard aggregate functions:
+标准聚合函数:
 
 -   [count](../../../sql-reference/aggregate-functions/reference/count.md)
 -   [min](../../../sql-reference/aggregate-functions/reference/min.md)
@@ -21,7 +21,7 @@ Standard aggregate functions:
 -   [covarPop](../../../sql-reference/aggregate-functions/reference/covarpop.md)
 -   [covarSamp](../../../sql-reference/aggregate-functions/reference/covarsamp.md)
 
-ClickHouse-specific aggregate functions:
+ClickHouse 特有的聚合函数:
 
 -   [anyHeavy](../../../sql-reference/aggregate-functions/reference/anyheavy.md)
 -   [anyLast](../../../sql-reference/aggregate-functions/reference/anylast.md)

From c9f824876f35b7a1d1426dd9873dd41a8f45b98c Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Sat, 6 Feb 2021 23:38:03 +0800
Subject: [PATCH 148/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 45 -------------------
 .../aggregate-functions/reference/kurtpop.md  | 15 ++++---
 .../aggregate-functions/reference/kurtsamp.md | 15 ++++---
 3 files changed, 16 insertions(+), 59 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index c1ffec1fa52..31f0dfe85c4 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -58,51 +58,6 @@ skewSamp(expr)
 SELECT skewSamp(value) FROM series_with_value_column
 ```
 
-## kurtPop {#kurtpop}
-
-计算 [峰度](https://en.wikipedia.org/wiki/Kurtosis) 的序列。
-
-``` sql
-kurtPop(expr)
-```
-
-**参数**
-
-`expr` — [表达式](../syntax.md#syntax-expressions) 返回一个数字。
-
-**返回值**
-
-给定序列的峰度。 类型 — [Float64](../../sql-reference/data-types/float.md)
-
-**示例**
-
-``` sql
-SELECT kurtPop(value) FROM series_with_value_column
-```
-
-## kurtSamp {#kurtsamp}
-
-计算 [峰度样本](https://en.wikipedia.org/wiki/Kurtosis) 的序列。
-
-它表示随机变量峰度的无偏估计，如果传递的值形成其样本。
-
-``` sql
-kurtSamp(expr)
-```
-
-**参数**
-
-`expr` — [表达式](../syntax.md#syntax-expressions) 返回一个数字。
-
-**返回值**
-
-给定序列的峰度。类型 — [Float64](../../sql-reference/data-types/float.md). 如果 `n <= 1` (`n` 是样本的大小），则该函数返回 `nan`.
-
-**示例**
-
-``` sql
-SELECT kurtSamp(value) FROM series_with_value_column
-```
 
 ## uniq {#agg_function-uniq}
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md b/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md
index 65e7e31b9b4..7a954e43e3a 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md
@@ -4,22 +4,23 @@ toc_priority: 153
 
 # kurtPop {#kurtpop}
 
-Computes the [kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence.
+计算给定序列的 [峰度](https://en.wikipedia.org/wiki/Kurtosis)。
+
+**语法**
 
 ``` sql
 kurtPop(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
+`expr` —  结果为数字的 [表达式](../../../sql-reference/syntax.md#syntax-expressions)。
 
-**Returned value**
+**返回值**
 
-The kurtosis of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md)
+给定分布的峰度。 类型 — [Float64](../../../sql-reference/data-types/float.md)
 
-**Example**
+**示例**
 
 ``` sql
 SELECT kurtPop(value) FROM series_with_value_column
-```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md
index 224bbbdb9e7..348df805cf3 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md
@@ -4,23 +4,24 @@ toc_priority: 154
 
 # kurtSamp {#kurtsamp}
 
-Computes the [sample kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence.
+计算给定序列的 [峰度样本](https://en.wikipedia.org/wiki/Kurtosis)。
+它表示随机变量峰度的无偏估计，如果传递的值形成其样本。
 
-It represents an unbiased estimate of the kurtosis of a random variable if passed values form its sample.
+**语法**
 
 ``` sql
 kurtSamp(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
+`expr` — 结果为数字的 [表达式](../../../sql-reference/syntax.md#syntax-expressions)。
 
-**Returned value**
+**返回值**
 
-The kurtosis of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md). If `n <= 1` (`n` is a size of the sample), then the function returns `nan`.
+给定序列的峰度。类型 — [Float64](../../../sql-reference/data-types/float.md)。 如果 `n <= 1` (`n` 是样本的大小），则该函数返回 `nan`。
 
-**Example**
+**示例**
 
 ``` sql
 SELECT kurtSamp(value) FROM series_with_value_column

From 1fe6b995fac82f3e355fc7154fe28a4387d90718 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Sun, 7 Feb 2021 22:37:31 +0800
Subject: [PATCH 149/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 84 -------------------
 .../aggregate-functions/reference/varpop.md   |  8 +-
 .../aggregate-functions/reference/varsamp.md  | 11 +--
 3 files changed, 10 insertions(+), 93 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 31f0dfe85c4..d3e5ec673ad 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -5,11 +5,6 @@ toc_title: 参考手册
 
 # 参考手册 {#aggregate-functions-reference}
 
-## sumWithOverflow(x) {#sumwithoverflowx}
-
-使用与输入参数相同的数据类型计算数字的总和。 如果总和超过此数据类型的最大值，则函数返回错误。
-
-只适用于数字。
 
 
 ## skewPop {#skewpop}
@@ -207,68 +202,7 @@ uniqExact(x[, ...])
 -   [uniqHLL12](#agg_function-uniqhll12)
 
 
-## quantile {#quantile}
 
-计算数字序列的近似[分位数](https://en.wikipedia.org/wiki/Quantile)。
-
-此功能适用 [水塘抽样(](https://en.wikipedia.org/wiki/Reservoir_sampling)，使用储存器最大到8192和随机数发生器进行采样。 结果是非确定性的。 要获得精确的分位数，请使用 [quantileExact](#quantileexact) 功能。
-
-当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[分位数](#quantiles)功能。
-
-**语法**
-
-``` sql
-quantile(level)(expr)
-```
-
-别名: `median`.
-
-**参数**
-
--   `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median).
--   `expr` — 求职表达式，类型为：数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。
-
-**返回值**
-
--   指定层次的近似分位数。
-
-类型:
-
--   [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。
--   [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。
--   [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。
-
-**示例**
-
-输入表:
-
-``` text
-┌─val─┐
-│   1 │
-│   1 │
-│   2 │
-│   3 │
-└─────┘
-```
-
-查询:
-
-``` sql
-SELECT quantile(val) FROM t
-```
-
-结果:
-
-``` text
-┌─quantile(val)─┐
-│           1.5 │
-└───────────────┘
-```
-
-**另请参阅**
-
--   [中位数](#median)
--   [分位数](#quantiles)
 
 ## quantileDeterministic {#quantiledeterministic}
 
@@ -770,25 +704,7 @@ SELECT medianDeterministic(val, 1) FROM t
 
 所有分位数函数也有相应的函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。这些函数一次计算所列层次的所有分位数，并返回结果值的数组。
 
-## varSamp(x) {#varsampx}
 
-计算 `Σ((x - x̅)^2) / (n - 1)`，这里 `n` 是样本大小， `x̅`是`x`的平均值。
-
-它表示随机变量的方差的无偏估计，如果传递的值形成其样本。
-
-返回 `Float64`. 当 `n <= 1`，返回 `+∞`.
-
-!!! note "注"
-    该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `varSampStable` 功能。 它的工作速度较慢，但提供较低的计算错误。
-
-## varPop(x) {#varpopx}
-
-计算 `Σ((x - x̅)^2) / n`，这里 `n` 是样本大小， `x̅`是`x`的平均值。
-
-换句话说，计算一组数据的离差。 返回 `Float64`。
-
-!!! note "注"
-    该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `varPopStable` 功能。 它的工作速度较慢，但提供较低的计算错误。
 
 ## stddevSamp(x) {#stddevsampx}
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/varpop.md b/docs/zh/sql-reference/aggregate-functions/reference/varpop.md
index c08dcfd9bfd..4dca8efde38 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/varpop.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/varpop.md
@@ -4,9 +4,9 @@ toc_priority: 32
 
 # varPop(x) {#varpopx}
 
-Calculates the amount `Σ((x - x̅)^2) / n`, where `n` is the sample size and `x̅`is the average value of `x`.
+计算 `Σ((x - x̅)^2) / n`，这里 `n` 是样本大小， `x̅` 是 `x` 的平均值。
 
-In other words, dispersion for a set of values. Returns `Float64`.
+换句话说，计算一组数据的离差。 返回 `Float64`。
 
-!!! note "Note"
-    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varPopStable` function. It works slower but provides a lower computational error.
+!!! note "注"
+该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `varPopStable` 函数。 它的工作速度较慢，但提供较低的计算错误。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md
index 78bc545a5d0..c83ee7e24d2 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/varsamp.md
@@ -4,11 +4,12 @@ toc_priority: 33
 
 # varSamp {#varsamp}
 
-Calculates the amount `Σ((x - x̅)^2) / (n - 1)`, where `n` is the sample size and `x̅`is the average value of `x`.
+计算 `Σ((x - x̅)^2) / (n - 1)`，这里 `n` 是样本大小， `x̅`是`x`的平均值。
 
-It represents an unbiased estimate of the variance of a random variable if passed values form its sample.
+它表示随机变量的方差的无偏估计，如果传递的值形成其样本。
 
-Returns `Float64`. When `n <= 1`, returns `+∞`.
+返回 `Float64`。 当 `n <= 1`，返回 `+∞`。
+
+!!! note "注"
+该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `varSampStable` 函数。 它的工作速度较慢，但提供较低的计算错误。
 
-!!! note "Note"
-    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varSampStable` function. It works slower but provides a lower computational error.

From 024b70442f8dd36202af96b82a24703549bbdcec Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Mon, 8 Feb 2021 15:18:46 +0800
Subject: [PATCH 150/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 37 -------------------
 .../aggregate-functions/reference/uniq.md     | 28 +++++++-------
 2 files changed, 15 insertions(+), 50 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index d3e5ec673ad..5ef31027fca 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -54,43 +54,6 @@ SELECT skewSamp(value) FROM series_with_value_column
 ```
 
 
-## uniq {#agg_function-uniq}
-
-计算参数的不同值的近似数量。
-
-``` sql
-uniq(x[, ...])
-```
-
-**参数**
-
-该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`，或数字类型。
-
-**返回值**
-
--   A [UInt64](../../sql-reference/data-types/int-uint.md)-键入号码。
-
-**实现细节**
-
-功能:
-
--   计算聚合中所有参数的哈希值，然后在计算中使用它。
-
--   使用自适应采样算法。 对于计算状态，该函数使用最多65536个元素哈希值的样本。
-
-        这个算法是非常精确的，并且对于CPU来说非常高效。如果查询包含一些这样的函数，那和其他聚合函数相比 `uniq` 将是几乎一样快。
-
--   确定性地提供结果（它不依赖于查询处理顺序）。
-
-我们建议在几乎所有情况下使用此功能。
-
-**另请参阅**
-
--   [uniqCombined](#agg_function-uniqcombined)
--   [uniqCombined64](#agg_function-uniqcombined64)
--   [uniqHLL12](#agg_function-uniqhll12)
--   [uniqExact](#agg_function-uniqexact)
-
 ## uniqCombined {#agg_function-uniqcombined}
 
 计算不同参数值的近似数量。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniq.md b/docs/zh/sql-reference/aggregate-functions/reference/uniq.md
index 81d1ec6761e..2cf020d052b 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/uniq.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/uniq.md
@@ -4,35 +4,37 @@ toc_priority: 190
 
 # uniq {#agg_function-uniq}
 
-Calculates the approximate number of different values of the argument.
+计算参数的不同值的近似数量。
+
+**语法**
 
 ``` sql
 uniq(x[, ...])
 ```
 
-**Parameters**
+**参数**
 
-The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
+该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`, 或数字类型。
 
-**Returned value**
+**返回值**
 
--   A [UInt64](../../../sql-reference/data-types/int-uint.md)-type number.
+-  [UInt64](../../../sql-reference/data-types/int-uint.md) 类型数值。
 
-**Implementation details**
+**实现细节**
 
-Function:
+功能:
 
--   Calculates a hash for all parameters in the aggregate, then uses it in calculations.
+-   计算聚合中所有参数的哈希值，然后在计算中使用它。
 
--   Uses an adaptive sampling algorithm. For the calculation state, the function uses a sample of element hash values up to 65536.
+-   使用自适应采样算法。 对于计算状态，该函数使用最多65536个元素哈希值的样本。
 
-        This algorithm is very accurate and very efficient on the CPU. When the query contains several of these functions, using `uniq` is almost as fast as using other aggregate functions.
+    这个算法是非常精确的，并且对于CPU来说非常高效。如果查询包含一些这样的函数，那和其他聚合函数相比 `uniq` 将是几乎一样快。
 
--   Provides the result deterministically (it doesn’t depend on the query processing order).
+-   确定性地提供结果（它不依赖于查询处理顺序）。
 
-We recommend using this function in almost all scenarios.
+我们建议在几乎所有情况下使用此功能。
 
-**See Also**
+**参见**
 
 -   [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined)
 -   [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)

From 6864c477b664c3a45255dc62f676a475861da9e6 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Tue, 9 Feb 2021 20:47:52 +0800
Subject: [PATCH 151/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference/maxmap.md     | 17 +++++++++++------
 .../aggregate-functions/reference/summap.md     |  2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md b/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
index c62502cf46e..89e6fcd7ac3 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
@@ -4,17 +4,22 @@ toc_priority: 143
 
 # maxMap {#agg_functions-maxmap}
 
-Syntax: `maxMap(key, value)` or `maxMap(Tuple(key, value))`
+**语法**
 
-Calculates the maximum from `value` array according to the keys specified in the `key` array.
+```sql
+`maxMap(key, value)`
+ 或
+`maxMap(Tuple(key, value))`
+```
 
-Passing a tuple of keys and value arrays is identical to passing two arrays of keys and values.
 
-The number of elements in `key` and `value` must be the same for each row that is totaled.
+根据 `key` 数组中指定的键对 `value` 数组计算最大值。
 
-Returns a tuple of two arrays: keys and values calculated for the corresponding keys.
+传递 `key` 和 `value` 数组的元组与传递 `key` 和 `value` 的两个数组是同义的。
+要总计的每一行的 `key` 和 `value` (数组)元素的数量必须相同。
+返回两个数组组成的元组: `key` 和对应 `key` 的 `value` 计算值(最大值)。
 
-Example:
+示例:
 
 ``` sql
 SELECT maxMap(a, b)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/summap.md b/docs/zh/sql-reference/aggregate-functions/reference/summap.md
index 03e8e447125..2eeb5be65e9 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/summap.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/summap.md
@@ -14,7 +14,7 @@ toc_priority: 141
 
 根据 `key` 数组中指定的键对 `value` 数组进行求和。
 
-传递键和值数组的元组与传递两个键和值数组是同义的。
+传递 `key` 和 `value` 数组的元组与传递 `key` 和 `value` 的两个数组是同义的。
 要总计的每一行的 `key` 和 `value` (数组)元素的数量必须相同。
 返回两个数组组成的一个元组: 排好序的 `key` 和对应 `key` 的 `value` 之和。
 

From f71e2691c1ef07c3b74afd9e5221c5a7d3ed3ded Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Mon, 22 Feb 2021 22:08:23 +0800
Subject: [PATCH 152/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 42 -------------------
 .../aggregate-functions/reference/maxmap.md   |  2 +-
 .../aggregate-functions/reference/median.md   | 28 ++++++-------
 .../aggregate-functions/reference/minmap.md   | 18 ++++----
 4 files changed, 26 insertions(+), 64 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 5ef31027fca..66e8554297a 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -621,48 +621,6 @@ SELECT quantileTDigestWeighted(number, 1) FROM numbers(10)
 -   [中位数](#median)
 -   [分位数](#quantiles)
 
-## median {#median}
-
-`median*` 函数是 `quantile*` 函数的别名。 它们计算数字数据样本的中位数。
-
-函数:
-
--   `median` — [quantile](#quantile)别名。
--   `medianDeterministic` — [quantileDeterministic](#quantiledeterministic)别名。
--   `medianExact` — [quantileExact](#quantileexact)别名。
--   `medianExactWeighted` — [quantileExactWeighted](#quantileexactweighted)别名。
--   `medianTiming` — [quantileTiming](#quantiletiming)别名。
--   `medianTimingWeighted` — [quantileTimingWeighted](#quantiletimingweighted)别名。
--   `medianTDigest` — [quantileTDigest](#quantiletdigest)别名。
--   `medianTDigestWeighted` — [quantileTDigestWeighted](#quantiletdigestweighted)别名。
-
-**示例**
-
-输入表:
-
-``` text
-┌─val─┐
-│   1 │
-│   1 │
-│   2 │
-│   3 │
-└─────┘
-```
-
-查询:
-
-``` sql
-SELECT medianDeterministic(val, 1) FROM t
-```
-
-结果:
-
-``` text
-┌─medianDeterministic(val, 1)─┐
-│                         1.5 │
-└─────────────────────────────┘
-```
-
 ## quantiles(level1, level2, …)(x) {#quantiles}
 
 所有分位数函数也有相应的函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。这些函数一次计算所列层次的所有分位数，并返回结果值的数组。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md b/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
index 89e6fcd7ac3..86352792dd7 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
@@ -17,7 +17,7 @@ toc_priority: 143
 
 传递 `key` 和 `value` 数组的元组与传递 `key` 和 `value` 的两个数组是同义的。
 要总计的每一行的 `key` 和 `value` (数组)元素的数量必须相同。
-返回两个数组组成的元组: `key` 和对应 `key` 的 `value` 计算值(最大值)。
+返回两个数组组成的元组: 排好序的`key` 和对应 `key` 的 `value` 计算值(最大值)。
 
 示例:
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/median.md b/docs/zh/sql-reference/aggregate-functions/reference/median.md
index b4f38a9b562..83879f6cb34 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/median.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/median.md
@@ -1,21 +1,21 @@
 # median {#median}
 
-The `median*` functions are the aliases for the corresponding `quantile*` functions. They calculate median of a numeric data sample.
+`median*` 函数是 `quantile*` 函数的别名。它们计算数字数据样本的中位数。
 
-Functions:
+函数:
 
--   `median` — Alias for [quantile](#quantile).
--   `medianDeterministic` — Alias for [quantileDeterministic](#quantiledeterministic).
--   `medianExact` — Alias for [quantileExact](#quantileexact).
--   `medianExactWeighted` — Alias for [quantileExactWeighted](#quantileexactweighted).
--   `medianTiming` — Alias for [quantileTiming](#quantiletiming).
--   `medianTimingWeighted` — Alias for [quantileTimingWeighted](#quantiletimingweighted).
--   `medianTDigest` — Alias for [quantileTDigest](#quantiletdigest).
--   `medianTDigestWeighted` — Alias for [quantileTDigestWeighted](#quantiletdigestweighted).
+-   `median` — [quantile](#quantile)别名。
+-   `medianDeterministic` — [quantileDeterministic](#quantiledeterministic)别名。
+-   `medianExact` — [quantileExact](#quantileexact)别名。
+-   `medianExactWeighted` — [quantileExactWeighted](#quantileexactweighted)别名。
+-   `medianTiming` — [quantileTiming](#quantiletiming)别名。
+-   `medianTimingWeighted` — [quantileTimingWeighted](#quantiletimingweighted)别名。
+-   `medianTDigest` — [quantileTDigest](#quantiletdigest)别名。
+-   `medianTDigestWeighted` — [quantileTDigestWeighted](#quantiletdigestweighted)别名。
 
-**Example**
+**示例**
 
-Input table:
+输入表:
 
 ``` text
 ┌─val─┐
@@ -26,13 +26,13 @@ Input table:
 └─────┘
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT medianDeterministic(val, 1) FROM t
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─medianDeterministic(val, 1)─┐
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/minmap.md b/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
index 9408d0ddfff..f5ee0557c16 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
@@ -4,17 +4,21 @@ toc_priority: 142
 
 # minMap {#agg_functions-minmap}
 
-Syntax: `minMap(key, value)` or `minMap(Tuple(key, value))`
+语法:
 
-Calculates the minimum from `value` array according to the keys specified in the `key` array.
+```sql
+`minMap(key, value)`
+或
+`minMap(Tuple(key, value))`
+```
 
-Passing a tuple of keys and value ​​arrays is identical to passing two arrays of keys and values.
+根据 `key` 数组中指定的键对 `value` 数组计算最小值。
 
-The number of elements in `key` and `value` must be the same for each row that is totaled.
+传递 `key` 和 `value` 数组的元组与传递 `key` 和 `value` 的两个数组是同义的。
+要总计的每一行的 `key` 和 `value` (数组)元素的数量必须相同。
+返回两个数组组成的元组: 排好序的 `key`  和对应 `key` 的 `value` 计算值(最小值)。
 
-Returns a tuple of two arrays: keys in sorted order, and values calculated for the corresponding keys.
-
-Example:
+示例:
 
 ``` sql
 SELECT minMap(a, b)

From f718a4abbf9d3c1e159013c503a66f42b3a69f49 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Tue, 23 Feb 2021 23:43:43 +0800
Subject: [PATCH 153/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference/quantile.md | 39 +++++++++----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantile.md b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
index 77f858a1735..57369130f9b 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
@@ -4,38 +4,37 @@ toc_priority: 200
 
 # quantile {#quantile}
 
-Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+计算数字序列的近似[分位数](https://en.wikipedia.org/wiki/Quantile)。
+此函数应用[水塘抽样][reservoir sampling] (https://en.wikipedia.org/wiki/Reservoir_sampling)，使用高达8192的水塘大小和随机数发生器采样。
+结果是不确定的。要获得精确的分位数，使用 [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact) 函数。
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。
 
-This function applies [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) with a reservoir size up to 8192 and a random number generator for sampling. The result is non-deterministic. To get an exact quantile, use the [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact) function.
-
-When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
-
-**Syntax**
+**语法**
 
 ``` sql
 quantile(level)(expr)
 ```
 
-Alias: `median`.
+别名: `median`。
 
-**Parameters**
+**参数**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
--   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md).
 
-**Returned value**
+**返回值**
 
--   Approximate quantile of the specified level.
+-   指定层次的分位数。
 
-Type:
+类型:
 
--   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
--   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
--   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+-   [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。
+-   [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。
+-   [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。
 
-**Example**
+**示例**
 
-Input table:
+输入表:
 
 ``` text
 ┌─val─┐
@@ -46,13 +45,13 @@ Input table:
 └─────┘
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT quantile(val) FROM t
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─quantile(val)─┐
@@ -60,7 +59,7 @@ Result:
 └───────────────┘
 ```
 
-**See Also**
+**参见**
 
 -   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
 -   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)

From 7a26c653d41ddd07476446339229570ea0c80760 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Wed, 24 Feb 2021 22:11:38 +0800
Subject: [PATCH 154/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 66 -------------------
 .../aggregate-functions/reference/quantile.md |  8 +--
 .../reference/quantiledeterministic.md        | 45 +++++++------
 3 files changed, 26 insertions(+), 93 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 66e8554297a..6e250e731fb 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -165,72 +165,6 @@ uniqExact(x[, ...])
 -   [uniqHLL12](#agg_function-uniqhll12)
 
 
-
-
-## quantileDeterministic {#quantiledeterministic}
-
-计算数字序列的近似[分位数](https://en.wikipedia.org/wiki/Quantile)。
-
-此功能适用 [水塘抽样(](https://en.wikipedia.org/wiki/Reservoir_sampling)，使用储存器最大到8192和随机数发生器进行采样。 结果是非确定性的。 要获得精确的分位数，请使用 [quantileExact](#quantileexact) 功能。
-
-当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[分位数](#quantiles)功能。
-
-**语法**
-
-``` sql
-quantileDeterministic(level)(expr, determinator)
-```
-
-别名: `medianDeterministic`.
-
-**参数**
-
--   `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median).
--   `expr` — 求职表达式，类型为：数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。
--   `determinator` — 一个数字，其hash被用来代替在水塘抽样中随机生成的数字，这样可以保证取样的确定性。你可以使用用户ID或者事件ID等任何正数，但是如果相同的 `determinator` 出现多次，那结果很可能不正确。
-
-**返回值**
-
--   指定层次的近似分位数。
-
-类型:
-
--   [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。
--   [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。
--   [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。
-
-**示例**
-
-输入表:
-
-``` text
-┌─val─┐
-│   1 │
-│   1 │
-│   2 │
-│   3 │
-└─────┘
-```
-
-查询:
-
-``` sql
-SELECT quantileDeterministic(val, 1) FROM t
-```
-
-结果:
-
-``` text
-┌─quantileDeterministic(val, 1)─┐
-│                           1.5 │
-└───────────────────────────────┘
-```
-
-**另请参阅**
-
--   [中位数](#median)
--   [分位数](#quantiles)
-
 ## quantileExact {#quantileexact}
 
 准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantile.md b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
index 57369130f9b..c51386c7533 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
@@ -19,8 +19,8 @@ quantile(level)(expr)
 
 **参数**
 
--   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
--   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr` — — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 
 **返回值**
 
@@ -61,5 +61,5 @@ SELECT quantile(val) FROM t
 
 **参见**
 
--   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
--   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
+-   [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md
index 6046447dd10..8e327472864 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md
@@ -4,39 +4,38 @@ toc_priority: 206
 
 # quantileDeterministic {#quantiledeterministic}
 
-Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+计算数字序列的近似[分位数](https://en.wikipedia.org/wiki/Quantile)。
 
-This function applies [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) with a reservoir size up to 8192 and deterministic algorithm of sampling. The result is deterministic. To get an exact quantile, use the [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact) function.
+此功能适用 [水塘抽样](https://en.wikipedia.org/wiki/Reservoir_sampling)，使用储存器最大到8192和随机数发生器进行采样。 结果是非确定性的。 要获得精确的分位数，请使用 [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact) 功能。
 
-When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)功能。
 
-**Syntax**
+**语法**
 
 ``` sql
 quantileDeterministic(level)(expr, determinator)
 ```
 
-Alias: `medianDeterministic`.
+别名: `medianDeterministic`。
 
-**Parameters**
+**参数**
 
--   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
--   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
--   `determinator` — Number whose hash is used instead of a random number generator in the reservoir sampling algorithm to make the result of sampling deterministic. As a determinator you can use any deterministic positive number, for example, a user id or an event id. If the same determinator value occures too often, the function works incorrectly.
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr` — — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
+-   `determinator` — 一个数字，其hash被用来代替在水塘抽样中随机生成的数字，这样可以保证取样的确定性。你可以使用用户ID或者事件ID等任何正数，但是如果相同的 `determinator` 出现多次，那结果很可能不正确。
+**返回值**
 
-**Returned value**
+-   指定层次的近似分位数。
 
--   Approximate quantile of the specified level.
+类型:
 
-Type:
+-   [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。
+-   [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。
+-   [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。
 
--   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
--   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
--   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+**示例**
 
-**Example**
-
-Input table:
+输入表:
 
 ``` text
 ┌─val─┐
@@ -47,13 +46,13 @@ Input table:
 └─────┘
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT quantileDeterministic(val, 1) FROM t
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─quantileDeterministic(val, 1)─┐
@@ -61,7 +60,7 @@ Result:
 └───────────────────────────────┘
 ```
 
-**See Also**
+**参见**
 
--   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
--   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
+-   [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)

From 014b297e2e267e011d935a0b3cf398a252b2d061 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Thu, 25 Feb 2021 22:21:42 +0800
Subject: [PATCH 155/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 39 -------------------
 .../reference/simplelinearregression.md       | 16 ++++----
 2 files changed, 9 insertions(+), 46 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 6e250e731fb..294847ea207 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -653,45 +653,6 @@ SELECT topKWeighted(10)(number, number) FROM numbers(1000)
 └───────────────────────────────────────────┘
 ```
 
-## simpleLinearRegression {#simplelinearregression}
-
-执行简单（一维）线性回归。
-
-``` sql
-simpleLinearRegression(x, y)
-```
-
-参数:
-
--   `x` — x轴。
--   `y` — y轴。
-
-返回值:
-
-符合`y = a*x + b`的常量 `(a, b)` 。
-
-**例**
-
-``` sql
-SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])
-```
-
-``` text
-┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])─┐
-│ (1,0)                                                             │
-└───────────────────────────────────────────────────────────────────┘
-```
-
-``` sql
-SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])
-```
-
-``` text
-┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])─┐
-│ (1,3)                                                             │
-└───────────────────────────────────────────────────────────────────┘
-```
-
 ## stochasticLinearRegression {#agg_functions-stochasticlinearregression}
 
 该函数实现随机线性回归。 它支持自定义参数的学习率、L2正则化系数、微批，并且具有少量更新权重的方法（[Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) （默认）， [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)， [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum)， [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)）。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md
index fee71cdeb49..56cb1539fc9 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/simplelinearregression.md
@@ -4,22 +4,24 @@ toc_priority: 220
 
 # simpleLinearRegression {#simplelinearregression}
 
-Performs simple (unidimensional) linear regression.
+执行简单（一维）线性回归。
+
+**语法**
 
 ``` sql
 simpleLinearRegression(x, y)
 ```
 
-Parameters:
+**参数**
 
--   `x` — Column with dependent variable values.
--   `y` — Column with explanatory variable values.
+-   `x` — x轴。
+-   `y` — y轴。
 
-Returned values:
+**返回值**
 
-Constants `(a, b)` of the resulting line `y = a*x + b`.
+符合`y = a*x + b`的常量 `(a, b)` 。
 
-**Examples**
+**示例**
 
 ``` sql
 SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])

From 111c82cbe91e7c4e1bd2fc42bf7037c4d0f75a35 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Fri, 26 Feb 2021 21:01:37 +0800
Subject: [PATCH 156/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference/sumwithoverflow.md          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md b/docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md
index 1b39e9d0eb1..0fd5af519da 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/sumwithoverflow.md
@@ -4,6 +4,6 @@ toc_priority: 140
 
 # sumWithOverflow {#sumwithoverflowx}
 
-Computes the sum of the numbers, using the same data type for the result as for the input parameters. If the sum exceeds the maximum value for this data type, it is calculated with overflow.
+使用与输入参数相同的数据类型计算结果的数字总和。如果总和超过此数据类型的最大值，则使用溢出进行计算。
 
-Only works for numbers.
+只适用于数字。

From 0f40a99a128981cc1345365b5aded24735d471eb Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Sat, 27 Feb 2021 20:20:53 +0800
Subject: [PATCH 157/260] WIP update-aggregate-funcions-in-zh

---
 .../zh/sql-reference/aggregate-functions/reference.md | 11 -----------
 .../aggregate-functions/reference/stddevpop.md        |  6 +++---
 .../aggregate-functions/reference/stddevsamp.md       |  6 +++---
 3 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 294847ea207..256ad9f34f9 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -561,20 +561,9 @@ SELECT quantileTDigestWeighted(number, 1) FROM numbers(10)
 
 
 
-## stddevSamp(x) {#stddevsampx}
-
-结果等于平方根 `varSamp(x)`。
-
-!!! note "注"
-    该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `stddevSampStable` 功能。 它的工作速度较慢，但提供较低的计算错误。
 
 ## stddevPop(x) {#stddevpopx}
 
-结果等于平方根 `varPop(x)`。
-
-!!! note "注"
-    该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `stddevPopStable` 功能。 它的工作速度较慢，但提供较低的计算错误。
-
 ## topK(N)(x) {#topknx}
 
 返回指定列中近似最常见值的数组。 生成的数组按值的近似频率降序排序（而不是值本身）。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md b/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md
index 58f8c27cd72..378ef4ae7e4 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/stddevpop.md
@@ -4,7 +4,7 @@ toc_priority: 30
 
 # stddevPop {#stddevpop}
 
-The result is equal to the square root of [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md).
+结果等于 [varPop] (../../../sql-reference/aggregate-functions/reference/varpop.md)的平方根。
 
-!!! note "Note"
-    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevPopStable` function. It works slower but provides a lower computational error.
+!!! note "注"
+该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `stddevPopStable` 函数。 它的工作速度较慢，但提供较低的计算错误。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md
index 4ec72881ae5..68a348146a9 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/stddevsamp.md
@@ -4,7 +4,7 @@ toc_priority: 31
 
 # stddevSamp {#stddevsamp}
 
-The result is equal to the square root of [varSamp](../../../sql-reference/aggregate-functions/reference/varsamp.md).
+结果等于 [varSamp] (../../../sql-reference/aggregate-functions/reference/varsamp.md)的平方根。
 
-!!! note "Note"
-    This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevSampStable` function. It works slower but provides a lower computational error.
+!!! note "注"
+该函数使用数值不稳定的算法。 如果你需要 [数值稳定性](https://en.wikipedia.org/wiki/Numerical_stability) 在计算中，使用 `stddevSampStable` 函数。 它的工作速度较慢，但提供较低的计算错误。

From 5295f19ac840fdce0b08640948be352ae8b61e7d Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Mon, 1 Mar 2021 23:16:07 +0800
Subject: [PATCH 158/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 43 -------------------
 .../aggregate-functions/reference/topk.md     | 25 +++++------
 2 files changed, 13 insertions(+), 55 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 256ad9f34f9..6c6628a112c 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -560,49 +560,6 @@ SELECT quantileTDigestWeighted(number, 1) FROM numbers(10)
 所有分位数函数也有相应的函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。这些函数一次计算所列层次的所有分位数，并返回结果值的数组。
 
 
-
-
-## stddevPop(x) {#stddevpopx}
-
-## topK(N)(x) {#topknx}
-
-返回指定列中近似最常见值的数组。 生成的数组按值的近似频率降序排序（而不是值本身）。
-
-实现了[过滤节省空间](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf)算法， 使用基于reduce-and-combine的算法，借鉴[并行节省空间](https://arxiv.org/pdf/1401.0702.pdf).
-
-``` sql
-topK(N)(column)
-```
-
-此函数不提供保证的结果。 在某些情况下，可能会发生错误，并且可能会返回不是最高频的值。
-
-我们建议使用 `N < 10` 值，`N` 值越大，性能越低。最大值 `N = 65536`。
-
-**参数**
-
--   ‘N’ 是要返回的元素数。
-
-如果省略该参数，则使用默认值10。
-
-**参数**
-
--   ' x ' – 计算的频率值。
-
-**示例**
-
-就拿 [OnTime](../../getting-started/example-datasets/ontime.md) 数据集来说，选择`AirlineID` 列中出现最频繁的三个。
-
-``` sql
-SELECT topK(3)(AirlineID) AS res
-FROM ontime
-```
-
-``` text
-┌─res─────────────────┐
-│ [19393,19790,19805] │
-└─────────────────────┘
-```
-
 ## topKWeighted {#topkweighted}
 
 类似于 `topK` 但需要一个整数类型的附加参数 - `weight`. 每个输入都被记入 `weight` 次频率计算。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/topk.md b/docs/zh/sql-reference/aggregate-functions/reference/topk.md
index 004a67d33af..b2595ed9778 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/topk.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/topk.md
@@ -4,31 +4,32 @@ toc_priority: 108
 
 # topK {#topk}
 
-Returns an array of the approximately most frequent values in the specified column. The resulting array is sorted in descending order of approximate frequency of values (not by the values themselves).
+返回指定列中近似最常见值的数组。 生成的数组按值的近似频率降序排序（而不是值本身）。
 
-Implements the [Filtered Space-Saving](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf) algorithm for analyzing TopK, based on the reduce-and-combine algorithm from [Parallel Space Saving](https://arxiv.org/pdf/1401.0702.pdf).
+实现了[过滤节省空间](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf)算法， 使用基于reduce-and-combine的算法，借鉴[并行节省空间](https://arxiv.org/pdf/1401.0702.pdf)。
+
+**语法**
 
 ``` sql
 topK(N)(column)
 ```
+此函数不提供保证的结果。 在某些情况下，可能会发生错误，并且可能会返回不是最高频的值。
 
-This function doesn’t provide a guaranteed result. In certain situations, errors might occur and it might return frequent values that aren’t the most frequent values.
+我们建议使用 `N < 10` 值，`N` 值越大，性能越低。最大值 `N = 65536`。
 
-We recommend using the `N < 10` value; performance is reduced with large `N` values. Maximum value of `N = 65536`.
+**参数**
 
-**Parameters**
+-   ‘N’ 是要返回的元素数。
 
--   ‘N’ is the number of elements to return.
+如果省略该参数，则使用默认值10。
 
-If the parameter is omitted, default value 10 is used.
+**参数**
 
-**Arguments**
+-   ’ x ’ – 计算的频率值。
 
--   ’ x ’ – The value to calculate frequency.
+**示例**
 
-**Example**
-
-Take the [OnTime](../../../getting-started/example-datasets/ontime.md) data set and select the three most frequently occurring values in the `AirlineID` column.
+就拿 [OnTime](../../../getting-started/example-datasets/ontime.md) 数据集来说，选择`AirlineID` 列中出现最频繁的三个。
 
 ``` sql
 SELECT topK(3)(AirlineID) AS res

From 072ba26fc2d49371f764e217d0e9c6191d7fcd3c Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Tue, 2 Mar 2021 19:33:37 +0800
Subject: [PATCH 159/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 39 -------------------
 .../reference/topkweighted.md                 | 24 ++++++------
 2 files changed, 12 insertions(+), 51 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 6c6628a112c..13476fbec35 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -560,45 +560,6 @@ SELECT quantileTDigestWeighted(number, 1) FROM numbers(10)
 所有分位数函数也有相应的函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。这些函数一次计算所列层次的所有分位数，并返回结果值的数组。
 
 
-## topKWeighted {#topkweighted}
-
-类似于 `topK` 但需要一个整数类型的附加参数 - `weight`. 每个输入都被记入 `weight` 次频率计算。
-
-**语法**
-
-``` sql
-topKWeighted(N)(x, weight)
-```
-
-**参数**
-
--   `N` — 返回值个数。
-
-**参数**
-
--   `x` – 输入值。
--   `weight` — 权重。 [UInt8](../../sql-reference/data-types/int-uint.md)类型。
-
-**返回值**
-
-返回具有最大近似权重总和的值数组。
-
-**示例**
-
-查询:
-
-``` sql
-SELECT topKWeighted(10)(number, number) FROM numbers(1000)
-```
-
-结果:
-
-``` text
-┌─topKWeighted(10)(number, number)──────────┐
-│ [999,998,997,996,995,994,993,992,991,990] │
-└───────────────────────────────────────────┘
-```
-
 ## stochasticLinearRegression {#agg_functions-stochasticlinearregression}
 
 该函数实现随机线性回归。 它支持自定义参数的学习率、L2正则化系数、微批，并且具有少量更新权重的方法（[Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) （默认）， [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)， [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum)， [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)）。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md
index b597317f44e..edd5614592a 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md
@@ -4,36 +4,36 @@ toc_priority: 109
 
 # topKWeighted {#topkweighted}
 
-Similar to `topK` but takes one additional argument of integer type - `weight`. Every value is accounted `weight` times for frequency calculation.
+类似于 `topK`  但需要一个整数类型的附加参数 - `weight`。 每个输入都被记入 `weight` 次频率计算。
 
-**Syntax**
+**语法**
 
 ``` sql
 topKWeighted(N)(x, weight)
 ```
 
-**Parameters**
+**参数**
 
--   `N` — The number of elements to return.
+-   `N` — 返回值个数。
 
-**Arguments**
+**参数**
 
--   `x` – The value.
--   `weight` — The weight. [UInt8](../../../sql-reference/data-types/int-uint.md).
+-   `x` – 输入值。
+-   `weight` — 权重。 [UInt8](../../../sql-reference/data-types/int-uint.md)类型。
 
-**Returned value**
+**返回值**
 
-Returns an array of the values with maximum approximate sum of weights.
+返回具有最大近似权重总和的值数组。
 
-**Example**
+**示例**
 
-Query:
+查询:
 
 ``` sql
 SELECT topKWeighted(10)(number, number) FROM numbers(1000)
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─topKWeighted(10)(number, number)──────────┐

From 2d5391d529499eeb10223e4cfe28dceab91e3d71 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Wed, 3 Mar 2021 22:21:14 +0800
Subject: [PATCH 160/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 49 -------------------
 .../reference/uniqcombined.md                 | 41 ++++++++--------
 2 files changed, 21 insertions(+), 69 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 13476fbec35..290abbf429e 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -53,55 +53,6 @@ skewSamp(expr)
 SELECT skewSamp(value) FROM series_with_value_column
 ```
 
-
-## uniqCombined {#agg_function-uniqcombined}
-
-计算不同参数值的近似数量。
-
-``` sql
-uniqCombined(HLL_precision)(x[, ...])
-```
-
-该 `uniqCombined` 函数是计算不同数值数量的不错选择。
-
-**参数**
-
-该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`，或数字类型。
-
-`HLL_precision` 是以2为底的单元格数的对数 [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog). 可选，您可以将该函数用作 `uniqCombined(x[, ...])`. 默认值 `HLL_precision` 是17，这是有效的96KiB的空间（2^17个单元，每个6比特）。
-
-**返回值**
-
--   一个[UInt64](../../sql-reference/data-types/int-uint.md)类型的数字。
-
-**实现细节**
-
-功能:
-
--   计算散列（64位散列 `String` 否则32位）对于聚合中的所有参数，然后在计算中使用它。
-
--   使用三种算法的组合：数组、哈希表和包含错误修正表的HyperLogLog。
-
-        少量的不同的值，使用数组。 值再多一些，使用哈希表。对于大量的数据来说，使用HyperLogLog，HyperLogLog占用一个固定的内存空间。
-
--   确定性地提供结果（它不依赖于查询处理顺序）。
-
-!!! note "注"
-    因为它使用32位散列非-`String` 类型，结果将有非常高的误差基数显着大于 `UINT_MAX` （错误将在几百亿不同值之后迅速提高），因此在这种情况下，您应该使用 [uniqCombined64](#agg_function-uniqcombined64)
-
-相比于 [uniq](#agg_function-uniq) 功能，该 `uniqCombined`:
-
--   消耗少几倍的内存。
--   计算精度高出几倍。
--   通常具有略低的性能。 在某些情况下, `uniqCombined` 可以表现得比 `uniq` 好，例如，使用通过网络传输大量聚合状态的分布式查询。
-
-**另请参阅**
-
--   [uniq](#agg_function-uniq)
--   [uniqCombined64](#agg_function-uniqcombined64)
--   [uniqHLL12](#agg_function-uniqhll12)
--   [uniqExact](#agg_function-uniqexact)
-
 ## uniqCombined64 {#agg_function-uniqcombined64}
 
 和 [uniqCombined](#agg_function-uniqcombined)，但对所有数据类型使用64位哈希。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md
index c52486bc38f..26a681ed5af 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined.md
@@ -4,46 +4,47 @@ toc_priority: 192
 
 # uniqCombined {#agg_function-uniqcombined}
 
-Calculates the approximate number of different argument values.
+计算不同参数值的近似数量。
 
+**语法**
 ``` sql
 uniqCombined(HLL_precision)(x[, ...])
 ```
+该 `uniqCombined` 函数是计算不同值数量的不错选择。
 
-The `uniqCombined` function is a good choice for calculating the number of different values.
+**参数**
 
-**Parameters**
+该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`，或数字类型。
 
-The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
+`HLL_precision` 是以2为底的单元格数的对数 [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog)。可选，您可以将该函数用作 `uniqCombined(x[, ...])`。 `HLL_precision` 的默认值是17，这是有效的96KiB的空间（2^17个单元，每个6比特）。
 
-`HLL_precision` is the base-2 logarithm of the number of cells in [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog). Optional, you can use the function as `uniqCombined(x[, ...])`. The default value for `HLL_precision` is 17, which is effectively 96 KiB of space (2^17 cells, 6 bits each).
+**返回值**
 
-**Returned value**
+-   一个[UInt64](../../../sql-reference/data-types/int-uint.md)类型的数字。
 
--   A number [UInt64](../../../sql-reference/data-types/int-uint.md)-type number.
+**实现细节**
 
-**Implementation details**
+功能:
 
-Function:
+-   为聚合中的所有参数计算哈希（`String`类型用64位哈希，其他32位），然后在计算中使用它。
 
--   Calculates a hash (64-bit hash for `String` and 32-bit otherwise) for all parameters in the aggregate, then uses it in calculations.
+-   使用三种算法的组合：数组、哈希表和包含错误修正表的HyperLogLog。
 
--   Uses a combination of three algorithms: array, hash table, and HyperLogLog with an error correction table.
 
-        For a small number of distinct elements, an array is used. When the set size is larger, a hash table is used. For a larger number of elements, HyperLogLog is used, which will occupy a fixed amount of memory.
+    少量的不同的值，使用数组。 值再多一些，使用哈希表。对于大量的数据来说，使用HyperLogLog，HyperLogLog占用一个固定的内存空间。
 
--   Provides the result deterministically (it doesn’t depend on the query processing order).
+-   确定性地提供结果（它不依赖于查询处理顺序）。
 
-!!! note "Note"
-    Since it uses 32-bit hash for non-`String` type, the result will have very high error for cardinalities significantly larger than `UINT_MAX` (error will raise quickly after a few tens of billions of distinct values), hence in this case you should use [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)
+!!! note "注"
+    由于它对非 `String` 类型使用32位哈希，对于基数显著大于`UINT_MAX` ，结果将有非常高的误差(误差将在几百亿不同值之后迅速提高), 因此这种情况，你应该使用 [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)
 
-Compared to the [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) function, the `uniqCombined`:
+相比于 [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) 函数, 该 `uniqCombined`:
 
--   Consumes several times less memory.
--   Calculates with several times higher accuracy.
--   Usually has slightly lower performance. In some scenarios, `uniqCombined` can perform better than `uniq`, for example, with distributed queries that transmit a large number of aggregation states over the network.
+-   消耗内存要少几倍。
+-   计算精度高出几倍。
+-   通常具有略低的性能。 在某些情况下, `uniqCombined` 可以表现得比 `uniq` 好，例如，使用通过网络传输大量聚合状态的分布式查询。
 
-**See Also**
+**参见**
 
 -   [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)
 -   [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)

From e35957b716fb1d2a3ba283ae702cc26f49f35527 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Thu, 4 Mar 2021 19:05:31 +0800
Subject: [PATCH 161/260] WIP update-aggregate-funcions-in-zh

---
 docs/zh/faq/terms_translation_zh.md                            | 3 +++
 docs/zh/sql-reference/aggregate-functions/reference.md         | 3 ---
 .../aggregate-functions/reference/uniqcombined64.md            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/zh/faq/terms_translation_zh.md b/docs/zh/faq/terms_translation_zh.md
index ab58b9769d7..3ef71929a98 100644
--- a/docs/zh/faq/terms_translation_zh.md
+++ b/docs/zh/faq/terms_translation_zh.md
@@ -13,6 +13,9 @@ Decimal             定点数
 Tuple               元组
 function            函数
 array               数组/阵列
+hash                哈希/散列
+Parameters          参数
+Arguments           参数
 
 
 ##
diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 290abbf429e..5880d5a97eb 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -53,9 +53,6 @@ skewSamp(expr)
 SELECT skewSamp(value) FROM series_with_value_column
 ```
 
-## uniqCombined64 {#agg_function-uniqcombined64}
-
-和 [uniqCombined](#agg_function-uniqcombined)，但对所有数据类型使用64位哈希。
 
 ## uniqHLL12 {#agg_function-uniqhll12}
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md
index 6d060d82779..3c07791450d 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqcombined64.md
@@ -4,4 +4,4 @@ toc_priority: 193
 
 # uniqCombined64 {#agg_function-uniqcombined64}
 
-Same as [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined), but uses 64-bit hash for all data types.
+和 [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined)一样, 但对于所有数据类型使用64位哈希。

From db84b0712e452692d8eaf0bfdb2359eadec6060c Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Fri, 5 Mar 2021 21:14:08 +0800
Subject: [PATCH 162/260] WIP update-aggregate-funcions-in-zh

---
 docs/zh/faq/terms_translation_zh.md           |  2 ++
 .../aggregate-functions/reference.md          | 22 -------------------
 .../reference/uniqexact.md                    | 15 +++++++------
 3 files changed, 10 insertions(+), 29 deletions(-)

diff --git a/docs/zh/faq/terms_translation_zh.md b/docs/zh/faq/terms_translation_zh.md
index 3ef71929a98..3e83af42955 100644
--- a/docs/zh/faq/terms_translation_zh.md
+++ b/docs/zh/faq/terms_translation_zh.md
@@ -32,4 +32,6 @@ Arguments           参数
 `x` — 生成所[支持的数据类型](数据)的[表达式]。
 ```
 
+3. See also         参见
+
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 5880d5a97eb..bf400dfad6a 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -90,28 +90,6 @@ uniqHLL12(x[, ...])
 -   [uniqCombined](#agg_function-uniqcombined)
 -   [uniqExact](#agg_function-uniqexact)
 
-## uniqExact {#agg_function-uniqexact}
-
-计算不同参数值的准确数目。
-
-``` sql
-uniqExact(x[, ...])
-```
-
-如果你绝对需要一个确切的结果，使用 `uniqExact` 功能。 否则使用 [uniq](#agg_function-uniq) 功能。
-
-`uniqExact` 比 `uniq` 使用更多的内存，因为状态的大小随着不同值的数量的增加而无界增长。
-
-**参数**
-
-该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`，或数字类型。
-
-**另请参阅**
-
--   [uniq](#agg_function-uniq)
--   [uniqCombined](#agg_function-uniqcombined)
--   [uniqHLL12](#agg_function-uniqhll12)
-
 
 ## quantileExact {#quantileexact}
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md
index 9a6224533c8..bdd60ca1d30 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqexact.md
@@ -4,21 +4,22 @@ toc_priority: 191
 
 # uniqExact {#agg_function-uniqexact}
 
-Calculates the exact number of different argument values.
+计算不同参数值的准确数目。
+
+**语法**
 
 ``` sql
 uniqExact(x[, ...])
 ```
+如果你绝对需要一个确切的结果，使用 `uniqExact` 函数。 否则使用 [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) 函数。
 
-Use the `uniqExact` function if you absolutely need an exact result. Otherwise use the [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) function.
+`uniqExact` 函数比 `uniq` 使用更多的内存，因为状态的大小随着不同值的数量的增加而无界增长。
 
-The `uniqExact` function uses more memory than `uniq`, because the size of the state has unbounded growth as the number of different values increases.
+**参数**
 
-**Parameters**
+该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`，或数字类型。
 
-The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
-
-**See Also**
+**参见**
 
 -   [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)
 -   [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqcombined)

From ebdf4ad1e96d126b4670ec785366d6c4275288b6 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Sat, 6 Mar 2021 17:19:23 +0800
Subject: [PATCH 163/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 37 -------------------
 .../reference/uniqhll12.md                    | 30 ++++++++-------
 2 files changed, 17 insertions(+), 50 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index bf400dfad6a..319f86681e3 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -54,43 +54,6 @@ SELECT skewSamp(value) FROM series_with_value_column
 ```
 
 
-## uniqHLL12 {#agg_function-uniqhll12}
-
-计算不同参数值的近似数量，使用 [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) 算法。
-
-``` sql
-uniqHLL12(x[, ...])
-```
-
-**参数**
-
-该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`，或数字类型。
-
-**返回值**
-
--   A [UInt64](../../sql-reference/data-types/int-uint.md)-键入号码。
-
-**实现细节**
-
-功能:
-
--   计算聚合中所有参数的哈希值，然后在计算中使用它。
-
--   使用HyperLogLog算法来近似不同参数值的数量。
-
-        212 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements).
-
--   提供确定结果（它不依赖于查询处理顺序）。
-
-我们不建议使用此功能。 在大多数情况下，使用 [uniq](#agg_function-uniq) 或 [uniqCombined](#agg_function-uniqcombined) 功能。
-
-**另请参阅**
-
--   [uniq](#agg_function-uniq)
--   [uniqCombined](#agg_function-uniqcombined)
--   [uniqExact](#agg_function-uniqexact)
-
-
 ## quantileExact {#quantileexact}
 
 准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md b/docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md
index fcddc22cc46..7521065b954 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/uniqhll12.md
@@ -4,35 +4,39 @@ toc_priority: 194
 
 # uniqHLL12 {#agg_function-uniqhll12}
 
-Calculates the approximate number of different argument values, using the [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) algorithm.
+计算不同参数值的近似数量，使用 [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) 算法。
+
+**语法**
 
 ``` sql
 uniqHLL12(x[, ...])
 ```
 
-**Parameters**
+**参数**
 
-The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
+该函数采用可变数量的参数。 参数可以是 `Tuple`, `Array`, `Date`, `DateTime`, `String`，或数字类型。
 
-**Returned value**
+**返回值**
 
--   A [UInt64](../../../sql-reference/data-types/int-uint.md)-type number.
+**返回值**
 
-**Implementation details**
+-   一个[UInt64](../../../sql-reference/data-types/int-uint.md)类型的数字。
 
-Function:
+**实现细节**
 
--   Calculates a hash for all parameters in the aggregate, then uses it in calculations.
+功能:
 
--   Uses the HyperLogLog algorithm to approximate the number of different argument values.
+-   计算聚合中所有参数的哈希值，然后在计算中使用它。
 
-        212 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements).
+-   使用 HyperLogLog 算法来近似不同参数值的数量。
 
--   Provides the determinate result (it doesn’t depend on the query processing order).
+        使用2^12个5比特单元。 状态的大小略大于2.5KB。 对于小数据集（<10K元素），结果不是很准确（误差高达10%）。 但是, 对于高基数数据集（10K-100M），结果相当准确，最大误差约为1.6%。Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements).
 
-We don’t recommend using this function. In most cases, use the [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) or [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) function.
+-   提供确定结果（它不依赖于查询处理顺序）。
 
-**See Also**
+我们不建议使用此函数。 在大多数情况下, 使用 [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) 或 [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) 函数。
+
+**参见**
 
 -   [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)
 -   [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined)

From 20e9784284686a456f038959476898896f1c80f6 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Sun, 7 Mar 2021 22:18:15 +0800
Subject: [PATCH 164/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 48 -------------------
 .../aggregate-functions/reference/skewpop.md  | 14 +++---
 .../aggregate-functions/reference/skewsamp.md | 16 ++++---
 .../aggregate-functions/reference/topk.md     |  6 +--
 .../reference/topkweighted.md                 |  4 +-
 5 files changed, 22 insertions(+), 66 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 319f86681e3..1ccd1107f02 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -6,54 +6,6 @@ toc_title: 参考手册
 # 参考手册 {#aggregate-functions-reference}
 
 
-
-## skewPop {#skewpop}
-
-计算的序列[偏度](https://en.wikipedia.org/wiki/Skewness)。
-
-``` sql
-skewPop(expr)
-```
-
-**参数**
-
-`expr` — [表达式](../syntax.md#syntax-expressions) 返回一个数字。
-
-**返回值**
-
-给定序列的偏度。类型 — [Float64](../../sql-reference/data-types/float.md)
-
-**示例**
-
-``` sql
-SELECT skewPop(value) FROM series_with_value_column
-```
-
-## skewSamp {#skewsamp}
-
-计算 [样品偏度](https://en.wikipedia.org/wiki/Skewness) 的序列。
-
-它表示随机变量的偏度的无偏估计，如果传递的值形成其样本。
-
-``` sql
-skewSamp(expr)
-```
-
-**参数**
-
-`expr` — [表达式](../syntax.md#syntax-expressions) 返回一个数字。
-
-**返回值**
-
-给定序列的偏度。 类型 — [Float64](../../sql-reference/data-types/float.md). 如果 `n <= 1` (`n` 是样本的大小），则该函数返回 `nan`.
-
-**示例**
-
-``` sql
-SELECT skewSamp(value) FROM series_with_value_column
-```
-
-
 ## quantileExact {#quantileexact}
 
 准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md b/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md
index d15a5ffdd47..e26e5f8c754 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md
@@ -4,21 +4,23 @@ toc_priority: 150
 
 # skewPop {#skewpop}
 
-Computes the [skewness](https://en.wikipedia.org/wiki/Skewness) of a sequence.
+计算给定序列的 [偏度] (https://en.wikipedia.org/wiki/Skewness)。
+
+**语法**
 
 ``` sql
 skewPop(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
+`expr` — [表达式](../../../sql-reference/syntax.md#syntax-expressions) 返回一个数字。
 
-**Returned value**
+**返回值**
 
-The skewness of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md)
+给定分布的偏度。类型 — [Float64](../../../sql-reference/data-types/float.md)
 
-**Example**
+**示例**
 
 ``` sql
 SELECT skewPop(value) FROM series_with_value_column
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md
index cb323f4b142..0ac68fb4e7c 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md
@@ -4,23 +4,25 @@ toc_priority: 151
 
 # skewSamp {#skewsamp}
 
-Computes the [sample skewness](https://en.wikipedia.org/wiki/Skewness) of a sequence.
+计算给定序列的 [样本偏度] (https://en.wikipedia.org/wiki/Skewness)。
 
-It represents an unbiased estimate of the skewness of a random variable if passed values form its sample.
+如果传递的值形成其样本，它代表了一个随机变量的偏度的无偏估计。
+
+**语法**
 
 ``` sql
 skewSamp(expr)
 ```
 
-**Parameters**
+**参数**
 
-`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
+`expr` — [表达式](../../../sql-reference/syntax.md#syntax-expressions) 返回一个数字。
 
-**Returned value**
+**返回值**
 
-The skewness of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md). If `n <= 1` (`n` is the size of the sample), then the function returns `nan`.
+给定分布的偏度。 类型 — [Float64](../../../sql-reference/data-types/float.md)。 如果 `n <= 1` (`n` 样本的大小), 函数返回 `nan`。
 
-**Example**
+**示例**
 
 ``` sql
 SELECT skewSamp(value) FROM series_with_value_column
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/topk.md b/docs/zh/sql-reference/aggregate-functions/reference/topk.md
index b2595ed9778..69e006d1a6c 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/topk.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/topk.md
@@ -11,7 +11,7 @@ toc_priority: 108
 **语法**
 
 ``` sql
-topK(N)(column)
+topK(N)(x)
 ```
 此函数不提供保证的结果。 在某些情况下，可能会发生错误，并且可能会返回不是最高频的值。
 
@@ -19,13 +19,13 @@ topK(N)(column)
 
 **参数**
 
--   ‘N’ 是要返回的元素数。
+-   `N` — 要返回的元素数。
 
 如果省略该参数，则使用默认值10。
 
 **参数**
 
--   ’ x ’ – 计算的频率值。
+-   `x` – (要计算频次的)值。
 
 **示例**
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md
index edd5614592a..66b436f42bb 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/topkweighted.md
@@ -14,11 +14,11 @@ topKWeighted(N)(x, weight)
 
 **参数**
 
--   `N` — 返回值个数。
+-   `N` — 要返回的元素数。
 
 **参数**
 
--   `x` – 输入值。
+-   `x` – (要计算频次的)值。
 -   `weight` — 权重。 [UInt8](../../../sql-reference/data-types/int-uint.md)类型。
 
 **返回值**

From c1e5dc92a4165423f7341d4476a7f8adaff2aa14 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Mon, 8 Mar 2021 18:28:52 +0800
Subject: [PATCH 165/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference/minmap.md   |  4 +--
 .../reference/quantiles.md                    |  7 +++--
 .../aggregate-functions/reference/rankCorr.md | 28 +++++++++----------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/minmap.md b/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
index f5ee0557c16..efbb3448b60 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
@@ -4,7 +4,7 @@ toc_priority: 142
 
 # minMap {#agg_functions-minmap}
 
-语法:
+**语法**
 
 ```sql
 `minMap(key, value)`
@@ -18,7 +18,7 @@ toc_priority: 142
 要总计的每一行的 `key` 和 `value` (数组)元素的数量必须相同。
 返回两个数组组成的元组: 排好序的 `key`  和对应 `key` 的 `value` 计算值(最小值)。
 
-示例:
+**示例**
 
 ``` sql
 SELECT minMap(a, b)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md
index abce6a9e7f0..111b1c086c1 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md
@@ -4,6 +4,9 @@ toc_priority: 201
 
 # quantiles {#quantiles}
 
-Syntax: `quantiles(level1, level2, …)(x)`
+**语法**
+``` sql
+`quantiles(level1, level2, …)(x)`
+```
 
-All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values.
+所有分位数函数(quantile)也有相应的分位数(quantiles)函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。 这些函数一次计算所列的级别的所有分位数, 并返回结果值的数组。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md b/docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md
index dc23029f239..c29a43f6ca9 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/rankCorr.md
@@ -1,33 +1,33 @@
 ## rankCorr {#agg_function-rankcorr}
 
-Computes a rank correlation coefficient.
+计算等级相关系数。
 
-**Syntax**
+**语法**
 
 ``` sql
 rankCorr(x, y)
 ```
 
-**Parameters**
+**参数**
 
--   `x` — Arbitrary value. [Float32](../../../sql-reference/data-types/float.md#float32-float64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64).
--   `y` — Arbitrary value. [Float32](../../../sql-reference/data-types/float.md#float32-float64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64).
+-   `x` — 任意值。[Float32](../../../sql-reference/data-types/float.md#float32-float64) 或 [Float64](../../../sql-reference/data-types/float.md#float32-float64)。
+-   `y` — 任意值。[Float32](../../../sql-reference/data-types/float.md#float32-float64) 或 [Float64](../../../sql-reference/data-types/float.md#float32-float64)。
 
-**Returned value(s)**
+**返回值**
 
 -   Returns a rank correlation coefficient of the ranks of x and y. The value of the correlation coefficient ranges from -1 to +1. If less than two arguments are passed, the function will return an exception. The value close to +1 denotes a high linear relationship, and with an increase of one random variable, the second random variable also increases. The value close to -1 denotes a high linear relationship, and with an increase of one random variable, the second random variable decreases. The value close or equal to 0 denotes no relationship between the two random variables.
 
-Type: [Float64](../../../sql-reference/data-types/float.md#float32-float64).
+类型: [Float64](../../../sql-reference/data-types/float.md#float32-float64)。  
 
-**Example**
+**示例**
 
-Query:
+查询:
 
 ``` sql
 SELECT rankCorr(number, number) FROM numbers(100);
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─rankCorr(number, number)─┐
@@ -35,19 +35,19 @@ Result:
 └──────────────────────────┘
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT roundBankers(rankCorr(exp(number), sin(number)), 3) FROM numbers(100);
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─roundBankers(rankCorr(exp(number), sin(number)), 3)─┐
 │                                              -0.037 │
 └─────────────────────────────────────────────────────┘
 ```
-**See Also**
+**参见**
 
--   [Spearman's rank correlation coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
\ No newline at end of file
+-   斯皮尔曼等级相关系数[Spearman's rank correlation coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
\ No newline at end of file

From c24207037f779519528ce44e133a082a03b0bddd Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Tue, 9 Mar 2021 19:20:52 +0800
Subject: [PATCH 166/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 57 -------------------
 .../reference/stochasticlogisticregression.md | 35 ++++++------
 2 files changed, 17 insertions(+), 75 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 1ccd1107f02..f5fd3c8ead6 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -396,11 +396,6 @@ SELECT quantileTDigestWeighted(number, 1) FROM numbers(10)
 -   [中位数](#median)
 -   [分位数](#quantiles)
 
-## quantiles(level1, level2, …)(x) {#quantiles}
-
-所有分位数函数也有相应的函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。这些函数一次计算所列层次的所有分位数，并返回结果值的数组。
-
-
 ## stochasticLinearRegression {#agg_functions-stochasticlinearregression}
 
 该函数实现随机线性回归。 它支持自定义参数的学习率、L2正则化系数、微批，并且具有少量更新权重的方法（[Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) （默认）， [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)， [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum)， [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)）。
@@ -473,57 +468,5 @@ evalMLMethod(model, param1, param2) FROM test_data
 -   [stochasticLogisticRegression](#agg_functions-stochasticlogisticregression)
 -   [线性回归和逻辑回归之间的区别](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression)
 
-## stochasticLogisticRegression {#agg_functions-stochasticlogisticregression}
-
-该函数实现随机逻辑回归。 它可以用于二进制分类问题，支持与stochasticLinearRegression相同的自定义参数，并以相同的方式工作。
-
-### 参数 {#agg_functions-stochasticlogisticregression-parameters}
-
-参数与stochasticLinearRegression中的参数完全相同:
-`learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`.
-欲了解更多信息，请参阅 [参数](#agg_functions-stochasticlinearregression-parameters).
-
-``` text
-stochasticLogisticRegression(1.0, 1.0, 10, 'SGD')
-```
-
-**1.** 安装
-
-<!-- -->
-
-    参考stochasticLinearRegression相关文档
-
-    预测标签的取值范围为[-1, 1]
-
-**2.** 预测
-
-<!-- -->
-
-    使用已经保存的state我们可以预测标签为 `1` 的对象的概率。
-
-    ``` sql
-    WITH (SELECT state FROM your_model) AS model SELECT
-    evalMLMethod(model, param1, param2) FROM test_data
-    ```
-
-    查询结果返回一个列的概率。注意 `evalMLMethod` 的第一个参数是 `AggregateFunctionState` 对象，接下来的参数是列的特性。
-
-    我们也可以设置概率的范围， 这样需要给元素指定不同的标签。
-
-    ``` sql
-    SELECT ans < 1.1 AND ans > 0.5 FROM
-    (WITH (SELECT state FROM your_model) AS model SELECT
-    evalMLMethod(model, param1, param2) AS ans FROM test_data)
-    ```
-
-    结果是标签。
-
-    `test_data` 是一个像 `train_data` 一样的表，但是不包含目标值。
-
-**另请参阅**
-
--   [随机指标线上回归](#agg_functions-stochasticlinearregression)
--   [线性回归和逻辑回归之间的差异](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression)
-
 
 [原始文章](https://clickhouse.tech/docs/en/query_language/agg_functions/reference/) <!--hide-->
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md
index 35d1e3899ac..a2e9ffa89e6 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md
@@ -4,40 +4,39 @@ toc_priority: 222
 
 # stochasticLogisticRegression {#agg_functions-stochasticlogisticregression}
 
-This function implements stochastic logistic regression. It can be used for binary classification problem, supports the same custom parameters as stochasticLinearRegression and works the same way.
+该函数实现随机逻辑回归。 它可以用于二进制分类问题，支持与stochasticLinearRegression相同的自定义参数，并以相同的方式工作。
 
-### Parameters {#agg_functions-stochasticlogisticregression-parameters}
+### 参数 {#agg_functions-stochasticlogisticregression-parameters}
 
-Parameters are exactly the same as in stochasticLinearRegression:
+参数与stochasticLinearRegression中的参数完全相同:
 `learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`.
-For more information see [parameters](#agg_functions-stochasticlinearregression-parameters).
+欲了解更多信息，参见 [参数] (#agg_functions-stochasticlinearregression-parameters).
 
-``` text
+``` sql
 stochasticLogisticRegression(1.0, 1.0, 10, 'SGD')
 ```
 
-**1.** Fitting
+**1.** 拟合
 
 <!-- -->
 
-    See the `Fitting` section in the [stochasticLinearRegression](#stochasticlinearregression-usage-fitting) description.
+    参考[stochasticLinearRegression](#stochasticlinearregression-usage-fitting)  `拟合` 章节文档。
 
-    Predicted labels have to be in \[-1, 1\].
+    预测标签的取值范围为\[-1, 1\]
 
-**2.** Predicting
+**2.** 预测
 
 <!-- -->
 
-    Using saved state we can predict probability of object having label `1`.
-
+    使用已经保存的state我们可以预测标签为 `1` 的对象的概率。
     ``` sql
     WITH (SELECT state FROM your_model) AS model SELECT
     evalMLMethod(model, param1, param2) FROM test_data
     ```
 
-    The query will return a column of probabilities. Note that first argument of `evalMLMethod` is `AggregateFunctionState` object, next are columns of features.
+    查询结果返回一个列的概率。注意 `evalMLMethod` 的第一个参数是 `AggregateFunctionState` 对象，接下来的参数是列的特性。
 
-    We can also set a bound of probability, which assigns elements to different labels.
+    我们也可以设置概率的范围， 这样需要给元素指定不同的标签。
 
     ``` sql
     SELECT ans < 1.1 AND ans > 0.5 FROM
@@ -45,11 +44,11 @@ stochasticLogisticRegression(1.0, 1.0, 10, 'SGD')
     evalMLMethod(model, param1, param2) AS ans FROM test_data)
     ```
 
-    Then the result will be labels.
+      结果是标签。
 
-    `test_data` is a table like `train_data` but may not contain target value.
+    `test_data` 是一个像 `train_data` 一样的表，但是不包含目标值。
 
-**See Also**
+**参见**
 
--   [stochasticLinearRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md#agg_functions-stochasticlinearregression)
--   [Difference between linear and logistic regressions.](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression)
+-   [随机指标线性回归](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md#agg_functions-stochasticlinearregression)
+-   [线性回归和逻辑回归之间的差异](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression)

From 2e999ff484c1e4364973ff4c97385eb7e34f23d1 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Wed, 10 Mar 2021 23:38:55 +0800
Subject: [PATCH 167/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 73 -------------------
 .../reference/stochasticlinearregression.md   | 56 +++++++-------
 2 files changed, 29 insertions(+), 100 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index f5fd3c8ead6..7d763745bbf 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -396,77 +396,4 @@ SELECT quantileTDigestWeighted(number, 1) FROM numbers(10)
 -   [中位数](#median)
 -   [分位数](#quantiles)
 
-## stochasticLinearRegression {#agg_functions-stochasticlinearregression}
-
-该函数实现随机线性回归。 它支持自定义参数的学习率、L2正则化系数、微批，并且具有少量更新权重的方法（[Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) （默认）， [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)， [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum)， [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)）。
-
-### 参数 {#agg_functions-stochasticlinearregression-parameters}
-
-有4个可自定义的参数。 它们按顺序传递给函数，但是没有必要传递所有四个默认值将被使用，但是好的模型需要一些参数调整。
-
-``` text
-stochasticLinearRegression(1.0, 1.0, 10, 'SGD')
-```
-
-1.  `learning rate` 当执行梯度下降步骤时，步长上的系数。 过大的学习率可能会导致模型的权重无限大。 默认值为 `0.00001`.
-2.  `l2 regularization coefficient` 这可能有助于防止过度拟合。 默认值为 `0.1`.
-3.  `mini-batch size` 设置元素的数量，这些元素将被计算和求和以执行梯度下降的一个步骤。 纯随机下降使用一个元素，但是具有小批量（约10个元素）使梯度步骤更稳定。 默认值为 `15`.
-4.  `method for updating weights` 他们是: `Adam` （默认情况下), `SGD`, `Momentum`, `Nesterov`. `Momentum` 和 `Nesterov` 需要更多的计算和内存，但是它们恰好在收敛速度和随机梯度方法的稳定性方面是有用的。
-
-### 用法 {#agg_functions-stochasticlinearregression-usage}
-
-`stochasticLinearRegression` 用于两个步骤：拟合模型和预测新数据。 为了拟合模型并保存其状态以供以后使用，我们使用 `-State` combinator，它基本上保存了状态（模型权重等）。
-为了预测我们使用函数 [evalMLMethod](../functions/machine-learning-functions.md#machine_learning_methods-evalmlmethod)，这需要一个状态作为参数以及特征来预测。
-
-<a name="stochasticlinearregression-usage-fitting"></a>
-
-**1.** 安装
-
-可以使用这种查询。
-
-``` sql
-CREATE TABLE IF NOT EXISTS train_data
-(
-    param1 Float64,
-    param2 Float64,
-    target Float64
-) ENGINE = Memory;
-
-CREATE TABLE your_model ENGINE = Memory AS SELECT
-stochasticLinearRegressionState(0.1, 0.0, 5, 'SGD')(target, param1, param2)
-AS state FROM train_data;
-```
-
-在这里，我们还需要将数据插入到 `train_data` 桌子 参数的数量不是固定的，它只取决于参数的数量，传递到 `linearRegressionState`. 它们都必须是数值。
-请注意，带有目标值的列（我们想要学习预测）被插入作为第一个参数。
-
-**2.** 预测
-
-在将状态保存到表中之后，我们可以多次使用它进行预测，甚至与其他状态合并并创建新的更好的模型。
-
-``` sql
-WITH (SELECT state FROM your_model) AS model SELECT
-evalMLMethod(model, param1, param2) FROM test_data
-```
-
-查询将返回一列预测值。 请注意，第一个参数 `evalMLMethod` 是 `AggregateFunctionState` 对象，接下来是要素列。
-
-`test_data` 是一个像表 `train_data` 但可能不包含目标值。
-
-### 注 {#agg_functions-stochasticlinearregression-notes}
-
-1.  要合并两个模型，用户可以创建这样的查询:
-    `sql  SELECT state1 + state2 FROM your_models`
-    哪里 `your_models` 表包含这两个模型。 此查询将返回new `AggregateFunctionState` 对象。
-
-2.  如果没有，用户可以获取创建的模型的权重用于自己的目的，而不保存模型 `-State` 使用combinator。
-    `sql  SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data`
-    这种查询将拟合模型并返回其权重-首先是权重，它对应于模型的参数，最后一个是偏差。 所以在上面的例子中，查询将返回一个具有3个值的列。
-
-**另请参阅**
-
--   [stochasticLogisticRegression](#agg_functions-stochasticlogisticregression)
--   [线性回归和逻辑回归之间的区别](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression)
-
-
 [原始文章](https://clickhouse.tech/docs/en/query_language/agg_functions/reference/) <!--hide-->
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md
index 7a37ed83e17..43ebd6be575 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlinearregression.md
@@ -4,31 +4,33 @@ toc_priority: 221
 
 # stochasticLinearRegression {#agg_functions-stochasticlinearregression}
 
-This function implements stochastic linear regression. It supports custom parameters for learning rate, L2 regularization coefficient, mini-batch size and has few methods for updating weights ([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (used by default), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)).
+该函数实现随机线性回归。 它支持自定义参数的学习率、L2正则化系数、微批，并且具有少量更新权重的方法（[Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) （默认）， [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)， [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum)， [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)）。
 
-### Parameters {#agg_functions-stochasticlinearregression-parameters}
+### 参数 {#agg_functions-stochasticlinearregression-parameters}
 
-There are 4 customizable parameters. They are passed to the function sequentially, but there is no need to pass all four - default values will be used, however good model required some parameter tuning.
+有4个可自定义的参数。它们按顺序传递给函数，但不需要传递所有四个参数——将使用默认值，然而好的模型需要一些参数调整。
 
-``` text
+**语法**
+
+``` sql
 stochasticLinearRegression(1.0, 1.0, 10, 'SGD')
 ```
 
-1.  `learning rate` is the coefficient on step length, when gradient descent step is performed. Too big learning rate may cause infinite weights of the model. Default is `0.00001`.
-2.  `l2 regularization coefficient` which may help to prevent overfitting. Default is `0.1`.
-3.  `mini-batch size` sets the number of elements, which gradients will be computed and summed to perform one step of gradient descent. Pure stochastic descent uses one element, however having small batches(about 10 elements) make gradient steps more stable. Default is `15`.
-4.  `method for updating weights`, they are: `Adam` (by default), `SGD`, `Momentum`, `Nesterov`. `Momentum` and `Nesterov` require little bit more computations and memory, however they happen to be useful in terms of speed of convergance and stability of stochastic gradient methods.
+1.  `learning rate` 当执行梯度下降步骤时，步长的系数。 过大的学习率可能会导致模型的权重无限大。 默认值为 `0.00001`。
+2.  `l2 regularization coefficient` 这可能有助于防止过度拟合。 默认值为 `0.1`。
+3.  `mini-batch size` 设置元素的数量，这些元素将被计算和求和以执行梯度下降的一个步骤。纯随机下降使用一个元素，但是具有小批量（约10个元素）使梯度步骤更稳定。 默认值为 `15`。
+4.  `method for updating weights` 他们是: `Adam` (默认情况下), `SGD`, `Momentum`, `Nesterov`。`Momentum` 和 `Nesterov` 需要更多的计算和内存，但是它们恰好在收敛速度和随机梯度方法的稳定性方面是有用的。
 
-### Usage {#agg_functions-stochasticlinearregression-usage}
+### 使用 {#agg_functions-stochasticlinearregression-usage}
 
-`stochasticLinearRegression` is used in two steps: fitting the model and predicting on new data. In order to fit the model and save its state for later usage we use `-State` combinator, which basically saves the state (model weights, etc).
-To predict we use function [evalMLMethod](../../../sql-reference/functions/machine-learning-functions.md#machine_learning_methods-evalmlmethod), which takes a state as an argument as well as features to predict on.
+`stochasticLinearRegression` 用于两个步骤：拟合模型和预测新数据。 为了拟合模型并保存其状态以供以后使用，我们使用 `-State` 组合器，它基本上保存了状态（模型权重等）。
+为了预测我们使用函数 [evalMLMethod](../../../sql-reference/functions/machine-learning-functions.md#machine_learning_methods-evalmlmethod), 这需要一个状态作为参数以及特征来预测。
 
 <a name="stochasticlinearregression-usage-fitting"></a>
 
-**1.** Fitting
+**1.** 拟合
 
-Such query may be used.
+可以使用这种查询。
 
 ``` sql
 CREATE TABLE IF NOT EXISTS train_data
@@ -43,33 +45,33 @@ stochasticLinearRegressionState(0.1, 0.0, 5, 'SGD')(target, param1, param2)
 AS state FROM train_data;
 ```
 
-Here we also need to insert data into `train_data` table. The number of parameters is not fixed, it depends only on number of arguments, passed into `linearRegressionState`. They all must be numeric values.
-Note that the column with target value(which we would like to learn to predict) is inserted as the first argument.
+在这里，我们还需要将数据插入到 `train_data` 表。参数的数量不是固定的，它只取决于传入 `linearRegressionState` 的参数数量。它们都必须是数值。
+注意，目标值(我们想学习预测的)列作为第一个参数插入。
 
-**2.** Predicting
+**2.** 预测
 
-After saving a state into the table, we may use it multiple times for prediction, or even merge with other states and create new even better models.
+在将状态保存到表中之后，我们可以多次使用它进行预测，甚至与其他状态合并，创建新的更好的模型。
 
 ``` sql
 WITH (SELECT state FROM your_model) AS model SELECT
 evalMLMethod(model, param1, param2) FROM test_data
 ```
 
-The query will return a column of predicted values. Note that first argument of `evalMLMethod` is `AggregateFunctionState` object, next are columns of features.
+查询将返回一列预测值。注意，`evalMLMethod` 的第一个参数是 `AggregateFunctionState` 对象, 接下来是特征列。
 
-`test_data` is a table like `train_data` but may not contain target value.
+`test_data` 是一个类似 `train_data` 的表 但可能不包含目标值。
 
-### Notes {#agg_functions-stochasticlinearregression-notes}
+### 注 {#agg_functions-stochasticlinearregression-notes}
 
-1.  To merge two models user may create such query:
+1.  要合并两个模型，用户可以创建这样的查询:
     `sql  SELECT state1 + state2 FROM your_models`
-    where `your_models` table contains both models. This query will return new `AggregateFunctionState` object.
+    其中 `your_models` 表包含这两个模型。此查询将返回新的 `AggregateFunctionState` 对象。
 
-2.  User may fetch weights of the created model for its own purposes without saving the model if no `-State` combinator is used.
+2.  如果没有使用 `-State` 组合器，用户可以为自己的目的获取所创建模型的权重，而不保存模型 。
     `sql  SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data`
-    Such query will fit the model and return its weights - first are weights, which correspond to the parameters of the model, the last one is bias. So in the example above the query will return a column with 3 values.
+    这样的查询将拟合模型，并返回其权重——首先是权重，对应模型的参数，最后一个是偏差。 所以在上面的例子中，查询将返回一个具有3个值的列。
 
-**See Also**
+**参见**
 
--   [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#agg_functions-stochasticlogisticregression)
--   [Difference between linear and logistic regressions](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression)
+-   [随机指标逻辑回归](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#agg_functions-stochasticlogisticregression)
+-   [线性回归和逻辑回归之间的差异](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression)

From 5a5134ed3de590c15588b8c4cace44f3f91cf9d7 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Thu, 11 Mar 2021 20:39:13 +0800
Subject: [PATCH 168/260] WIP update-aggregate-funcions-in-zh

---
 docs/zh/faq/terms_translation_zh.md           |  1 +
 .../aggregate-functions/reference/quantile.md |  2 +-
 .../reference/quantiletdigest.md              | 42 +++++++++----------
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/docs/zh/faq/terms_translation_zh.md b/docs/zh/faq/terms_translation_zh.md
index 3e83af42955..d252b4e293e 100644
--- a/docs/zh/faq/terms_translation_zh.md
+++ b/docs/zh/faq/terms_translation_zh.md
@@ -9,6 +9,7 @@ Parquet
 ## 英文   <->         中文
 Integer             整数
 floating-point      浮点数
+Fitting             拟合
 Decimal             定点数
 Tuple               元组
 function            函数
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantile.md b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
index c51386c7533..26882465ff3 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
@@ -20,7 +20,7 @@ quantile(level)(expr)
 **参数**
 
 -   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
--   `expr` — — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
+-   `expr`  — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 
 **返回值**
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md
index bda98ea338d..be8e55da817 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md
@@ -4,46 +4,46 @@ toc_priority: 207
 
 # quantileTDigest {#quantiletdigest}
 
-Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm.
+使用[t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 算法计算数字序列近似[分位数](https://en.wikipedia.org/wiki/Quantile)。
 
-The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values. The result depends on the order of running the query, and is nondeterministic.
+最大误差为1%。 内存消耗为 `log(n)`，这里 `n` 是值的个数。 结果取决于运行查询的顺序，并且是不确定的。
 
-The performance of the function is lower than performance of [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) or [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming). In terms of the ratio of State size to precision, this function is much better than `quantile`.
+该函数的性能低于 [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) 或 [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming) 的性能。 从状态大小和精度的比值来看，这个函数比 `quantile` 更优秀。
 
-When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。
 
-**Syntax**
+**语法**
 
 ``` sql
 quantileTDigest(level)(expr)
 ```
 
-Alias: `medianTDigest`.
+别名: `medianTDigest`。
 
-**Parameters**
+**参数**
 
--   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
--   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr`  — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 
-**Returned value**
+**返回值**
 
--   Approximate quantile of the specified level.
+-   指定层次的分位数。
 
-Type:
+类型:
 
--   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
--   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
--   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+-   [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。
+-   [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。
+-   [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。
 
-**Example**
+**示例**
 
-Query:
+查询:
 
 ``` sql
 SELECT quantileTDigest(number) FROM numbers(10)
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─quantileTDigest(number)─┐
@@ -51,7 +51,7 @@ Result:
 └─────────────────────────┘
 ```
 
-**See Also**
+**参见**
 
--   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
--   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
+-   [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)

From 638b62a74e20c19a43bfe75446e754adaeea06b4 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Thu, 11 Mar 2021 20:46:42 +0800
Subject: [PATCH 169/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 54 -------------------
 1 file changed, 54 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 7d763745bbf..4ee1a01f204 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -287,60 +287,6 @@ SELECT quantileTimingWeighted(response_time, weight) FROM t
 -   [中位数](#median)
 -   [分位数](#quantiles)
 
-## quantileTDigest {#quantiletdigest}
-
-使用[t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 算法计算近似[分位数](https://en.wikipedia.org/wiki/Quantile)。
-
-最大误差为1%。 内存消耗 `log(n)`，这里 `n` 是值的个数。 结果取决于运行查询的顺序，并且是不确定的。
-
-该功能的性能低于性能 [分位数](#quantile) 或 [时间分位](#quantiletiming). 在状态大小与精度的比率方面，这个函数比 `quantile`更优秀。
-
-当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[分位数](#quantiles)功能。
-
-**语法**
-
-``` sql
-quantileTDigest(level)(expr)
-```
-
-别名: `medianTDigest`.
-
-**参数**
-
--   `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median).
--   `expr` — 求职表达式，类型为：数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。
-
-**回值**
-
--   指定层次的分位数。
-
-类型:
-
--   [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。
--   [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。
--   [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。
-
-**示例**
-
-查询:
-
-``` sql
-SELECT quantileTDigest(number) FROM numbers(10)
-```
-
-结果:
-
-``` text
-┌─quantileTDigest(number)─┐
-│                     4.5 │
-└─────────────────────────┘
-```
-
-**另请参阅**
-
--   [中位数](#median)
--   [分位数](#quantiles)
-
 ## quantileTDigestWeighted {#quantiletdigestweighted}
 
 使用[t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 算法计算近似[分位数](https://en.wikipedia.org/wiki/Quantile)。 该函数考虑了每个序列成员的权重。最大误差为1%。 内存消耗 `log(n)`，这里 `n` 是值的个数。

From c80f601efb024f256719841ecbb762b79a0639d2 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Sat, 13 Mar 2021 16:57:55 +0800
Subject: [PATCH 170/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 55 -------------------
 .../reference/quantiletdigestweighted.md      | 46 ++++++++--------
 2 files changed, 23 insertions(+), 78 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 4ee1a01f204..73316bb336f 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -287,59 +287,4 @@ SELECT quantileTimingWeighted(response_time, weight) FROM t
 -   [中位数](#median)
 -   [分位数](#quantiles)
 
-## quantileTDigestWeighted {#quantiletdigestweighted}
-
-使用[t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 算法计算近似[分位数](https://en.wikipedia.org/wiki/Quantile)。 该函数考虑了每个序列成员的权重。最大误差为1%。 内存消耗 `log(n)`，这里 `n` 是值的个数。
-
-该功能的性能低于性能 [分位数](#quantile) 或 [时间分位](#quantiletiming). 在状态大小与精度的比率方面，这个函数比 `quantile`更优秀。
-
-结果取决于运行查询的顺序，并且是不确定的。
-
-当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[分位数](#quantiles)功能
-
-**语法**
-
-``` sql
-quantileTDigest(level)(expr)
-```
-
-别名: `medianTDigest`.
-
-**参数**
-
--   `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median).
--   `expr` — 求职表达式，类型为：数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。
--   `weight` — 权重序列。 权重是一个数据出现的数值。
-
-**返回值**
-
--   指定层次的分位数。
-
-类型:
-
--   [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。
--   [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。
--   [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。
-
-**示例**
-
-查询:
-
-``` sql
-SELECT quantileTDigestWeighted(number, 1) FROM numbers(10)
-```
-
-结果:
-
-``` text
-┌─quantileTDigestWeighted(number, 1)─┐
-│                                4.5 │
-└────────────────────────────────────┘
-```
-
-**另请参阅**
-
--   [中位数](#median)
--   [分位数](#quantiles)
-
 [原始文章](https://clickhouse.tech/docs/en/query_language/agg_functions/reference/) <!--hide-->
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
index 309cbe95e95..5dfa3dfb429 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
@@ -4,47 +4,47 @@ toc_priority: 208
 
 # quantileTDigestWeighted {#quantiletdigestweighted}
 
-Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm. The function takes into account the weight of each sequence member. The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values.
+使用[t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 算法计算数字序列近似[分位数](https://en.wikipedia.org/wiki/Quantile)。该函数考虑了每个序列成员的权重。最大误差为1%。 内存消耗为 `log(n)`，这里 `n` 是值的个数。
 
-The performance of the function is lower than performance of [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) or [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming). In terms of the ratio of State size to precision, this function is much better than `quantile`.
+该函数的性能低于 [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) 或 [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming) 的性能。 从状态大小和精度的比值来看，这个函数比 `quantile` 更优秀。
 
-The result depends on the order of running the query, and is nondeterministic.
+结果取决于运行查询的顺序，并且是不确定的。
 
-When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。
 
-**Syntax**
+**语法**
 
 ``` sql
-quantileTDigest(level)(expr)
+quantileTDigestWeighted(level)(expr, weight)
 ```
 
-Alias: `medianTDigest`.
+别名: `medianTDigestWeighted`。
 
-**Parameters**
+**参数**
 
--   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
--   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
--   `weight` — Column with weights of sequence elements. Weight is a number of value occurrences.
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr`  — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
+-   `weight` — 权重序列。 权重是一个数据出现的数值。
 
-**Returned value**
+**返回值**
 
--   Approximate quantile of the specified level.
+-   指定层次的分位数。
 
-Type:
+类型:
 
--   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
--   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
--   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+-   [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。
+-   [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。
+-   [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。
 
-**Example**
+**示例**
 
-Query:
+查询:
 
 ``` sql
 SELECT quantileTDigestWeighted(number, 1) FROM numbers(10)
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─quantileTDigestWeighted(number, 1)─┐
@@ -52,7 +52,7 @@ Result:
 └────────────────────────────────────┘
 ```
 
-**See Also**
+**参见**
 
--   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
--   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
+-   [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)

From e1df95b519b55998731a15bbdb1cb0309a745bb6 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Sun, 14 Mar 2021 22:15:25 +0800
Subject: [PATCH 171/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 63 -------------------
 .../reference/quantileexactweighted.md        | 46 +++++++-------
 2 files changed, 22 insertions(+), 87 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index 73316bb336f..e73e561227c 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -58,69 +58,6 @@ SELECT quantileExact(number) FROM numbers(10)
 -   [中位数](#median)
 -   [分位数](#quantiles)
 
-## quantileExactWeighted {#quantileexactweighted}
-
-考虑到每个元素的权重，然后准确计算数值序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
-
-为了准确计算，所有输入的数据被合并为一个数组，并且部分的排序。每个输入值需要根据 `weight` 计算求和。该算法使用哈希表。正因为如此，在数据重复较多的时候使用的内存是少于[quantileExact](#quantileexact)的。 您可以使用此函数代替 `quantileExact` 并指定重量1。
-
-当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[分位数](#quantiles)功能。
-
-**语法**
-
-``` sql
-quantileExactWeighted(level)(expr, weight)
-```
-
-别名: `medianExactWeighted`.
-
-**参数**
-
--   `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median).
--   `expr` — 求职表达式，类型为：数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。
--   `weight` — 权重序列。 权重是一个数据出现的数值。
-
-**返回值**
-
--   指定层次的分位数。
-
-类型:
-
--   [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。
--   [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。
--   [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。
-
-**示例**
-
-输入表:
-
-``` text
-┌─n─┬─val─┐
-│ 0 │   3 │
-│ 1 │   2 │
-│ 2 │   1 │
-│ 5 │   4 │
-└───┴─────┘
-```
-
-查询:
-
-``` sql
-SELECT quantileExactWeighted(n, val) FROM t
-```
-
-结果:
-
-``` text
-┌─quantileExactWeighted(n, val)─┐
-│                             1 │
-└───────────────────────────────┘
-```
-
-**另请参阅**
-
--   [中位数](#median)
--   [分位数](#quantiles)
 
 ## quantileTiming {#quantiletiming}
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
index 3251f8298a6..738f8ee9ae1 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
@@ -4,39 +4,38 @@ toc_priority: 203
 
 # quantileExactWeighted {#quantileexactweighted}
 
-Exactly computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence, taking into account the weight of each element.
+考虑到每个元素的权重，然后准确计算数值序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
 
-To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Each value is counted with its weight, as if it is present `weight` times. A hash table is used in the algorithm. Because of this, if the passed values ​​are frequently repeated, the function consumes less RAM than [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact). You can use this function instead of `quantileExact` and specify the weight 1.
+为了准确计算，所有输入的数据被合并为一个数组，并且部分的排序。每个输入值需要根据 `weight` 计算求和。该算法使用哈希表。正因为如此，在数据重复较多的时候使用的内存是少于[quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact)的。 您可以使用此函数代替 `quantileExact` 并指定`weight`为 1 。
 
-When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。
 
-**Syntax**
+**语法**
 
 ``` sql
 quantileExactWeighted(level)(expr, weight)
 ```
 
-Alias: `medianExactWeighted`.
+别名: `medianExactWeighted`。
 
-**Parameters**
+**参数**
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr`  — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
+-   `weight` — 权重序列。 权重是一个数据出现的数值。
 
--   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
--   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
--   `weight` — Column with weights of sequence members. Weight is a number of value occurrences.
+**返回值**
 
-**Returned value**
+-   指定层次的分位数。
 
--   Quantile of the specified level.
+类型:
 
-Type:
+-   [Float64](../../../sql-reference/data-types/float.md) 对于数字数据类型输入。
+-   [日期](../../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。
+-   [日期时间](../../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。
 
--   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
--   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
--   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+**示例**
 
-**Example**
-
-Input table:
+输入表:
 
 ``` text
 ┌─n─┬─val─┐
@@ -47,13 +46,13 @@ Input table:
 └───┴─────┘
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT quantileExactWeighted(n, val) FROM t
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─quantileExactWeighted(n, val)─┐
@@ -61,7 +60,6 @@ Result:
 └───────────────────────────────┘
 ```
 
-**See Also**
-
--   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
--   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
+**参见**
+-   [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)

From 3877f58829f73082bdf6cc46e3ce08fc71549183 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Mon, 15 Mar 2021 21:22:26 +0800
Subject: [PATCH 172/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 84 -------------------
 .../reference/quantiletiming.md               | 58 ++++++-------
 2 files changed, 29 insertions(+), 113 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
index e73e561227c..efb103fc826 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference.md
@@ -58,90 +58,6 @@ SELECT quantileExact(number) FROM numbers(10)
 -   [中位数](#median)
 -   [分位数](#quantiles)
 
-
-## quantileTiming {#quantiletiming}
-
-使用确定的精度计算数字数据序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
-
-结果是确定性的（它不依赖于查询处理顺序）。 该函数针对描述加载网页时间或后端响应时间等分布的序列进行了优化。
-
-当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[分位数](#quantiles)功能。
-
-**语法**
-
-``` sql
-quantileTiming(level)(expr)
-```
-
-别名: `medianTiming`.
-
-**参数**
-
--   `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median).
-
--   `expr` — [表达式](../syntax.md#syntax-expressions)，返回 [浮动\*](../../sql-reference/data-types/float.md)类型数据。
-
-        - 如果输入负值，那结果是不可预期的。
-        - 如果输入值大于30000（页面加载时间大于30s），那我们假设为30000。
-
-**精度**
-
-计算是准确的，如果:
-
--   值的总数不超过5670。
--   总数值超过5670，但页面加载时间小于1024ms。
-
-否则，计算结果将四舍五入到16毫秒的最接近倍数。
-
-!!! note "注"
-    对于计算页面加载时间分位数，此函数比 [分位数](#quantile)更有效和准确。
-
-**返回值**
-
--   指定层次的分位数。
-
-类型: `Float32`.
-
-!!! note "注"
-    如果没有值传递给函数（当使用 `quantileTimingIf`), [NaN](../../sql-reference/data-types/float.md#data_type-float-nan-inf) 被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 看 [ORDER BY clause](../statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。
-
-**示例**
-
-输入表:
-
-``` text
-┌─response_time─┐
-│            72 │
-│           112 │
-│           126 │
-│           145 │
-│           104 │
-│           242 │
-│           313 │
-│           168 │
-│           108 │
-└───────────────┘
-```
-
-查询:
-
-``` sql
-SELECT quantileTiming(response_time) FROM t
-```
-
-结果:
-
-``` text
-┌─quantileTiming(response_time)─┐
-│                           126 │
-└───────────────────────────────┘
-```
-
-**另请参阅**
-
--   [中位数](#median)
--   [分位数](#quantiles)
-
 ## quantileTimingWeighted {#quantiletimingweighted}
 
 根据每个序列成员的权重，使用确定的精度计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md
index 867e8b87e74..5d903c4bcd2 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md
@@ -4,53 +4,53 @@ toc_priority: 204
 
 # quantileTiming {#quantiletiming}
 
-With the determined precision computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+使用确定的精度计算数字数据序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
 
-The result is deterministic (it doesn’t depend on the query processing order). The function is optimized for working with sequences which describe distributions like loading web pages times or backend response times.
+结果是确定性的（它不依赖于查询处理顺序）。该函数针对描述加载网页时间或后端响应时间等分布的序列进行了优化。
 
-When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)功能。
 
-**Syntax**
+**语法**
 
 ``` sql
 quantileTiming(level)(expr)
 ```
 
-Alias: `medianTiming`.
+别名: `medianTiming`。
 
-**Parameters**
+**参数**
 
--   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr` — 求值[表达式](../../../sql-reference/syntax.md#syntax-expressions) 返回 [Float\*](../../../sql-reference/data-types/float.md) 类型数值。
 
--   `expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) over a column values returning a [Float\*](../../../sql-reference/data-types/float.md)-type number.
+    - 如果输入负值，那结果是不可预期的。
+    - 如果输入值大于30000（页面加载时间大于30s），那我们假设为30000。
 
-    -   If negative values are passed to the function, the behavior is undefined.
-    -   If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000.
+**精度**
 
-**Accuracy**
+计算是准确的，如果:
 
-The calculation is accurate if:
 
--   Total number of values doesn’t exceed 5670.
--   Total number of values exceeds 5670, but the page loading time is less than 1024ms.
+-   值的总数不超过5670。
+-   总数值超过5670，但页面加载时间小于1024ms。
 
-Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms.
+否则，计算结果将四舍五入到16毫秒的最接近倍数。
 
-!!! note "Note"
-    For calculating page loading time quantiles, this function is more effective and accurate than [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile).
+!!! note "注"
+    对于计算页面加载时间分位数， 此函数比[quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile)更有效和准确。
 
-**Returned value**
+**返回值**
 
--   Quantile of the specified level.
+-   指定层次的分位数。
 
-Type: `Float32`.
+类型: `Float32`。
 
-!!! note "Note"
-    If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) for notes on sorting `NaN` values.
+!!! note "注"
+如果没有值传递给函数（当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。
 
-**Example**
+**示例**
 
-Input table:
+输入表:
 
 ``` text
 ┌─response_time─┐
@@ -66,13 +66,13 @@ Input table:
 └───────────────┘
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT quantileTiming(response_time) FROM t
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─quantileTiming(response_time)─┐
@@ -80,7 +80,7 @@ Result:
 └───────────────────────────────┘
 ```
 
-**See Also**
+**参见**
 
--   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
--   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
+-   [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)

From d17c25475b0c2a66f938c3be58acb175f5de8c1c Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Tue, 16 Mar 2021 20:47:48 +0800
Subject: [PATCH 173/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference.md          | 143 ------------------
 .../reference/quantiletimingweighted.md       |  60 ++++----
 2 files changed, 30 insertions(+), 173 deletions(-)
 delete mode 100644 docs/zh/sql-reference/aggregate-functions/reference.md

diff --git a/docs/zh/sql-reference/aggregate-functions/reference.md b/docs/zh/sql-reference/aggregate-functions/reference.md
deleted file mode 100644
index efb103fc826..00000000000
--- a/docs/zh/sql-reference/aggregate-functions/reference.md
+++ /dev/null
@@ -1,143 +0,0 @@
----
-toc_priority: 36
-toc_title: 参考手册
----
-
-# 参考手册 {#aggregate-functions-reference}
-
-
-## quantileExact {#quantileexact}
-
-准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
-
-为了准确计算，所有输入的数据被合并为一个数组，并且部分的排序。因此该函数需要 `O(n)` 的内存，n为输入数据的个数。但是对于少量数据来说，该函数还是非常有效的。
-
-当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[分位数](#quantiles)功能。
-
-**语法**
-
-``` sql
-quantileExact(level)(expr)
-```
-
-别名: `medianExact`.
-
-**参数**
-
--   `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median).
--   `expr` — 求职表达式，类型为：数值[数据类型](../../sql-reference/data-types/index.md#data_types),[日期](../../sql-reference/data-types/date.md)数据类型或[时间](../../sql-reference/data-types/datetime.md)数据类型。
-
-**返回值**
-
--   指定层次的分位数。
-
-类型:
-
--   [Float64](../../sql-reference/data-types/float.md) 对于数字数据类型输入。
--   [日期](../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。
--   [日期时间](../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。
-
-**示例**
-
-查询:
-
-``` sql
-SELECT quantileExact(number) FROM numbers(10)
-```
-
-结果:
-
-``` text
-┌─quantileExact(number)─┐
-│                     5 │
-└───────────────────────┘
-```
-
-**另请参阅**
-
--   [中位数](#median)
--   [分位数](#quantiles)
-
-## quantileTimingWeighted {#quantiletimingweighted}
-
-根据每个序列成员的权重，使用确定的精度计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
-
-结果是确定性的（它不依赖于查询处理顺序）。 该函数针对描述加载网页时间或后端响应时间等分布的序列进行了优化。
-
-当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[分位数](#quantiles)功能。
-
-**语法**
-
-``` sql
-quantileTimingWeighted(level)(expr, weight)
-```
-
-别名: `medianTimingWeighted`.
-
-**参数**
-
--   `level` — 分位数层次。可选参数。 从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 在 `level=0.5` 该函数计算 [中位数](https://en.wikipedia.org/wiki/Median).
-
--   `expr` — [表达式](../syntax.md#syntax-expressions)，返回 [浮动\*](../../sql-reference/data-types/float.md)类型数据。
-
-        - 如果输入负值，那结果是不可预期的。
-        - 如果输入值大于30000（页面加载时间大于30s），那我们假设为30000。
-
--   `weight` — 权重序列。 权重是一个数据出现的数值。
-
-**精度**
-
-计算是准确的，如果:
-
--   值的总数不超过5670。
--   总数值超过5670，但页面加载时间小于1024ms。
-
-否则，计算结果将四舍五入到16毫秒的最接近倍数。
-
-!!! note "注"
-    对于计算页面加载时间分位数，此函数比 [分位数](#quantile)更高效和准确。
-
-**返回值**
-
--   指定层次的分位数。
-
-类型: `Float32`.
-
-!!! note "注"
-    如果没有值传递给函数（当使用 `quantileTimingIf`), [NaN](../../sql-reference/data-types/float.md#data_type-float-nan-inf) 被返回。 这样做的目的是将这些案例与导致零的案例区分开来。看 [ORDER BY clause](../statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。
-
-**示例**
-
-输入表:
-
-``` text
-┌─response_time─┬─weight─┐
-│            68 │      1 │
-│           104 │      2 │
-│           112 │      3 │
-│           126 │      2 │
-│           138 │      1 │
-│           162 │      1 │
-└───────────────┴────────┘
-```
-
-查询:
-
-``` sql
-SELECT quantileTimingWeighted(response_time, weight) FROM t
-```
-
-结果:
-
-``` text
-┌─quantileTimingWeighted(response_time, weight)─┐
-│                                           112 │
-└───────────────────────────────────────────────┘
-```
-
-**另请参阅**
-
--   [中位数](#median)
--   [分位数](#quantiles)
-
-[原始文章](https://clickhouse.tech/docs/en/query_language/agg_functions/reference/) <!--hide-->
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
index 0f8606986c8..70a2b49d413 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
@@ -4,55 +4,55 @@ toc_priority: 205
 
 # quantileTimingWeighted {#quantiletimingweighted}
 
-With the determined precision computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence according to the weight of each sequence member.
+根据每个序列成员的权重，使用确定的精度计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
 
-The result is deterministic (it doesn’t depend on the query processing order). The function is optimized for working with sequences which describe distributions like loading web pages times or backend response times.
+结果是确定性的（它不依赖于查询处理顺序）。该函数针对描述加载网页时间或后端响应时间等分布的序列进行了优化。
 
-When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)功能。
 
-**Syntax**
+**语法**
 
 ``` sql
 quantileTimingWeighted(level)(expr, weight)
 ```
 
-Alias: `medianTimingWeighted`.
+别名: `medianTimingWeighted`。
 
-**Parameters**
+**参数**
 
--   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr` — 求值[表达式](../../../sql-reference/syntax.md#syntax-expressions) 返回 [Float\*](../../../sql-reference/data-types/float.md) 类型数值。
 
--   `expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) over a column values returning a [Float\*](../../../sql-reference/data-types/float.md)-type number.
+    - 如果输入负值，那结果是不可预期的。
+    - 如果输入值大于30000（页面加载时间大于30s），那我们假设为30000。
 
-        - If negative values are passed to the function, the behavior is undefined.
-        - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000.
+-   `weight` — 权重序列。 权重是一个数据出现的数值。
 
--   `weight` — Column with weights of sequence elements. Weight is a number of value occurrences.
+**精度**
 
-**Accuracy**
+计算是准确的，如果:
 
-The calculation is accurate if:
 
--   Total number of values doesn’t exceed 5670.
--   Total number of values exceeds 5670, but the page loading time is less than 1024ms.
+-   值的总数不超过5670。
+-   总数值超过5670，但页面加载时间小于1024ms。
 
-Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms.
+否则，计算结果将四舍五入到16毫秒的最接近倍数。
 
-!!! note "Note"
-    For calculating page loading time quantiles, this function is more effective and accurate than [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile).
+!!! note "注"
+    对于计算页面加载时间分位数， 此函数比[quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile)更有效和准确。
 
-**Returned value**
+**返回值**
 
--   Quantile of the specified level.
+-   指定层次的分位数。
 
-Type: `Float32`.
+类型: `Float32`。
 
-!!! note "Note"
-    If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) for notes on sorting `NaN` values.
+!!! note "注"
+如果没有值传递给函数（当使用 `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf)被返回。 这样做的目的是将这些案例与导致零的案例区分开来。 参见 [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) 对于 `NaN` 值排序注意事项。
 
-**Example**
+**示例**
 
-Input table:
+输入表:
 
 ``` text
 ┌─response_time─┬─weight─┐
@@ -65,13 +65,13 @@ Input table:
 └───────────────┴────────┘
 ```
 
-Query:
+查询:
 
 ``` sql
 SELECT quantileTimingWeighted(response_time, weight) FROM t
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─quantileTimingWeighted(response_time, weight)─┐
@@ -79,7 +79,7 @@ Result:
 └───────────────────────────────────────────────┘
 ```
 
-**See Also**
+**参见**
 
--   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
--   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
+-   [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)

From 53b9afda06891afbf83518580bc7a2e22e0ad113 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Wed, 17 Mar 2021 21:06:35 +0800
Subject: [PATCH 174/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference/anyheavy.md |   2 +-
 .../reference/avgweighted.md                  |   2 +-
 .../reference/initializeAggregation.md        |  24 ++--
 .../aggregate-functions/reference/quantile.md |   2 +-
 .../reference/quantiledeterministic.md        |   4 +-
 .../reference/quantileexact.md                | 128 +++++++++---------
 .../reference/quantileexactweighted.md        |   2 +-
 .../reference/quantiletdigest.md              |   2 +-
 .../reference/quantiletdigestweighted.md      |   2 +-
 .../reference/quantiletiming.md               |   4 +-
 .../reference/quantiletimingweighted.md       |   2 +-
 11 files changed, 90 insertions(+), 84 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md b/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
index f47027bd0c4..45a96309ac8 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
@@ -12,7 +12,7 @@ anyHeavy(column)
 
 **参数**
 
--   `column` – The column name.
+-   `column` – The column name。
 
 **示例**
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
index ba53678b704..9b732f57b4a 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/avgweighted.md
@@ -5,7 +5,7 @@ toc_priority: 107
 # avgWeighted {#avgweighted}
 
 
-计算 [加权算术平均值](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean).
+计算 [加权算术平均值](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean)。
 
 **语法**
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md b/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md
index ea44d5f1ddd..ba54d343b44 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md
@@ -4,33 +4,33 @@ toc_priority: 150
 
 ## initializeAggregation {#initializeaggregation}
 
-Initializes aggregation for your input rows. It is intended for the functions with the suffix `State`.
-Use it for tests or to process columns of types `AggregateFunction` and `AggregationgMergeTree`.
+初始化你输入行的聚合。用于后缀是 `State` 的函数。
+用它来测试或处理 `AggregateFunction` 和 `AggregationgMergeTree` 类型的列。
 
-**Syntax**
+**语法**
 
 ``` sql
 initializeAggregation (aggregate_function, column_1, column_2);
 ```
 
-**Parameters**
+**参数**
 
--   `aggregate_function` — Name of the aggregation function. The state of this function — the creating one. [String](../../../sql-reference/data-types/string.md#string).
--   `column_n` — The column to translate it into the function as it's argument. [String](../../../sql-reference/data-types/string.md#string).
+-   `aggregate_function` — 聚合函数名。 这个函数的状态 — 正创建的。[String](../../../sql-reference/data-types/string.md#string)。
+-   `column_n` — 将其转换为函数的参数的列。[String](../../../sql-reference/data-types/string.md#string)。
 
-**Returned value(s)**
+**返回值**
 
-Returns the result of the aggregation for your input rows. The return type will be the same as the return type of function, that `initializeAgregation` takes as first argument.
-For example for functions with the suffix `State` the return type will be `AggregateFunction`.
+返回输入行的聚合结果。返回类型将与 `initializeAgregation` 用作第一个参数的函数的返回类型相同。
+例如，对于后缀为 `State` 的函数，返回类型将是 `AggregateFunction`。
 
-**Example**
+**示例**
 
-Query:
+查询:
 
 ```sql
 SELECT uniqMerge(state) FROM (SELECT initializeAggregation('uniqState', number % 3) AS state FROM system.numbers LIMIT 10000);
 ```
-Result:
+结果:
 
 ┌─uniqMerge(state)─┐
 │                3 │
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantile.md b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
index 26882465ff3..4519688dc7e 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantile.md
@@ -19,7 +19,7 @@ quantile(level)(expr)
 
 **参数**
 
--   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值：0.5。当 `level=0.5` 时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
 -   `expr`  — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 
 **返回值**
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md
index 8e327472864..c6c6b0a63de 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiledeterministic.md
@@ -20,8 +20,8 @@ quantileDeterministic(level)(expr, determinator)
 
 **参数**
 
--   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
--   `expr` — — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr` — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 -   `determinator` — 一个数字，其hash被用来代替在水塘抽样中随机生成的数字，这样可以保证取样的确定性。你可以使用用户ID或者事件ID等任何正数，但是如果相同的 `determinator` 出现多次，那结果很可能不正确。
 **返回值**
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md b/docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md
index a39f724f368..a8d39c35700 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantileexact.md
@@ -4,44 +4,46 @@ toc_priority: 202
 
 # quantileExact {#quantileexact}
 
-Exactly computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
 
-To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Therefore, the function consumes `O(n)` memory, where `n` is a number of values that were passed. However, for a small number of values, the function is very effective.
+准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
 
-When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+为了准确计算，所有输入的数据被合并为一个数组，并且部分的排序。因此该函数需要 `O(n)` 的内存，n为输入数据的个数。但是对于少量数据来说，该函数还是非常有效的。
 
-**Syntax**
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。
+
+**语法**
 
 ``` sql
 quantileExact(level)(expr)
 ```
 
-Alias: `medianExact`.
+别名: `medianExact`。
 
-**Parameters**
+**参数**
 
--   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
--   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值：0.5。当 `level=0.5` 时，该函数计算[中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr` — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 
-**Returned value**
+**返回值**
 
--   Quantile of the specified level.
+-   指定层次的分位数。
 
-Type:
 
--   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
--   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
--   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+类型:
 
-**Example**
+-   [Float64](../../../sql-reference/data-types/float.md) 对于数字数据类型输入。
+-   [日期](../../../sql-reference/data-types/date.md) 如果输入值具有 `Date` 类型。
+-   [日期时间](../../../sql-reference/data-types/datetime.md) 如果输入值具有 `DateTime` 类型。
 
-Query:
+**示例**
+
+查询:
 
 ``` sql
 SELECT quantileExact(number) FROM numbers(10)
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─quantileExact(number)─┐
@@ -51,13 +53,15 @@ Result:
 
 # quantileExactLow {#quantileexactlow}
 
-Similar to `quantileExact`, this computes the exact [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+和 `quantileExact` 相似, 准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
 
-To get the exact value, all the passed values are combined into an array, which is then fully sorted.  The sorting [algorithm's](https://en.cppreference.com/w/cpp/algorithm/sort) complexity is `O(N·log(N))`, where `N = std::distance(first, last)` comparisons.
+为了准确计算，所有输入的数据被合并为一个数组，并且全排序。这排序[算法](https://en.cppreference.com/w/cpp/algorithm/sort)的复杂度是 `O(N·log(N))`, 其中 `N = std::distance(first, last)` 比较。
 
-The return value depends on the quantile level and the number of elements in the selection, i.e. if the level is 0.5, then the function returns the lower median value for an even number of elements and the middle median value for an odd number of elements. Median is calculated similarly to the [median_low](https://docs.python.org/3/library/statistics.html#statistics.median_low) implementation which is used in python.
+返回值取决于分位数级别和所选取的元素数量，即如果级别是 0.5, 函数返回偶数元素的低位中位数，奇数元素的中位数。中位数计算类似于 python 中使用的[median_low](https://docs.python.org/3/library/statistics.html#statistics.median_low)的实现。
 
-For all other levels, the element at the index corresponding to the value of `level * size_of_array` is returned. For example:
+对于所有其他级别， 返回 `level * size_of_array` 值所对应的索引的元素值。
+
+例如:
 
 ``` sql
 SELECT quantileExactLow(0.1)(number) FROM numbers(10)
@@ -66,99 +70,101 @@ SELECT quantileExactLow(0.1)(number) FROM numbers(10)
 │                             1 │
 └───────────────────────────────┘
 ```
-                                                                                                                                                                                 
-When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
 
-**Syntax**
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。
+
+**语法**
 
 ``` sql
-quantileExact(level)(expr)
+quantileExactLow(level)(expr)
 ```
 
-Alias: `medianExactLow`.
+别名: `medianExactLow`。
 
-**Parameters**
+**参数**
 
--   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
--   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值：0.5。当 `level=0.5` 时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr` — — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 
-**Returned value**
+**返回值**
 
--   Quantile of the specified level.
+-   指定层次的分位数。
 
-Type:
+类型:
 
--   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
--   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
--   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+-   [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。
+-   [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。
+-   [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。
 
-**Example**
+**示例**
 
-Query:
+查询:
 
 ``` sql
 SELECT quantileExactLow(number) FROM numbers(10)
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─quantileExactLow(number)─┐
 │                        4 │
 └──────────────────────────┘
 ```
+
 # quantileExactHigh {#quantileexacthigh}
 
-Similar to `quantileExact`, this computes the exact [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence.
+和 `quantileExact` 相似, 准确计算数字序列的[分位数](https://en.wikipedia.org/wiki/Quantile)。
 
-All the passed values are combined into an array, which is then fully sorted, 
-to get the exact value.  The sorting [algorithm's](https://en.cppreference.com/w/cpp/algorithm/sort) complexity is `O(N·log(N))`, where `N = std::distance(first, last)` comparisons.
+为了准确计算，所有输入的数据被合并为一个数组，并且全排序。这排序[算法](https://en.cppreference.com/w/cpp/algorithm/sort)的复杂度是 `O(N·log(N))`, 其中 `N = std::distance(first, last)` 比较。
 
-The return value depends on the quantile level and the number of elements in the selection, i.e. if the level is 0.5, then the function returns the higher median value for an even number of elements and the middle median value for an odd number of elements. Median is calculated similarly to the [median_high](https://docs.python.org/3/library/statistics.html#statistics.median_high) implementation which is used in python. For all other levels, the element at the index corresponding to the value of `level * size_of_array` is returned. 
+返回值取决于分位数级别和所选取的元素数量，即如果级别是 0.5, 函数返回偶数元素的低位中位数，奇数元素的中位数。中位数计算类似于 python 中使用的[median_high](https://docs.python.org/3/library/statistics.html#statistics.median_high)的实现。
 
-This implementation behaves exactly similar to the current `quantileExact` implementation.
+对于所有其他级别， 返回 `level * size_of_array` 值所对应的索引的元素值。
 
-When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+这个实现与当前的 `quantileExact` 实现完全相似。
 
-**Syntax**
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用 [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) 函数。
+
+**语法**
 
 ``` sql
 quantileExactHigh(level)(expr)
 ```
 
-Alias: `medianExactHigh`.
+别名: `medianExactHigh`。
 
-**Parameters**
+**参数**
 
--   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
--   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值：0.5。当 `level=0.5` 时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `expr` — — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 
-**Returned value**
+**返回值**
 
--   Quantile of the specified level.
+-   指定层次的分位数。
 
-Type:
+类型:
 
--   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
--   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
--   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+-   [Float64](../../../sql-reference/data-types/float.md) 用于数字数据类型输入。
+-   [Date](../../../sql-reference/data-types/date.md) 如果输入值是 `Date` 类型。
+-   [DateTime](../../../sql-reference/data-types/datetime.md) 如果输入值是 `DateTime` 类型。
 
-**Example**
+**示例**
 
-Query:
+查询:
 
 ``` sql
 SELECT quantileExactHigh(number) FROM numbers(10)
 ```
 
-Result:
+结果:
 
 ``` text
 ┌─quantileExactHigh(number)─┐
 │                         5 │
 └───────────────────────────┘
 ```
-**See Also**
+**参见**
 
--   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
--   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
+-   [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
index 738f8ee9ae1..0b20d780094 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
@@ -19,7 +19,7 @@ quantileExactWeighted(level)(expr, weight)
 别名: `medianExactWeighted`。
 
 **参数**
--   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。当 `level=0.5` 时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
 -   `expr`  — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 -   `weight` — 权重序列。 权重是一个数据出现的数值。
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md
index be8e55da817..fb186da299e 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigest.md
@@ -22,7 +22,7 @@ quantileTDigest(level)(expr)
 
 **参数**
 
--   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]` 。默认值：0.5。当 `level=0.5` 时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
 -   `expr`  — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 
 **返回值**
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
index 5dfa3dfb429..cf78c4c03bc 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
@@ -22,7 +22,7 @@ quantileTDigestWeighted(level)(expr, weight)
 
 **参数**
 
--   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]` 。默认值：0.5。 当 `level=0.5` 时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
 -   `expr`  — 求值表达式，类型为数值类型[data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) 或 [DateTime](../../../sql-reference/data-types/datetime.md)。
 -   `weight` — 权重序列。 权重是一个数据出现的数值。
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md
index 5d903c4bcd2..a193b60338a 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletiming.md
@@ -8,7 +8,7 @@ toc_priority: 204
 
 结果是确定性的（它不依赖于查询处理顺序）。该函数针对描述加载网页时间或后端响应时间等分布的序列进行了优化。
 
-当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)功能。
+当在一个查询中使用多个不同层次的 `quantile*` 时，内部状态不会被组合（即查询的工作效率低于组合情况）。在这种情况下，使用[quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)函数。
 
 **语法**
 
@@ -20,7 +20,7 @@ quantileTiming(level)(expr)
 
 **参数**
 
--   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`. 默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]` 。默认值：0.5。当 `level=0.5` 时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
 -   `expr` — 求值[表达式](../../../sql-reference/syntax.md#syntax-expressions) 返回 [Float\*](../../../sql-reference/data-types/float.md) 类型数值。
 
     - 如果输入负值，那结果是不可预期的。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
index 70a2b49d413..79f3bfc38b5 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
@@ -20,7 +20,7 @@ quantileTimingWeighted(level)(expr, weight)
 
 **参数**
 
--   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。 我们推荐 `level` 值的范围为 `[0.01, 0.99]`。默认值：0.5。 当 `level=0.5`时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
+-   `level` — 分位数层次。可选参数。从0到1的一个float类型的常量。我们推荐 `level` 值的范围为 `[0.01, 0.99]` 。默认值：0.5。当 `level=0.5` 时，该函数计算 [中位数](https://en.wikipedia.org/wiki/Median)。
 -   `expr` — 求值[表达式](../../../sql-reference/syntax.md#syntax-expressions) 返回 [Float\*](../../../sql-reference/data-types/float.md) 类型数值。
 
     - 如果输入负值，那结果是不可预期的。

From 41de78e8255ff043a1e6616a4502b2cb2791fda3 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Thu, 18 Mar 2021 13:15:17 +0800
Subject: [PATCH 175/260] WIP update-aggregate-funcions-in-zh

---
 .../reference/quantiletimingweighted.md       | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
index 79f3bfc38b5..7b130dbddbd 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
@@ -79,6 +79,39 @@ SELECT quantileTimingWeighted(response_time, weight) FROM t
 └───────────────────────────────────────────────┘
 ```
 
+# quantilesTimingWeighted {#quantilestimingweighted}
+
+类似于 `quantileTimingWeighted` , 但接受多个分位数层次参数，并返回一个由这些分位数值组成的数组。
+
+**示例**
+
+输入表:
+
+``` text
+┌─response_time─┬─weight─┐
+│            68 │      1 │
+│           104 │      2 │
+│           112 │      3 │
+│           126 │      2 │
+│           138 │      1 │
+│           162 │      1 │
+└───────────────┴────────┘
+```
+
+查询:
+
+``` sql
+SELECT quantilesTimingWeighted(0,5, 0.99)(response_time, weight) FROM t
+```
+
+结果:
+
+``` text
+┌─quantilesTimingWeighted(0.5, 0.99)(response_time, weight)─┐
+│ [112,162]                                                 │
+└───────────────────────────────────────────────────────────┘
+```
+
 **参见**
 
 -   [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median)

From 502a5d62da2f8ff737c82bbcd611d7b5eebc69d4 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Fri, 19 Mar 2021 00:02:22 +0800
Subject: [PATCH 176/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference/covarpop.md             | 2 +-
 .../aggregate-functions/reference/covarsamp.md            | 2 +-
 .../aggregate-functions/reference/grouparray.md           | 8 +++-----
 .../aggregate-functions/reference/groupuniqarray.md       | 4 ++--
 .../sql-reference/aggregate-functions/reference/maxmap.md | 4 ++--
 .../sql-reference/aggregate-functions/reference/minmap.md | 4 ++--
 .../aggregate-functions/reference/quantiles.md            | 2 +-
 .../reference/stochasticlogisticregression.md             | 2 ++
 .../sql-reference/aggregate-functions/reference/summap.md | 4 ++--
 9 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md
index cddad69e56a..c6f43c6b9e9 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/covarpop.md
@@ -6,7 +6,7 @@ toc_priority: 36
 
 **语法**
 ``` sql
-`covarPop(x, y)`
+covarPop(x, y)
 ```
 
 计算 `Σ((x - x̅)(y - y̅)) / n` 的值。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md
index 5ee18cf3f97..5ef5104504b 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/covarsamp.md
@@ -6,7 +6,7 @@ toc_priority: 37
 
 **语法**
 ``` sql
-`covarSamp(x, y)`
+covarSamp(x, y)
 ```
 
 计算 `Σ((x - x̅)(y - y̅)) / (n - 1)` 的值。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md
index 81cd38db8b5..0a8f1cd326d 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparray.md
@@ -6,11 +6,9 @@ toc_priority: 110
 
 **语法**
 ``` sql
-`groupArray(x)`
-
-or
-
-`groupArray(max_size)(x)`
+groupArray(x)
+或
+groupArray(max_size)(x)
 ```
 
 创建参数值的数组。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md b/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md
index d89e575cfa4..f371361bbf6 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/groupuniqarray.md
@@ -7,9 +7,9 @@ toc_priority: 111
 **语法**
 
 ``` sql
-`groupUniqArray(x)`
+groupUniqArray(x)
 或
-`groupUniqArray(max_size)(x)`
+groupUniqArray(max_size)(x)
 ```
 
 从不同的参数值创建一个数组。 内存消耗和 [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md) 函数是一样的。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md b/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
index 86352792dd7..4d91d1e75fd 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/maxmap.md
@@ -7,9 +7,9 @@ toc_priority: 143
 **语法**
 
 ```sql
-`maxMap(key, value)`
+maxMap(key, value)
  或
-`maxMap(Tuple(key, value))`
+maxMap(Tuple(key, value))
 ```
 
 
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/minmap.md b/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
index efbb3448b60..8e0022ac174 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/minmap.md
@@ -7,9 +7,9 @@ toc_priority: 142
 **语法**
 
 ```sql
-`minMap(key, value)`
+minMap(key, value)
 或
-`minMap(Tuple(key, value))`
+minMap(Tuple(key, value))
 ```
 
 根据 `key` 数组中指定的键对 `value` 数组计算最小值。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md
index 111b1c086c1..044c4d6d24e 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantiles.md
@@ -6,7 +6,7 @@ toc_priority: 201
 
 **语法**
 ``` sql
-`quantiles(level1, level2, …)(x)`
+quantiles(level1, level2, …)(x)
 ```
 
 所有分位数函数(quantile)也有相应的分位数(quantiles)函数: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`。 这些函数一次计算所列的级别的所有分位数, 并返回结果值的数组。
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md
index a2e9ffa89e6..5ed2fb74b89 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md
@@ -12,6 +12,8 @@ toc_priority: 222
 `learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`.
 欲了解更多信息，参见 [参数] (#agg_functions-stochasticlinearregression-parameters).
 
+**语法**
+
 ``` sql
 stochasticLogisticRegression(1.0, 1.0, 10, 'SGD')
 ```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/summap.md b/docs/zh/sql-reference/aggregate-functions/reference/summap.md
index 2eeb5be65e9..4a92a1ea1b0 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/summap.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/summap.md
@@ -7,9 +7,9 @@ toc_priority: 141
 **语法**
 
 ``` sql
-`sumMap(key, value)`
+sumMap(key, value)
 或
-`sumMap(Tuple(key, value))`
+sumMap(Tuple(key, value))
 ```
 
 根据 `key` 数组中指定的键对 `value` 数组进行求和。

From 31d6a4369be2478009f5529a61eb314dc136f481 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Fri, 19 Mar 2021 15:39:10 +0800
Subject: [PATCH 177/260] WIP update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference/deltasum.md | 23 ++++++
 .../reference/mannwhitneyutest.md             | 72 +++++++++++++++++++
 .../reference/studentttest.md                 | 64 +++++++++++++++++
 .../reference/welchttest.md                   | 62 ++++++++++++++++
 4 files changed, 221 insertions(+)
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/deltasum.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/mannwhitneyutest.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/studentttest.md
 create mode 100644 docs/zh/sql-reference/aggregate-functions/reference/welchttest.md

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md b/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md
new file mode 100644
index 00000000000..3f099944cee
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md
@@ -0,0 +1,23 @@
+---
+toc_priority: 141
+---
+
+# deltaSum {#agg_functions-deltasum}
+
+**语法**
+
+``` sql
+deltaSum(value)
+```
+
+计算连续行之间的差值和。如果差值为负，则忽略。
+`value`必须是整型或浮点类型。
+
+示例:
+
+```sql
+select deltaSum(arrayJoin([1, 2, 3]));                  -- => 2
+select deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3]));   -- => 7
+select deltaSum(arrayJoin([2.25, 3, 4.5])); -- => 2.25
+```
+
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/zh/sql-reference/aggregate-functions/reference/mannwhitneyutest.md
new file mode 100644
index 00000000000..016a650b61b
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/mannwhitneyutest.md
@@ -0,0 +1,72 @@
+---
+toc_priority: 310
+toc_title: mannWhitneyUTest
+---
+
+# mannWhitneyUTest {#mannwhitneyutest}
+
+对两个总体的样本应用 Mann-Whitney 秩检验。
+
+**语法**
+
+``` sql
+mannWhitneyUTest[(alternative[, continuity_correction])](sample_data, sample_index)
+```
+
+两个样本的值都在 `sample_data` 列中。如果 `sample_index` 等于 0，则该行的值属于第一个总体的样本。 反之属于第二个总体的样本。
+零假设是两个总体随机相等。也可以检验单边假设。该检验不假设数据具有正态分布。
+
+**参数**
+
+-   `sample_data` — 样本数据。[Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) 或 [Decimal](../../../sql-reference/data-types/decimal.md)。
+-   `sample_index` — 样本索引。[Integer](../../../sql-reference/data-types/int-uint.md).
+
+**参数**
+
+-   `alternative` — 供选假设。(可选，默认值是: `'two-sided'` 。) [String](../../../sql-reference/data-types/string.md)。
+    -   `'two-sided'`;
+    -   `'greater'`;
+    -   `'less'`。
+-   `continuity_correction` — 如果不为0，那么将对p值进行正态近似的连续性修正。(可选，默认：1。) [UInt64](../../../sql-reference/data-types/int-uint.md)。
+
+**返回值**
+
+[元组](../../../sql-reference/data-types/tuple.md)，有两个元素:
+
+-   计算出U统计量。[Float64](../../../sql-reference/data-types/float.md)。
+-   计算出的p值。[Float64](../../../sql-reference/data-types/float.md)。
+
+
+**示例**
+
+输入表:
+
+``` text
+┌─sample_data─┬─sample_index─┐
+│          10 │            0 │
+│          11 │            0 │
+│          12 │            0 │
+│           1 │            1 │
+│           2 │            1 │
+│           3 │            1 │
+└─────────────┴──────────────┘
+```
+
+查询:
+
+``` sql
+SELECT mannWhitneyUTest('greater')(sample_data, sample_index) FROM mww_ttest;
+```
+
+结果:
+
+``` text
+┌─mannWhitneyUTest('greater')(sample_data, sample_index)─┐
+│ (9,0.04042779918503192)                                │
+└────────────────────────────────────────────────────────┘
+```
+
+**参见**
+
+-   [Mann–Whitney U test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test)
+-   [Stochastic ordering](https://en.wikipedia.org/wiki/Stochastic_ordering)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/studentttest.md b/docs/zh/sql-reference/aggregate-functions/reference/studentttest.md
new file mode 100644
index 00000000000..6d84e728330
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/studentttest.md
@@ -0,0 +1,64 @@
+---
+toc_priority: 300
+toc_title: studentTTest
+---
+
+# studentTTest {#studentttest}
+
+对两个总体的样本应用t检验。
+
+**语法**
+
+``` sql
+studentTTest(sample_data, sample_index)
+```
+
+两个样本的值都在 `sample_data` 列中。如果 `sample_index` 等于 0，则该行的值属于第一个总体的样本。 反之属于第二个总体的样本。
+零假设是总体的均值相等。假设为方差相等的正态分布。
+
+**参数**
+
+-   `sample_data` — 样本数据。[Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) 或 [Decimal](../../../sql-reference/data-types/decimal.md)。
+-   `sample_index` — 样本索引。[Integer](../../../sql-reference/data-types/int-uint.md)。
+
+**返回值**
+
+[元组](../../../sql-reference/data-types/tuple.md)，有两个元素:
+
+-   计算出的t统计量。 [Float64](../../../sql-reference/data-types/float.md)。
+-   计算出的p值。[Float64](../../../sql-reference/data-types/float.md)。
+
+
+**示例**
+
+输入表:
+
+``` text
+┌─sample_data─┬─sample_index─┐
+│        20.3 │            0 │
+│        21.1 │            0 │
+│        21.9 │            1 │
+│        21.7 │            0 │
+│        19.9 │            1 │
+│        21.8 │            1 │
+└─────────────┴──────────────┘
+```
+
+查询:
+
+``` sql
+SELECT studentTTest(sample_data, sample_index) FROM student_ttest;
+```
+
+结果:
+
+``` text
+┌─studentTTest(sample_data, sample_index)───┐
+│ (-0.21739130434783777,0.8385421208415731) │
+└───────────────────────────────────────────┘
+```
+
+**参见**
+
+-   [Student's t-test](https://en.wikipedia.org/wiki/Student%27s_t-test)
+-   [welchTTest function](../../../sql-reference/aggregate-functions/reference/welchttest.md#welchttest)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/welchttest.md b/docs/zh/sql-reference/aggregate-functions/reference/welchttest.md
new file mode 100644
index 00000000000..44b8e81d4d9
--- /dev/null
+++ b/docs/zh/sql-reference/aggregate-functions/reference/welchttest.md
@@ -0,0 +1,62 @@
+---
+toc_priority: 301
+toc_title: welchTTest
+---
+
+# welchTTest {#welchttest}
+
+对两个总体的样本应用 Welch t检验。
+
+**语法**
+
+``` sql
+welchTTest(sample_data, sample_index)
+```
+两个样本的值都在 `sample_data` 列中。如果 `sample_index` 等于 0，则该行的值属于第一个总体的样本。 反之属于第二个总体的样本。
+零假设是群体的均值相等。假设为正态分布。总体可能具有不相等的方差。
+
+**参数**
+
+-   `sample_data` — 样本数据。[Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) 或 [Decimal](../../../sql-reference/data-types/decimal.md).
+-   `sample_index` — 样本索引。[Integer](../../../sql-reference/data-types/int-uint.md).
+
+**返回值**
+
+[元组](../../../sql-reference/data-types/tuple.md)，有两个元素:
+
+-   计算出的t统计量。 [Float64](../../../sql-reference/data-types/float.md)。
+-   计算出的p值。[Float64](../../../sql-reference/data-types/float.md)。
+
+**示例**
+
+输入表:
+
+``` text
+┌─sample_data─┬─sample_index─┐
+│        20.3 │            0 │
+│        22.1 │            0 │
+│        21.9 │            0 │
+│        18.9 │            1 │
+│        20.3 │            1 │
+│          19 │            1 │
+└─────────────┴──────────────┘
+```
+
+查询:
+
+``` sql
+SELECT welchTTest(sample_data, sample_index) FROM welch_ttest;
+```
+
+结果:
+
+``` text
+┌─welchTTest(sample_data, sample_index)─────┐
+│ (2.7988719532211235,0.051807360348581945) │
+└───────────────────────────────────────────┘
+```
+
+**参见**
+
+-   [Welch's t-test](https://en.wikipedia.org/wiki/Welch%27s_t-test)
+-   [studentTTest function](../../../sql-reference/aggregate-functions/reference/studentttest.md#studentttest)

From 3ae419f01ca8a792f1837c875c19668b7d0ddd10 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Fri, 19 Mar 2021 23:31:56 +0800
Subject: [PATCH 178/260] WIP update-aggregate-funcions-in-zh:fix links

---
 docs/zh/guides/apply-catboost-model.md                        | 2 +-
 docs/zh/sql-reference/aggregate-functions/combinators.md      | 4 ++--
 .../sql-reference/aggregate-functions/parametric-functions.md | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/zh/guides/apply-catboost-model.md b/docs/zh/guides/apply-catboost-model.md
index 5e374751052..9002e5cf005 100644
--- a/docs/zh/guides/apply-catboost-model.md
+++ b/docs/zh/guides/apply-catboost-model.md
@@ -238,6 +238,6 @@ FROM
 ```
 
 !!! note "注"
-    查看函数说明 [avg()](../sql-reference/aggregate-functions/reference.md#agg_function-avg) 和 [log()](../sql-reference/functions/math-functions.md) 。
+    查看函数说明 [avg()](../sql-reference/aggregate-functions/reference/avg.md#agg_function-avg) 和 [log()](../sql-reference/functions/math-functions.md) 。
 
 [原始文章](https://clickhouse.tech/docs/en/guides/apply_catboost_model/) <!--hide-->
diff --git a/docs/zh/sql-reference/aggregate-functions/combinators.md b/docs/zh/sql-reference/aggregate-functions/combinators.md
index c458097a5fb..6d1cd9c775c 100644
--- a/docs/zh/sql-reference/aggregate-functions/combinators.md
+++ b/docs/zh/sql-reference/aggregate-functions/combinators.md
@@ -27,7 +27,7 @@ toc_title: 聚合函数组合器
 
 ## -State {#agg-functions-combinator-state}
 
-如果应用此combinator，则聚合函数不会返回结果值（例如唯一值的数量 [uniq](reference.md#agg_function-uniq) 函数），但是返回聚合的中间状态（对于 `uniq`，返回的是计算唯一值的数量的哈希表）。 这是一个 `AggregateFunction(...)` 可用于进一步处理或存储在表中以完成稍后的聚合。
+如果应用此combinator，则聚合函数不会返回结果值（例如唯一值的数量 [uniq](./reference/uniq.md#agg_function-uniq) 函数），但是返回聚合的中间状态（对于 `uniq`，返回的是计算唯一值的数量的哈希表）。 这是一个 `AggregateFunction(...)` 可用于进一步处理或存储在表中以完成稍后的聚合。
 
 要使用这些状态，请使用:
 
@@ -209,7 +209,7 @@ FROM
 
 让我们得到的人的名字，他们的年龄在于的时间间隔 `[30,60)` 和 `[60,75)`。 由于我们使用整数表示的年龄，我们得到的年龄 `[30, 59]` 和 `[60,74]` 间隔。
 
-要在数组中聚合名称，我们使用 [groupArray](reference.md#agg_function-grouparray) 聚合函数。 这需要一个参数。 在我们的例子中，它是 `name` 列。 `groupArrayResample` 函数应该使用 `age` 按年龄聚合名称， 要定义所需的时间间隔，我们传入 `30, 75, 30` 参数给 `groupArrayResample` 函数。
+要在数组中聚合名称，我们使用 [groupArray](./reference/grouparray.md#agg_function-grouparray) 聚合函数。 这需要一个参数。 在我们的例子中，它是 `name` 列。 `groupArrayResample` 函数应该使用 `age` 按年龄聚合名称， 要定义所需的时间间隔，我们传入 `30, 75, 30` 参数给 `groupArrayResample` 函数。
 
 ``` sql
 SELECT groupArrayResample(30, 75, 30)(name, age) FROM people
diff --git a/docs/zh/sql-reference/aggregate-functions/parametric-functions.md b/docs/zh/sql-reference/aggregate-functions/parametric-functions.md
index d151bbc3957..be9166e5737 100644
--- a/docs/zh/sql-reference/aggregate-functions/parametric-functions.md
+++ b/docs/zh/sql-reference/aggregate-functions/parametric-functions.md
@@ -493,6 +493,6 @@ FROM
 
 ## sumMapFiltered(keys_to_keep)(keys, values) {#summapfilteredkeys-to-keepkeys-values}
 
-和 [sumMap](reference.md#agg_functions-summap) 基本一致， 除了一个键数组作为参数传递。这在使用高基数key时尤其有用。
+和 [sumMap](./reference/summap.md#agg_functions-summap) 基本一致， 除了一个键数组作为参数传递。这在使用高基数key时尤其有用。
 
 [原始文章](https://clickhouse.tech/docs/en/query_language/agg_functions/parametric_functions/) <!--hide-->

From aa6aae4d812841839573e20120671e68a6405d91 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Mon, 22 Mar 2021 00:03:15 +0800
Subject: [PATCH 179/260] simpleaggregatefunction update links

---
 .../data-types/simpleaggregatefunction.md     | 45 ++++++++++---------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/docs/zh/sql-reference/data-types/simpleaggregatefunction.md b/docs/zh/sql-reference/data-types/simpleaggregatefunction.md
index e827adb817e..38d7699c176 100644
--- a/docs/zh/sql-reference/data-types/simpleaggregatefunction.md
+++ b/docs/zh/sql-reference/data-types/simpleaggregatefunction.md
@@ -1,26 +1,31 @@
----
-machine_translated: true
-machine_translated_rev: 71d72c1f237f4a553fe91ba6c6c633e81a49e35b
----
-
 # SimpleAggregateFunction {#data-type-simpleaggregatefunction}
 
-`SimpleAggregateFunction(name, types_of_arguments…)` 数据类型存储聚合函数的当前值，而不将其完整状态存储为 [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) 有 此优化可应用于具有以下属性的函数：应用函数的结果 `f` 到行集 `S1 UNION ALL S2` 可以通过应用来获得 `f` 行的部分单独设置，然后再次应用 `f` 到结果: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. 此属性保证部分聚合结果足以计算组合结果，因此我们不必存储和处理任何额外的数据。
+`SimpleAggregateFunction(name, types_of_arguments…)` 数据类型存储聚合函数的当前值, 并不像 [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) 那样存储其全部状态。这种优化可以应用于具有以下属性函数: 将函数 `f` 应用于行集合 `S1 UNION ALL S2` 的结果，可以通过将 `f` 分别应用于行集合的部分, 然后再将 `f` 应用于结果来获得: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`。 这个属性保证了部分聚合结果足以计算出合并的结果，所以我们不必存储和处理任何额外的数据。
 
 支持以下聚合函数:
 
--   [`any`](../../sql-reference/aggregate-functions/reference.md#agg_function-any)
--   [`anyLast`](../../sql-reference/aggregate-functions/reference.md#anylastx)
--   [`min`](../../sql-reference/aggregate-functions/reference.md#agg_function-min)
--   [`max`](../../sql-reference/aggregate-functions/reference.md#agg_function-max)
--   [`sum`](../../sql-reference/aggregate-functions/reference.md#agg_function-sum)
--   [`groupBitAnd`](../../sql-reference/aggregate-functions/reference.md#groupbitand)
--   [`groupBitOr`](../../sql-reference/aggregate-functions/reference.md#groupbitor)
--   [`groupBitXor`](../../sql-reference/aggregate-functions/reference.md#groupbitxor)
--   [`groupArrayArray`](../../sql-reference/aggregate-functions/reference.md#agg_function-grouparray)
--   [`groupUniqArrayArray`](../../sql-reference/aggregate-functions/reference.md#groupuniqarrayx-groupuniqarraymax-sizex)
+-   [`any`](../../sql-reference/aggregate-functions/reference/any.md#agg_function-any)
+-   [`anyLast`](../../sql-reference/aggregate-functions/reference/anylast.md#anylastx)
+-   [`min`](../../sql-reference/aggregate-functions/reference/min.md#agg_function-min)
+-   [`max`](../../sql-reference/aggregate-functions/reference/max.md#agg_function-max)
+-   [`sum`](../../sql-reference/aggregate-functions/reference/sum.md#agg_function-sum)
+-   [`sumWithOverflow`](../../sql-reference/aggregate-functions/reference/sumwithoverflow.md#sumwithoverflowx)
+-   [`groupBitAnd`](../../sql-reference/aggregate-functions/reference/groupbitand.md#groupbitand)
+-   [`groupBitOr`](../../sql-reference/aggregate-functions/reference/groupbitor.md#groupbitor)
+-   [`groupBitXor`](../../sql-reference/aggregate-functions/reference/groupbitxor.md#groupbitxor)
+-   [`groupArrayArray`](../../sql-reference/aggregate-functions/reference/grouparray.md#agg_function-grouparray)
+-   [`groupUniqArrayArray`](../../sql-reference/aggregate-functions/reference/groupuniqarray.md)
+-   [`sumMap`](../../sql-reference/aggregate-functions/reference/summap.md#agg_functions-summap)
+-   [`minMap`](../../sql-reference/aggregate-functions/reference/minmap.md#agg_functions-minmap)
+-   [`maxMap`](../../sql-reference/aggregate-functions/reference/maxmap.md#agg_functions-maxmap)
+-   [`argMin`](../../sql-reference/aggregate-functions/reference/argmin.md)
+-   [`argMax`](../../sql-reference/aggregate-functions/reference/argmax.md)
 
-的值 `SimpleAggregateFunction(func, Type)` 看起来和存储方式相同 `Type`，所以你不需要应用函数 `-Merge`/`-State` 后缀。 `SimpleAggregateFunction` 具有比更好的性能 `AggregateFunction` 具有相同的聚合功能。
+
+!!! note "注"
+    `SimpleAggregateFunction(func, Type)` 的值外观和存储方式于 `Type` 相同, 所以你不需要应用带有 `-Merge`/`-State` 后缀的函数。
+
+    `SimpleAggregateFunction` 的性能优于具有相同聚合函数的 `AggregateFunction` 。
 
 **参数**
 
@@ -30,11 +35,7 @@ machine_translated_rev: 71d72c1f237f4a553fe91ba6c6c633e81a49e35b
 **示例**
 
 ``` sql
-CREATE TABLE t
-(
-    column1 SimpleAggregateFunction(sum, UInt64),
-    column2 SimpleAggregateFunction(any, String)
-) ENGINE = ...
+CREATE TABLE simple (id UInt64, val SimpleAggregateFunction(sum, Double)) ENGINE=AggregatingMergeTree ORDER BY id;
 ```
 
 [原始文章](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) <!--hide-->

From 8ac49387c3462734c493a77fe48fe41476edfcb6 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Mon, 22 Mar 2021 23:56:44 +0800
Subject: [PATCH 180/260] update-aggregate-funcions-in-zh

---
 .../aggregate-functions/reference/grouparrayinsertat.md         | 2 +-
 .../aggregate-functions/reference/initializeAggregation.md      | 2 +-
 docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md  | 2 +-
 docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md | 2 +-
 docs/zh/sql-reference/aggregate-functions/reference/skewpop.md  | 2 +-
 docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
index 6d2d15fa584..3a50b24fd7f 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
@@ -21,7 +21,7 @@ groupArrayInsertAt(default_x, size)(x, pos);
 
 -   `x` — 要插入的值。生成所[支持的数据类型](../../../sql-reference/data-types/index.md)(数据)的[表达式](../../../sql-reference/syntax.md#syntax-expressions)。
 -   `pos` — 指定元素 `x` 将被插入的位置。 数组中的索引编号从零开始。 [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges).
--   `default_x`— 在空位置替换的默认值。可选参数。生成 `x` 数据类型 (数据) 的[表达式](../../../sql-reference/syntax.md#syntax-expressions)。  如果 `default_x` 未定义，则 [默认值](../../../sql-reference/statements/create.md#create-default-values) 被使用。
+-   `default_x` — 在空位置替换的默认值。可选参数。生成 `x` 数据类型 (数据) 的[表达式](../../../sql-reference/syntax.md#syntax-expressions)。  如果 `default_x` 未定义，则 [默认值](../../../sql-reference/statements/create.md#create-default-values) 被使用。
 -   `size`— 结果数组的长度。可选参数。如果使用该参数，必须指定默认值 `default_x` 。 [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges)。
 
 **返回值**
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md b/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md
index ba54d343b44..feecd7afb1f 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/initializeAggregation.md
@@ -10,7 +10,7 @@ toc_priority: 150
 **语法**
 
 ``` sql
-initializeAggregation (aggregate_function, column_1, column_2);
+initializeAggregation (aggregate_function, column_1, column_2)
 ```
 
 **参数**
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md b/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md
index 7a954e43e3a..d5b76e0c1e9 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/kurtpop.md
@@ -23,4 +23,4 @@ kurtPop(expr)
 **示例**
 
 ``` sql
-SELECT kurtPop(value) FROM series_with_value_column
+SELECT kurtPop(value) FROM series_with_value_column;
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md
index 348df805cf3..a38e14d0792 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/kurtsamp.md
@@ -24,5 +24,5 @@ kurtSamp(expr)
 **示例**
 
 ``` sql
-SELECT kurtSamp(value) FROM series_with_value_column
+SELECT kurtSamp(value) FROM series_with_value_column;
 ```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md b/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md
index e26e5f8c754..0771c18c2f3 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/skewpop.md
@@ -23,5 +23,5 @@ skewPop(expr)
 **示例**
 
 ``` sql
-SELECT skewPop(value) FROM series_with_value_column
+SELECT skewPop(value) FROM series_with_value_column;
 ```
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md b/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md
index 0ac68fb4e7c..902d06da8e7 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/skewsamp.md
@@ -25,5 +25,5 @@ skewSamp(expr)
 **示例**
 
 ``` sql
-SELECT skewSamp(value) FROM series_with_value_column
+SELECT skewSamp(value) FROM series_with_value_column;
 ```

From 67b8f02b9366aff71f12738f7d6ba40074c0375c Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Tue, 23 Mar 2021 21:10:26 +0800
Subject: [PATCH 181/260] update deltasum.md

---
 .../aggregate-functions/reference/deltasum.md | 60 ++++++++++++++++---
 1 file changed, 53 insertions(+), 7 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md b/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md
index 3f099944cee..eabb13376b1 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md
@@ -4,20 +4,66 @@ toc_priority: 141
 
 # deltaSum {#agg_functions-deltasum}
 
+计算连续行之间的差值和。如果差值为负，则忽略。
+
 **语法**
 
 ``` sql
 deltaSum(value)
 ```
 
-计算连续行之间的差值和。如果差值为负，则忽略。
-`value`必须是整型或浮点类型。
+**参数**
 
-示例:
+-   `value` — 必须是 [整型](../../data-types/int-uint.md) 或者 [浮点型](../../data-types/float.md) 。
 
-```sql
-select deltaSum(arrayJoin([1, 2, 3]));                  -- => 2
-select deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3]));   -- => 7
-select deltaSum(arrayJoin([2.25, 3, 4.5])); -- => 2.25
+**返回值**
+
+- `Integer` or `Float` 型的算术差值和。
+
+**示例**
+
+查询:
+
+``` sql
+SELECT deltaSum(arrayJoin([1, 2, 3]));
 ```
 
+结果:
+
+``` text
+┌─deltaSum(arrayJoin([1, 2, 3]))─┐
+│                              2 │
+└────────────────────────────────┘
+```
+
+查询:
+
+``` sql
+SELECT deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3]));
+```
+
+结果:
+
+``` text
+┌─deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3]))─┐
+│                                             7 │
+└───────────────────────────────────────────────┘
+```
+
+查询:
+
+``` sql
+SELECT deltaSum(arrayJoin([2.25, 3, 4.5]));
+```
+
+结果:
+
+``` text
+┌─deltaSum(arrayJoin([2.25, 3, 4.5]))─┐
+│                                2.25 │
+└─────────────────────────────────────┘
+```
+
+## 参见 {#see-also}
+
+-   [runningDifference](../../functions/other-functions.md#other_functions-runningdifference)

From dd63983ab193d14d17ddb912b04ca25fc8187870 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Tue, 23 Mar 2021 23:01:28 +0800
Subject: [PATCH 182/260] update avg.md

---
 .../aggregate-functions/reference/avg.md      | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/avg.md b/docs/zh/sql-reference/aggregate-functions/reference/avg.md
index ea4f351b55e..739654adc1c 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/avg.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/avg.md
@@ -14,26 +14,19 @@ avg(x)
 
 **参数**
 
--   `x` — 列名
-
-`x` 必须是
-[Integer](../../../sql-reference/data-types/int-uint.md),
-[floating-point](../../../sql-reference/data-types/float.md), or 
-[Decimal](../../../sql-reference/data-types/decimal.md).
+-   `x` — 输入值, 必须是 [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md), 或 [Decimal](../../../sql-reference/data-types/decimal.md)。
 
 **返回值**
 
-- `NaN`。 参数列为空时返回。
-- 算术平均值。 其他情况。
-
-**返回类型** 总是 [Float64](../../../sql-reference/data-types/float.md).
+-   算术平均值，总是 [Float64](../../../sql-reference/data-types/float.md) 类型。
+-   输入参数 `x` 为空时返回 `NaN` 。
 
 **示例**
 
 查询:
 
 ``` sql
-SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5)
+SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5);
 ```
 
 结果:
@@ -46,11 +39,20 @@ SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5)
 
 **示例**
 
+创建一个临时表:
+
 查询:
 
 ``` sql
 CREATE table test (t UInt8) ENGINE = Memory;
-SELECT avg(t) FROM test
+```
+
+获取算术平均值:
+
+查询:
+
+```
+SELECT avg(t) FROM test;
 ```
 
 结果:

From 5bcda9e101002e5ce4f5396c2161114d31e51803 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Tue, 23 Mar 2021 23:33:29 +0800
Subject: [PATCH 183/260] update argmax.md

---
 .../aggregate-functions/reference/argmax.md   | 48 +++++++++++++++----
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/argmax.md b/docs/zh/sql-reference/aggregate-functions/reference/argmax.md
index 1791ef8f88e..9d90590b2f1 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/argmax.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/argmax.md
@@ -4,14 +4,42 @@ toc_priority: 106
 
 # argMax {#agg-function-argmax}
 
-语法: `argMax(arg, val)` 或 `argMax(tuple(arg, val))`
+计算 `val` 最大值对应的 `arg` 值。 如果 `val` 最大值存在几个不同的 `arg` 值，输出遇到的第一个值。
 
-计算 `val` 最大值对应的 `arg`  值。 如果 `val` 最大值存在几个不同的 `arg` 值，输出遇到的第一个(`arg`)值。
+这个函数的Tuple版本将返回 `val` 最大值对应的元组。本函数适合和 `SimpleAggregateFunction` 搭配使用。
 
+**语法**
 
-这个函数的Tuple版本将返回`val`最大值对应的tuple。本函数适合和`SimpleAggregateFunction`搭配使用。
+``` sql
+argMax(arg, val)
+```
 
-**示例:**
+或
+
+``` sql
+argMax(tuple(arg, val))
+```
+
+**参数**
+
+-   `arg` — Argument.
+-   `val` — Value.
+
+**返回值**
+
+-   `val` 最大值对应的 `arg` 值。
+
+类型: 匹配 `arg` 类型。
+
+对于输入中的元组:
+
+-   元组 `(arg, val)`, 其中 `val` 最大值，`arg` 是对应的值。
+
+类型: [元组](../../../sql-reference/data-types/tuple.md)。
+
+**示例**
+
+输入表:
 
 ``` text
 ┌─user─────┬─salary─┐
@@ -21,12 +49,16 @@ toc_priority: 106
 └──────────┴────────┘
 ```
 
+查询:
+
 ``` sql
-SELECT argMax(user, salary), argMax(tuple(user, salary)) FROM salary
+SELECT argMax(user, salary), argMax(tuple(user, salary), salary), argMax(tuple(user, salary)) FROM salary;
 ```
 
+结果:
+
 ``` text
-┌─argMax(user, salary)─┬─argMax(tuple(user, salary))─┐
-│ director             │ ('director',5000)           │
-└──────────────────────┴─────────────────────────────┘
+┌─argMax(user, salary)─┬─argMax(tuple(user, salary), salary)─┬─argMax(tuple(user, salary))─┐
+│ director             │ ('director',5000)                   │ ('director',5000)           │
+└──────────────────────┴─────────────────────────────────────┴─────────────────────────────┘
 ```

From ee939fed4d3688b7d98d52ba3949643ee1cf234a Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Wed, 24 Mar 2021 23:46:03 +0800
Subject: [PATCH 184/260] update-aggregate-funcion-in-zh

---
 .../aggregate-functions/reference/anyheavy.md             | 6 +++++-
 .../sql-reference/aggregate-functions/reference/argmin.md | 8 +++++++-
 .../aggregate-functions/reference/deltasum.md             | 2 +-
 .../sql-reference/aggregate-functions/reference/index.md  | 2 --
 .../reference/quantileexactweighted.md                    | 1 +
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md b/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
index 45a96309ac8..b67be9e48cf 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/anyheavy.md
@@ -18,11 +18,15 @@ anyHeavy(column)
 
 使用 [OnTime](../../../getting-started/example-datasets/ontime.md) 数据集，并选择在 `AirlineID` 列任何频繁出现的值。
 
+查询:
+
 ``` sql
 SELECT anyHeavy(AirlineID) AS res
-FROM ontime
+FROM ontime;
 ```
 
+结果:
+
 ``` text
 ┌───res─┐
 │ 19690 │
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/argmin.md b/docs/zh/sql-reference/aggregate-functions/reference/argmin.md
index a174fb16c59..0dd4625ac0d 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/argmin.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/argmin.md
@@ -12,6 +12,8 @@ toc_priority: 105
 
 **示例:**
 
+输入表:
+
 ``` text
 ┌─user─────┬─salary─┐
 │ director │   5000 │
@@ -20,10 +22,14 @@ toc_priority: 105
 └──────────┴────────┘
 ```
 
+查询:
+
 ``` sql
-SELECT argMin(user, salary), argMin(tuple(user, salary)) FROM salary
+SELECT argMin(user, salary), argMin(tuple(user, salary)) FROM salary;
 ```
 
+结果:
+
 ``` text
 ┌─argMin(user, salary)─┬─argMin(tuple(user, salary))─┐
 │ worker               │ ('worker',1000)             │
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md b/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md
index eabb13376b1..e439263bf78 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/deltasum.md
@@ -64,6 +64,6 @@ SELECT deltaSum(arrayJoin([2.25, 3, 4.5]));
 └─────────────────────────────────────┘
 ```
 
-## 参见 {#see-also}
+**参见**
 
 -   [runningDifference](../../functions/other-functions.md#other_functions-runningdifference)
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/index.md b/docs/zh/sql-reference/aggregate-functions/reference/index.md
index 3598a3cc536..5070c79775e 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/index.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/index.md
@@ -70,5 +70,3 @@ ClickHouse 特有的聚合函数:
 -   [stochasticLinearRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md)
 -   [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md)
 -   [categoricalInformationValue](../../../sql-reference/aggregate-functions/reference/categoricalinformationvalue.md)
-
-[Original article](https://clickhouse.tech/docs/en/sql-reference/aggregate-functions/reference/) <!--hide-->
diff --git a/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md b/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
index 0b20d780094..5211ca210f2 100644
--- a/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
+++ b/docs/zh/sql-reference/aggregate-functions/reference/quantileexactweighted.md
@@ -61,5 +61,6 @@ SELECT quantileExactWeighted(n, val) FROM t
 ```
 
 **参见**
+
 -   [中位数](../../../sql-reference/aggregate-functions/reference/median.md#median)
 -   [分位数](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)

From f9c7fd7b9db7fa4c3386222da018217c7575b6f3 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Sat, 20 Mar 2021 22:20:22 +0800
Subject: [PATCH 185/260] update zh docs datatype uuid

---
 docs/zh/sql-reference/data-types/uuid.md | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/docs/zh/sql-reference/data-types/uuid.md b/docs/zh/sql-reference/data-types/uuid.md
index 2ff1e391e81..11e5fc2cd93 100644
--- a/docs/zh/sql-reference/data-types/uuid.md
+++ b/docs/zh/sql-reference/data-types/uuid.md
@@ -1,21 +1,19 @@
 ---
-machine_translated: true
-machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd
 toc_priority: 46
 toc_title: UUID
 ---
 
 # UUID {#uuid-data-type}
 
-通用唯一标识符(UUID)是用于标识记录的16字节数。 有关UUID的详细信息，请参阅 [维基百科](https://en.wikipedia.org/wiki/Universally_unique_identifier).
+通用唯一标识符(UUID)是一个16字节的数字，用于标识记录。有关UUID的详细信息, 参见[维基百科](https://en.wikipedia.org/wiki/Universally_unique_identifier)。
 
-UUID类型值的示例如下所示:
+UUID类型值的示例如下:
 
 ``` text
 61f0c404-5cb3-11e7-907b-a6006ad3dba0
 ```
 
-如果在插入新记录时未指定UUID列值，则UUID值将用零填充:
+如果在插入新记录时未指定UUID列的值，则UUID值将用零填充:
 
 ``` text
 00000000-0000-0000-0000-000000000000
@@ -23,13 +21,13 @@ UUID类型值的示例如下所示:
 
 ## 如何生成 {#how-to-generate}
 
-要生成UUID值，ClickHouse提供了 [generateuidv4](../../sql-reference/functions/uuid-functions.md) 功能。
+要生成UUID值，ClickHouse提供了 [generateuidv4](../../sql-reference/functions/uuid-functions.md) 函数。
 
 ## 用法示例 {#usage-example}
 
 **示例1**
 
-此示例演示如何创建具有UUID类型列的表并将值插入到表中。
+这个例子演示了创建一个具有UUID类型列的表，并在表中插入一个值。
 
 ``` sql
 CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog
@@ -51,7 +49,7 @@ SELECT * FROM t_uuid
 
 **示例2**
 
-在此示例中，插入新记录时未指定UUID列值。
+在这个示例中，插入新记录时未指定UUID列的值。
 
 ``` sql
 INSERT INTO t_uuid (y) VALUES ('Example 2')
@@ -70,7 +68,7 @@ SELECT * FROM t_uuid
 
 ## 限制 {#restrictions}
 
-UUID数据类型仅支持以下功能 [字符串](string.md) 数据类型也支持（例如, [min](../../sql-reference/aggregate-functions/reference.md#agg_function-min), [max](../../sql-reference/aggregate-functions/reference.md#agg_function-max)，和 [计数](../../sql-reference/aggregate-functions/reference.md#agg_function-count)).
+UUID数据类型只支持 [字符串](../../sql-reference/data-types/string.md) 数据类型也支持的函数(比如, [min](../../sql-reference/aggregate-functions/reference/min.md#agg_function-min), [max](../../sql-reference/aggregate-functions/reference/max.md#agg_function-max), 和 [count](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count))。
 
 算术运算不支持UUID数据类型（例如, [abs](../../sql-reference/functions/arithmetic-functions.md#arithm_func-abs)）或聚合函数，例如 [sum](../../sql-reference/aggregate-functions/reference.md#agg_function-sum) 和 [avg](../../sql-reference/aggregate-functions/reference.md#agg_function-avg).
 

From 1f3439101870d5d78f4db17f9a88a131d52fa772 Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Thu, 25 Mar 2021 00:23:56 +0800
Subject: [PATCH 186/260] update links in uuid

---
 docs/zh/sql-reference/data-types/uuid.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/zh/sql-reference/data-types/uuid.md b/docs/zh/sql-reference/data-types/uuid.md
index 11e5fc2cd93..b454484003c 100644
--- a/docs/zh/sql-reference/data-types/uuid.md
+++ b/docs/zh/sql-reference/data-types/uuid.md
@@ -70,6 +70,5 @@ SELECT * FROM t_uuid
 
 UUID数据类型只支持 [字符串](../../sql-reference/data-types/string.md) 数据类型也支持的函数(比如, [min](../../sql-reference/aggregate-functions/reference/min.md#agg_function-min), [max](../../sql-reference/aggregate-functions/reference/max.md#agg_function-max), 和 [count](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count))。
 
-算术运算不支持UUID数据类型（例如, [abs](../../sql-reference/functions/arithmetic-functions.md#arithm_func-abs)）或聚合函数，例如 [sum](../../sql-reference/aggregate-functions/reference.md#agg_function-sum) 和 [avg](../../sql-reference/aggregate-functions/reference.md#agg_function-avg).
+算术运算不支持UUID数据类型（例如, [abs](../../sql-reference/functions/arithmetic-functions.md#arithm_func-abs)）或聚合函数，例如 [sum](../../sql-reference/aggregate-functions/reference/sum.md#agg_function-sum) 和 [avg](../../sql-reference/aggregate-functions/reference/avg.md#agg_function-avg).
 
-[原始文章](https://clickhouse.tech/docs/en/data_types/uuid/) <!--hide-->

From a861b633d6a59e05e5396966ebbdaadc9828f05b Mon Sep 17 00:00:00 2001
From: benbiti <wangshouben@hotmail.com>
Date: Thu, 25 Mar 2021 17:04:43 +0800
Subject: [PATCH 187/260] fix broken links in settings.md

---
 docs/zh/operations/settings/settings.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/zh/operations/settings/settings.md b/docs/zh/operations/settings/settings.md
index 64625c19c6a..720b822ce29 100644
--- a/docs/zh/operations/settings/settings.md
+++ b/docs/zh/operations/settings/settings.md
@@ -988,15 +988,15 @@ ClickHouse生成异常
 
 ## count_distinct_implementation {#settings-count_distinct_implementation}
 
-指定其中的 `uniq*` 函数应用于执行 [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference.md#agg_function-count) 建筑。
+指定其中的 `uniq*` 函数应用于执行 [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count) 建筑。
 
 可能的值:
 
--   [uniq](../../sql-reference/aggregate-functions/reference.md#agg_function-uniq)
--   [uniqCombined](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqcombined)
--   [uniqCombined64](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqcombined64)
--   [uniqHLL12](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqhll12)
--   [uniqExact](../../sql-reference/aggregate-functions/reference.md#agg_function-uniqexact)
+-   [uniq](../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)
+-   [uniqCombined](../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined)
+-   [uniqCombined64](../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)
+-   [uniqHLL12](../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12)
+-   [uniqExact](../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact)
 
 默认值: `uniqExact`.
 

From d36d3f036dfda47bf0212e7810f799377b14aaff Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 25 Mar 2021 13:04:16 +0300
Subject: [PATCH 188/260] Fix several races in NuRaft

---
 contrib/NuRaft | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/NuRaft b/contrib/NuRaft
index 3d3683e7775..70468326ad5 160000
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@@ -1 +1 @@
-Subproject commit 3d3683e77753cfe015a05fae95ddf418e19f59e1
+Subproject commit 70468326ad5d72e9497944838484c591dae054ea

From 640ba7928880d8bc42b4efc494e2dbd21203fbf5 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 25 Mar 2021 13:23:25 +0300
Subject: [PATCH 189/260] Remove data corruption from lightweight run

---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 45a1f442d24..7380a9d9cbb 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -113,8 +113,10 @@
 
 (def useful-nemesises ["random-node-killer"
                        "simple-partitioner"
-                       "logs-and-snapshots-corruptor"
-                       "drop-data-corruptor"
+                       "all-nodes-hammer-time"
+                       ; can lead to a very rare data loss https://github.com/eBay/NuRaft/issues/185
+                       ;"logs-and-snapshots-corruptor"
+                       ;"drop-data-corruptor"
                        "bridge-partitioner"
                        "blind-node-partitioner"
                        "blind-others-partitioner"])

From b7622868fc03a76769180f95459137c3ca1c091b Mon Sep 17 00:00:00 2001
From: feng lv <fenglv15@mails.ucas.ac.cn>
Date: Thu, 25 Mar 2021 11:10:41 +0000
Subject: [PATCH 190/260] remove useless code

---
 src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 747819c77eb..96a3dba12f7 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -551,11 +551,6 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
         .checksum_on_read = settings.checksum_on_read,
     };
 
-    /// PREWHERE
-    String prewhere_column;
-    if (select.prewhere())
-        prewhere_column = select.prewhere()->getColumnName();
-
     struct DataSkippingIndexAndCondition
     {
         MergeTreeIndexPtr index;

From 3d92cb46f7478bc38b4c1f3b6d192cb4cffd824c Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 25 Mar 2021 14:22:19 +0300
Subject: [PATCH 191/260] Trying to fix my favorite test

---
 .../00992_system_parts_race_condition_zookeeper_long.sh   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh
index 1e61c8d64f3..fe6246e02f6 100755
--- a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh
+++ b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh
@@ -74,7 +74,7 @@ timeout $TIMEOUT bash -c thread5 2> /dev/null &
 
 wait
 
-$CLICKHOUSE_CLIENT -n -q "
-    DROP TABLE alter_table;
-    DROP TABLE alter_table2
-"
+$CLICKHOUSE_CLIENT -n -q "DROP TABLE alter_table;" &
+$CLICKHOUSE_CLIENT -n -q "DROP TABLE alter_table2;" &
+
+wait

From 55ba4ec15e322bc332b845cd90d43bf6cff916e6 Mon Sep 17 00:00:00 2001
From: feng lv <fenglv15@mails.ucas.ac.cn>
Date: Thu, 25 Mar 2021 10:41:03 +0000
Subject: [PATCH 192/260] Fix bar with invalid float value

fix
---
 src/Functions/bar.cpp                                         | 4 ++++
 .../0_stateless/01774_bar_with_illegal_value.reference        | 0
 tests/queries/0_stateless/01774_bar_with_illegal_value.sql    | 1 +
 3 files changed, 5 insertions(+)
 create mode 100644 tests/queries/0_stateless/01774_bar_with_illegal_value.reference
 create mode 100644 tests/queries/0_stateless/01774_bar_with_illegal_value.sql

diff --git a/src/Functions/bar.cpp b/src/Functions/bar.cpp
index 7364311a1be..6f5298a8c5e 100644
--- a/src/Functions/bar.cpp
+++ b/src/Functions/bar.cpp
@@ -16,6 +16,7 @@ namespace ErrorCodes
     extern const int ARGUMENT_OUT_OF_BOUND;
     extern const int ILLEGAL_COLUMN;
     extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int BAD_ARGUMENTS;
 }
 
 namespace
@@ -110,6 +111,9 @@ public:
                 arguments[2].column->getFloat64(i),
                 max_width);
 
+            if (!isFinite(width))
+                throw Exception("Value of width must not be NaN and Inf", ErrorCodes::BAD_ARGUMENTS);
+
             size_t next_size = current_offset + UnicodeBar::getWidthInBytes(width) + 1;
             dst_chars.resize(next_size);
             UnicodeBar::render(width, reinterpret_cast<char *>(&dst_chars[current_offset]));
diff --git a/tests/queries/0_stateless/01774_bar_with_illegal_value.reference b/tests/queries/0_stateless/01774_bar_with_illegal_value.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01774_bar_with_illegal_value.sql b/tests/queries/0_stateless/01774_bar_with_illegal_value.sql
new file mode 100644
index 00000000000..60c7f303c13
--- /dev/null
+++ b/tests/queries/0_stateless/01774_bar_with_illegal_value.sql
@@ -0,0 +1 @@
+SELECT greatCircleAngle(1048575, 257, -9223372036854775808, 1048576) - NULL, bar(7, -inf, 1024); -- { serverError 36 } 

From 2ede490152129d3f6b35a764e6bfa25b6e227e22 Mon Sep 17 00:00:00 2001
From: Peng Jian <pengjian.uestc@gmail.com>
Date: Thu, 25 Mar 2021 21:57:47 +0800
Subject: [PATCH 193/260] The thread pool name should not longer than 15 chars

---
 src/Core/BackgroundSchedulePool.cpp | 21 +++++++++++++++++++--
 src/Interpreters/Context.cpp        |  2 +-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/Core/BackgroundSchedulePool.cpp b/src/Core/BackgroundSchedulePool.cpp
index 5f2a115562d..74585b859d8 100644
--- a/src/Core/BackgroundSchedulePool.cpp
+++ b/src/Core/BackgroundSchedulePool.cpp
@@ -6,6 +6,7 @@
 #include <common/logger_useful.h>
 #include <chrono>
 #include <ext/scope_guard.h>
+#include <Common/Exception.h>
 
 
 namespace DB
@@ -243,7 +244,14 @@ void BackgroundSchedulePool::attachToThreadGroup()
 
 void BackgroundSchedulePool::threadFunction()
 {
-    setThreadName(thread_name.c_str());
+    try
+    {
+        setThreadName(thread_name.c_str());
+    }
+    catch (const DB::Exception &)
+    {
+        throw;
+    }
 
     attachToThreadGroup();
     SCOPE_EXIT({ CurrentThread::detachQueryIfNotDetached(); });
@@ -270,7 +278,16 @@ void BackgroundSchedulePool::threadFunction()
 
 void BackgroundSchedulePool::delayExecutionThreadFunction()
 {
-    setThreadName((thread_name + "/D").c_str());
+
+    try
+    {
+        setThreadName((thread_name + "/D").c_str());
+    }
+    catch (const DB::Exception &)
+    {
+        throw;
+    }
+
 
     attachToThreadGroup();
     SCOPE_EXIT({ CurrentThread::detachQueryIfNotDetached(); });
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index b2b15fb0d25..cd458399992 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -1538,7 +1538,7 @@ BackgroundSchedulePool & Context::getMessageBrokerSchedulePool() const
         shared->message_broker_schedule_pool.emplace(
             settings.background_message_broker_schedule_pool_size,
             CurrentMetrics::BackgroundDistributedSchedulePoolTask,
-            "BgMsgBrkSchPool");
+            "BgMBSchPool");
     return *shared->message_broker_schedule_pool;
 }
 

From 8f5c8f32dfebc86d50e44cc13a08782137a4e04c Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Thu, 25 Mar 2021 17:30:29 +0300
Subject: [PATCH 194/260] Shrink totals block to 1 row for JOIN with TOTALS and
 arrayJoin

---
 src/Interpreters/join_common.cpp              |  4 +++
 .../Transforms/JoiningTransform.cpp           |  6 ++++-
 .../01107_join_right_table_totals.reference   | 10 ++++++++
 .../01107_join_right_table_totals.sql         | 25 +++++++++++++++++++
 4 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/Interpreters/join_common.cpp b/src/Interpreters/join_common.cpp
index 4c124f99e57..9a9253cee75 100644
--- a/src/Interpreters/join_common.cpp
+++ b/src/Interpreters/join_common.cpp
@@ -268,6 +268,10 @@ void joinTotals(const Block & totals, const Block & columns_to_add, const TableJ
         {
             if (table_join.rightBecomeNullable(col.type))
                 JoinCommon::convertColumnToNullable(col);
+
+            /// In case of arrayJoin it can be not one row
+            if (col.column->size() != 1)
+                col.column = col.column->cloneResized(1);
         }
 
         for (size_t i = 0; i < totals_without_keys.columns(); ++i)
diff --git a/src/Processors/Transforms/JoiningTransform.cpp b/src/Processors/Transforms/JoiningTransform.cpp
index 26630f80b17..2b87695db69 100644
--- a/src/Processors/Transforms/JoiningTransform.cpp
+++ b/src/Processors/Transforms/JoiningTransform.cpp
@@ -38,7 +38,11 @@ void JoiningTransform::transform(Chunk & chunk)
     if (on_totals)
     {
         /// We have to make chunk empty before return
-        block = getInputPort().getHeader().cloneWithColumns(chunk.detachColumns());
+        /// In case of using `arrayJoin` we can get more or less columns than one
+        auto cols = chunk.detachColumns();
+        for (auto & col : cols)
+            col = col->cloneResized(1);
+        block = getInputPort().getHeader().cloneWithColumns(std::move(cols));
 
         /// Drop totals if both out stream and joined stream doesn't have ones.
         /// See comment in ExpressionTransform.h
diff --git a/tests/queries/0_stateless/01107_join_right_table_totals.reference b/tests/queries/0_stateless/01107_join_right_table_totals.reference
index 77db8015b0e..f71d3b0d05f 100644
--- a/tests/queries/0_stateless/01107_join_right_table_totals.reference
+++ b/tests/queries/0_stateless/01107_join_right_table_totals.reference
@@ -8,3 +8,13 @@
 0
 
 0	0	0
+
+0	0
+
+0	0
+
+0	0
+
+0	0
+
+0	0
diff --git a/tests/queries/0_stateless/01107_join_right_table_totals.sql b/tests/queries/0_stateless/01107_join_right_table_totals.sql
index 77e8848c957..a4f284e5e2d 100644
--- a/tests/queries/0_stateless/01107_join_right_table_totals.sql
+++ b/tests/queries/0_stateless/01107_join_right_table_totals.sql
@@ -35,4 +35,29 @@ FULL JOIN
 ) rr
 USING (id);
 
+SELECT id, yago 
+FROM ( SELECT item_id AS id FROM t GROUP BY id ) AS ll
+FULL OUTER JOIN ( SELECT item_id AS id, arrayJoin([111, 222, 333, 444]), SUM(price_sold) AS yago FROM t GROUP BY id WITH TOTALS ) AS rr 
+USING (id);
+
+SELECT id, yago 
+FROM ( SELECT item_id AS id, arrayJoin([111, 222, 333]) FROM t GROUP BY id WITH TOTALS ) AS ll
+FULL OUTER JOIN ( SELECT item_id AS id, SUM(price_sold) AS yago FROM t GROUP BY id ) AS rr 
+USING (id);
+
+SELECT id, yago 
+FROM ( SELECT item_id AS id, arrayJoin(emptyArrayInt32()) FROM t GROUP BY id WITH TOTALS ) AS ll
+FULL OUTER JOIN ( SELECT item_id AS id, SUM(price_sold) AS yago FROM t GROUP BY id ) AS rr 
+USING (id);
+
+SELECT id, yago 
+FROM ( SELECT item_id AS id FROM t GROUP BY id ) AS ll
+FULL OUTER JOIN ( SELECT item_id AS id, arrayJoin(emptyArrayInt32()), SUM(price_sold) AS yago FROM t GROUP BY id WITH TOTALS ) AS rr 
+USING (id);
+
+SELECT id, yago 
+FROM ( SELECT item_id AS id, arrayJoin([111, 222, 333]) FROM t GROUP BY id WITH TOTALS ) AS ll
+FULL OUTER JOIN ( SELECT item_id AS id, arrayJoin([111, 222, 333, 444]), SUM(price_sold) AS yago FROM t GROUP BY id WITH TOTALS ) AS rr 
+USING (id);
+
 DROP TABLE t;

From fc1cd6f65e672a18fbcb2702a17684141293b74e Mon Sep 17 00:00:00 2001
From: Peng Jian <pengjian.uestc@gmail.com>
Date: Thu, 25 Mar 2021 22:57:08 +0800
Subject: [PATCH 195/260] remove unnecessary code

---
 src/Core/BackgroundSchedulePool.cpp | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/src/Core/BackgroundSchedulePool.cpp b/src/Core/BackgroundSchedulePool.cpp
index 74585b859d8..5f2a115562d 100644
--- a/src/Core/BackgroundSchedulePool.cpp
+++ b/src/Core/BackgroundSchedulePool.cpp
@@ -6,7 +6,6 @@
 #include <common/logger_useful.h>
 #include <chrono>
 #include <ext/scope_guard.h>
-#include <Common/Exception.h>
 
 
 namespace DB
@@ -244,14 +243,7 @@ void BackgroundSchedulePool::attachToThreadGroup()
 
 void BackgroundSchedulePool::threadFunction()
 {
-    try
-    {
-        setThreadName(thread_name.c_str());
-    }
-    catch (const DB::Exception &)
-    {
-        throw;
-    }
+    setThreadName(thread_name.c_str());
 
     attachToThreadGroup();
     SCOPE_EXIT({ CurrentThread::detachQueryIfNotDetached(); });
@@ -278,16 +270,7 @@ void BackgroundSchedulePool::threadFunction()
 
 void BackgroundSchedulePool::delayExecutionThreadFunction()
 {
-
-    try
-    {
-        setThreadName((thread_name + "/D").c_str());
-    }
-    catch (const DB::Exception &)
-    {
-        throw;
-    }
-
+    setThreadName((thread_name + "/D").c_str());
 
     attachToThreadGroup();
     SCOPE_EXIT({ CurrentThread::detachQueryIfNotDetached(); });

From 8ea697b7df4efa6fb01d2b418b895eb58ead71b9 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Thu, 25 Mar 2021 18:49:01 +0300
Subject: [PATCH 196/260] use camelCase

---
 docs/en/sql-reference/window-functions/index.md            | 2 +-
 src/Processors/Transforms/WindowTransform.cpp              | 6 +++---
 tests/performance/window_functions.xml                     | 6 +++---
 tests/queries/0_stateless/01591_window_functions.reference | 4 ++--
 tests/queries/0_stateless/01591_window_functions.sql       | 4 ++--
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md
index 3d18bc123f9..a646347ea60 100644
--- a/docs/en/sql-reference/window-functions/index.md
+++ b/docs/en/sql-reference/window-functions/index.md
@@ -25,7 +25,7 @@ ClickHouse supports the standard grammar for defining windows and window functio
 | `rank()`, `dense_rank()`, `row_number()` | supported |
 | `lag/lead(value, offset)` | Not supported. Workarounds: |
 |  | 1) replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`|
-|  | 2) use `lag_in_frame/lead_in_frame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` |
+|  | 2) use `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` |
 
 ## References
 
diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 3ab16d0d1b4..4a5282c1e6b 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -1525,7 +1525,7 @@ void registerWindowFunctions(AggregateFunctionFactory & factory)
     // be able to process at least the lag/lead in streaming fashion.
     // A partial solution for constant offsets is rewriting, say `lag(value, offset)
     // to `any(value) over (rows between offset preceding and offset preceding)`.
-    // We also implement non-standard functions `lag/lead_in_frame`, that are
+    // We also implement non-standard functions `lag/leadInFrame`, that are
     // analogous to `lag/lead`, but respect the frame.
     // Functions like cume_dist() do require materializing the entire
     // partition, but it's probably also simpler to implement them by rewriting
@@ -1553,14 +1553,14 @@ void registerWindowFunctions(AggregateFunctionFactory & factory)
                 parameters);
         });
 
-    factory.registerFunction("lag_in_frame", [](const std::string & name,
+    factory.registerFunction("lagInFrame", [](const std::string & name,
             const DataTypes & argument_types, const Array & parameters)
         {
             return std::make_shared<WindowFunctionLagLeadInFrame<false>>(
                 name, argument_types, parameters);
         });
 
-    factory.registerFunction("lead_in_frame", [](const std::string & name,
+    factory.registerFunction("leadInFrame", [](const std::string & name,
             const DataTypes & argument_types, const Array & parameters)
         {
             return std::make_shared<WindowFunctionLagLeadInFrame<true>>(
diff --git a/tests/performance/window_functions.xml b/tests/performance/window_functions.xml
index 8db168b1a97..6be3d59e2b0 100644
--- a/tests/performance/window_functions.xml
+++ b/tests/performance/window_functions.xml
@@ -112,7 +112,7 @@
 
     <!-- Our variant of lead. -->
     <query>
-        select lead_in_frame(number) over w
+        select leadInFrame(number) over w
         from
             (select number, intDiv(number, 1111) p, mod(number, 111) o
                 from numbers(10000000)) t
@@ -133,7 +133,7 @@
     </query>
 
     <query>
-        select lead_in_frame(number, number) over w
+        select leadInFrame(number, number) over w
         from
             (select number, intDiv(number, 1111) p, mod(number, 111) o
                 from numbers(10000000)) t
@@ -143,7 +143,7 @@
     </query>
 
     <query>
-        select lead_in_frame(number, number, number) over w
+        select leadInFrame(number, number, number) over w
         from
             (select number, intDiv(number, 1111) p, mod(number, 111) o
                 from numbers(10000000)) t
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index 05228e5303b..14e5889a811 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -1004,8 +1004,8 @@ from numbers(5);
 3	\N
 -- variants of lag/lead that respect the frame
 select number, p, pp,
-    lag_in_frame(number, number - pp, number * 11) over w as lag,
-    lead_in_frame(number, number - pp, number * 11) over w as lead
+    lagInFrame(number, number - pp, number * 11) over w as lag,
+    leadInFrame(number, number - pp, number * 11) over w as lead
 from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16))
 window w as (partition by p order by number
     rows between unbounded preceding and unbounded following)
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index 2783fc66d78..30847e09246 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -349,8 +349,8 @@ from numbers(5);
 
 -- variants of lag/lead that respect the frame
 select number, p, pp,
-    lag_in_frame(number, number - pp, number * 11) over w as lag,
-    lead_in_frame(number, number - pp, number * 11) over w as lead
+    lagInFrame(number, number - pp, number * 11) over w as lag,
+    leadInFrame(number, number - pp, number * 11) over w as lead
 from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16))
 window w as (partition by p order by number
     rows between unbounded preceding and unbounded following)

From b179ae468c7f03bcfd00ee1c444394d0e8a996ec Mon Sep 17 00:00:00 2001
From: Dmitriy <sevirov@yandex-team.ru>
Date: Thu, 25 Mar 2021 19:11:30 +0300
Subject: [PATCH 197/260] Update formats.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Внес небольшие правки в русскую версию.
---
 docs/ru/interfaces/formats.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md
index 8ec26ec66f5..2cb09c2aa17 100644
--- a/docs/ru/interfaces/formats.md
+++ b/docs/ru/interfaces/formats.md
@@ -1235,7 +1235,7 @@ ClickHouse поддерживает настраиваемую точность
 
 ### Вставка данных {#vstavka-dannykh-1}
 
-Чтобы вставить в ClickHouse данные из файла в формате ORC, вы можете использовать команду следующего вида:
+Чтобы вставить в ClickHouse данные из файла в формате ORC, используйте команду следующего вида:
 
 ``` bash
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
@@ -1243,7 +1243,7 @@ $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT OR
 
 ### Выборка данных {#vyborka-dannykh-1}
 
-Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, вы можете использовать команду следующего вида:
+Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, используйте команду следующего вида:
 
 ``` bash
 $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc}

From 177a017c5c5bbc7e4cd4c8b775c7d14b10aff4f8 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Thu, 25 Mar 2021 20:51:32 +0300
Subject: [PATCH 198/260] Minor modification #22115

---
 src/Storages/HDFS/ReadBufferFromHDFS.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/HDFS/ReadBufferFromHDFS.cpp
index f3b0e3022f1..affb76314b1 100644
--- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp
+++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp
@@ -26,7 +26,7 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl
     HDFSBuilderWrapper builder;
     HDFSFSPtr fs;
 
-    explicit ReadBufferFromHDFSImpl(const std::string & hdfs_name_,
+    ReadBufferFromHDFSImpl(const std::string & hdfs_name_,
         const Poco::Util::AbstractConfiguration & config_)
         : hdfs_uri(hdfs_name_),
           builder(createHDFSBuilder(hdfs_uri, config_))

From a4aff546e95f4a0d72b65c7918c3e6739ca84ee7 Mon Sep 17 00:00:00 2001
From: Dmitriy <sevirov@yandex-team.ru>
Date: Thu, 25 Mar 2021 22:05:26 +0300
Subject: [PATCH 199/260] Fix the title
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Поправил название заголовка.
---
 docs/ru/interfaces/formats.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md
index 2cb09c2aa17..f67997b58d6 100644
--- a/docs/ru/interfaces/formats.md
+++ b/docs/ru/interfaces/formats.md
@@ -1241,7 +1241,7 @@ ClickHouse поддерживает настраиваемую точность
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
-### Выборка данных {#vyborka-dannykh-1}
+### Вывод данных {#vyvod-dannykh-1}
 
 Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, используйте команду следующего вида:
 

From a8ce138788de6ac2391d586d25d333c7b069660f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Thu, 25 Mar 2021 23:08:47 +0300
Subject: [PATCH 200/260] Speedup codec NONE

---
 .../CachedCompressedReadBuffer.cpp            |  2 +-
 src/Compression/CompressedReadBuffer.cpp      |  6 ++--
 src/Compression/CompressedReadBufferBase.cpp  | 29 ++++++++++++++++++-
 src/Compression/CompressedReadBufferBase.h    |  8 ++++-
 .../CompressedReadBufferFromFile.cpp          |  6 ++--
 5 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/src/Compression/CachedCompressedReadBuffer.cpp b/src/Compression/CachedCompressedReadBuffer.cpp
index 4b4d33954a9..0548de07859 100644
--- a/src/Compression/CachedCompressedReadBuffer.cpp
+++ b/src/Compression/CachedCompressedReadBuffer.cpp
@@ -51,7 +51,7 @@ bool CachedCompressedReadBuffer::nextImpl()
         {
             owned_cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer();
             owned_cell->data.resize(size_decompressed + owned_cell->additional_bytes);
-            decompress(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum);
+            decompressTo(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum);
 
         }
 
diff --git a/src/Compression/CompressedReadBuffer.cpp b/src/Compression/CompressedReadBuffer.cpp
index 6a082164231..6393723acfd 100644
--- a/src/Compression/CompressedReadBuffer.cpp
+++ b/src/Compression/CompressedReadBuffer.cpp
@@ -21,7 +21,7 @@ bool CompressedReadBuffer::nextImpl()
     memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
     working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
 
-    decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
+    decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
 
     return true;
 }
@@ -48,7 +48,7 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n)
         /// If the decompressed block fits entirely where it needs to be copied.
         if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read)
         {
-            decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum);
+            decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum);
             bytes_read += size_decompressed;
             bytes += size_decompressed;
         }
@@ -63,7 +63,7 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n)
             working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
             pos = working_buffer.begin();
 
-            decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
+            decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
 
             bytes_read += read(to + bytes_read, n - bytes_read);
             break;
diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp
index 8f5b779e4bc..65ba9607468 100644
--- a/src/Compression/CompressedReadBufferBase.cpp
+++ b/src/Compression/CompressedReadBufferBase.cpp
@@ -184,7 +184,7 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed,
 }
 
 
-void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
+static void readHeaderAndGetCodec(const char * compressed_buffer, size_t size_decompressed, CompressionCodecPtr & codec, bool allow_different_codecs)
 {
     ProfileEvents::increment(ProfileEvents::CompressedReadBufferBlocks);
     ProfileEvents::increment(ProfileEvents::CompressedReadBufferBytes, size_decompressed);
@@ -210,11 +210,38 @@ void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, s
                             ErrorCodes::CANNOT_DECOMPRESS);
         }
     }
+}
 
+
+void CompressedReadBufferBase::decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
+{
+    readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs);
     codec->decompress(compressed_buffer, size_compressed_without_checksum, to);
 }
 
 
+void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum)
+{
+    readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs);
+
+    if (codec->isNone())
+    {
+        /// Shortcut for NONE codec to avoid extra memcpy.
+        /// We doing it by changing the buffer `to` to point to existing uncompressed data.
+
+        UInt8 header_size = ICompressionCodec::getHeaderSize();
+        if (size_compressed_without_checksum < header_size)
+            throw Exception(ErrorCodes::CORRUPTED_DATA,
+                "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})",
+                    size_compressed_without_checksum, size_t(header_size));
+
+        to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum);
+    }
+
+    codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin());
+}
+
+
 /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'.
 CompressedReadBufferBase::CompressedReadBufferBase(ReadBuffer * in, bool allow_different_codecs_)
     : compressed_in(in), own_compressed_buffer(0), allow_different_codecs(allow_different_codecs_)
diff --git a/src/Compression/CompressedReadBufferBase.h b/src/Compression/CompressedReadBufferBase.h
index 60b8847f639..c1e928039ef 100644
--- a/src/Compression/CompressedReadBufferBase.h
+++ b/src/Compression/CompressedReadBufferBase.h
@@ -3,6 +3,7 @@
 #include <Common/PODArray.h>
 #include <Compression/LZ4_decompress_faster.h>
 #include <Compression/ICompressionCodec.h>
+#include <IO/BufferBase.h>
 
 
 namespace DB
@@ -37,7 +38,12 @@ protected:
     /// Returns number of compressed bytes read.
     size_t readCompressedData(size_t & size_decompressed, size_t & size_compressed_without_checksum, bool always_copy);
 
-    void decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum);
+    /// Decompress into memory pointed by `to`
+    void decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum);
+
+    /// This method can change location of `to` to avoid unnecessary copy if data is uncompressed.
+    /// It is more efficient for compression codec NONE but not suitable if you want to decompress into specific location.
+    void decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum);
 
 public:
     /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'.
diff --git a/src/Compression/CompressedReadBufferFromFile.cpp b/src/Compression/CompressedReadBufferFromFile.cpp
index 54f360f417b..ea12ec7e8b7 100644
--- a/src/Compression/CompressedReadBufferFromFile.cpp
+++ b/src/Compression/CompressedReadBufferFromFile.cpp
@@ -31,7 +31,7 @@ bool CompressedReadBufferFromFile::nextImpl()
     memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
     working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
 
-    decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
+    decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
 
     return true;
 }
@@ -108,7 +108,7 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n)
         /// If the decompressed block fits entirely where it needs to be copied.
         if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read)
         {
-            decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum);
+            decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum);
             bytes_read += size_decompressed;
             bytes += size_decompressed;
         }
@@ -124,7 +124,7 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n)
             working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
             pos = working_buffer.begin();
 
-            decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
+            decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
 
             bytes_read += read(to + bytes_read, n - bytes_read);
             break;

From 8e445c5e5545115cda125344ef606c00bf6fa16c Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 25 Mar 2021 09:34:28 +0300
Subject: [PATCH 201/260] Disable async_socket_for_remote/use_hedged_requests
 for buggy linux kernels

async_socket_for_remote/use_hedged_requests uses nested epolls, which
may not be reliable in 5.5+ [1], but it has been fixed in shortly in
5.7+ [2] or
5.6.13+.

  [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=339ddb53d373
  [2]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c54a6a44bf3
---
 src/Core/SettingsQuirks.cpp  | 62 ++++++++++++++++++++++++++++++++++++
 src/Core/SettingsQuirks.h    | 16 ++++++++++
 src/Core/ya.make             |  1 +
 src/Interpreters/Context.cpp |  4 +++
 4 files changed, 83 insertions(+)
 create mode 100644 src/Core/SettingsQuirks.cpp
 create mode 100644 src/Core/SettingsQuirks.h

diff --git a/src/Core/SettingsQuirks.cpp b/src/Core/SettingsQuirks.cpp
new file mode 100644
index 00000000000..3bf9047af3a
--- /dev/null
+++ b/src/Core/SettingsQuirks.cpp
@@ -0,0 +1,62 @@
+#include <Core/SettingsQuirks.h>
+#include <Core/Settings.h>
+#include <common/logger_useful.h>
+
+#ifdef __linux__
+#include <linux/version.h>
+#endif
+
+#ifdef __linux__
+/// Detect does epoll_wait with nested epoll fds works correctly.
+/// Polling nested epoll fds from epoll_wait is required for async_socket_for_remote and use_hedged_requests.
+///
+/// It may not be reliable in 5.5+ [1], that has been fixed in 5.7+ [2] or 5.6.13+.
+///
+///   [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=339ddb53d373
+///   [2]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c54a6a44bf3
+bool nestedEpollWorks(Poco::Logger * log)
+{
+    bool nested_epoll_works =
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 5, 0)) && (LINUX_VERSION_CODE < KERNEL_VERSION(5, 6, 13))
+        /// the check is correct since there will be no more 5.5.x releases.
+        false
+#else
+        true
+#endif
+    ;
+
+    if (!nested_epoll_works)
+    {
+        if (log)
+            LOG_WARNING(log, "Nested epoll_wait has some issues on kernels [5.5.0, 5.6.13). You should upgrade it to avoid possible issues.");
+    }
+    return nested_epoll_works;
+}
+#else
+bool nestedEpollWorks(Poco::Logger *) { return true; }
+#endif
+
+namespace DB
+{
+
+/// Update some settings defaults to avoid some known issues.
+void applySettingsQuirks(Settings & settings, Poco::Logger * log)
+{
+    if (!nestedEpollWorks(log))
+    {
+        if (!settings.async_socket_for_remote.changed && settings.async_socket_for_remote)
+        {
+            settings.async_socket_for_remote = false;
+            if (log)
+                LOG_WARNING(log, "async_socket_for_remote has been disabled (you can explicitly enable it still)");
+        }
+        if (!settings.use_hedged_requests.changed && settings.use_hedged_requests)
+        {
+            settings.use_hedged_requests = false;
+            if (log)
+                LOG_WARNING(log, "use_hedged_requests has been disabled (you can explicitly enable it still)");
+        }
+    }
+}
+
+}
diff --git a/src/Core/SettingsQuirks.h b/src/Core/SettingsQuirks.h
new file mode 100644
index 00000000000..38def8eebf2
--- /dev/null
+++ b/src/Core/SettingsQuirks.h
@@ -0,0 +1,16 @@
+#pragma once
+
+namespace Poco
+{
+class Logger;
+}
+
+namespace DB
+{
+
+struct Settings;
+
+/// Update some settings defaults to avoid some known issues.
+void applySettingsQuirks(Settings & settings, Poco::Logger * log = nullptr);
+
+}
diff --git a/src/Core/ya.make b/src/Core/ya.make
index 1eae848163b..004653d060e 100644
--- a/src/Core/ya.make
+++ b/src/Core/ya.make
@@ -36,6 +36,7 @@ SRCS(
     Settings.cpp
     SettingsEnums.cpp
     SettingsFields.cpp
+    SettingsQuirks.cpp
     SortDescription.cpp
     iostream_debug_helpers.cpp
 
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index b2b15fb0d25..1c757287c83 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -30,6 +30,7 @@
 #include <TableFunctions/TableFunctionFactory.h>
 #include <Interpreters/ActionLocksManager.h>
 #include <Core/Settings.h>
+#include <Core/SettingsQuirks.h>
 #include <Access/AccessControlManager.h>
 #include <Access/ContextAccess.h>
 #include <Access/EnabledRolesInfo.h>
@@ -1100,6 +1101,7 @@ void Context::applySettingsChanges(const SettingsChanges & changes)
     auto lock = getLock();
     for (const SettingChange & change : changes)
         applySettingChange(change);
+    applySettingsQuirks(settings);
 }
 
 
@@ -2273,6 +2275,8 @@ void Context::setDefaultProfiles(const Poco::Util::AbstractConfiguration & confi
     shared->system_profile_name = config.getString("system_profile", shared->default_profile_name);
     setProfile(shared->system_profile_name);
 
+    applySettingsQuirks(settings, &Poco::Logger::get("SettingsQuirks"));
+
     shared->buffer_profile_name = config.getString("buffer_profile", shared->system_profile_name);
     buffer_context = std::make_shared<Context>(*this);
     buffer_context->setProfile(shared->buffer_profile_name);

From 48fe30e5a28592f326a38d8d8261e782a7b0ba7d Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 25 Mar 2021 23:41:03 +0300
Subject: [PATCH 202/260] Add missing logging for exception in
 InterserverIOHTTPHandler

---
 src/Server/InterserverIOHTTPHandler.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp
index 740072e8e9f..7d8dfaaf2c8 100644
--- a/src/Server/InterserverIOHTTPHandler.cpp
+++ b/src/Server/InterserverIOHTTPHandler.cpp
@@ -107,6 +107,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe
         }
         catch (...)
         {
+            tryLogCurrentException(log);
             out.finalize();
         }
     };

From e1de9600253c741e0d8f4b15c5c64dab658544b0 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 25 Mar 2021 23:41:03 +0300
Subject: [PATCH 203/260] Fix uncaught exception in InterserverIOHTTPHandler

There was one more uncaught exception case [1]:

    2021.03.19 18:11:00.845632 [ 17469 ] {} <Debug> InterserverIOHTTPHandler: Done processing query
    ...
    2021.03.19 18:11:31.698961 [ 80145 ] {} <Fatal> BaseDaemon: ########################################
    2021.03.19 18:11:31.699903 [ 80145 ] {} <Fatal> BaseDaemon: (version 21.4.1.6293 (official build), build id: 859E400E1C65C4702FE491420741DD8B58190002) (from thread 17469) (no query) Received signal Aborted (6)
    2021.03.19 18:11:32.614075 [ 80145 ] {} <Fatal> BaseDaemon: 8. ./obj-x86_64-linux-gnu/../contrib/libcxxabi/src/cxa_handlers.cpp:89: std::terminate() @ 0x21e9b3a2 in /usr/bin/clickhouse
    2021.03.19 18:11:43.831215 [ 80145 ] {} <Fatal> BaseDaemon: 10. ./obj-x86_64-linux-gnu/../src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp:201: ? @ 0x1be77038 in /usr/bin/clickhouse
    2021.03.19 18:11:44.743193 [ 80145 ] {} <Fatal> BaseDaemon: 11. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:892: std::__1::allocator<DB::WriteBufferFromHTTPServerResponse>::destroy(DB::WriteBufferFromHTTPServerResponse*) @ 0x1bddd7c9 in /usr/bin/clickhouse
    2021.03.19 18:11:45.283905 [ 80145 ] {} <Fatal> BaseDaemon: 12. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/__memory/allocator_traits.h:541: void std::__1::allocator_traits<std::__1::allocator<DB::WriteBufferFromHTTPServerResponse> >::__destroy<DB::WriteBufferFromHTTPServerResponse>(std::__1::integral_constant<bool, true>, std::__1::allocator<DB::WriteBufferFromHTTPServerResponse>&, DB::WriteBufferFromHTTPServerResponse*) @ 0x1bddd79d in /usr/bin/clickhouse
    2021.03.19 18:11:45.805233 [ 80145 ] {} <Fatal> BaseDaemon: 13. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/__memory/allocator_traits.h:487: void std::__1::allocator_traits<std::__1::allocator<DB::WriteBufferFromHTTPServerResponse> >::destroy<DB::WriteBufferFromHTTPServerResponse>(std::__1::allocator<DB::WriteBufferFromHTTPServerResponse>&, DB::WriteBufferFromHTTPServerResponse*) @ 0x1bddd76d in /usr/bin/clickhouse
    2021.03.19 18:11:46.351371 [ 80145 ] {} <Fatal> BaseDaemon: 14. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:2611: std::__1::__shared_ptr_emplace<DB::WriteBufferFromHTTPServerResponse, std::__1::allocator<DB::WriteBufferFromHTTPServerResponse> >::__on_zero_shared() @ 0x1bddd525 in /usr/bin/clickhouse
    2021.03.19 18:11:46.579263 [ 80145 ] {} <Fatal> BaseDaemon: 15. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:2476: std::__1::__shared_count::__release_shared() @ 0x119490ed in /usr/bin/clickhouse
    2021.03.19 18:11:46.790912 [ 80145 ] {} <Fatal> BaseDaemon: 16. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:2517: std::__1::__shared_weak_count::__release_shared() @ 0x1194908f in /usr/bin/clickhouse
    2021.03.19 18:11:47.277990 [ 80145 ] {} <Fatal> BaseDaemon: 17. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:3213: std::__1::shared_ptr<DB::WriteBufferFromHTTPServerResponse>::~shared_ptr() @ 0x1bdd75fc in /usr/bin/clickhouse
    2021.03.19 18:11:47.649213 [ 80145 ] {} <Fatal> BaseDaemon: 18. ./obj-x86_64-linux-gnu/../src/Server/InterserverIOHTTPHandler.h:34: DB::InterserverIOHTTPHandler::Output::~Output() @ 0x1bdf6bd5 in /usr/bin/clickhouse
    2021.03.19 18:11:47.921556 [ 80145 ] {} <Fatal> BaseDaemon: 19. ./obj-x86_64-linux-gnu/../src/Server/InterserverIOHTTPHandler.cpp:154: DB::InterserverIOHTTPHandler::handleRequest(DB::HTTPServerRequest&, DB::HTTPServerResponse&) @ 0x1bdf653f in /usr/bin/clickhouse

  [1]: https://clickhouse-test-reports.s3.yandex.net/0/78c56b891383288cf3a893139e796fc87476412e/stress_test_(debug).html

Since in case of no errors during processing we should call finalize, to
ensure that it will not be called from dtor.

Fixes: #22046
Fixes: #22067
---
 src/Server/InterserverIOHTTPHandler.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp
index 7d8dfaaf2c8..426e4ca2138 100644
--- a/src/Server/InterserverIOHTTPHandler.cpp
+++ b/src/Server/InterserverIOHTTPHandler.cpp
@@ -117,6 +117,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe
         if (auto [message, success] = checkAuthentication(request); success)
         {
             processQuery(request, response, used_output);
+            used_output.out->finalize();
             LOG_DEBUG(log, "Done processing query");
         }
         else

From 50003e496a1551a49733b3c67b1b0fb4939af54b Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 25 Mar 2021 23:41:03 +0300
Subject: [PATCH 204/260] Use existing logger for logging from
 WriteBufferFromS3 dtor

---
 src/IO/WriteBufferFromS3.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp
index 7373b24991a..93aaf9456b5 100644
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@@ -120,7 +120,7 @@ WriteBufferFromS3::~WriteBufferFromS3()
     }
     catch (...)
     {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
+        tryLogCurrentException(log);
     }
 }
 

From f1907acbcd96b3b5ce0e6073e7faa44479c3221b Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 25 Mar 2021 23:42:18 +0300
Subject: [PATCH 205/260] Use finalize() over next() for nested writers

Refs: https://github.com/ClickHouse/ClickHouse/pull/21325#discussion_r585348309
---
 src/IO/BrotliWriteBuffer.cpp        | 2 +-
 src/IO/LZMADeflatingWriteBuffer.cpp | 2 +-
 src/IO/ZlibDeflatingWriteBuffer.cpp | 2 +-
 src/IO/ZstdDeflatingWriteBuffer.cpp | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/IO/BrotliWriteBuffer.cpp b/src/IO/BrotliWriteBuffer.cpp
index e87eeb1a2be..512ed5fc93f 100644
--- a/src/IO/BrotliWriteBuffer.cpp
+++ b/src/IO/BrotliWriteBuffer.cpp
@@ -106,7 +106,7 @@ void BrotliWriteBuffer::finish()
     try
     {
         finishImpl();
-        out->next();
+        out->finalize();
         finished = true;
     }
     catch (...)
diff --git a/src/IO/LZMADeflatingWriteBuffer.cpp b/src/IO/LZMADeflatingWriteBuffer.cpp
index 96f1d34b01b..7ea4f7945dc 100644
--- a/src/IO/LZMADeflatingWriteBuffer.cpp
+++ b/src/IO/LZMADeflatingWriteBuffer.cpp
@@ -105,7 +105,7 @@ void LZMADeflatingWriteBuffer::finish()
     try
     {
         finishImpl();
-        out->next();
+        out->finalize();
         finished = true;
     }
     catch (...)
diff --git a/src/IO/ZlibDeflatingWriteBuffer.cpp b/src/IO/ZlibDeflatingWriteBuffer.cpp
index 5da82b52279..7e91820f298 100644
--- a/src/IO/ZlibDeflatingWriteBuffer.cpp
+++ b/src/IO/ZlibDeflatingWriteBuffer.cpp
@@ -107,7 +107,7 @@ void ZlibDeflatingWriteBuffer::finish()
     try
     {
         finishImpl();
-        out->next();
+        out->finalize();
         finished = true;
     }
     catch (...)
diff --git a/src/IO/ZstdDeflatingWriteBuffer.cpp b/src/IO/ZstdDeflatingWriteBuffer.cpp
index 27694797db6..5b97588b33e 100644
--- a/src/IO/ZstdDeflatingWriteBuffer.cpp
+++ b/src/IO/ZstdDeflatingWriteBuffer.cpp
@@ -94,7 +94,7 @@ void ZstdDeflatingWriteBuffer::finish()
     try
     {
         finishImpl();
-        out->next();
+        out->finalize();
         finished = true;
     }
     catch (...)

From 30cd1c614514973d88d8475daf88fa5bd6ae5a04 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Sun, 21 Mar 2021 10:58:31 +0300
Subject: [PATCH 206/260] Fix typo in FirstSignificantSubdomainCustomLookup
 name

---
 src/Functions/URL/FirstSignificantSubdomainCustomImpl.h   | 8 ++++----
 .../URL/cutToFirstSignificantSubdomainCustom.cpp          | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h
index 244b32459c1..d6868834f75 100644
--- a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h
+++ b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h
@@ -17,10 +17,10 @@ namespace ErrorCodes
     extern const int ILLEGAL_TYPE_OF_ARGUMENT;
 }
 
-struct FirstSignificantSubdomainCustomtLookup
+struct FirstSignificantSubdomainCustomLookup
 {
     const TLDList & tld_list;
-    FirstSignificantSubdomainCustomtLookup(const std::string & tld_list_name)
+    FirstSignificantSubdomainCustomLookup(const std::string & tld_list_name)
         : tld_list(TLDListsHolder::getInstance().getTldList(tld_list_name))
     {
     }
@@ -63,7 +63,7 @@ public:
     ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
     {
         const ColumnConst * column_tld_list_name = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
-        FirstSignificantSubdomainCustomtLookup tld_lookup(column_tld_list_name->getValue<String>());
+        FirstSignificantSubdomainCustomLookup tld_lookup(column_tld_list_name->getValue<String>());
 
         /// FIXME: convertToFullColumnIfConst() is suboptimal
         auto column = arguments[0].column->convertToFullColumnIfConst();
@@ -79,7 +79,7 @@ public:
                 ErrorCodes::ILLEGAL_COLUMN);
     }
 
-    static void vector(FirstSignificantSubdomainCustomtLookup & tld_lookup,
+    static void vector(FirstSignificantSubdomainCustomLookup & tld_lookup,
         const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
         ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
     {
diff --git a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp
index 11fd27e317b..88d8fc7704e 100644
--- a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp
+++ b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp
@@ -10,7 +10,7 @@ struct CutToFirstSignificantSubdomainCustom
 {
     static size_t getReserveLengthForElement() { return 15; }
 
-    static void execute(FirstSignificantSubdomainCustomtLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size)
+    static void execute(FirstSignificantSubdomainCustomLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size)
     {
         res_data = data;
         res_size = 0;

From b68517f69ea0b7dd54716734bb1bf33f55c5956a Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Sun, 21 Mar 2021 10:55:28 +0300
Subject: [PATCH 207/260] Fix
 cutToFirstSignificantSubdomainCustom()/firstSignificantSubdomainCustom() for
 3+level domains

Custom TLD lists (added in #17748), may contain domain of the 3-d level,
however builtin TLD lists does not have such records, so it is not
affected.

Note that this will significantly increase hashtable lookups.

Fixes: #17748
---
 .../URL/ExtractFirstSignificantSubdomain.h    | 65 ++++++++++++++++++-
 .../cutToFirstSignificantSubdomainCustom.cpp  |  2 +-
 .../0_stateless/01601_custom_tld.reference    | 19 +++++-
 .../queries/0_stateless/01601_custom_tld.sql  | 25 +++++--
 4 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/src/Functions/URL/ExtractFirstSignificantSubdomain.h b/src/Functions/URL/ExtractFirstSignificantSubdomain.h
index c13b5f50156..974574058e9 100644
--- a/src/Functions/URL/ExtractFirstSignificantSubdomain.h
+++ b/src/Functions/URL/ExtractFirstSignificantSubdomain.h
@@ -90,7 +90,70 @@ struct ExtractFirstSignificantSubdomain
             res_data += last_3_periods[1] + 1 - begin;
             res_size = last_3_periods[0] - last_3_periods[1] - 1;
         }
-   }
+    }
+
+    /// The difference with execute() is due to custom TLD list can have records of any level,
+    /// not only 2-nd level (like non-custom variant), so it requires more lookups.
+    template <class Lookup>
+    static void executeCustom(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
+    {
+        res_data = data;
+        res_size = 0;
+
+        Pos tmp;
+        size_t domain_length;
+        ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
+
+        if (domain_length == 0)
+            return;
+
+        if (out_domain_end)
+            *out_domain_end = tmp + domain_length;
+
+        /// cut useless dot
+        if (tmp[domain_length - 1] == '.')
+            --domain_length;
+
+        res_data = tmp;
+        res_size = domain_length;
+
+        auto begin = tmp;
+        auto end = begin + domain_length;
+        const char * last_2_periods[2]{};
+        const char * prev = begin - 1;
+
+        auto pos = find_first_symbols<'.'>(begin, end);
+        while (pos < end)
+        {
+            if (lookup(pos + 1, end - pos - 1))
+            {
+                res_data += prev + 1 - begin;
+                res_size = end - 1 - prev;
+                return;
+            }
+
+            last_2_periods[1] = last_2_periods[0];
+            last_2_periods[0] = pos;
+            prev = pos;
+            pos = find_first_symbols<'.'>(pos + 1, end);
+        }
+
+        /// if there is domain of the first level (i.e. no dots in the hostname) -> return nothing
+        if (!last_2_periods[0])
+            return;
+
+        /// if there is domain of the second level -> always return itself
+        if (!last_2_periods[1])
+        {
+            res_size = last_2_periods[0] - begin;
+            return;
+        }
+
+        /// if there is domain of the 3+ level, and zero records in TLD list ->
+        /// fallback to domain of the second level
+        res_data += last_2_periods[1] + 1 - begin;
+        res_size = last_2_periods[0] - last_2_periods[1] - 1;
+    }
 };
 
 }
diff --git a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp
index 88d8fc7704e..7532ddd00f2 100644
--- a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp
+++ b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp
@@ -18,7 +18,7 @@ struct CutToFirstSignificantSubdomainCustom
         Pos tmp_data;
         size_t tmp_length;
         Pos domain_end;
-        ExtractFirstSignificantSubdomain<without_www>::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
+        ExtractFirstSignificantSubdomain<without_www>::executeCustom(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
 
         if (tmp_length == 0)
             return;
diff --git a/tests/queries/0_stateless/01601_custom_tld.reference b/tests/queries/0_stateless/01601_custom_tld.reference
index 98b99778396..e056505f273 100644
--- a/tests/queries/0_stateless/01601_custom_tld.reference
+++ b/tests/queries/0_stateless/01601_custom_tld.reference
@@ -1,11 +1,24 @@
-no-tld
+-- no-tld
+
+foo.there-is-no-such-domain
+foo.there-is-no-such-domain
 
 foo.there-is-no-such-domain
 foo.there-is-no-such-domain
 foo
-generic
+-- generic
 kernel
 kernel.biz.ss
-difference
+-- difference
 biz.ss
 kernel.biz.ss
+-- 3+level
+xx.blogspot.co.at
+blogspot
+xx.blogspot.co.at
+blogspot
+-- url
+foobar.com
+foobar.com
+foobar.com
+xx.blogspot.co.at
diff --git a/tests/queries/0_stateless/01601_custom_tld.sql b/tests/queries/0_stateless/01601_custom_tld.sql
index 6d68299c07d..688dd419858 100644
--- a/tests/queries/0_stateless/01601_custom_tld.sql
+++ b/tests/queries/0_stateless/01601_custom_tld.sql
@@ -1,16 +1,31 @@
-select 'no-tld';
-select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
+select '-- no-tld';
 -- even if there is no TLD, 2-nd level by default anyway
 -- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real)
+select cutToFirstSignificantSubdomain('there-is-no-such-domain');
+select cutToFirstSignificantSubdomain('foo.there-is-no-such-domain');
+select cutToFirstSignificantSubdomain('bar.foo.there-is-no-such-domain');
+select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list');
 select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list');
 select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
 select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
 
-select 'generic';
-select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
+select '-- generic';
+select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel
 select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
 
-select 'difference';
+select '-- difference';
 -- biz.ss is not in the default TLD list, hence:
 select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss
 select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss
+
+select '-- 3+level';
+select cutToFirstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
+select firstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
+select cutToFirstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at
+select firstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- blogspot
+
+select '-- url';
+select cutToFirstSignificantSubdomainCustom('http://foobar.com', 'public_suffix_list');
+select cutToFirstSignificantSubdomainCustom('http://foobar.com/foo', 'public_suffix_list');
+select cutToFirstSignificantSubdomainCustom('http://bar.foobar.com/foo', 'public_suffix_list');
+select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at', 'public_suffix_list');

From a12cc5f559682dee90bfe8703d92b84ec5e9b157 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Fri, 26 Mar 2021 00:00:18 +0300
Subject: [PATCH 208/260] adjust perf test thresholds

---
 tests/performance/arithmetic.xml                      | 2 +-
 tests/performance/array_join.xml                      | 2 +-
 tests/performance/bounding_ratio.xml                  | 2 +-
 tests/performance/codecs_float_insert.xml             | 2 +-
 tests/performance/codecs_int_insert.xml               | 2 +-
 tests/performance/conditional.xml                     | 2 +-
 tests/performance/constant_column_search.xml          | 2 +-
 tests/performance/date_time_64.xml                    | 2 +-
 tests/performance/date_time_long.xml                  | 2 +-
 tests/performance/direct_dictionary.xml               | 2 +-
 tests/performance/float_formatting.xml                | 2 +-
 tests/performance/fuzz_bits.xml                       | 2 +-
 tests/performance/general_purpose_hashes.xml          | 2 +-
 tests/performance/generate_table_function.xml         | 2 +-
 tests/performance/group_by_sundy_li.xml               | 2 +-
 tests/performance/if_array_string.xml                 | 2 +-
 tests/performance/int_parsing.xml                     | 2 +-
 tests/performance/jit_small_requests.xml              | 2 +-
 tests/performance/joins_in_memory.xml                 | 2 +-
 tests/performance/joins_in_memory_pmj.xml             | 2 +-
 tests/performance/logical_functions_medium.xml        | 2 +-
 tests/performance/logical_functions_small.xml         | 2 +-
 tests/performance/math.xml                            | 2 +-
 tests/performance/optimized_select_final.xml          | 2 +-
 tests/performance/optimized_select_final_one_part.xml | 2 +-
 tests/performance/or_null_default.xml                 | 2 +-
 tests/performance/parse_engine_file.xml               | 2 +-
 tests/performance/random_string.xml                   | 2 +-
 tests/performance/sum.xml                             | 2 +-
 tests/performance/sum_map.xml                         | 2 +-
 tests/performance/url_hits.xml                        | 2 +-
 31 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/tests/performance/arithmetic.xml b/tests/performance/arithmetic.xml
index 0be61eb5823..bf5e7662e37 100644
--- a/tests/performance/arithmetic.xml
+++ b/tests/performance/arithmetic.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <settings>
         <max_memory_usage>30000000000</max_memory_usage>
     </settings>
diff --git a/tests/performance/array_join.xml b/tests/performance/array_join.xml
index ca280ce28ad..cf92b51f545 100644
--- a/tests/performance/array_join.xml
+++ b/tests/performance/array_join.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
 
 
 
diff --git a/tests/performance/bounding_ratio.xml b/tests/performance/bounding_ratio.xml
index e3a15f90013..e430136b624 100644
--- a/tests/performance/bounding_ratio.xml
+++ b/tests/performance/bounding_ratio.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <query>SELECT boundingRatio(number, number) FROM numbers(100000000)</query>
     <query>SELECT (argMax(number, number) - argMin(number, number)) / (max(number) - min(number)) FROM numbers(100000000)</query>
 </test>
diff --git a/tests/performance/codecs_float_insert.xml b/tests/performance/codecs_float_insert.xml
index a7cb5152c09..b282bcc268f 100644
--- a/tests/performance/codecs_float_insert.xml
+++ b/tests/performance/codecs_float_insert.xml
@@ -1,5 +1,5 @@
 <!-- FIXME this instability is abysmal, investigate the unstable queries -->
-<test>
+<test max_ignored_relative_change="0.2">
     <settings>
         <allow_suspicious_codecs>1</allow_suspicious_codecs>
     </settings>
diff --git a/tests/performance/codecs_int_insert.xml b/tests/performance/codecs_int_insert.xml
index caefaba3725..662df80ae70 100644
--- a/tests/performance/codecs_int_insert.xml
+++ b/tests/performance/codecs_int_insert.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <settings>
         <allow_suspicious_codecs>1</allow_suspicious_codecs>
     </settings>
diff --git a/tests/performance/conditional.xml b/tests/performance/conditional.xml
index 21623f45b05..91b6cb95ff2 100644
--- a/tests/performance/conditional.xml
+++ b/tests/performance/conditional.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.2">
+<test>
     <query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(if(rand() % 2, toDateTime('2019-02-04 01:24:31'), toDate('2019-02-04')))</query>
     <query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(multiIf(rand() % 2, toDateTime('2019-02-04 01:24:31'), toDate('2019-02-04')))</query>
     <query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(if(rand() % 2, [toDateTime('2019-02-04 01:24:31')], [toDate('2019-02-04')]))</query>
diff --git a/tests/performance/constant_column_search.xml b/tests/performance/constant_column_search.xml
index cb76fd4cefb..71d8185d818 100644
--- a/tests/performance/constant_column_search.xml
+++ b/tests/performance/constant_column_search.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <tags>
         <tag>search</tag>
     </tags>
diff --git a/tests/performance/date_time_64.xml b/tests/performance/date_time_64.xml
index 838aba34d87..fd883416a33 100644
--- a/tests/performance/date_time_64.xml
+++ b/tests/performance/date_time_64.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <preconditions>
         <table_exists>hits_100m_single</table_exists>
     </preconditions>
diff --git a/tests/performance/date_time_long.xml b/tests/performance/date_time_long.xml
index 0c3d85f9659..c2eb42d3318 100644
--- a/tests/performance/date_time_long.xml
+++ b/tests/performance/date_time_long.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.3">
     <substitutions>
        <substitution>
            <name>datetime_transform</name>
diff --git a/tests/performance/direct_dictionary.xml b/tests/performance/direct_dictionary.xml
index eb1b4e0da00..cd9aa73a128 100644
--- a/tests/performance/direct_dictionary.xml
+++ b/tests/performance/direct_dictionary.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <create_query>
         CREATE TABLE simple_direct_dictionary_test_table
         (
diff --git a/tests/performance/float_formatting.xml b/tests/performance/float_formatting.xml
index d24ccd7664c..71d8aee3f89 100644
--- a/tests/performance/float_formatting.xml
+++ b/tests/performance/float_formatting.xml
@@ -3,7 +3,7 @@
     is 10 times faster than toString(number % 100 + 0.5). The shorter
     queries are somewhat unstable, so ignore differences less than 10%.
 -->
-<test max_ignored_relative_change="0.2">
+<test>
     <substitutions>
        <substitution>
            <name>expr</name>
diff --git a/tests/performance/fuzz_bits.xml b/tests/performance/fuzz_bits.xml
index 2679977cb1d..87064e520c2 100644
--- a/tests/performance/fuzz_bits.xml
+++ b/tests/performance/fuzz_bits.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.3">
 
 
 
diff --git a/tests/performance/general_purpose_hashes.xml b/tests/performance/general_purpose_hashes.xml
index bd2fa9674f6..f34554360cf 100644
--- a/tests/performance/general_purpose_hashes.xml
+++ b/tests/performance/general_purpose_hashes.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.2">
+<test>
     <substitutions>
         <substitution>
            <name>gp_hash_func</name>
diff --git a/tests/performance/generate_table_function.xml b/tests/performance/generate_table_function.xml
index bc49a7de1bd..0339a8c19e8 100644
--- a/tests/performance/generate_table_function.xml
+++ b/tests/performance/generate_table_function.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('ui64 UInt64, i64 Int64, ui32 UInt32, i32 Int32, ui16 UInt16, i16 Int16, ui8 UInt8, i8 Int8') LIMIT 1000000000);</query>
     <query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('ui64 UInt64, i64 Int64, ui32 UInt32, i32 Int32, ui16 UInt16, i16 Int16, ui8 UInt8, i8 Int8', 0, 10, 10) LIMIT 1000000000);</query>
     <query>SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('i Enum8(\'hello\' = 1, \'world\' = 5)', 0, 10, 10) LIMIT 1000000000);</query>
diff --git a/tests/performance/group_by_sundy_li.xml b/tests/performance/group_by_sundy_li.xml
index c49712a8519..aebc305335c 100644
--- a/tests/performance/group_by_sundy_li.xml
+++ b/tests/performance/group_by_sundy_li.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.2">
+<test max_ignored_relative_change="0.4">
     <settings>
         <max_insert_threads>8</max_insert_threads>
     </settings>
diff --git a/tests/performance/if_array_string.xml b/tests/performance/if_array_string.xml
index 445b3c8c55a..5d33bfda51f 100644
--- a/tests/performance/if_array_string.xml
+++ b/tests/performance/if_array_string.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.4">
+<test max_ignored_relative_change="0.2">
     <query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : ['a', 'b', 'c'])</query>
     <query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? materialize(['Hello', 'World']) : ['a', 'b', 'c'])</query>
     <query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : materialize(['a', 'b', 'c']))</query>
diff --git a/tests/performance/int_parsing.xml b/tests/performance/int_parsing.xml
index 3b8620e46c3..32f904331ce 100644
--- a/tests/performance/int_parsing.xml
+++ b/tests/performance/int_parsing.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <preconditions>
         <table_exists>hits_100m_single</table_exists>
         <table_exists>hits_10m_single</table_exists>
diff --git a/tests/performance/jit_small_requests.xml b/tests/performance/jit_small_requests.xml
index c9abec0926b..d8f917fb9af 100644
--- a/tests/performance/jit_small_requests.xml
+++ b/tests/performance/jit_small_requests.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <query>
         WITH
             bitXor(number, 0x4CF2D2BAAE6DA887) AS x0,
diff --git a/tests/performance/joins_in_memory.xml b/tests/performance/joins_in_memory.xml
index bac7679930f..fac6f2659c6 100644
--- a/tests/performance/joins_in_memory.xml
+++ b/tests/performance/joins_in_memory.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <create_query>CREATE TABLE ints (i64 Int64, i32 Int32, i16 Int16, i8 Int8) ENGINE = Memory</create_query>
 
     <fill_query>INSERT INTO ints SELECT number AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000)</fill_query>
diff --git a/tests/performance/joins_in_memory_pmj.xml b/tests/performance/joins_in_memory_pmj.xml
index 5dd4395513d..87d1c0df14c 100644
--- a/tests/performance/joins_in_memory_pmj.xml
+++ b/tests/performance/joins_in_memory_pmj.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="1.3">
+<test max_ignored_relative_change="0.7">
     <create_query>CREATE TABLE ints (i64 Int64, i32 Int32, i16 Int16, i8 Int8) ENGINE = Memory</create_query>
 
     <settings>
diff --git a/tests/performance/logical_functions_medium.xml b/tests/performance/logical_functions_medium.xml
index be474894b54..19572191532 100644
--- a/tests/performance/logical_functions_medium.xml
+++ b/tests/performance/logical_functions_medium.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <settings>
         <max_threads>1</max_threads>
     </settings>
diff --git a/tests/performance/logical_functions_small.xml b/tests/performance/logical_functions_small.xml
index 3d70ef6811d..d5f6a7b99cb 100644
--- a/tests/performance/logical_functions_small.xml
+++ b/tests/performance/logical_functions_small.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <settings>
         <max_threads>1</max_threads>
     </settings>
diff --git a/tests/performance/math.xml b/tests/performance/math.xml
index 006e33548c9..35250351683 100644
--- a/tests/performance/math.xml
+++ b/tests/performance/math.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.3">
     <substitutions>
         <substitution>
            <name>func_slow</name>
diff --git a/tests/performance/optimized_select_final.xml b/tests/performance/optimized_select_final.xml
index 2c8254d2b88..d70fccc1330 100644
--- a/tests/performance/optimized_select_final.xml
+++ b/tests/performance/optimized_select_final.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.2">
+<test max_ignored_relative_change="0.3">
     <settings>
         <do_not_merge_across_partitions_select_final>1</do_not_merge_across_partitions_select_final>
     </settings>
diff --git a/tests/performance/optimized_select_final_one_part.xml b/tests/performance/optimized_select_final_one_part.xml
index 92c8eed859a..63541313ac9 100644
--- a/tests/performance/optimized_select_final_one_part.xml
+++ b/tests/performance/optimized_select_final_one_part.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.2">
+<test>
     <settings>
         <do_not_merge_across_partitions_select_final>1</do_not_merge_across_partitions_select_final>
     </settings>
diff --git a/tests/performance/or_null_default.xml b/tests/performance/or_null_default.xml
index 6fed0cce4d6..009719f66a5 100644
--- a/tests/performance/or_null_default.xml
+++ b/tests/performance/or_null_default.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.3">
+<test>
     <query>SELECT sumOrNull(number) FROM numbers(100000000)</query>
     <query>SELECT sumOrDefault(toNullable(number)) FROM numbers(100000000)</query>
     <query>SELECT sumOrNull(number) FROM numbers(10000000) GROUP BY number % 1024</query>
diff --git a/tests/performance/parse_engine_file.xml b/tests/performance/parse_engine_file.xml
index 2459ed084cd..d49670b36b5 100644
--- a/tests/performance/parse_engine_file.xml
+++ b/tests/performance/parse_engine_file.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.4">
+<test max_ignored_relative_change="0.2">
 
 <preconditions>
     <table_exists>test.hits</table_exists>
diff --git a/tests/performance/random_string.xml b/tests/performance/random_string.xml
index 1a740ae077a..79f12373f1c 100644
--- a/tests/performance/random_string.xml
+++ b/tests/performance/random_string.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <query>SELECT count() FROM zeros(100000000) WHERE NOT ignore(randomString(10))</query>
     <query>SELECT count() FROM zeros(100000000) WHERE NOT ignore(randomString(100))</query>
     <query>SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomString(1000))</query>
diff --git a/tests/performance/sum.xml b/tests/performance/sum.xml
index 32c194dab6f..9bee2a580c3 100644
--- a/tests/performance/sum.xml
+++ b/tests/performance/sum.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.2">
+<test>
     <query>SELECT sum(number) FROM numbers(100000000)</query>
     <query>SELECT sum(toUInt32(number)) FROM numbers(100000000)</query>
     <query>SELECT sum(toUInt16(number)) FROM numbers(100000000)</query>
diff --git a/tests/performance/sum_map.xml b/tests/performance/sum_map.xml
index bc9f9be2a18..b732c150220 100644
--- a/tests/performance/sum_map.xml
+++ b/tests/performance/sum_map.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.3">
     <settings>
         <max_threads>1</max_threads>
     </settings>
diff --git a/tests/performance/url_hits.xml b/tests/performance/url_hits.xml
index a699ef6ba97..1813b2a72cb 100644
--- a/tests/performance/url_hits.xml
+++ b/tests/performance/url_hits.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <preconditions>
         <table_exists>hits_100m_single</table_exists>
         <table_exists>hits_10m_single</table_exists>

From f3ca9db832d7ce6d92c49700fc3a944ed31cd817 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Fri, 26 Mar 2021 01:00:06 +0300
Subject: [PATCH 209/260] forgot abs()

---
 tests/performance/collations.xml                   | 2 +-
 tests/performance/direct_dictionary.xml            | 2 +-
 tests/performance/float_parsing.xml                | 2 +-
 tests/performance/if_array_string.xml              | 2 +-
 tests/performance/synthetic_hardware_benchmark.xml | 2 +-
 tests/performance/visit_param_extract_raw.xml      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/performance/collations.xml b/tests/performance/collations.xml
index 17b2d36b7e3..52ccede3798 100644
--- a/tests/performance/collations.xml
+++ b/tests/performance/collations.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
 
 
 
diff --git a/tests/performance/direct_dictionary.xml b/tests/performance/direct_dictionary.xml
index cd9aa73a128..68b52d917dd 100644
--- a/tests/performance/direct_dictionary.xml
+++ b/tests/performance/direct_dictionary.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.2">
+<test max_ignored_relative_change="0.3">
     <create_query>
         CREATE TABLE simple_direct_dictionary_test_table
         (
diff --git a/tests/performance/float_parsing.xml b/tests/performance/float_parsing.xml
index 33ab8ba6f10..eb8577bd127 100644
--- a/tests/performance/float_parsing.xml
+++ b/tests/performance/float_parsing.xml
@@ -1,4 +1,4 @@
-<test>
+<test max_ignored_relative_change="0.2">
     <substitutions>
        <substitution>
            <name>expr</name>
diff --git a/tests/performance/if_array_string.xml b/tests/performance/if_array_string.xml
index 5d33bfda51f..773509e1c4b 100644
--- a/tests/performance/if_array_string.xml
+++ b/tests/performance/if_array_string.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.2">
+<test max_ignored_relative_change="0.3">
     <query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : ['a', 'b', 'c'])</query>
     <query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? materialize(['Hello', 'World']) : ['a', 'b', 'c'])</query>
     <query>SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : materialize(['a', 'b', 'c']))</query>
diff --git a/tests/performance/synthetic_hardware_benchmark.xml b/tests/performance/synthetic_hardware_benchmark.xml
index 4b94f73a21d..ffcf30db5cb 100644
--- a/tests/performance/synthetic_hardware_benchmark.xml
+++ b/tests/performance/synthetic_hardware_benchmark.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.2">
+<test max_ignored_relative_change="0.3">
     <settings>
         <max_memory_usage>30000000000</max_memory_usage>
     </settings>
diff --git a/tests/performance/visit_param_extract_raw.xml b/tests/performance/visit_param_extract_raw.xml
index 67faeb1f743..358dcc9cc0e 100644
--- a/tests/performance/visit_param_extract_raw.xml
+++ b/tests/performance/visit_param_extract_raw.xml
@@ -1,4 +1,4 @@
-<test max_ignored_relative_change="0.2">
+<test max_ignored_relative_change="0.3">
     <substitutions>
         <substitution>
            <name>param</name>

From ddbd95be2d2ca8acba467c88b73afaba2577b121 Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Fri, 26 Mar 2021 01:55:42 +0300
Subject: [PATCH 210/260] Suggestion from @l1tsolaiki

---
 docs/en/faq/integration/json-import.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/faq/integration/json-import.md b/docs/en/faq/integration/json-import.md
index 7038cc539d2..3fa026c794a 100644
--- a/docs/en/faq/integration/json-import.md
+++ b/docs/en/faq/integration/json-import.md
@@ -19,7 +19,7 @@ $ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test
 Using [CLI interface](../../interfaces/cli.md):
 
 ``` bash
-$ echo '{"foo":"bar"}'  | clickhouse-client ---query="INSERT INTO test FORMAT JSONEachRow"
+$ echo '{"foo":"bar"}'  | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow"
 ```
 
 Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/index.md) instead.

From ad66c4a91609b53b46596bd37527fbf1493908c1 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 26 Mar 2021 01:56:24 +0300
Subject: [PATCH 211/260] Fix error

---
 src/Compression/CompressedReadBuffer.cpp         | 2 +-
 src/Compression/CompressedReadBufferFromFile.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Compression/CompressedReadBuffer.cpp b/src/Compression/CompressedReadBuffer.cpp
index 6393723acfd..78241ec1b69 100644
--- a/src/Compression/CompressedReadBuffer.cpp
+++ b/src/Compression/CompressedReadBuffer.cpp
@@ -61,9 +61,9 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n)
 
             memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
             working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
-            pos = working_buffer.begin();
 
             decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
+            pos = working_buffer.begin();
 
             bytes_read += read(to + bytes_read, n - bytes_read);
             break;
diff --git a/src/Compression/CompressedReadBufferFromFile.cpp b/src/Compression/CompressedReadBufferFromFile.cpp
index ea12ec7e8b7..3a75ea14166 100644
--- a/src/Compression/CompressedReadBufferFromFile.cpp
+++ b/src/Compression/CompressedReadBufferFromFile.cpp
@@ -122,9 +122,9 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n)
 
             memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
             working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
-            pos = working_buffer.begin();
 
             decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
+            pos = working_buffer.begin();
 
             bytes_read += read(to + bytes_read, n - bytes_read);
             break;

From e55f7e63333f5400023210ac194969e17d31d9de Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 26 Mar 2021 02:21:59 +0300
Subject: [PATCH 212/260] Fix error

---
 src/Compression/CompressedReadBufferBase.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp
index 65ba9607468..eb4d6ea5986 100644
--- a/src/Compression/CompressedReadBufferBase.cpp
+++ b/src/Compression/CompressedReadBufferBase.cpp
@@ -237,8 +237,8 @@ void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_d
 
         to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum);
     }
-
-    codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin());
+    else
+        codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin());
 }
 
 

From 67f6efb94dee914db80a97890be9492f9dd8b4df Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Fri, 26 Mar 2021 02:35:38 +0300
Subject: [PATCH 213/260] Update column.md

---
 docs/en/sql-reference/statements/alter/column.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md
index e39ca8dcaf2..3ece30be5b8 100644
--- a/docs/en/sql-reference/statements/alter/column.md
+++ b/docs/en/sql-reference/statements/alter/column.md
@@ -144,7 +144,7 @@ This query changes the `name` column properties:
 
 -   TTL
 
-For examples of columns TTL modifying, see [Column TTL](../../engines/table_engines/mergetree_family/mergetree.md#mergetree-column-ttl).
+For examples of columns TTL modifying, see [Column TTL](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl).
 
 If the `IF EXISTS` clause is specified, the query won’t return an error if the column doesn’t exist.
 

From 3107920f315ee21c3bc5938d8b0521bd78cfcf88 Mon Sep 17 00:00:00 2001
From: Ilya Yatsishin <2159081+qoega@users.noreply.github.com>
Date: Fri, 26 Mar 2021 09:43:33 +0300
Subject: [PATCH 214/260] Update requirements.txt

---
 docs/tools/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt
index 470bc5e8719..9605525edbf 100644
--- a/docs/tools/requirements.txt
+++ b/docs/tools/requirements.txt
@@ -26,7 +26,7 @@ numpy==1.19.2
 Pygments==2.5.2
 pymdown-extensions==8.0
 python-slugify==4.0.1
-PyYAML==5.3.1
+PyYAML==5.4.1
 repackage==0.7.3
 requests==2.24.0
 singledispatch==3.4.0.3

From c36f147b1633c7fab5cc23f9124f50ed6592d502 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Fri, 26 Mar 2021 10:05:56 +0300
Subject: [PATCH 215/260] Fix
 sleep_in_send_tables_status_ms/sleep_in_send_data_ms in integration tests

---
 tests/integration/test_secure_socket/test.py              | 8 ++++----
 .../configs/users.xml                                     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py
index 337b6b05bd7..c2bad80bca0 100644
--- a/tests/integration/test_secure_socket/test.py
+++ b/tests/integration/test_secure_socket/test.py
@@ -12,7 +12,7 @@ NODES = {'node' + str(i): None for i in (1, 2)}
 config = '''<yandex>
     <profiles>
         <default>
-            <sleep_in_send_data>{sleep_in_send_data}</sleep_in_send_data>
+            <sleep_in_send_data_ms>{sleep_in_send_data_ms}</sleep_in_send_data_ms>
         </default>
     </profiles>
 </yandex>'''
@@ -45,12 +45,12 @@ def started_cluster():
 
 
 def test(started_cluster):
-    NODES['node2'].replace_config('/etc/clickhouse-server/users.d/users.xml', config.format(sleep_in_send_data=1000))
+    NODES['node2'].replace_config('/etc/clickhouse-server/users.d/users.xml', config.format(sleep_in_send_data_ms=1000000))
     
     attempts = 0
     while attempts < 1000:
-        setting = NODES['node2'].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data'")
-        if int(setting) == 1000:
+        setting = NODES['node2'].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data_ms'")
+        if int(setting) == 1000000:
             break
         time.sleep(0.1)
         attempts += 1
diff --git a/tests/integration/test_system_clusters_actual_information/configs/users.xml b/tests/integration/test_system_clusters_actual_information/configs/users.xml
index 156cd3a6b59..3dd68165fac 100644
--- a/tests/integration/test_system_clusters_actual_information/configs/users.xml
+++ b/tests/integration/test_system_clusters_actual_information/configs/users.xml
@@ -2,7 +2,7 @@
 <yandex>
     <profiles>
         <default>
-            <sleep_in_send_tables_status>5</sleep_in_send_tables_status>
+            <sleep_in_send_tables_status_ms>5000</sleep_in_send_tables_status_ms>
         </default>
     </profiles>
 </yandex>

From fa930d49c4c7f238ef0b7bd2d228654825d0fa9b Mon Sep 17 00:00:00 2001
From: tavplubix <tavplubix@gmail.com>
Date: Fri, 26 Mar 2021 10:29:58 +0300
Subject: [PATCH 216/260] Update gtest_peekable_read_buffer.cpp

---
 src/IO/tests/gtest_peekable_read_buffer.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/IO/tests/gtest_peekable_read_buffer.cpp b/src/IO/tests/gtest_peekable_read_buffer.cpp
index ddb947d8b2f..2e5ca47c0aa 100644
--- a/src/IO/tests/gtest_peekable_read_buffer.cpp
+++ b/src/IO/tests/gtest_peekable_read_buffer.cpp
@@ -6,11 +6,6 @@
 #include <IO/ConcatReadBuffer.h>
 #include <IO/PeekableReadBuffer.h>
 
-namespace DB::ErrorCodes
-{
-    extern const int LOGICAL_ERROR;
-}
-
 static void readAndAssert(DB::ReadBuffer & buf, const char * str)
 {
     size_t n = strlen(str);

From ba5c15103720b12ab4adeffb55035ee7c438e3e0 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 13:20:07 +0300
Subject: [PATCH 217/260] Fix race condition on snapshots

---
 src/Coordination/NuKeeperStateMachine.cpp | 25 +++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/Coordination/NuKeeperStateMachine.cpp b/src/Coordination/NuKeeperStateMachine.cpp
index 32bb4269f20..23485cb8b5b 100644
--- a/src/Coordination/NuKeeperStateMachine.cpp
+++ b/src/Coordination/NuKeeperStateMachine.cpp
@@ -4,6 +4,7 @@
 #include <IO/ReadHelpers.h>
 #include <Common/ZooKeeper/ZooKeeperIO.h>
 #include <Coordination/NuKeeperSnapshotManager.h>
+#include <future>
 
 namespace DB
 {
@@ -227,7 +228,28 @@ void NuKeeperStateMachine::save_logical_snp_obj(
     nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
     cloned_meta = nuraft::snapshot::deserialize(*snp_buf);
 
-    auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, s.get_last_log_idx());
+    /// Sometimes NuRaft can call save and create snapshots from different threads
+    /// at onces. To avoid race conditions we serialize snapshots through snapshots_queue
+    /// TODO: make something better
+    CreateSnapshotTask snapshot_task;
+    std::shared_ptr<std::promise<void>> waiter = std::make_shared<std::promise<void>>();
+    auto future = waiter->get_future();
+    snapshot_task.snapshot = nullptr;
+    snapshot_task.create_snapshot = [this, waiter, cloned_buffer, log_idx = s.get_last_log_idx()] (NuKeeperStorageSnapshotPtr &&)
+    {
+        try
+        {
+            auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, log_idx);
+            LOG_DEBUG(log, "Saved snapshot {} to path {}", log_idx, result_path);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log);
+        }
+        waiter->set_value();
+    };
+    snapshots_queue.push(std::move(snapshot_task));
+    future.wait();
 
     {
         std::lock_guard lock(snapshots_lock);
@@ -235,7 +257,6 @@ void NuKeeperStateMachine::save_logical_snp_obj(
         latest_snapshot_meta = cloned_meta;
     }
 
-    LOG_DEBUG(log, "Created snapshot {} with path {}", s.get_last_log_idx(), result_path);
 
     obj_id++;
 }

From 331c5b66365d96541ea1f5a913b0b4beae747416 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 13:55:39 +0300
Subject: [PATCH 218/260] Fix startup one more time

---
 src/Coordination/NuKeeperServer.cpp | 25 ++++++++++++++++---------
 src/Coordination/NuKeeperServer.h   |  2 +-
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp
index 62af9656fb9..7e6c10ca125 100644
--- a/src/Coordination/NuKeeperServer.cpp
+++ b/src/Coordination/NuKeeperServer.cpp
@@ -188,6 +188,9 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
     if (next_index < last_commited || next_index - last_commited <= 1)
         commited_store = true;
 
+    if (initialized_flag)
+        return nuraft::cb_func::ReturnCode::Ok;
+
     auto set_initialized = [this] ()
     {
         std::unique_lock lock(initialized_mutex);
@@ -205,15 +208,19 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
             return nuraft::cb_func::ReturnCode::Ok;
         }
         case nuraft::cb_func::BecomeFollower:
+        case nuraft::cb_func::GotAppendEntryReqFromLeader:
         {
-            auto leader_index = raft_instance->get_leader_committed_log_idx();
-            auto our_index = raft_instance->get_committed_log_idx();
-            /// This may happen when we start RAFT claster from scratch.
-            /// Node first became leader, and after that some other node became leader.
-            /// BecameFresh for this node will not be called because it was already fresh
-            /// when it was leader.
-            if (isLeaderAlive() && leader_index < our_index + coordination_settings->fresh_log_gap)
-                set_initialized();
+            if (isLeaderAlive())
+            {
+                auto leader_index = raft_instance->get_leader_committed_log_idx();
+                auto our_index = raft_instance->get_committed_log_idx();
+                /// This may happen when we start RAFT cluster from scratch.
+                /// Node first became leader, and after that some other node became leader.
+                /// BecameFresh for this node will not be called because it was already fresh
+                /// when it was leader.
+                if (leader_index < our_index + coordination_settings->fresh_log_gap)
+                    set_initialized();
+            }
             return nuraft::cb_func::ReturnCode::Ok;
         }
         case nuraft::cb_func::BecomeFresh:
@@ -237,7 +244,7 @@ void NuKeeperServer::waitInit()
 {
     std::unique_lock lock(initialized_mutex);
     int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds();
-    if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; }))
+    if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag.load(); }))
         throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization");
 }
 
diff --git a/src/Coordination/NuKeeperServer.h b/src/Coordination/NuKeeperServer.h
index ba25d5c181b..b5c13e62212 100644
--- a/src/Coordination/NuKeeperServer.h
+++ b/src/Coordination/NuKeeperServer.h
@@ -31,7 +31,7 @@ private:
     ResponsesQueue & responses_queue;
 
     std::mutex initialized_mutex;
-    bool initialized_flag = false;
+    std::atomic<bool> initialized_flag = false;
     std::condition_variable initialized_cv;
     std::atomic<bool> initial_batch_committed = false;
 

From 2db57f0f1669ded0768a00becf7747249f99d930 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 14:18:31 +0300
Subject: [PATCH 219/260] Followup fix

---
 src/Coordination/NuKeeperStorageDispatcher.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/Coordination/NuKeeperStorageDispatcher.cpp b/src/Coordination/NuKeeperStorageDispatcher.cpp
index 3aed0d99568..5b35b9c4829 100644
--- a/src/Coordination/NuKeeperStorageDispatcher.cpp
+++ b/src/Coordination/NuKeeperStorageDispatcher.cpp
@@ -132,6 +132,10 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati
 
     coordination_settings->loadFromConfig("test_keeper_server.coordination_settings", config);
 
+    request_thread = ThreadFromGlobalPool([this] { requestThread(); });
+    responses_thread = ThreadFromGlobalPool([this] { responseThread(); });
+    snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); });
+
     server = std::make_unique<NuKeeperServer>(myid, coordination_settings, config, responses_queue, snapshots_queue);
     try
     {
@@ -148,10 +152,8 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati
         throw;
     }
 
-    request_thread = ThreadFromGlobalPool([this] { requestThread(); });
-    responses_thread = ThreadFromGlobalPool([this] { responseThread(); });
+
     session_cleaner_thread = ThreadFromGlobalPool([this] { sessionCleanerTask(); });
-    snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); });
 
     LOG_DEBUG(log, "Dispatcher initialized");
 }

From 9bdeb436c2671d92d3462374777bdf88b9e06d12 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 15:06:36 +0300
Subject: [PATCH 220/260] Fix typo

---
 src/Coordination/NuKeeperStateMachine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Coordination/NuKeeperStateMachine.cpp b/src/Coordination/NuKeeperStateMachine.cpp
index 23485cb8b5b..a7037b8d644 100644
--- a/src/Coordination/NuKeeperStateMachine.cpp
+++ b/src/Coordination/NuKeeperStateMachine.cpp
@@ -229,7 +229,7 @@ void NuKeeperStateMachine::save_logical_snp_obj(
     cloned_meta = nuraft::snapshot::deserialize(*snp_buf);
 
     /// Sometimes NuRaft can call save and create snapshots from different threads
-    /// at onces. To avoid race conditions we serialize snapshots through snapshots_queue
+    /// at once. To avoid race conditions we serialize snapshots through snapshots_queue
     /// TODO: make something better
     CreateSnapshotTask snapshot_task;
     std::shared_ptr<std::promise<void>> waiter = std::make_shared<std::promise<void>>();

From b0e401088ada141b6d206a4f0b279c87cacbccee Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 15:20:49 +0300
Subject: [PATCH 221/260] Make this test sequential

---
 tests/queries/skip_list.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json
index 981cf69d676..9c061da7428 100644
--- a/tests/queries/skip_list.json
+++ b/tests/queries/skip_list.json
@@ -582,6 +582,7 @@
         "00980_zookeeper_merge_tree_alter_settings",
         "00988_constraints_replication_zookeeper",
         "00989_parallel_parts_loading",
+        "00992_system_parts_race_condition_zookeeper_long",
         "00993_system_parts_race_condition_drop_zookeeper",
         "01012_show_tables_limit",
         "01013_sync_replica_timeout_zookeeper",

From 35d1443a9c075756b6fc2fbe624c97e59b2fe49b Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 15:54:08 +0300
Subject: [PATCH 222/260] Don't wait when table shutdown called

---
 src/Storages/StorageReplicatedMergeTree.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 9b93d7183fd..0849f65477d 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -4932,7 +4932,7 @@ bool StorageReplicatedMergeTree::waitForTableReplicaToProcessLogEntry(
 
     const auto & stop_waiting = [&]()
     {
-        bool stop_waiting_itself = waiting_itself && is_dropped;
+        bool stop_waiting_itself = waiting_itself && (partial_shutdown_called || is_dropped);
         bool stop_waiting_non_active = !wait_for_non_active && !getZooKeeper()->exists(table_zookeeper_path + "/replicas/" + replica + "/is_active");
         return stop_waiting_itself || stop_waiting_non_active;
     };

From 2f07056ef6ff60444ab333d3357431f88fa4f0d5 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 16:39:09 +0300
Subject: [PATCH 223/260] More stable last get

---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 7380a9d9cbb..f0b4998dad0 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -103,7 +103,7 @@
                         (gen/nemesis (gen/once {:type :info, :f :stop}))
                         (gen/log "Waiting for recovery")
                         (gen/sleep 10)
-                        (gen/clients (:final-generator workload)))})))
+                        (gen/clients (gen/until-ok (:final-generator workload))))})))
 
 (def all-nemesises (keys custom-nemesis/custom-nemesises))
 

From 8b08c0c3a667316926c2380612d7917cf78e370a Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 17:45:52 +0300
Subject: [PATCH 224/260] Fix test_odbc_interaction

---
 tests/integration/test_odbc_interaction/test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_odbc_interaction/test.py b/tests/integration/test_odbc_interaction/test.py
index 6232168f2e6..2ef71927bdf 100644
--- a/tests/integration/test_odbc_interaction/test.py
+++ b/tests/integration/test_odbc_interaction/test.py
@@ -360,6 +360,7 @@ def test_bridge_dies_with_parent(started_cluster):
 
     assert clickhouse_pid is None
     assert bridge_pid is None
+    node1.start_clickhouse(20)
 
 
 def test_odbc_postgres_date_data_type(started_cluster):

From 0fae73071c5a7fd6deb8068363050b7fa5e89124 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 18:14:42 +0300
Subject: [PATCH 225/260] Fix flaky test

---
 tests/integration/test_dictionaries_update_and_reload/test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_dictionaries_update_and_reload/test.py b/tests/integration/test_dictionaries_update_and_reload/test.py
index 5c8abcda38e..533a29dc245 100644
--- a/tests/integration/test_dictionaries_update_and_reload/test.py
+++ b/tests/integration/test_dictionaries_update_and_reload/test.py
@@ -141,7 +141,8 @@ def test_reload_after_loading(started_cluster):
     time.sleep(1)  # see the comment above
     replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '81', '82')
     replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '101', '102')
-    query("SYSTEM RELOAD DICTIONARIES")
+    query("SYSTEM RELOAD DICTIONARY 'file'")
+    query("SYSTEM RELOAD DICTIONARY 'executable'")
     assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "82\n"
     assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "102\n"
 

From 5c0c6a9aecc7f05b6f0d73f6a84e937feaf42021 Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Fri, 26 Mar 2021 18:16:15 +0300
Subject: [PATCH 226/260] Delete unused users.xml

---
 .../configs/users.xml                                     | 8 --------
 1 file changed, 8 deletions(-)
 delete mode 100644 tests/integration/test_system_clusters_actual_information/configs/users.xml

diff --git a/tests/integration/test_system_clusters_actual_information/configs/users.xml b/tests/integration/test_system_clusters_actual_information/configs/users.xml
deleted file mode 100644
index 3dd68165fac..00000000000
--- a/tests/integration/test_system_clusters_actual_information/configs/users.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0"?>
-<yandex>
-    <profiles>
-        <default>
-            <sleep_in_send_tables_status_ms>5000</sleep_in_send_tables_status_ms>
-        </default>
-    </profiles>
-</yandex>

From 482704c4343389fc2e07d7361a2fba22cfbbf1cd Mon Sep 17 00:00:00 2001
From: Sergey Demurin <kakty3.mail@gmail.com>
Date: Fri, 26 Mar 2021 18:29:07 +0300
Subject: [PATCH 227/260] Update other-functions.md

fix typo
---
 docs/ru/sql-reference/functions/other-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/functions/other-functions.md b/docs/ru/sql-reference/functions/other-functions.md
index 54537b7735d..f9b3e5c3e68 100644
--- a/docs/ru/sql-reference/functions/other-functions.md
+++ b/docs/ru/sql-reference/functions/other-functions.md
@@ -672,7 +672,7 @@ neighbor(column, offset[, default_value])
     Функция может получить доступ к значению в столбце соседней строки только внутри обрабатываемого в данный момент блока данных.
 
 Порядок строк, используемый при вычислении функции `neighbor`, может отличаться от порядка строк, возвращаемых пользователю.
-Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию изне подзапроса.
+Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию извне подзапроса.
 
 **Аргументы**
 

From 1b0a9461f0e6500d98be27c36c082ba4fe471d66 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 18:30:35 +0300
Subject: [PATCH 228/260] Fix more tests

---
 tests/integration/test_dictionaries_postgresql/test.py      | 2 +-
 .../test_distributed_inter_server_secret/test.py            | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_dictionaries_postgresql/test.py b/tests/integration/test_dictionaries_postgresql/test.py
index 5ceb6496b90..10d9f4213e1 100644
--- a/tests/integration/test_dictionaries_postgresql/test.py
+++ b/tests/integration/test_dictionaries_postgresql/test.py
@@ -80,7 +80,7 @@ def test_load_dictionaries(started_cluster):
     create_dict(table_name)
     dict_name = 'dict0'
 
-    node1.query("SYSTEM RELOAD DICTIONARIES")
+    node1.query("SYSTEM RELOAD DICTIONARY {}".format(dict_name))
     assert node1.query("SELECT count() FROM `test`.`dict_table_{}`".format(table_name)).rstrip() == '10000'
     assert node1.query("SELECT dictGetUInt32('{}', 'id', toUInt64(0))".format(dict_name)) == '0\n'
     assert node1.query("SELECT dictGetUInt32('{}', 'value', toUInt64(9999))".format(dict_name)) == '9999\n'
diff --git a/tests/integration/test_distributed_inter_server_secret/test.py b/tests/integration/test_distributed_inter_server_secret/test.py
index b1daf2271d0..1a0e5a3dd91 100644
--- a/tests/integration/test_distributed_inter_server_secret/test.py
+++ b/tests/integration/test_distributed_inter_server_secret/test.py
@@ -97,12 +97,14 @@ def test_insecure():
     n1.query('SELECT * FROM dist_insecure')
 
 def test_insecure_insert_async():
+    n1.query("TRUNCATE TABLE data")
     n1.query('INSERT INTO dist_insecure SELECT * FROM numbers(2)')
     n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER insecure dist_insecure')
     assert int(n1.query('SELECT count() FROM dist_insecure')) == 2
     n1.query('TRUNCATE TABLE data ON CLUSTER insecure')
 
 def test_insecure_insert_sync():
+    n1.query("TRUNCATE TABLE data")
     n1.query('INSERT INTO dist_insecure SELECT * FROM numbers(2)', settings={'insert_distributed_sync': 1})
     assert int(n1.query('SELECT count() FROM dist_insecure')) == 2
     n1.query('TRUNCATE TABLE data ON CLUSTER secure')
@@ -111,12 +113,14 @@ def test_secure():
     n1.query('SELECT * FROM dist_secure')
 
 def test_secure_insert_async():
+    n1.query("TRUNCATE TABLE data")
     n1.query('INSERT INTO dist_secure SELECT * FROM numbers(2)')
     n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER secure dist_secure')
     assert int(n1.query('SELECT count() FROM dist_secure')) == 2
     n1.query('TRUNCATE TABLE data ON CLUSTER secure')
 
 def test_secure_insert_sync():
+    n1.query("TRUNCATE TABLE data")
     n1.query('INSERT INTO dist_secure SELECT * FROM numbers(2)', settings={'insert_distributed_sync': 1})
     assert int(n1.query('SELECT count() FROM dist_secure')) == 2
     n1.query('TRUNCATE TABLE data ON CLUSTER secure')
@@ -126,6 +130,7 @@ def test_secure_insert_sync():
 # Buffer() flush happens with global context, that does not have user
 # And so Context::user/ClientInfo::current_user/ClientInfo::initial_user will be empty
 def test_secure_insert_buffer_async():
+    n1.query("TRUNCATE TABLE data")
     n1.query('INSERT INTO dist_secure_buffer SELECT * FROM numbers(2)')
     n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER secure dist_secure')
     # no Buffer flush happened
@@ -141,6 +146,7 @@ def test_secure_disagree():
         n1.query('SELECT * FROM dist_secure_disagree')
 
 def test_secure_disagree_insert():
+    n1.query("TRUNCATE TABLE data")
     n1.query('INSERT INTO dist_secure_disagree SELECT * FROM numbers(2)')
     with pytest.raises(QueryRuntimeException, match='.*Hash mismatch.*'):
         n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER secure_disagree dist_secure_disagree')

From 9bb0dc48b28bcb2fd2c12eafbad931331d599103 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 18:37:26 +0300
Subject: [PATCH 229/260] Fix one more test

---
 .../test_max_http_connections_for_replication/test.py  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/integration/test_max_http_connections_for_replication/test.py b/tests/integration/test_max_http_connections_for_replication/test.py
index 2dc4e2a8810..634697c8668 100644
--- a/tests/integration/test_max_http_connections_for_replication/test.py
+++ b/tests/integration/test_max_http_connections_for_replication/test.py
@@ -43,6 +43,8 @@ def start_small_cluster():
 
 
 def test_single_endpoint_connections_count(start_small_cluster):
+    node1.query("TRUNCATE TABLE test_table")
+    node2.query("SYSTEM SYNC REPLICA test_table")
     def task(count):
         print(("Inserting ten times from {}".format(count)))
         for i in range(count, count + 10):
@@ -58,9 +60,11 @@ def test_single_endpoint_connections_count(start_small_cluster):
 
 
 def test_keepalive_timeout(start_small_cluster):
-    current_count = int(node1.query("select count() from test_table").strip())
+    node1.query("TRUNCATE TABLE test_table")
+    node2.query("SYSTEM SYNC REPLICA test_table")
+
     node1.query("insert into test_table values ('2017-06-16', 777, 0)")
-    assert_eq_with_retry(node2, "select count() from test_table", str(current_count + 1))
+    assert_eq_with_retry(node2, "select count() from test_table", str(1))
     # Server keepAliveTimeout is 3 seconds, default client session timeout is 8
     # lets sleep in that interval
     time.sleep(4)
@@ -69,7 +73,7 @@ def test_keepalive_timeout(start_small_cluster):
 
     time.sleep(3)
 
-    assert_eq_with_retry(node2, "select count() from test_table", str(current_count + 2))
+    assert_eq_with_retry(node2, "select count() from test_table", str(2))
 
     assert not node2.contains_in_log("No message received"), "Found 'No message received' in clickhouse-server.log"
 

From aa2244bad5b66271dd766c41698189c9504b4d47 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 18:59:14 +0300
Subject: [PATCH 230/260] Fix more

---
 tests/integration/test_ttl_replicated/test.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/integration/test_ttl_replicated/test.py b/tests/integration/test_ttl_replicated/test.py
index 389e249790f..67614b88029 100644
--- a/tests/integration/test_ttl_replicated/test.py
+++ b/tests/integration/test_ttl_replicated/test.py
@@ -396,6 +396,10 @@ def test_ttl_compatibility(started_cluster, node_left, node_right, num_run):
     node_right.query("OPTIMIZE TABLE test_ttl_group_by FINAL")
     node_right.query("OPTIMIZE TABLE test_ttl_where FINAL")
 
+    node_left.query("SYSTEM SYNC REPLICA test_ttl_delete", timeout=20)
+    node_left.query("SYSTEM SYNC REPLICA test_ttl_group_by", timeout=20)
+    node_left.query("SYSTEM SYNC REPLICA test_ttl_where", timeout=20)
+
     assert node_left.query("SELECT id FROM test_ttl_delete ORDER BY id") == "2\n4\n"
     assert node_right.query("SELECT id FROM test_ttl_delete ORDER BY id") == "2\n4\n"
 

From f32704101b5e415b341e28e653ab7239e7c440b1 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 19:56:08 +0300
Subject: [PATCH 231/260] Add retries to final operations

---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj |  5 ++---
 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj    |  2 +-
 tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj   |  8 +++-----
 tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj     | 10 +++++-----
 tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj   | 11 +++++++++++
 5 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
index 48b270517a4..7e2cd00736f 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
@@ -24,11 +24,10 @@
 
   (invoke! [this test op]
     (case (:f op)
-      :read (try
+      :read (exec-with-retries 30 (fn []
               (assoc op
                      :type :ok
-                     :value (count (zk-list conn "/")))
-              (catch Exception _ (assoc op :type :fail, :error :connect-error)))
+                     :value (count (zk-list conn "/")))))
       :add (try
              (do
                (zk-multi-create-many-seq-nodes conn "/seq-" (:value op))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index f0b4998dad0..7380a9d9cbb 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -103,7 +103,7 @@
                         (gen/nemesis (gen/once {:type :info, :f :stop}))
                         (gen/log "Waiting for recovery")
                         (gen/sleep 10)
-                        (gen/clients (gen/until-ok (:final-generator workload))))})))
+                        (gen/clients (:final-generator workload)))})))
 
 (def all-nemesises (keys custom-nemesis/custom-nemesises))
 
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
index 951c0822ad2..494e0357bc1 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
@@ -40,11 +40,9 @@
         (catch Exception _ (assoc op :type :info, :error :connect-error)))
       :drain
       ; drain via delete is to long, just list all nodes
-      (try
-        (do
-          (zk-sync conn)
-          (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/")))))
-        (catch Exception _ (assoc op :type :info, :error :connect-error)))))
+      (exec-with-retries 30 (fn []
+        (zk-sync conn)
+        (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/"))))))))
 
   (teardown! [_ test])
 
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
index 23461591eaf..01cc10e9a0f 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -22,11 +22,11 @@
 
   (invoke! [this test op]
     (case (:f op)
-      :read (do
-              (zk-sync conn)
-              (assoc op
-                     :type :ok
-                     :value (read-string (:data (zk-get-str conn k)))))
+      :read (exec-with-retries 30 (fn []
+                                  (zk-sync conn)
+                                  (assoc op
+                                         :type :ok
+                                         :value (read-string (:data (zk-get-str conn k))))))
       :add (try
              (do
                (zk-add-to-set conn k (:value op))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index 0e0db2d3a6d..032a8829514 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -167,3 +167,14 @@
     :--test_keeper_server.snapshot_storage_path coordination-snapshots-dir
     :--test_keeper_server.logs_storage_path coordination-logs-dir)
   (wait-clickhouse-alive! node test)))
+
+(defn exec-with-retries
+  [retries f & args]
+  (let [res (try {:value (apply f args)}
+                 (catch Exception e
+                   (if (zero? retries)
+                     (throw e)
+                     {:exception e})))]
+    (if (:exception res)
+      (do (Thread/sleep 1000) (recur (dec retries) f args))
+      (:value res))))

From e101fbab53ec7c984bd0e6cc8ec5e6d9fd1a897a Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 26 Mar 2021 19:57:23 +0300
Subject: [PATCH 232/260] Fix style

---
 .../src/jepsen/nukeeper/counter.clj           |  6 +--
 .../src/jepsen/nukeeper/db.clj                | 51 +++++++++----------
 .../src/jepsen/nukeeper/main.clj              |  8 +--
 .../src/jepsen/nukeeper/nemesis.clj           |  8 +--
 .../src/jepsen/nukeeper/queue.clj             |  4 +-
 .../src/jepsen/nukeeper/set.clj               |  8 +--
 .../src/jepsen/nukeeper/utils.clj             | 12 ++---
 7 files changed, 48 insertions(+), 49 deletions(-)

diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
index 7e2cd00736f..b426a8ea90d 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj
@@ -25,9 +25,9 @@
   (invoke! [this test op]
     (case (:f op)
       :read (exec-with-retries 30 (fn []
-              (assoc op
-                     :type :ok
-                     :value (count (zk-list conn "/")))))
+                                    (assoc op
+                                           :type :ok
+                                           :value (count (zk-list conn "/")))))
       :add (try
              (do
                (zk-multi-create-many-seq-nodes conn "/seq-" (:value op))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
index 7bc2b9c6cea..d82d628cc95 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj
@@ -32,20 +32,20 @@
 (defn unpack-deb
   [path]
   (do
-  (c/exec :dpkg :-x path common-prefix)
-  (c/exec :rm :-f path)
-  (c/exec :mv (str common-prefix "/usr/bin/clickhouse") common-prefix)
-  (c/exec :rm :-rf (str common-prefix "/usr") (str common-prefix "/etc"))))
+    (c/exec :dpkg :-x path common-prefix)
+    (c/exec :rm :-f path)
+    (c/exec :mv (str common-prefix "/usr/bin/clickhouse") common-prefix)
+    (c/exec :rm :-rf (str common-prefix "/usr") (str common-prefix "/etc"))))
 
 (defn unpack-tgz
   [path]
   (do
-  (c/exec :mkdir :-p (str common-prefix "/unpacked"))
-  (c/exec :tar :-zxvf path :-C (str common-prefix "/unpacked"))
-  (c/exec :rm :-f path)
-  (let [subdir (c/exec :ls (str common-prefix "/unpacked"))]
-    (c/exec :mv (str common-prefix "/unpacked/" subdir "/usr/bin/clickhouse") common-prefix)
-    (c/exec :rm :-fr (str common-prefix "/unpacked")))))
+    (c/exec :mkdir :-p (str common-prefix "/unpacked"))
+    (c/exec :tar :-zxvf path :-C (str common-prefix "/unpacked"))
+    (c/exec :rm :-f path)
+    (let [subdir (c/exec :ls (str common-prefix "/unpacked"))]
+      (c/exec :mv (str common-prefix "/unpacked/" subdir "/usr/bin/clickhouse") common-prefix)
+      (c/exec :rm :-fr (str common-prefix "/unpacked")))))
 
 (defn chmod-binary
   [path]
@@ -85,10 +85,10 @@
 
 (defn install-configs
   [test node]
-     (c/exec :echo (slurp (io/resource "config.xml")) :> (str configs-dir "/config.xml"))
-     (c/exec :echo (slurp (io/resource "users.xml")) :> (str configs-dir "/users.xml"))
-     (c/exec :echo (slurp (io/resource "listen.xml")) :> (str sub-configs-dir "/listen.xml"))
-     (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> (str sub-configs-dir "/test_keeper_config.xml")))
+  (c/exec :echo (slurp (io/resource "config.xml")) :> (str configs-dir "/config.xml"))
+  (c/exec :echo (slurp (io/resource "users.xml")) :> (str configs-dir "/users.xml"))
+  (c/exec :echo (slurp (io/resource "listen.xml")) :> (str sub-configs-dir "/listen.xml"))
+  (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> (str sub-configs-dir "/test_keeper_config.xml")))
 
 (defn db
   [version reuse-binary]
@@ -96,25 +96,24 @@
     (setup! [_ test node]
       (c/su
        (do
-       (info "Preparing directories")
-       (prepare-dirs)
-       (if (or (not (cu/exists? binary-path)) (not reuse-binary))
+         (info "Preparing directories")
+         (prepare-dirs)
+         (if (or (not (cu/exists? binary-path)) (not reuse-binary))
            (do (info "Downloading clickhouse")
-           (install-downloaded-clickhouse (download-clickhouse version)))
+               (install-downloaded-clickhouse (download-clickhouse version)))
            (info "Binary already exsist on path" binary-path "skipping download"))
-       (info "Installing configs")
-       (install-configs test node)
-       (info "Starting server")
-       (start-clickhouse! node test)
-      (info "ClickHouse started"))))
-
+         (info "Installing configs")
+         (install-configs test node)
+         (info "Starting server")
+         (start-clickhouse! node test)
+         (info "ClickHouse started"))))
 
     (teardown! [_ test node]
       (info node "Tearing down clickhouse")
       (kill-clickhouse! node test)
       (c/su
        (if (not reuse-binary)
-           (c/exec :rm :-rf binary-path))
+         (c/exec :rm :-rf binary-path))
        (c/exec :rm :-rf pid-file-path)
        (c/exec :rm :-rf data-dir)
        (c/exec :rm :-rf logs-dir)
@@ -125,5 +124,5 @@
       (c/su
        (kill-clickhouse! node test)
        (c/cd data-dir
-        (c/exec :tar :czf "coordination.tar.gz" "coordination")))
+             (c/exec :tar :czf "coordination.tar.gz" "coordination")))
       [stderr-file (str logs-dir "/clickhouse-server.log") (str data-dir "/coordination.tar.gz")])))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
index 7380a9d9cbb..b9439097e85 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj
@@ -134,10 +134,10 @@
   [cli worload-nemeseis-collection]
   (take (:test-count cli)
         (shuffle (for [[workload nemesis] worload-nemeseis-collection]
-                                     (assoc cli
-                                            :nemesis   nemesis
-                                            :workload  workload
-                                            :test-count 1)))))
+                   (assoc cli
+                          :nemesis   nemesis
+                          :workload  workload
+                          :test-count 1)))))
 (defn all-tests
   "Turns CLI options into a sequence of tests."
   [test-fn cli]
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
index 8314d29f575..7d4941cdc8e 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj
@@ -85,13 +85,13 @@
 (defn logs-and-snapshots-corruption-nemesis
   []
   (corruptor-nemesis coordination-data-dir (fn [path]
-                                       (do
-                                         (corrupt-file (select-last-file (str path "/snapshots")))
-                                         (corrupt-file (select-last-file (str path "/logs")))))))
+                                             (do
+                                               (corrupt-file (select-last-file (str path "/snapshots")))
+                                               (corrupt-file (select-last-file (str path "/logs")))))))
 (defn drop-all-corruption-nemesis
   []
   (corruptor-nemesis coordination-data-dir (fn [path]
-                                       (c/exec :rm :-fr path))))
+                                             (c/exec :rm :-fr path))))
 
 (defn partition-bridge-nemesis
   []
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
index 494e0357bc1..308778983aa 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj
@@ -41,8 +41,8 @@
       :drain
       ; drain via delete is to long, just list all nodes
       (exec-with-retries 30 (fn []
-        (zk-sync conn)
-        (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/"))))))))
+                              (zk-sync conn)
+                              (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/"))))))))
 
   (teardown! [_ test])
 
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
index 01cc10e9a0f..f9d21a8dc62 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj
@@ -23,10 +23,10 @@
   (invoke! [this test op]
     (case (:f op)
       :read (exec-with-retries 30 (fn []
-                                  (zk-sync conn)
-                                  (assoc op
-                                         :type :ok
-                                         :value (read-string (:data (zk-get-str conn k))))))
+                                    (zk-sync conn)
+                                    (assoc op
+                                           :type :ok
+                                           :value (read-string (:data (zk-get-str conn k))))))
       :add (try
              (do
                (zk-add-to-set conn k (:value op))
diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
index 032a8829514..cfe9add238b 100644
--- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
+++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj
@@ -113,11 +113,11 @@
         first-child (first (sort children))]
     (if (not (nil? first-child))
       (try
-          (do (.check txn path (:version stat))
-              (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions
-              (.delete txn (str path first-child) -1)
-              (.commit txn)
-              first-child)
+        (do (.check txn path (:version stat))
+            (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions
+            (.delete txn (str path first-child) -1)
+            (.commit txn)
+            first-child)
         (catch KeeperException$BadVersionException _ nil)
         ; Even if we got connection loss, delete may actually be executed.
         ; This function is used for queue model, which strictly require
@@ -166,7 +166,7 @@
     :--logger.errorlog (str logs-dir "/clickhouse-server.err.log")
     :--test_keeper_server.snapshot_storage_path coordination-snapshots-dir
     :--test_keeper_server.logs_storage_path coordination-logs-dir)
-  (wait-clickhouse-alive! node test)))
+   (wait-clickhouse-alive! node test)))
 
 (defn exec-with-retries
   [retries f & args]

From 48ba36b682b26b6bd5524df6be19c1938478179d Mon Sep 17 00:00:00 2001
From: Robert Hodges <rhodges@altinity.com>
Date: Fri, 26 Mar 2021 10:34:48 -0700
Subject: [PATCH 233/260] Update postgresql.md

Corrected typo in PostgreSQL Table Engine page title.
---
 docs/en/engines/table-engines/integrations/postgresql.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md
index 1a2ccf3e0dc..8326038407f 100644
--- a/docs/en/engines/table-engines/integrations/postgresql.md
+++ b/docs/en/engines/table-engines/integrations/postgresql.md
@@ -3,7 +3,7 @@ toc_priority: 8
 toc_title: PostgreSQL
 ---
 
-# PosgtreSQL {#postgresql}
+# PostgreSQL {#postgresql}
 
 The PostgreSQL engine allows you to perform `SELECT` queries on data that is stored on a remote PostgreSQL server.
 

From e32beab913ba29ac92b1d91b31aa9171c980a6e6 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 26 Mar 2021 22:08:53 +0300
Subject: [PATCH 234/260] Add a test for mmap IO

---
 .../1_stateful/00162_mmap_compression_none.reference      | 1 +
 tests/queries/1_stateful/00162_mmap_compression_none.sql  | 8 ++++++++
 2 files changed, 9 insertions(+)
 create mode 100644 tests/queries/1_stateful/00162_mmap_compression_none.reference
 create mode 100644 tests/queries/1_stateful/00162_mmap_compression_none.sql

diff --git a/tests/queries/1_stateful/00162_mmap_compression_none.reference b/tests/queries/1_stateful/00162_mmap_compression_none.reference
new file mode 100644
index 00000000000..3495cc537c1
--- /dev/null
+++ b/tests/queries/1_stateful/00162_mmap_compression_none.reference
@@ -0,0 +1 @@
+687074654
diff --git a/tests/queries/1_stateful/00162_mmap_compression_none.sql b/tests/queries/1_stateful/00162_mmap_compression_none.sql
new file mode 100644
index 00000000000..2178644214a
--- /dev/null
+++ b/tests/queries/1_stateful/00162_mmap_compression_none.sql
@@ -0,0 +1,8 @@
+DROP TABLE IF EXISTS hits_none;
+CREATE TABLE hits_none (Title String CODEC(NONE)) ENGINE = MergeTree ORDER BY tuple();
+INSERT INTO hits_none SELECT Title FROM test.hits;
+
+SET min_bytes_to_use_mmap_io = 1;
+SELECT sum(length(Title)) FROM hits_none;
+
+DROP TABLE hits_none;

From 1e08304fb1ea8b24a9debaf56d3afd40558b993a Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 26 Mar 2021 22:12:56 +0300
Subject: [PATCH 235/260] Add performance test

---
 tests/performance/codec_none.xml | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 tests/performance/codec_none.xml

diff --git a/tests/performance/codec_none.xml b/tests/performance/codec_none.xml
new file mode 100644
index 00000000000..e6eb9773a66
--- /dev/null
+++ b/tests/performance/codec_none.xml
@@ -0,0 +1,13 @@
+<test>
+    <preconditions>
+        <table_exists>hits_10m_single</table_exists>
+    </preconditions>
+
+    <create_query>CREATE TABLE hits_none (Title String CODEC(NONE)) ENGINE = MergeTree ORDER BY tuple()</create_query>
+    <fill_query>INSERT INTO hits_none SELECT Title FROM test.hits</fill_query>
+    <fill_query>OPTIMIZE TABLE hits_none FINAL</fill_query>
+
+    <query><![CDATA[SELECT sum(length(Title)) FROM hits_none]]></query>
+
+    <drop_query>DROP TABLE hits_none</drop_query>
+</test>

From d01af5e9f8d17b0e4541cb2dda0df847e28dcaa8 Mon Sep 17 00:00:00 2001
From: Ilya Yatsishin <2159081+qoega@users.noreply.github.com>
Date: Fri, 26 Mar 2021 22:44:40 +0300
Subject: [PATCH 236/260] touch to rebuild

---
 docker/test/sqlancer/Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docker/test/sqlancer/Dockerfile b/docker/test/sqlancer/Dockerfile
index 6bcdc3df5cd..253ca1b729a 100644
--- a/docker/test/sqlancer/Dockerfile
+++ b/docker/test/sqlancer/Dockerfile
@@ -2,7 +2,6 @@
 FROM ubuntu:20.04
 
 RUN apt-get update --yes && env DEBIAN_FRONTEND=noninteractive apt-get install wget unzip git openjdk-14-jdk maven python3 --yes --no-install-recommends
-
 RUN wget https://github.com/sqlancer/sqlancer/archive/master.zip -O /sqlancer.zip
 RUN mkdir /sqlancer && \
 	cd /sqlancer && \

From ff891c50022c11c9b4b0c1bc596ab308b2bd3623 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 26 Mar 2021 23:46:04 +0300
Subject: [PATCH 237/260] Preparation

---
 src/IO/MMapReadBufferFromFile.cpp           |  4 +-
 src/IO/MMapReadBufferFromFileDescriptor.cpp | 68 +++++----------------
 src/IO/MMapReadBufferFromFileDescriptor.h   | 10 ++-
 3 files changed, 22 insertions(+), 60 deletions(-)

diff --git a/src/IO/MMapReadBufferFromFile.cpp b/src/IO/MMapReadBufferFromFile.cpp
index d881e0d2b30..a507c9670a8 100644
--- a/src/IO/MMapReadBufferFromFile.cpp
+++ b/src/IO/MMapReadBufferFromFile.cpp
@@ -44,7 +44,7 @@ MMapReadBufferFromFile::MMapReadBufferFromFile(const std::string & file_name_, s
     : file_name(file_name_)
 {
     open();
-    init(fd, offset, length_);
+    mapped.set(fd, offset, length_);
 }
 
 
@@ -52,7 +52,7 @@ MMapReadBufferFromFile::MMapReadBufferFromFile(const std::string & file_name_, s
     : file_name(file_name_)
 {
     open();
-    init(fd, offset);
+    mapped.set(fd, offset);
 }
 
 
diff --git a/src/IO/MMapReadBufferFromFileDescriptor.cpp b/src/IO/MMapReadBufferFromFileDescriptor.cpp
index 2034a116e0c..56e9e384cd4 100644
--- a/src/IO/MMapReadBufferFromFileDescriptor.cpp
+++ b/src/IO/MMapReadBufferFromFileDescriptor.cpp
@@ -16,91 +16,55 @@ namespace DB
 
 namespace ErrorCodes
 {
-    extern const int CANNOT_ALLOCATE_MEMORY;
-    extern const int CANNOT_MUNMAP;
-    extern const int CANNOT_STAT;
-    extern const int BAD_ARGUMENTS;
-    extern const int LOGICAL_ERROR;
     extern const int ARGUMENT_OUT_OF_BOUND;
     extern const int CANNOT_SEEK_THROUGH_FILE;
 }
 
 
-void MMapReadBufferFromFileDescriptor::init(int fd_, size_t offset, size_t length_)
+void MMapReadBufferFromFileDescriptor::init()
 {
-    fd = fd_;
-    length = length_;
+    size_t length = mapped.getLength();
+    BufferBase::set(mapped.getData(), length, 0);
 
-    if (length)
-    {
-        void * buf = mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fd, offset);
-        if (MAP_FAILED == buf)
-            throwFromErrno(fmt::format("MMapReadBufferFromFileDescriptor: Cannot mmap {}.", ReadableSize(length)),
-                ErrorCodes::CANNOT_ALLOCATE_MEMORY);
-
-        BufferBase::set(static_cast<char *>(buf), length, 0);
-
-        size_t page_size = static_cast<size_t>(::getPageSize());
-        ReadBuffer::padded = (length % page_size) > 0 && (length % page_size) <= (page_size - 15);
-    }
-}
-
-void MMapReadBufferFromFileDescriptor::init(int fd_, size_t offset)
-{
-    fd = fd_;
-
-    struct stat stat_res {};
-    if (0 != fstat(fd, &stat_res))
-        throwFromErrno("MMapReadBufferFromFileDescriptor: Cannot fstat.", ErrorCodes::CANNOT_STAT);
-
-    off_t file_size = stat_res.st_size;
-
-    if (file_size < 0)
-        throw Exception("MMapReadBufferFromFileDescriptor: fstat returned negative file size", ErrorCodes::LOGICAL_ERROR);
-
-    if (offset > static_cast<size_t>(file_size))
-        throw Exception("MMapReadBufferFromFileDescriptor: requested offset is greater than file size", ErrorCodes::BAD_ARGUMENTS);
-
-    init(fd, offset, file_size - offset);
+    size_t page_size = static_cast<size_t>(::getPageSize());
+    ReadBuffer::padded = (length % page_size) > 0 && (length % page_size) <= (page_size - 15);
 }
 
 
-MMapReadBufferFromFileDescriptor::MMapReadBufferFromFileDescriptor(int fd_, size_t offset_, size_t length_)
+MMapReadBufferFromFileDescriptor::MMapReadBufferFromFileDescriptor(int fd, size_t offset, size_t length)
+    : mapped(fd, offset, length)
 {
-    init(fd_, offset_, length_);
+    init();
 }
 
 
-MMapReadBufferFromFileDescriptor::MMapReadBufferFromFileDescriptor(int fd_, size_t offset_)
+MMapReadBufferFromFileDescriptor::MMapReadBufferFromFileDescriptor(int fd, size_t offset)
+    : mapped(fd, offset)
 {
-    init(fd_, offset_);
+    init();
 }
 
 
 MMapReadBufferFromFileDescriptor::~MMapReadBufferFromFileDescriptor()
 {
-    if (length)
-        finish();    /// Exceptions will lead to std::terminate and that's Ok.
+    finish();
 }
 
 
 void MMapReadBufferFromFileDescriptor::finish()
 {
-    if (0 != munmap(internalBuffer().begin(), length))
-        throwFromErrno(fmt::format("MMapReadBufferFromFileDescriptor: Cannot munmap {}.", ReadableSize(length)),
-            ErrorCodes::CANNOT_MUNMAP);
-
-    length = 0;
+    mapped.finish();
 }
 
+
 std::string MMapReadBufferFromFileDescriptor::getFileName() const
 {
-    return "(fd = " + toString(fd) + ")";
+    return "(fd = " + toString(mapped.getFD()) + ")";
 }
 
 int MMapReadBufferFromFileDescriptor::getFD() const
 {
-    return fd;
+    return mapped.getFD();
 }
 
 off_t MMapReadBufferFromFileDescriptor::getPosition()
diff --git a/src/IO/MMapReadBufferFromFileDescriptor.h b/src/IO/MMapReadBufferFromFileDescriptor.h
index e409e9d2d0c..21aea3e8409 100644
--- a/src/IO/MMapReadBufferFromFileDescriptor.h
+++ b/src/IO/MMapReadBufferFromFileDescriptor.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <IO/ReadBufferFromFileBase.h>
+#include <IO/MappedFile.h>
 
 
 namespace DB
@@ -18,8 +19,9 @@ public:
 
 protected:
     MMapReadBufferFromFileDescriptor() {}
-    void init(int fd_, size_t offset, size_t length_);
-    void init(int fd_, size_t offset);
+    void init();
+
+    MappedFile mapped;
 
 public:
     MMapReadBufferFromFileDescriptor(int fd_, size_t offset_, size_t length_);
@@ -35,10 +37,6 @@ public:
     off_t getPosition() override;
     std::string getFileName() const override;
     int getFD() const;
-
-private:
-    size_t length = 0;
-    int fd = -1;
 };
 
 }

From 7aac5e1849591fba3fbe7abb7e53f063540e7472 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 26 Mar 2021 23:51:46 +0300
Subject: [PATCH 238/260] Preparation

---
 src/IO/MappedFile.cpp | 100 ++++++++++++++++++++++++++++++++++++++++++
 src/IO/MappedFile.h   |  49 +++++++++++++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 src/IO/MappedFile.cpp
 create mode 100644 src/IO/MappedFile.h

diff --git a/src/IO/MappedFile.cpp b/src/IO/MappedFile.cpp
new file mode 100644
index 00000000000..424bb077956
--- /dev/null
+++ b/src/IO/MappedFile.cpp
@@ -0,0 +1,100 @@
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fmt/format.h>
+
+#include <Common/formatReadable.h>
+#include <Common/Exception.h>
+#include <common/getPageSize.h>
+#include <IO/MappedFile.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_ALLOCATE_MEMORY;
+    extern const int CANNOT_MUNMAP;
+    extern const int CANNOT_STAT;
+    extern const int BAD_ARGUMENTS;
+    extern const int LOGICAL_ERROR;
+}
+
+
+static size_t getFileSize(int fd)
+{
+    struct stat stat_res {};
+    if (0 != fstat(fd, &stat_res))
+        throwFromErrno("MMapReadBufferFromFileDescriptor: Cannot fstat.", ErrorCodes::CANNOT_STAT);
+
+    off_t file_size = stat_res.st_size;
+
+    if (file_size < 0)
+        throw Exception("MMapReadBufferFromFileDescriptor: fstat returned negative file size", ErrorCodes::LOGICAL_ERROR);
+
+    return file_size;
+}
+
+
+MappedFile::MappedFile(int fd_, size_t offset_, size_t length_)
+{
+    set(fd_, offset_, length_);
+}
+
+MappedFile::MappedFile(int fd_, size_t offset_)
+    : fd(fd_), offset(offset_)
+{
+    set(fd_, offset_);
+}
+
+void MappedFile::set(int fd_, size_t offset_, size_t length_)
+{
+    finish();
+
+    fd = fd_;
+    offset = offset_;
+    length = length_;
+
+    if (length)
+    {
+        void * buf = mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fd, offset);
+        if (MAP_FAILED == buf)
+            throwFromErrno(fmt::format("MMapReadBufferFromFileDescriptor: Cannot mmap {}.", ReadableSize(length)),
+                ErrorCodes::CANNOT_ALLOCATE_MEMORY);
+
+        data = static_cast<char *>(buf);
+    }
+}
+
+void MappedFile::set(int fd_, size_t offset_)
+{
+    size_t file_size = getFileSize(fd_);
+
+    if (offset > static_cast<size_t>(file_size))
+        throw Exception("MMapReadBufferFromFileDescriptor: requested offset is greater than file size", ErrorCodes::BAD_ARGUMENTS);
+
+    set(fd_, offset_, file_size - offset);
+}
+
+void MappedFile::finish()
+{
+    if (!length)
+        return;
+
+    if (0 != munmap(data, length))
+        throwFromErrno(fmt::format("MMapReadBufferFromFileDescriptor: Cannot munmap {}.", ReadableSize(length)),
+            ErrorCodes::CANNOT_MUNMAP);
+
+    length = 0;
+}
+
+MappedFile::~MappedFile()
+{
+    finish(); /// Exceptions will lead to std::terminate and that's Ok.
+}
+
+}
+
diff --git a/src/IO/MappedFile.h b/src/IO/MappedFile.h
new file mode 100644
index 00000000000..cd5217d7f61
--- /dev/null
+++ b/src/IO/MappedFile.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <cstddef>
+
+
+namespace DB
+{
+
+/// MMaps a region in file descriptor (or a whole file) into memory. Unmaps in destructor.
+/// Does not open or close file.
+class MappedFile
+{
+public:
+    MappedFile(int fd_, size_t offset_, size_t length_);
+    MappedFile(int fd_, size_t offset_);
+
+    /// Makes empty object that can be initialized with `set`.
+    MappedFile() {}
+
+    ~MappedFile();
+
+    char * getData() { return data; }
+    const char * getData() const { return data; }
+
+    int getFD() const { return fd; }
+    size_t getOffset() const { return offset; }
+    size_t getLength() const { return length; }
+
+    /// Unmap memory before call to destructor
+    void finish();
+
+    /// Initialize or reset to another fd.
+    void set(int fd_, size_t offset_, size_t length_);
+    void set(int fd_, size_t offset_);
+
+private:
+    MappedFile(const MappedFile &) = delete;
+    MappedFile(MappedFile &&) = delete;
+
+    void init();
+
+    int fd = -1;
+    size_t offset = 0;
+    size_t length = 0;
+    char * data = nullptr;
+};
+
+
+}

From 1d9e23f9253855d0c5ce6ce38759236f91141e7a Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 27 Mar 2021 01:28:57 +0300
Subject: [PATCH 239/260] Fix error

---
 src/IO/MMapReadBufferFromFile.cpp | 2 ++
 src/IO/MappedFile.h               | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/IO/MMapReadBufferFromFile.cpp b/src/IO/MMapReadBufferFromFile.cpp
index a507c9670a8..b3354c42fbb 100644
--- a/src/IO/MMapReadBufferFromFile.cpp
+++ b/src/IO/MMapReadBufferFromFile.cpp
@@ -45,6 +45,7 @@ MMapReadBufferFromFile::MMapReadBufferFromFile(const std::string & file_name_, s
 {
     open();
     mapped.set(fd, offset, length_);
+    init();
 }
 
 
@@ -53,6 +54,7 @@ MMapReadBufferFromFile::MMapReadBufferFromFile(const std::string & file_name_, s
 {
     open();
     mapped.set(fd, offset);
+    init();
 }
 
 
diff --git a/src/IO/MappedFile.h b/src/IO/MappedFile.h
index cd5217d7f61..3e218280885 100644
--- a/src/IO/MappedFile.h
+++ b/src/IO/MappedFile.h
@@ -6,7 +6,7 @@
 namespace DB
 {
 
-/// MMaps a region in file descriptor (or a whole file) into memory. Unmaps in destructor.
+/// MMaps a region in file (or a whole file) into memory. Unmaps in destructor.
 /// Does not open or close file.
 class MappedFile
 {
@@ -45,5 +45,4 @@ private:
     char * data = nullptr;
 };
 
-
 }

From bcdf3dce36653addb25d434504b66d082aa91e06 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 27 Mar 2021 02:22:51 +0300
Subject: [PATCH 240/260] Add cache for mmapped files

---
 src/Common/ProfileEvents.cpp               |   2 +
 src/IO/MMapReadBufferFromFileDescriptor.h  |   4 +-
 src/IO/MMapReadBufferFromFileWithCache.cpp |  84 ++++++++++++++++
 src/IO/MMapReadBufferFromFileWithCache.h   |  31 ++++++
 src/IO/MappedFile.cpp                      | 106 ++++++++-------------
 src/IO/MappedFile.h                        |  44 ++++-----
 src/IO/MappedFileCache.h                   |  63 ++++++++++++
 src/IO/MappedFileDescriptor.cpp            | 101 ++++++++++++++++++++
 src/IO/MappedFileDescriptor.h              |  49 ++++++++++
 src/IO/createReadBufferFromFileBase.cpp    |   4 +-
 src/IO/ya.make                             |   3 +
 11 files changed, 397 insertions(+), 94 deletions(-)
 create mode 100644 src/IO/MMapReadBufferFromFileWithCache.cpp
 create mode 100644 src/IO/MMapReadBufferFromFileWithCache.h
 create mode 100644 src/IO/MappedFileCache.h
 create mode 100644 src/IO/MappedFileDescriptor.cpp
 create mode 100644 src/IO/MappedFileDescriptor.h

diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 2539f20ed4d..948c5f3c6b8 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -32,6 +32,8 @@
     M(UncompressedCacheHits, "") \
     M(UncompressedCacheMisses, "") \
     M(UncompressedCacheWeightLost, "") \
+    M(MappedFileCacheHits, "") \
+    M(MappedFileCacheMisses, "") \
     M(IOBufferAllocs, "") \
     M(IOBufferAllocBytes, "") \
     M(ArenaAllocChunks, "") \
diff --git a/src/IO/MMapReadBufferFromFileDescriptor.h b/src/IO/MMapReadBufferFromFileDescriptor.h
index 21aea3e8409..eb7fc4ded64 100644
--- a/src/IO/MMapReadBufferFromFileDescriptor.h
+++ b/src/IO/MMapReadBufferFromFileDescriptor.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <IO/ReadBufferFromFileBase.h>
-#include <IO/MappedFile.h>
+#include <IO/MappedFileDescriptor.h>
 
 
 namespace DB
@@ -21,7 +21,7 @@ protected:
     MMapReadBufferFromFileDescriptor() {}
     void init();
 
-    MappedFile mapped;
+    MappedFileDescriptor mapped;
 
 public:
     MMapReadBufferFromFileDescriptor(int fd_, size_t offset_, size_t length_);
diff --git a/src/IO/MMapReadBufferFromFileWithCache.cpp b/src/IO/MMapReadBufferFromFileWithCache.cpp
new file mode 100644
index 00000000000..8d92e9d835a
--- /dev/null
+++ b/src/IO/MMapReadBufferFromFileWithCache.cpp
@@ -0,0 +1,84 @@
+#include <IO/MMapReadBufferFromFileWithCache.h>
+
+
+namespace DB
+{
+
+namespace
+{
+    /// TODO: Move to a better place, make configurable.
+    MappedFileCache cache(1000);
+}
+
+
+namespace ErrorCodes
+{
+    extern const int ARGUMENT_OUT_OF_BOUND;
+    extern const int CANNOT_SEEK_THROUGH_FILE;
+}
+
+
+void MMapReadBufferFromFileWithCache::init()
+{
+    size_t length = mapped->getLength();
+    BufferBase::set(mapped->getData(), length, 0);
+
+    size_t page_size = static_cast<size_t>(::getPageSize());
+    ReadBuffer::padded = (length % page_size) > 0 && (length % page_size) <= (page_size - 15);
+}
+
+
+MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
+    const std::string & file_name, size_t offset, size_t length)
+{
+    mapped = cache.getOrSet(cache.hash(file_name, offset, length), [&]
+    {
+        return std::make_shared<MappedFile>(file_name, offset, length);
+    }).first;
+
+    init();
+}
+
+MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
+    const std::string & file_name, size_t offset)
+{
+    mapped = cache.getOrSet(cache.hash(file_name, offset, -1), [&]
+    {
+        return std::make_shared<MappedFile>(file_name, offset);
+    }).first;
+
+    init();
+}
+
+
+std::string MMapReadBufferFromFileWithCache::getFileName() const
+{
+    return mapped->getFileName();
+}
+
+off_t MMapReadBufferFromFileWithCache::getPosition()
+{
+    return count();
+}
+
+off_t MMapReadBufferFromFileWithCache::seek(off_t offset, int whence)
+{
+    off_t new_pos;
+    if (whence == SEEK_SET)
+        new_pos = offset;
+    else if (whence == SEEK_CUR)
+        new_pos = count() + offset;
+    else
+        throw Exception("MMapReadBufferFromFileWithCache::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
+
+    working_buffer = internal_buffer;
+    if (new_pos < 0 || new_pos > off_t(working_buffer.size()))
+        throw Exception("Cannot seek through file " + getFileName()
+            + " because seek position (" + toString(new_pos) + ") is out of bounds [0, " + toString(working_buffer.size()) + "]",
+            ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+
+    position() = working_buffer.begin() + new_pos;
+    return new_pos;
+}
+
+}
diff --git a/src/IO/MMapReadBufferFromFileWithCache.h b/src/IO/MMapReadBufferFromFileWithCache.h
new file mode 100644
index 00000000000..a6ac8d5f678
--- /dev/null
+++ b/src/IO/MMapReadBufferFromFileWithCache.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <IO/ReadBufferFromFileBase.h>
+#include <IO/MappedFileCache.h>
+#include <IO/MMapReadBufferFromFileDescriptor.h>
+
+
+namespace DB
+{
+
+class MMapReadBufferFromFileWithCache : public ReadBufferFromFileBase
+{
+public:
+    MMapReadBufferFromFileWithCache(const std::string & file_name, size_t offset, size_t length);
+
+    /// Map till end of file.
+    MMapReadBufferFromFileWithCache(const std::string & file_name, size_t offset);
+
+    off_t getPosition() override;
+    std::string getFileName() const override;
+    off_t seek(off_t offset, int whence) override;
+
+private:
+    MappedFileCache::MappedPtr mapped;
+
+    void init();
+};
+
+}
+
+
diff --git a/src/IO/MappedFile.cpp b/src/IO/MappedFile.cpp
index 424bb077956..8867320d46c 100644
--- a/src/IO/MappedFile.cpp
+++ b/src/IO/MappedFile.cpp
@@ -1,100 +1,78 @@
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
 #include <unistd.h>
+#include <fcntl.h>
 
-#include <fmt/format.h>
-
+#include <Common/ProfileEvents.h>
 #include <Common/formatReadable.h>
 #include <Common/Exception.h>
-#include <common/getPageSize.h>
 #include <IO/MappedFile.h>
 
 
+namespace ProfileEvents
+{
+    extern const Event FileOpen;
+}
+
 namespace DB
 {
 
 namespace ErrorCodes
 {
-    extern const int CANNOT_ALLOCATE_MEMORY;
-    extern const int CANNOT_MUNMAP;
-    extern const int CANNOT_STAT;
-    extern const int BAD_ARGUMENTS;
-    extern const int LOGICAL_ERROR;
+    extern const int FILE_DOESNT_EXIST;
+    extern const int CANNOT_OPEN_FILE;
+    extern const int CANNOT_CLOSE_FILE;
 }
 
 
-static size_t getFileSize(int fd)
+void MappedFile::open()
 {
-    struct stat stat_res {};
-    if (0 != fstat(fd, &stat_res))
-        throwFromErrno("MMapReadBufferFromFileDescriptor: Cannot fstat.", ErrorCodes::CANNOT_STAT);
+    ProfileEvents::increment(ProfileEvents::FileOpen);
 
-    off_t file_size = stat_res.st_size;
+    fd = ::open(file_name.c_str(), O_RDONLY | O_CLOEXEC);
 
-    if (file_size < 0)
-        throw Exception("MMapReadBufferFromFileDescriptor: fstat returned negative file size", ErrorCodes::LOGICAL_ERROR);
-
-    return file_size;
+    if (-1 == fd)
+        throwFromErrnoWithPath("Cannot open file " + file_name, file_name,
+                               errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE);
 }
 
 
-MappedFile::MappedFile(int fd_, size_t offset_, size_t length_)
+std::string MappedFile::getFileName() const
 {
-    set(fd_, offset_, length_);
+    return file_name;
 }
 
-MappedFile::MappedFile(int fd_, size_t offset_)
-    : fd(fd_), offset(offset_)
+
+MappedFile::MappedFile(const std::string & file_name_, size_t offset_, size_t length_)
+    : file_name(file_name_)
 {
-    set(fd_, offset_);
+    open();
+    set(fd, offset_, length_);
 }
 
-void MappedFile::set(int fd_, size_t offset_, size_t length_)
+
+MappedFile::MappedFile(const std::string & file_name_, size_t offset_)
+    : file_name(file_name_)
 {
-    finish();
-
-    fd = fd_;
-    offset = offset_;
-    length = length_;
-
-    if (length)
-    {
-        void * buf = mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fd, offset);
-        if (MAP_FAILED == buf)
-            throwFromErrno(fmt::format("MMapReadBufferFromFileDescriptor: Cannot mmap {}.", ReadableSize(length)),
-                ErrorCodes::CANNOT_ALLOCATE_MEMORY);
-
-        data = static_cast<char *>(buf);
-    }
+    open();
+    set(fd, offset_);
 }
 
-void MappedFile::set(int fd_, size_t offset_)
-{
-    size_t file_size = getFileSize(fd_);
-
-    if (offset > static_cast<size_t>(file_size))
-        throw Exception("MMapReadBufferFromFileDescriptor: requested offset is greater than file size", ErrorCodes::BAD_ARGUMENTS);
-
-    set(fd_, offset_, file_size - offset);
-}
-
-void MappedFile::finish()
-{
-    if (!length)
-        return;
-
-    if (0 != munmap(data, length))
-        throwFromErrno(fmt::format("MMapReadBufferFromFileDescriptor: Cannot munmap {}.", ReadableSize(length)),
-            ErrorCodes::CANNOT_MUNMAP);
-
-    length = 0;
-}
 
 MappedFile::~MappedFile()
 {
-    finish(); /// Exceptions will lead to std::terminate and that's Ok.
+    if (fd != -1)
+        close();    /// Exceptions will lead to std::terminate and that's Ok.
+}
+
+
+void MappedFile::close()
+{
+    finish();
+
+    if (0 != ::close(fd))
+        throw Exception("Cannot close file", ErrorCodes::CANNOT_CLOSE_FILE);
+
+    fd = -1;
+    metric_increment.destroy();
 }
 
 }
-
diff --git a/src/IO/MappedFile.h b/src/IO/MappedFile.h
index 3e218280885..2c362bc07bc 100644
--- a/src/IO/MappedFile.h
+++ b/src/IO/MappedFile.h
@@ -1,48 +1,40 @@
 #pragma once
 
+#include <Common/CurrentMetrics.h>
+#include <IO/MappedFileDescriptor.h>
 #include <cstddef>
 
 
+namespace CurrentMetrics
+{
+    extern const Metric OpenFileForRead;
+}
+
+
 namespace DB
 {
 
-/// MMaps a region in file (or a whole file) into memory. Unmaps in destructor.
-/// Does not open or close file.
-class MappedFile
+/// Opens a file and mmaps a region in it (or a whole file) into memory. Unmaps and closes in destructor.
+class MappedFile : public MappedFileDescriptor
 {
 public:
-    MappedFile(int fd_, size_t offset_, size_t length_);
-    MappedFile(int fd_, size_t offset_);
+    MappedFile(const std::string & file_name_, size_t offset_, size_t length_);
 
-    /// Makes empty object that can be initialized with `set`.
-    MappedFile() {}
+    /// Map till end of file.
+    MappedFile(const std::string & file_name_, size_t offset_);
 
     ~MappedFile();
 
-    char * getData() { return data; }
-    const char * getData() const { return data; }
+    void close();
 
-    int getFD() const { return fd; }
-    size_t getOffset() const { return offset; }
-    size_t getLength() const { return length; }
-
-    /// Unmap memory before call to destructor
-    void finish();
-
-    /// Initialize or reset to another fd.
-    void set(int fd_, size_t offset_, size_t length_);
-    void set(int fd_, size_t offset_);
+    std::string getFileName() const;
 
 private:
-    MappedFile(const MappedFile &) = delete;
-    MappedFile(MappedFile &&) = delete;
+    std::string file_name;
 
-    void init();
+    CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForRead};
 
-    int fd = -1;
-    size_t offset = 0;
-    size_t length = 0;
-    char * data = nullptr;
+    void open();
 };
 
 }
diff --git a/src/IO/MappedFileCache.h b/src/IO/MappedFileCache.h
new file mode 100644
index 00000000000..d5b35ab3060
--- /dev/null
+++ b/src/IO/MappedFileCache.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <Common/LRUCache.h>
+#include <Common/SipHash.h>
+#include <Common/UInt128.h>
+#include <Common/ProfileEvents.h>
+#include <IO/MappedFile.h>
+
+
+namespace ProfileEvents
+{
+    extern const Event MappedFileCacheHits;
+    extern const Event MappedFileCacheMisses;
+}
+
+namespace DB
+{
+
+
+/** Cache of opened and mmapped files for reading.
+  * mmap/munmap is heavy operation and better to keep mapped file to subsequent use than to map/unmap every time.
+  */
+class MappedFileCache : public LRUCache<UInt128, MappedFile, UInt128TrivialHash>
+{
+private:
+    using Base = LRUCache<UInt128, MappedFile, UInt128TrivialHash>;
+
+public:
+    MappedFileCache(size_t max_size_in_bytes)
+        : Base(max_size_in_bytes) {}
+
+    /// Calculate key from path to file and offset.
+    static UInt128 hash(const String & path_to_file, size_t offset, ssize_t length = -1)
+    {
+        UInt128 key;
+
+        SipHash hash;
+        hash.update(path_to_file.data(), path_to_file.size() + 1);
+        hash.update(offset);
+        hash.update(length);
+
+        hash.get128(key.low, key.high);
+
+        return key;
+    }
+
+    MappedPtr get(const Key & key)
+    {
+        MappedPtr res = Base::get(key);
+
+        if (res)
+            ProfileEvents::increment(ProfileEvents::MappedFileCacheHits);
+        else
+            ProfileEvents::increment(ProfileEvents::MappedFileCacheMisses);
+
+        return res;
+    }
+};
+
+using MappedFileCachePtr = std::shared_ptr<MappedFileCache>;
+
+}
+
diff --git a/src/IO/MappedFileDescriptor.cpp b/src/IO/MappedFileDescriptor.cpp
new file mode 100644
index 00000000000..6260ffa35e8
--- /dev/null
+++ b/src/IO/MappedFileDescriptor.cpp
@@ -0,0 +1,101 @@
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fmt/format.h>
+
+#include <Common/formatReadable.h>
+#include <Common/Exception.h>
+#include <common/getPageSize.h>
+#include <IO/MappedFileDescriptor.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_ALLOCATE_MEMORY;
+    extern const int CANNOT_MUNMAP;
+    extern const int CANNOT_STAT;
+    extern const int BAD_ARGUMENTS;
+    extern const int LOGICAL_ERROR;
+}
+
+
+static size_t getFileSize(int fd)
+{
+    struct stat stat_res {};
+    if (0 != fstat(fd, &stat_res))
+        throwFromErrno("MappedFileDescriptor: Cannot fstat.", ErrorCodes::CANNOT_STAT);
+
+    off_t file_size = stat_res.st_size;
+
+    if (file_size < 0)
+        throw Exception("MappedFileDescriptor: fstat returned negative file size", ErrorCodes::LOGICAL_ERROR);
+
+    return file_size;
+}
+
+
+MappedFileDescriptor::MappedFileDescriptor(int fd_, size_t offset_, size_t length_)
+{
+    set(fd_, offset_, length_);
+}
+
+MappedFileDescriptor::MappedFileDescriptor(int fd_, size_t offset_)
+    : fd(fd_), offset(offset_)
+{
+    set(fd_, offset_);
+}
+
+void MappedFileDescriptor::set(int fd_, size_t offset_, size_t length_)
+{
+    finish();
+
+    fd = fd_;
+    offset = offset_;
+    length = length_;
+
+    if (length)
+    {
+        void * buf = mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fd, offset);
+        if (MAP_FAILED == buf)
+            throwFromErrno(fmt::format("MappedFileDescriptor: Cannot mmap {}.", ReadableSize(length)),
+                ErrorCodes::CANNOT_ALLOCATE_MEMORY);
+
+        data = static_cast<char *>(buf);
+    }
+}
+
+void MappedFileDescriptor::set(int fd_, size_t offset_)
+{
+    size_t file_size = getFileSize(fd_);
+
+    if (offset > static_cast<size_t>(file_size))
+        throw Exception("MappedFileDescriptor: requested offset is greater than file size", ErrorCodes::BAD_ARGUMENTS);
+
+    set(fd_, offset_, file_size - offset);
+}
+
+void MappedFileDescriptor::finish()
+{
+    if (!length)
+        return;
+
+    if (0 != munmap(data, length))
+        throwFromErrno(fmt::format("MappedFileDescriptor: Cannot munmap {}.", ReadableSize(length)),
+            ErrorCodes::CANNOT_MUNMAP);
+
+    length = 0;
+}
+
+MappedFileDescriptor::~MappedFileDescriptor()
+{
+    finish(); /// Exceptions will lead to std::terminate and that's Ok.
+}
+
+}
+
+
diff --git a/src/IO/MappedFileDescriptor.h b/src/IO/MappedFileDescriptor.h
new file mode 100644
index 00000000000..950b39668f0
--- /dev/null
+++ b/src/IO/MappedFileDescriptor.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <cstddef>
+
+
+namespace DB
+{
+
+/// MMaps a region in file (or a whole file) into memory. Unmaps in destructor.
+/// Does not open or close file.
+class MappedFileDescriptor
+{
+public:
+    MappedFileDescriptor(int fd_, size_t offset_, size_t length_);
+    MappedFileDescriptor(int fd_, size_t offset_);
+
+    /// Makes empty object that can be initialized with `set`.
+    MappedFileDescriptor() {}
+
+    ~MappedFileDescriptor();
+
+    char * getData() { return data; }
+    const char * getData() const { return data; }
+
+    int getFD() const { return fd; }
+    size_t getOffset() const { return offset; }
+    size_t getLength() const { return length; }
+
+    /// Unmap memory before call to destructor
+    void finish();
+
+    /// Initialize or reset to another fd.
+    void set(int fd_, size_t offset_, size_t length_);
+    void set(int fd_, size_t offset_);
+
+protected:
+    MappedFileDescriptor(const MappedFileDescriptor &) = delete;
+    MappedFileDescriptor(MappedFileDescriptor &&) = delete;
+
+    void init();
+
+    int fd = -1;
+    size_t offset = 0;
+    size_t length = 0;
+    char * data = nullptr;
+};
+
+}
+
diff --git a/src/IO/createReadBufferFromFileBase.cpp b/src/IO/createReadBufferFromFileBase.cpp
index c1d4377fdff..2a303775a07 100644
--- a/src/IO/createReadBufferFromFileBase.cpp
+++ b/src/IO/createReadBufferFromFileBase.cpp
@@ -3,7 +3,7 @@
 #if defined(OS_LINUX) || defined(__FreeBSD__)
 #include <IO/ReadBufferAIO.h>
 #endif
-#include <IO/MMapReadBufferFromFile.h>
+#include <IO/MMapReadBufferFromFileWithCache.h>
 #include <Common/ProfileEvents.h>
 
 
@@ -49,7 +49,7 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
     {
         try
         {
-            auto res = std::make_unique<MMapReadBufferFromFile>(filename_, 0);
+            auto res = std::make_unique<MMapReadBufferFromFileWithCache>(filename_, 0);
             ProfileEvents::increment(ProfileEvents::CreatedReadBufferMMap);
             return res;
         }
diff --git a/src/IO/ya.make b/src/IO/ya.make
index 58df027c561..2d282818272 100644
--- a/src/IO/ya.make
+++ b/src/IO/ya.make
@@ -34,6 +34,9 @@ SRCS(
     LimitReadBuffer.cpp
     MMapReadBufferFromFile.cpp
     MMapReadBufferFromFileDescriptor.cpp
+    MMapReadBufferFromFileWithCache.cpp
+    MappedFile.cpp
+    MappedFileDescriptor.cpp
     MemoryReadWriteBuffer.cpp
     MySQLBinlogEventReadBuffer.cpp
     MySQLPacketPayloadReadBuffer.cpp

From 0e3571478d754d50d0397341ccb276d80d1ee993 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 27 Mar 2021 02:42:22 +0300
Subject: [PATCH 241/260] Code review changes

---
 src/Compression/CompressedReadBufferBase.cpp | 4 ++--
 src/Compression/ICompressionCodec.cpp        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp
index eb4d6ea5986..79757d6f151 100644
--- a/src/Compression/CompressedReadBufferBase.cpp
+++ b/src/Compression/CompressedReadBufferBase.cpp
@@ -232,8 +232,8 @@ void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_d
         UInt8 header_size = ICompressionCodec::getHeaderSize();
         if (size_compressed_without_checksum < header_size)
             throw Exception(ErrorCodes::CORRUPTED_DATA,
-                "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})",
-                    size_compressed_without_checksum, size_t(header_size));
+                "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})",
+                    size_compressed_without_checksum, static_cast<size_t>(header_size));
 
         to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum);
     }
diff --git a/src/Compression/ICompressionCodec.cpp b/src/Compression/ICompressionCodec.cpp
index dec2b633046..46a12e50828 100644
--- a/src/Compression/ICompressionCodec.cpp
+++ b/src/Compression/ICompressionCodec.cpp
@@ -98,7 +98,7 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch
 
     UInt8 header_size = getHeaderSize();
     if (source_size < header_size)
-        throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})", source_size, size_t(header_size));
+        throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})", source_size, static_cast<size_t>(header_size));
 
     uint8_t our_method = getMethodByte();
     uint8_t method = source[0];

From e0ab461636ea85046f93c6ce1be8113c9e3e0a1e Mon Sep 17 00:00:00 2001
From: Denis Zhuravlev <denis.zhuravlov@gmail.com>
Date: Sat, 27 Mar 2021 19:43:26 -0300
Subject: [PATCH 242/260] test for #10489

---
 ...st_LowCardinality_FixedString_pk.reference |  3 +++
 ...778_test_LowCardinality_FixedString_pk.sql | 21 +++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 tests/queries/0_stateless/01778_test_LowCardinality_FixedString_pk.reference
 create mode 100644 tests/queries/0_stateless/01778_test_LowCardinality_FixedString_pk.sql

diff --git a/tests/queries/0_stateless/01778_test_LowCardinality_FixedString_pk.reference b/tests/queries/0_stateless/01778_test_LowCardinality_FixedString_pk.reference
new file mode 100644
index 00000000000..a134ce52c11
--- /dev/null
+++ b/tests/queries/0_stateless/01778_test_LowCardinality_FixedString_pk.reference
@@ -0,0 +1,3 @@
+100
+100
+100
diff --git a/tests/queries/0_stateless/01778_test_LowCardinality_FixedString_pk.sql b/tests/queries/0_stateless/01778_test_LowCardinality_FixedString_pk.sql
new file mode 100644
index 00000000000..1a0a1d35f76
--- /dev/null
+++ b/tests/queries/0_stateless/01778_test_LowCardinality_FixedString_pk.sql
@@ -0,0 +1,21 @@
+DROP TABLE IF EXISTS test_01778;
+
+CREATE TABLE test_01778
+(
+    `key` LowCardinality(FixedString(3)),
+    `d` date
+)
+ENGINE = MergeTree(d, key, 8192);
+
+
+INSERT INTO test_01778 SELECT toString(intDiv(number,8000)), today() FROM numbers(100000);
+INSERT INTO test_01778 SELECT toString('xxx'), today() FROM numbers(100);
+
+SELECT count() FROM test_01778 WHERE key = 'xxx';
+
+SELECT count() FROM test_01778 WHERE key = toFixedString('xxx', 3);
+
+SELECT count() FROM test_01778 WHERE toString(key) = 'xxx';
+
+DROP TABLE test_01778;
+

From cbf5913109ffa053b5fed64779db92fb8154041b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 01:57:59 +0300
Subject: [PATCH 243/260] Review fixes

---
 src/IO/MMapReadBufferFromFileDescriptor.cpp | 6 ------
 src/IO/MMapReadBufferFromFileDescriptor.h   | 2 --
 src/IO/MappedFile.h                         | 2 +-
 src/IO/MappedFileDescriptor.h               | 2 +-
 4 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/IO/MMapReadBufferFromFileDescriptor.cpp b/src/IO/MMapReadBufferFromFileDescriptor.cpp
index 56e9e384cd4..eb5957dd70b 100644
--- a/src/IO/MMapReadBufferFromFileDescriptor.cpp
+++ b/src/IO/MMapReadBufferFromFileDescriptor.cpp
@@ -45,12 +45,6 @@ MMapReadBufferFromFileDescriptor::MMapReadBufferFromFileDescriptor(int fd, size_
 }
 
 
-MMapReadBufferFromFileDescriptor::~MMapReadBufferFromFileDescriptor()
-{
-    finish();
-}
-
-
 void MMapReadBufferFromFileDescriptor::finish()
 {
     mapped.finish();
diff --git a/src/IO/MMapReadBufferFromFileDescriptor.h b/src/IO/MMapReadBufferFromFileDescriptor.h
index eb7fc4ded64..0068b550899 100644
--- a/src/IO/MMapReadBufferFromFileDescriptor.h
+++ b/src/IO/MMapReadBufferFromFileDescriptor.h
@@ -29,8 +29,6 @@ public:
     /// Map till end of file.
     MMapReadBufferFromFileDescriptor(int fd_, size_t offset_);
 
-    ~MMapReadBufferFromFileDescriptor() override;
-
     /// unmap memory before call to destructor
     void finish();
 
diff --git a/src/IO/MappedFile.h b/src/IO/MappedFile.h
index 2c362bc07bc..c023526fae5 100644
--- a/src/IO/MappedFile.h
+++ b/src/IO/MappedFile.h
@@ -23,7 +23,7 @@ public:
     /// Map till end of file.
     MappedFile(const std::string & file_name_, size_t offset_);
 
-    ~MappedFile();
+    ~MappedFile() override;
 
     void close();
 
diff --git a/src/IO/MappedFileDescriptor.h b/src/IO/MappedFileDescriptor.h
index 950b39668f0..c119b9eb2ea 100644
--- a/src/IO/MappedFileDescriptor.h
+++ b/src/IO/MappedFileDescriptor.h
@@ -17,7 +17,7 @@ public:
     /// Makes empty object that can be initialized with `set`.
     MappedFileDescriptor() {}
 
-    ~MappedFileDescriptor();
+    virtual ~MappedFileDescriptor();
 
     char * getData() { return data; }
     const char * getData() const { return data; }

From bdb69b8f08cf807d81be6f6af7aa2a3e8eef7e99 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 02:11:46 +0300
Subject: [PATCH 244/260] Slightly better

---
 src/IO/MMapReadBufferFromFileDescriptor.cpp | 8 ++++----
 src/IO/MMapReadBufferFromFileWithCache.cpp  | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/IO/MMapReadBufferFromFileDescriptor.cpp b/src/IO/MMapReadBufferFromFileDescriptor.cpp
index eb5957dd70b..c5adfc017ae 100644
--- a/src/IO/MMapReadBufferFromFileDescriptor.cpp
+++ b/src/IO/MMapReadBufferFromFileDescriptor.cpp
@@ -74,13 +74,13 @@ off_t MMapReadBufferFromFileDescriptor::seek(off_t offset, int whence)
     else if (whence == SEEK_CUR)
         new_pos = count() + offset;
     else
-        throw Exception("MMapReadBufferFromFileDescriptor::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
+        throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "MMapReadBufferFromFileDescriptor::seek expects SEEK_SET or SEEK_CUR as whence");
 
     working_buffer = internal_buffer;
     if (new_pos < 0 || new_pos > off_t(working_buffer.size()))
-        throw Exception("Cannot seek through file " + getFileName()
-            + " because seek position (" + toString(new_pos) + ") is out of bounds [0, " + toString(working_buffer.size()) + "]",
-            ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+        throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE,
+            "Cannot seek through file {} because seek position ({}) is out of bounds [0, {}]",
+            getFileName(), new_pos, working_buffer.size());
 
     position() = working_buffer.begin() + new_pos;
     return new_pos;
diff --git a/src/IO/MMapReadBufferFromFileWithCache.cpp b/src/IO/MMapReadBufferFromFileWithCache.cpp
index 8d92e9d835a..9692e73f672 100644
--- a/src/IO/MMapReadBufferFromFileWithCache.cpp
+++ b/src/IO/MMapReadBufferFromFileWithCache.cpp
@@ -69,13 +69,13 @@ off_t MMapReadBufferFromFileWithCache::seek(off_t offset, int whence)
     else if (whence == SEEK_CUR)
         new_pos = count() + offset;
     else
-        throw Exception("MMapReadBufferFromFileWithCache::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
+        throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "MMapReadBufferFromFileWithCache::seek expects SEEK_SET or SEEK_CUR as whence");
 
     working_buffer = internal_buffer;
     if (new_pos < 0 || new_pos > off_t(working_buffer.size()))
-        throw Exception("Cannot seek through file " + getFileName()
-            + " because seek position (" + toString(new_pos) + ") is out of bounds [0, " + toString(working_buffer.size()) + "]",
-            ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+        throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE,
+            "Cannot seek through file {} because seek position ({}) is out of bounds [0, {}]",
+            getFileName(), new_pos, working_buffer.size());
 
     position() = working_buffer.begin() + new_pos;
     return new_pos;

From 50f712e198638e26aaf1834a8c883c70ec083d13 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 04:10:30 +0300
Subject: [PATCH 245/260] Integrate mmap cache to the infrastructure

---
 programs/local/LocalServer.cpp                |  5 +++
 programs/server/Server.cpp                    |  5 +++
 programs/server/config.xml                    | 17 ++++++++++
 src/Access/AccessType.h                       |  1 +
 src/Common/CurrentMetrics.cpp                 |  2 ++
 .../CompressedReadBufferFromFile.cpp          | 10 ++++--
 .../CompressedReadBufferFromFile.h            |  4 ++-
 .../tests/cached_compressed_read_buffer.cpp   |  4 +--
 src/Disks/DiskCacheWrapper.cpp                | 20 +++++++-----
 src/Disks/DiskCacheWrapper.h                  | 15 ++++++---
 src/Disks/DiskDecorator.cpp                   |  5 +--
 src/Disks/DiskDecorator.h                     | 18 ++++++++---
 src/Disks/DiskLocal.cpp                       |  5 +--
 src/Disks/DiskLocal.h                         |  3 +-
 src/Disks/DiskMemory.cpp                      |  2 +-
 src/Disks/DiskMemory.h                        |  3 +-
 src/Disks/IDisk.h                             |  4 ++-
 src/Disks/S3/DiskS3.cpp                       |  2 +-
 src/Disks/S3/DiskS3.h                         |  3 +-
 src/IO/MMapReadBufferFromFileWithCache.cpp    | 11 ++-----
 src/IO/MMapReadBufferFromFileWithCache.h      |  6 ++--
 src/IO/MappedFileDescriptor.cpp               | 22 ++++++++-----
 src/IO/MappedFileDescriptor.h                 | 10 ++++++
 src/IO/createReadBufferFromFileBase.cpp       |  6 ++--
 src/IO/createReadBufferFromFileBase.h         |  4 +++
 src/Interpreters/AsynchronousMetrics.cpp      |  8 +++++
 src/Interpreters/Context.cpp                  | 31 +++++++++++++++++--
 src/Interpreters/Context.h                    |  6 ++++
 src/Interpreters/InterpreterSystemQuery.cpp   |  5 +++
 src/Parsers/ASTSystemQuery.cpp                |  2 ++
 src/Parsers/ASTSystemQuery.h                  |  1 +
 src/Storages/MergeTree/DataPartsExchange.cpp  |  2 +-
 src/Storages/MergeTree/MergeTreeData.cpp      |  4 +++
 .../MergeTree/MergeTreeDataPartWriterWide.cpp |  2 +-
 .../MergeTree/MergeTreeDataSelectExecutor.cpp | 10 +++---
 src/Storages/MergeTree/MergeTreeIOSettings.h  |  6 ++++
 .../MergeTree/MergeTreeReaderCompact.cpp      | 10 ++++--
 .../MergeTree/MergeTreeReaderStream.cpp       | 12 +++++--
 38 files changed, 218 insertions(+), 68 deletions(-)

diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index 7c6b60fbf8e..31f9ca5b8c4 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -260,6 +260,11 @@ try
     if (mark_cache_size)
         global_context->setMarkCache(mark_cache_size);
 
+    /// A cache for mmapped files.
+    size_t mmap_cache_size = config().getUInt64("mmap_cache_size", 1000);   /// The choice of default is arbitrary.
+    if (mmap_cache_size)
+        global_context->setMappedFileCache(mmap_cache_size);
+
     /// Load global settings from default_profile and system_profile.
     global_context->setDefaultProfiles(config());
 
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index c37609a6e10..9728fe402d2 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -829,6 +829,11 @@ int Server::main(const std::vector<std::string> & /*args*/)
     }
     global_context->setMarkCache(mark_cache_size);
 
+    /// A cache for mmapped files.
+    size_t mmap_cache_size = config().getUInt64("mmap_cache_size", 1000);   /// The choice of default is arbitrary.
+    if (mmap_cache_size)
+        global_context->setMappedFileCache(mmap_cache_size);
+
 #if USE_EMBEDDED_COMPILER
     size_t compiled_expression_cache_size = config().getUInt64("compiled_expression_cache_size", 500);
     CompiledExpressionCacheFactory::instance().init(compiled_expression_cache_size);
diff --git a/programs/server/config.xml b/programs/server/config.xml
index 4220ecbcacd..543f7c54ca6 100644
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@@ -297,6 +297,23 @@
       -->
     <mark_cache_size>5368709120</mark_cache_size>
 
+    <!-- If you enable the `min_bytes_to_use_mmap_io` setting,
+         the data in MergeTree tables can be read with mmap to avoid copying from kernel to userspace.
+         It makes sense only for large files and helps only if data reside in page cache.
+         To avoid frequent open/mmap/munmap/close calls (which are very expensive due to consequent page faults)
+         and to reuse mappings from several threads and queries,
+         the cache of mapped files is maintained. Its size is the number of mapped regions (usually equal to the number of mapped files).
+         The amount of data in mapped files can be monitored in system.metrics, system.metric_log by the MappedFiles, MappedFileBytes metrics
+         and in system.asynchronous_metrics, system.asynchronous_metrics_log by the MMapCacheCells metric,
+         and also in system.events, system.processes, system.query_log, system.query_thread_log by the
+         CreatedReadBufferMMap, CreatedReadBufferMMapFailed, MappedFileCacheHits, MappedFileCacheMisses events.
+         Note that the amount of data in mapped files does not consume memory directly and is not accounted
+         in query or server memory usage - because this memory can be discarded similar to OS page cache.
+         The cache is dropped (the files are closed) automatically on removal of old parts in MergeTree,
+         also it can be dropped manually by the SYSTEM DROP MMAP CACHE query. Good luck.
+      -->
+    <mmap_cache_size>1000</mmap_cache_size>
+
 
     <!-- Path to data directory, with trailing slash. -->
     <path>/var/lib/clickhouse/</path>
diff --git a/src/Access/AccessType.h b/src/Access/AccessType.h
index db3798fbf63..40740b3164e 100644
--- a/src/Access/AccessType.h
+++ b/src/Access/AccessType.h
@@ -124,6 +124,7 @@ enum class AccessType
     M(SYSTEM_DROP_DNS_CACHE, "SYSTEM DROP DNS, DROP DNS CACHE, DROP DNS", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_MARK_CACHE, "SYSTEM DROP MARK, DROP MARK CACHE, DROP MARKS", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_UNCOMPRESSED_CACHE, "SYSTEM DROP UNCOMPRESSED, DROP UNCOMPRESSED CACHE, DROP UNCOMPRESSED", GLOBAL, SYSTEM_DROP_CACHE) \
+    M(SYSTEM_DROP_MMAP_CACHE, "SYSTEM DROP MMAP, DROP MMAP CACHE, DROP MMAP", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_COMPILED_EXPRESSION_CACHE, "SYSTEM DROP COMPILED EXPRESSION, DROP COMPILED EXPRESSION CACHE, DROP COMPILED EXPRESSIONS", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_CACHE, "DROP CACHE", GROUP, SYSTEM) \
     M(SYSTEM_RELOAD_CONFIG, "RELOAD CONFIG", GLOBAL, SYSTEM_RELOAD) \
diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index 2bc6258aa18..e45339efe9f 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -66,6 +66,8 @@
     M(PartsWide, "Wide parts.") \
     M(PartsCompact, "Compact parts.") \
     M(PartsInMemory, "In-memory parts.") \
+    M(MappedFiles, "Total number of mmapped files.") \
+    M(MappedFileBytes, "Sum size of mmapped file regions.") \
 
 namespace CurrentMetrics
 {
diff --git a/src/Compression/CompressedReadBufferFromFile.cpp b/src/Compression/CompressedReadBufferFromFile.cpp
index 3a75ea14166..e3cec15d504 100644
--- a/src/Compression/CompressedReadBufferFromFile.cpp
+++ b/src/Compression/CompressedReadBufferFromFile.cpp
@@ -45,9 +45,15 @@ CompressedReadBufferFromFile::CompressedReadBufferFromFile(std::unique_ptr<ReadB
 
 
 CompressedReadBufferFromFile::CompressedReadBufferFromFile(
-    const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, size_t buf_size, bool allow_different_codecs_)
+    const std::string & path,
+    size_t estimated_size,
+    size_t aio_threshold,
+    size_t mmap_threshold,
+    MappedFileCache * mmap_cache,
+    size_t buf_size,
+    bool allow_different_codecs_)
     : BufferWithOwnMemory<ReadBuffer>(0)
-    , p_file_in(createReadBufferFromFileBase(path, estimated_size, aio_threshold, mmap_threshold, buf_size))
+    , p_file_in(createReadBufferFromFileBase(path, estimated_size, aio_threshold, mmap_threshold, mmap_cache, buf_size))
     , file_in(*p_file_in)
 {
     compressed_in = &file_in;
diff --git a/src/Compression/CompressedReadBufferFromFile.h b/src/Compression/CompressedReadBufferFromFile.h
index 166b2595ef9..6e7d0df8333 100644
--- a/src/Compression/CompressedReadBufferFromFile.h
+++ b/src/Compression/CompressedReadBufferFromFile.h
@@ -9,6 +9,8 @@
 namespace DB
 {
 
+class MappedFileCache;
+
 
 /// Unlike CompressedReadBuffer, it can do seek.
 class CompressedReadBufferFromFile : public CompressedReadBufferBase, public BufferWithOwnMemory<ReadBuffer>
@@ -31,7 +33,7 @@ public:
     CompressedReadBufferFromFile(std::unique_ptr<ReadBufferFromFileBase> buf, bool allow_different_codecs_ = false);
 
     CompressedReadBufferFromFile(
-        const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold,
+        const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MappedFileCache * mmap_cache,
         size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, bool allow_different_codecs_ = false);
 
     void seek(size_t offset_in_compressed_file, size_t offset_in_decompressed_block);
diff --git a/src/Compression/tests/cached_compressed_read_buffer.cpp b/src/Compression/tests/cached_compressed_read_buffer.cpp
index ed198e36e46..94e8b356bd5 100644
--- a/src/Compression/tests/cached_compressed_read_buffer.cpp
+++ b/src/Compression/tests/cached_compressed_read_buffer.cpp
@@ -37,7 +37,7 @@ int main(int argc, char ** argv)
                 path,
                 [&]()
                 {
-                    return createReadBufferFromFileBase(path, 0, 0, 0);
+                    return createReadBufferFromFileBase(path, 0, 0, 0, nullptr);
                 },
                 &cache
             );
@@ -56,7 +56,7 @@ int main(int argc, char ** argv)
                 path,
                 [&]()
                 {
-                    return createReadBufferFromFileBase(path, 0, 0, 0);
+                    return createReadBufferFromFileBase(path, 0, 0, 0, nullptr);
                 },
                 &cache
             );
diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index 7586b4a28f0..589a39a5731 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -103,15 +103,21 @@ std::shared_ptr<FileDownloadMetadata> DiskCacheWrapper::acquireDownloadMetadata(
 }
 
 std::unique_ptr<ReadBufferFromFileBase>
-DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const
+DiskCacheWrapper::readFile(
+    const String & path,
+    size_t buf_size,
+    size_t estimated_size,
+    size_t aio_threshold,
+    size_t mmap_threshold,
+    MappedFileCache * mmap_cache) const
 {
     if (!cache_file_predicate(path))
-        return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold);
+        return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache);
 
     LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Read file {} from cache", backQuote(path));
 
     if (cache_disk->exists(path))
-        return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold);
+        return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache);
 
     auto metadata = acquireDownloadMetadata(path);
 
@@ -145,7 +151,7 @@ DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimate
 
                 auto tmp_path = path + ".tmp";
                 {
-                    auto src_buffer = DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold);
+                    auto src_buffer = DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache);
                     auto dst_buffer = cache_disk->writeFile(tmp_path, buf_size, WriteMode::Rewrite);
                     copyData(*src_buffer, *dst_buffer);
                 }
@@ -169,9 +175,9 @@ DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimate
     }
 
     if (metadata->status == DOWNLOADED)
-        return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold);
+        return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache);
 
-    return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold);
+    return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache);
 }
 
 std::unique_ptr<WriteBufferFromFileBase>
@@ -191,7 +197,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode
         [this, path, buf_size, mode]()
         {
             /// Copy file from cache to actual disk when cached buffer is finalized.
-            auto src_buffer = cache_disk->readFile(path, buf_size, 0, 0, 0);
+            auto src_buffer = cache_disk->readFile(path, buf_size, 0, 0, 0, nullptr);
             auto dst_buffer = DiskDecorator::writeFile(path, buf_size, mode);
             copyData(*src_buffer, *dst_buffer);
             dst_buffer->finalize();
diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h
index 46aadb78007..e7920fa25b6 100644
--- a/src/Disks/DiskCacheWrapper.h
+++ b/src/Disks/DiskCacheWrapper.h
@@ -32,10 +32,17 @@ public:
     void moveDirectory(const String & from_path, const String & to_path) override;
     void moveFile(const String & from_path, const String & to_path) override;
     void replaceFile(const String & from_path, const String & to_path) override;
-    std::unique_ptr<ReadBufferFromFileBase>
-    readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const override;
-    std::unique_ptr<WriteBufferFromFileBase>
-    writeFile(const String & path, size_t buf_size, WriteMode mode) override;
+
+    std::unique_ptr<ReadBufferFromFileBase> readFile(
+        const String & path,
+        size_t buf_size,
+        size_t estimated_size,
+        size_t aio_threshold,
+        size_t mmap_threshold,
+        MappedFileCache * mmap_cache) const override;
+
+    std::unique_ptr<WriteBufferFromFileBase> writeFile(const String & path, size_t buf_size, WriteMode mode) override;
+
     void removeFile(const String & path) override;
     void removeFileIfExists(const String & path) override;
     void removeDirectory(const String & path) override;
diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp
index eeb72fe1246..c12612f9e5d 100644
--- a/src/Disks/DiskDecorator.cpp
+++ b/src/Disks/DiskDecorator.cpp
@@ -114,9 +114,10 @@ void DiskDecorator::listFiles(const String & path, std::vector<String> & file_na
 }
 
 std::unique_ptr<ReadBufferFromFileBase>
-DiskDecorator::readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const
+DiskDecorator::readFile(
+    const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MappedFileCache * mmap_cache) const
 {
-    return delegate->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold);
+    return delegate->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache);
 }
 
 std::unique_ptr<WriteBufferFromFileBase>
diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h
index d5ac6f0fda0..837fb4532fd 100644
--- a/src/Disks/DiskDecorator.h
+++ b/src/Disks/DiskDecorator.h
@@ -34,10 +34,20 @@ public:
     void replaceFile(const String & from_path, const String & to_path) override;
     void copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path) override;
     void listFiles(const String & path, std::vector<String> & file_names) override;
-    std::unique_ptr<ReadBufferFromFileBase>
-    readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const override;
-    std::unique_ptr<WriteBufferFromFileBase>
-    writeFile(const String & path, size_t buf_size, WriteMode mode) override;
+
+    std::unique_ptr<ReadBufferFromFileBase> readFile(
+        const String & path,
+        size_t buf_size,
+        size_t estimated_size,
+        size_t aio_threshold,
+        size_t mmap_threshold,
+        MappedFileCache * mmap_cache) const override;
+
+    std::unique_ptr<WriteBufferFromFileBase> writeFile(
+        const String & path,
+        size_t buf_size,
+        WriteMode mode) override;
+
     void removeFile(const String & path) override;
     void removeFileIfExists(const String & path) override;
     void removeDirectory(const String & path) override;
diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp
index 5035a865191..4f7cad06456 100644
--- a/src/Disks/DiskLocal.cpp
+++ b/src/Disks/DiskLocal.cpp
@@ -219,9 +219,10 @@ void DiskLocal::replaceFile(const String & from_path, const String & to_path)
 }
 
 std::unique_ptr<ReadBufferFromFileBase>
-DiskLocal::readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const
+DiskLocal::readFile(
+    const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MappedFileCache * mmap_cache) const
 {
-    return createReadBufferFromFileBase(disk_path + path, estimated_size, aio_threshold, mmap_threshold, buf_size);
+    return createReadBufferFromFileBase(disk_path + path, estimated_size, aio_threshold, mmap_threshold, mmap_cache, buf_size);
 }
 
 std::unique_ptr<WriteBufferFromFileBase>
diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h
index 7dbfbe445f8..e1c9cb11d0e 100644
--- a/src/Disks/DiskLocal.h
+++ b/src/Disks/DiskLocal.h
@@ -76,7 +76,8 @@ public:
         size_t buf_size,
         size_t estimated_size,
         size_t aio_threshold,
-        size_t mmap_threshold) const override;
+        size_t mmap_threshold,
+        MappedFileCache * mmap_cache) const override;
 
     std::unique_ptr<WriteBufferFromFileBase> writeFile(
         const String & path,
diff --git a/src/Disks/DiskMemory.cpp b/src/Disks/DiskMemory.cpp
index a0905e67427..39a95357900 100644
--- a/src/Disks/DiskMemory.cpp
+++ b/src/Disks/DiskMemory.cpp
@@ -314,7 +314,7 @@ void DiskMemory::replaceFileImpl(const String & from_path, const String & to_pat
     files.insert(std::move(node));
 }
 
-std::unique_ptr<ReadBufferFromFileBase> DiskMemory::readFile(const String & path, size_t /*buf_size*/, size_t, size_t, size_t) const
+std::unique_ptr<ReadBufferFromFileBase> DiskMemory::readFile(const String & path, size_t /*buf_size*/, size_t, size_t, size_t, MappedFileCache *) const
 {
     std::lock_guard lock(mutex);
 
diff --git a/src/Disks/DiskMemory.h b/src/Disks/DiskMemory.h
index 29ac4919833..e7e61393cbc 100644
--- a/src/Disks/DiskMemory.h
+++ b/src/Disks/DiskMemory.h
@@ -67,7 +67,8 @@ public:
         size_t buf_size,
         size_t estimated_size,
         size_t aio_threshold,
-        size_t mmap_threshold) const override;
+        size_t mmap_threshold,
+        MappedFileCache * mmap_cache) const override;
 
     std::unique_ptr<WriteBufferFromFileBase> writeFile(
         const String & path,
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index 44c4fe73d37..734061a284e 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -31,6 +31,7 @@ using Reservations = std::vector<ReservationPtr>;
 
 class ReadBufferFromFileBase;
 class WriteBufferFromFileBase;
+class MappedFileCache;
 
 /**
  * Mode of opening a file for write.
@@ -153,7 +154,8 @@ public:
         size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
         size_t estimated_size = 0,
         size_t aio_threshold = 0,
-        size_t mmap_threshold = 0) const = 0;
+        size_t mmap_threshold = 0,
+        MappedFileCache * mmap_cache = nullptr) const = 0;
 
     /// Open the file for write and return WriteBufferFromFileBase object.
     virtual std::unique_ptr<WriteBufferFromFileBase> writeFile(
diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index bb9966eb6ff..f23b1b0a615 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -671,7 +671,7 @@ void DiskS3::replaceFile(const String & from_path, const String & to_path)
         moveFile(from_path, to_path);
 }
 
-std::unique_ptr<ReadBufferFromFileBase> DiskS3::readFile(const String & path, size_t buf_size, size_t, size_t, size_t) const
+std::unique_ptr<ReadBufferFromFileBase> DiskS3::readFile(const String & path, size_t buf_size, size_t, size_t, size_t, MappedFileCache *) const
 {
     auto metadata = readMeta(path);
 
diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h
index 5d9effa16fa..7f2698ad6d1 100644
--- a/src/Disks/S3/DiskS3.h
+++ b/src/Disks/S3/DiskS3.h
@@ -89,7 +89,8 @@ public:
         size_t buf_size,
         size_t estimated_size,
         size_t aio_threshold,
-        size_t mmap_threshold) const override;
+        size_t mmap_threshold,
+        MappedFileCache * mmap_cache) const override;
 
     std::unique_ptr<WriteBufferFromFileBase> writeFile(
         const String & path,
diff --git a/src/IO/MMapReadBufferFromFileWithCache.cpp b/src/IO/MMapReadBufferFromFileWithCache.cpp
index 9692e73f672..6796484b40b 100644
--- a/src/IO/MMapReadBufferFromFileWithCache.cpp
+++ b/src/IO/MMapReadBufferFromFileWithCache.cpp
@@ -4,13 +4,6 @@
 namespace DB
 {
 
-namespace
-{
-    /// TODO: Move to a better place, make configurable.
-    MappedFileCache cache(1000);
-}
-
-
 namespace ErrorCodes
 {
     extern const int ARGUMENT_OUT_OF_BOUND;
@@ -29,7 +22,7 @@ void MMapReadBufferFromFileWithCache::init()
 
 
 MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
-    const std::string & file_name, size_t offset, size_t length)
+    MappedFileCache & cache, const std::string & file_name, size_t offset, size_t length)
 {
     mapped = cache.getOrSet(cache.hash(file_name, offset, length), [&]
     {
@@ -40,7 +33,7 @@ MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
 }
 
 MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
-    const std::string & file_name, size_t offset)
+    MappedFileCache & cache, const std::string & file_name, size_t offset)
 {
     mapped = cache.getOrSet(cache.hash(file_name, offset, -1), [&]
     {
diff --git a/src/IO/MMapReadBufferFromFileWithCache.h b/src/IO/MMapReadBufferFromFileWithCache.h
index a6ac8d5f678..6d03be46735 100644
--- a/src/IO/MMapReadBufferFromFileWithCache.h
+++ b/src/IO/MMapReadBufferFromFileWithCache.h
@@ -11,10 +11,10 @@ namespace DB
 class MMapReadBufferFromFileWithCache : public ReadBufferFromFileBase
 {
 public:
-    MMapReadBufferFromFileWithCache(const std::string & file_name, size_t offset, size_t length);
+    MMapReadBufferFromFileWithCache(MappedFileCache & cache, const std::string & file_name, size_t offset, size_t length);
 
     /// Map till end of file.
-    MMapReadBufferFromFileWithCache(const std::string & file_name, size_t offset);
+    MMapReadBufferFromFileWithCache(MappedFileCache & cache, const std::string & file_name, size_t offset);
 
     off_t getPosition() override;
     std::string getFileName() const override;
@@ -27,5 +27,3 @@ private:
 };
 
 }
-
-
diff --git a/src/IO/MappedFileDescriptor.cpp b/src/IO/MappedFileDescriptor.cpp
index 6260ffa35e8..31caf0c4728 100644
--- a/src/IO/MappedFileDescriptor.cpp
+++ b/src/IO/MappedFileDescriptor.cpp
@@ -58,15 +58,18 @@ void MappedFileDescriptor::set(int fd_, size_t offset_, size_t length_)
     offset = offset_;
     length = length_;
 
-    if (length)
-    {
-        void * buf = mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fd, offset);
-        if (MAP_FAILED == buf)
-            throwFromErrno(fmt::format("MappedFileDescriptor: Cannot mmap {}.", ReadableSize(length)),
-                ErrorCodes::CANNOT_ALLOCATE_MEMORY);
+    if (!length)
+        return;
 
-        data = static_cast<char *>(buf);
-    }
+    void * buf = mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fd, offset);
+    if (MAP_FAILED == buf)
+        throwFromErrno(fmt::format("MappedFileDescriptor: Cannot mmap {}.", ReadableSize(length)),
+            ErrorCodes::CANNOT_ALLOCATE_MEMORY);
+
+    data = static_cast<char *>(buf);
+
+    files_metric_increment.changeTo(1);
+    bytes_metric_increment.changeTo(length);
 }
 
 void MappedFileDescriptor::set(int fd_, size_t offset_)
@@ -89,6 +92,9 @@ void MappedFileDescriptor::finish()
             ErrorCodes::CANNOT_MUNMAP);
 
     length = 0;
+
+    files_metric_increment.changeTo(0);
+    bytes_metric_increment.changeTo(0);
 }
 
 MappedFileDescriptor::~MappedFileDescriptor()
diff --git a/src/IO/MappedFileDescriptor.h b/src/IO/MappedFileDescriptor.h
index c119b9eb2ea..fbe7fa1915c 100644
--- a/src/IO/MappedFileDescriptor.h
+++ b/src/IO/MappedFileDescriptor.h
@@ -1,6 +1,13 @@
 #pragma once
 
 #include <cstddef>
+#include <Common/CurrentMetrics.h>
+
+namespace CurrentMetrics
+{
+    extern const Metric MappedFiles;
+    extern const Metric MappedFileBytes;
+}
 
 
 namespace DB
@@ -43,6 +50,9 @@ protected:
     size_t offset = 0;
     size_t length = 0;
     char * data = nullptr;
+
+    CurrentMetrics::Increment files_metric_increment{CurrentMetrics::MappedFiles, 0};
+    CurrentMetrics::Increment bytes_metric_increment{CurrentMetrics::MappedFileBytes, 0};
 };
 
 }
diff --git a/src/IO/createReadBufferFromFileBase.cpp b/src/IO/createReadBufferFromFileBase.cpp
index 2a303775a07..69e14169a87 100644
--- a/src/IO/createReadBufferFromFileBase.cpp
+++ b/src/IO/createReadBufferFromFileBase.cpp
@@ -21,7 +21,7 @@ namespace DB
 
 std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
     const std::string & filename_,
-    size_t estimated_size, size_t aio_threshold, size_t mmap_threshold,
+    size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MappedFileCache * mmap_cache,
     size_t buffer_size_, int flags_, char * existing_memory_, size_t alignment)
 {
 #if defined(OS_LINUX) || defined(__FreeBSD__)
@@ -45,11 +45,11 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
     (void)estimated_size;
 #endif
 
-    if (!existing_memory_ && mmap_threshold && estimated_size >= mmap_threshold)
+    if (!existing_memory_ && mmap_threshold && mmap_cache && estimated_size >= mmap_threshold)
     {
         try
         {
-            auto res = std::make_unique<MMapReadBufferFromFileWithCache>(filename_, 0);
+            auto res = std::make_unique<MMapReadBufferFromFileWithCache>(*mmap_cache, filename_, 0);
             ProfileEvents::increment(ProfileEvents::CreatedReadBufferMMap);
             return res;
         }
diff --git a/src/IO/createReadBufferFromFileBase.h b/src/IO/createReadBufferFromFileBase.h
index 61dfde6229f..16642196d50 100644
--- a/src/IO/createReadBufferFromFileBase.h
+++ b/src/IO/createReadBufferFromFileBase.h
@@ -8,6 +8,9 @@
 namespace DB
 {
 
+class MappedFileCache;
+
+
 /** Create an object to read data from a file.
   * estimated_size - the number of bytes to read
   * aio_threshold - the minimum number of bytes for asynchronous reads
@@ -20,6 +23,7 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
     size_t estimated_size,
     size_t aio_threshold,
     size_t mmap_threshold,
+    MappedFileCache * mmap_cache,
     size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
     int flags_ = -1,
     char * existing_memory_ = nullptr,
diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp
index 9a5c1065abb..dabcdf4b2d6 100644
--- a/src/Interpreters/AsynchronousMetrics.cpp
+++ b/src/Interpreters/AsynchronousMetrics.cpp
@@ -12,6 +12,7 @@
 #include <Storages/StorageMergeTree.h>
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <IO/UncompressedCache.h>
+#include <IO/MappedFileCache.h>
 #include <Databases/IDatabase.h>
 #include <chrono>
 
@@ -186,6 +187,13 @@ void AsynchronousMetrics::update()
         }
     }
 
+    {
+        if (auto mmap_cache = global_context.getMappedFileCache())
+        {
+            new_values["MMapCacheCells"] = mmap_cache->count();
+        }
+    }
+
 #if USE_EMBEDDED_COMPILER
     {
         if (auto * compiled_expression_cache = CompiledExpressionCacheFactory::instance().tryGetCache())
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index b2b15fb0d25..e743b8ffb90 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -55,6 +55,7 @@
 #include <Interpreters/DDLWorker.h>
 #include <IO/ReadBufferFromFile.h>
 #include <IO/UncompressedCache.h>
+#include <IO/MappedFileCache.h>
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ParserCreateQuery.h>
 #include <Parsers/parseQuery.h>
@@ -341,6 +342,7 @@ struct ContextShared
     AccessControlManager access_control_manager;
     mutable UncompressedCachePtr uncompressed_cache;        /// The cache of decompressed blocks.
     mutable MarkCachePtr mark_cache;                        /// Cache of marks in compressed files.
+    mutable MappedFileCachePtr mmap_cache; /// Cache of mmapped files to avoid frequent open/map/unmap/close and to reuse from several threads.
     ProcessList process_list;                               /// Executing queries at the moment.
     MergeList merge_list;                                   /// The list of executable merge (for (Replicated)?MergeTree)
     ReplicatedFetchList replicated_fetch_list;
@@ -1440,14 +1442,12 @@ void Context::setMarkCache(size_t cache_size_in_bytes)
     shared->mark_cache = std::make_shared<MarkCache>(cache_size_in_bytes);
 }
 
-
 MarkCachePtr Context::getMarkCache() const
 {
     auto lock = getLock();
     return shared->mark_cache;
 }
 
-
 void Context::dropMarkCache() const
 {
     auto lock = getLock();
@@ -1456,6 +1456,30 @@ void Context::dropMarkCache() const
 }
 
 
+void Context::setMappedFileCache(size_t cache_size_in_num_entries)
+{
+    auto lock = getLock();
+
+    if (shared->mmap_cache)
+        throw Exception("Mapped file cache has been already created.", ErrorCodes::LOGICAL_ERROR);
+
+    shared->mmap_cache = std::make_shared<MappedFileCache>(cache_size_in_num_entries);
+}
+
+MappedFileCachePtr Context::getMappedFileCache() const
+{
+    auto lock = getLock();
+    return shared->mmap_cache;
+}
+
+void Context::dropMappedFileCache() const
+{
+    auto lock = getLock();
+    if (shared->mmap_cache)
+        shared->mmap_cache->reset();
+}
+
+
 void Context::dropCaches() const
 {
     auto lock = getLock();
@@ -1465,6 +1489,9 @@ void Context::dropCaches() const
 
     if (shared->mark_cache)
         shared->mark_cache->reset();
+
+    if (shared->mmap_cache)
+        shared->mmap_cache->reset();
 }
 
 BackgroundSchedulePool & Context::getBufferFlushSchedulePool() const
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 1785f6684e4..196b38f6a98 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -68,6 +68,7 @@ class ReplicatedFetchList;
 class Cluster;
 class Compiler;
 class MarkCache;
+class MappedFileCache;
 class UncompressedCache;
 class ProcessList;
 class QueryStatus;
@@ -623,6 +624,11 @@ public:
     std::shared_ptr<MarkCache> getMarkCache() const;
     void dropMarkCache() const;
 
+    /// Create a cache of mapped files to avoid frequent open/map/unmap/close and to reuse from several threads.
+    void setMappedFileCache(size_t cache_size_in_num_entries);
+    std::shared_ptr<MappedFileCache> getMappedFileCache() const;
+    void dropMappedFileCache() const;
+
     /** Clear the caches of the uncompressed blocks and marks.
       * This is usually done when renaming tables, changing the type of columns, deleting a table.
       *  - since caches are linked to file names, and become incorrect.
diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp
index 7790ba03236..6a3e307bec6 100644
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@@ -268,6 +268,10 @@ BlockIO InterpreterSystemQuery::execute()
             context.checkAccess(AccessType::SYSTEM_DROP_UNCOMPRESSED_CACHE);
             system_context.dropUncompressedCache();
             break;
+        case Type::DROP_MMAP_CACHE:
+            context.checkAccess(AccessType::SYSTEM_DROP_MMAP_CACHE);
+            system_context.dropMappedFileCache();
+            break;
 #if USE_EMBEDDED_COMPILER
         case Type::DROP_COMPILED_EXPRESSION_CACHE:
             context.checkAccess(AccessType::SYSTEM_DROP_COMPILED_EXPRESSION_CACHE);
@@ -632,6 +636,7 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
         }
         case Type::DROP_DNS_CACHE: [[fallthrough]];
         case Type::DROP_MARK_CACHE: [[fallthrough]];
+        case Type::DROP_MMAP_CACHE: [[fallthrough]];
 #if USE_EMBEDDED_COMPILER
         case Type::DROP_COMPILED_EXPRESSION_CACHE: [[fallthrough]];
 #endif
diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp
index f3a43d7f3fd..71bda0c7709 100644
--- a/src/Parsers/ASTSystemQuery.cpp
+++ b/src/Parsers/ASTSystemQuery.cpp
@@ -30,6 +30,8 @@ const char * ASTSystemQuery::typeToString(Type type)
             return "DROP MARK CACHE";
         case Type::DROP_UNCOMPRESSED_CACHE:
             return "DROP UNCOMPRESSED CACHE";
+        case Type::DROP_MMAP_CACHE:
+            return "DROP MMAP CACHE";
 #if USE_EMBEDDED_COMPILER
         case Type::DROP_COMPILED_EXPRESSION_CACHE:
             return "DROP COMPILED EXPRESSION CACHE";
diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h
index ad7eb664659..5bcdcc7875d 100644
--- a/src/Parsers/ASTSystemQuery.h
+++ b/src/Parsers/ASTSystemQuery.h
@@ -24,6 +24,7 @@ public:
         DROP_DNS_CACHE,
         DROP_MARK_CACHE,
         DROP_UNCOMPRESSED_CACHE,
+        DROP_MMAP_CACHE,
 #if USE_EMBEDDED_COMPILER
         DROP_COMPILED_EXPRESSION_CACHE,
 #endif
diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp
index 7081695ad49..cf8de4456dd 100644
--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@@ -292,7 +292,7 @@ void Service::sendPartS3Metadata(const MergeTreeData::DataPartPtr & part, WriteB
         writeStringBinary(it.first, out);
         writeBinary(file_size, out);
 
-        auto file_in = createReadBufferFromFileBase(metadata_file, 0, 0, 0, DBMS_DEFAULT_BUFFER_SIZE);
+        auto file_in = createReadBufferFromFileBase(metadata_file, 0, 0, 0, nullptr, DBMS_DEFAULT_BUFFER_SIZE);
         HashingWriteBuffer hashing_out(out);
         copyData(*file_in, hashing_out, blocker.getCounter());
         if (blocker.isCancelled())
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 09b7dcd3a78..039b9281135 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1160,6 +1160,10 @@ void MergeTreeData::clearOldPartsFromFilesystem(bool force)
     DataPartsVector parts_to_remove = grabOldParts(force);
     clearPartsFromFilesystem(parts_to_remove);
     removePartsFinally(parts_to_remove);
+
+    /// This is needed to close files to avoid they reside on disk after being deleted.
+    /// NOTE: we can drop files from cache more selectively but this is good enough.
+    global_context.dropMappedFileCache();
 }
 
 void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts_to_remove)
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
index f2bbf53bd97..866980b81f5 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
@@ -395,7 +395,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name,
     String mrk_path = fullPath(disk, part_path + name + marks_file_extension);
     String bin_path = fullPath(disk, part_path + name + DATA_FILE_EXTENSION);
     DB::ReadBufferFromFile mrk_in(mrk_path);
-    DB::CompressedReadBufferFromFile bin_in(bin_path, 0, 0, 0);
+    DB::CompressedReadBufferFromFile bin_in(bin_path, 0, 0, 0, nullptr);
     bool must_be_last = false;
     UInt64 offset_in_compressed_file = 0;
     UInt64 offset_in_decompressed_block = 0;
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 96a3dba12f7..2cbc8fbd743 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -546,6 +546,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
     {
         .min_bytes_to_use_direct_io = settings.min_bytes_to_use_direct_io,
         .min_bytes_to_use_mmap_io = settings.min_bytes_to_use_mmap_io,
+        .mmap_cache = context.getMappedFileCache(),
         .max_read_buffer_size = settings.max_read_buffer_size,
         .save_marks_in_cache = true,
         .checksum_on_read = settings.checksum_on_read,
@@ -555,15 +556,14 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
     {
         MergeTreeIndexPtr index;
         MergeTreeIndexConditionPtr condition;
-        std::atomic<size_t> total_granules;
-        std::atomic<size_t> granules_dropped;
+        std::atomic<size_t> total_granules{0};
+        std::atomic<size_t> granules_dropped{0};
 
         DataSkippingIndexAndCondition(MergeTreeIndexPtr index_, MergeTreeIndexConditionPtr condition_)
             : index(index_)
             , condition(condition_)
-            , total_granules(0)
-            , granules_dropped(0)
-        {}
+        {
+        }
     };
     std::list<DataSkippingIndexAndCondition> useful_indices;
 
diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h
index f2469494792..5d89b00111c 100644
--- a/src/Storages/MergeTree/MergeTreeIOSettings.h
+++ b/src/Storages/MergeTree/MergeTreeIOSettings.h
@@ -3,13 +3,19 @@
 #include <Core/Settings.h>
 #include <Storages/MergeTree/MergeTreeSettings.h>
 
+
 namespace DB
 {
 
+class MappedFileCache;
+using MappedFileCachePtr = std::shared_ptr<MappedFileCache>;
+
+
 struct MergeTreeReaderSettings
 {
     size_t min_bytes_to_use_direct_io = 0;
     size_t min_bytes_to_use_mmap_io = 0;
+    MappedFileCachePtr mmap_cache;
     size_t max_read_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
     /// If save_marks_in_cache is false, then, if marks are not in cache,
     ///  we will load them but won't save in the cache, to avoid evicting other data.
diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
index 67268e8afd8..b135ea04032 100644
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
@@ -84,7 +84,8 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
                         buffer_size,
                         0,
                         settings.min_bytes_to_use_direct_io,
-                        settings.min_bytes_to_use_mmap_io);
+                        settings.min_bytes_to_use_mmap_io,
+                        settings.mmap_cache.get());
                 },
                 uncompressed_cache,
                 /* allow_different_codecs = */ true);
@@ -103,7 +104,12 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
             auto buffer =
                 std::make_unique<CompressedReadBufferFromFile>(
                     data_part->volume->getDisk()->readFile(
-                        full_data_path, buffer_size, 0, settings.min_bytes_to_use_direct_io, settings.min_bytes_to_use_mmap_io),
+                        full_data_path,
+                        buffer_size,
+                        0,
+                        settings.min_bytes_to_use_direct_io,
+                        settings.min_bytes_to_use_mmap_io,
+                        settings.mmap_cache.get()),
                     /* allow_different_codecs = */ true);
 
             if (profile_callback_)
diff --git a/src/Storages/MergeTree/MergeTreeReaderStream.cpp b/src/Storages/MergeTree/MergeTreeReaderStream.cpp
index fd251497d7c..774c5bcf3d8 100644
--- a/src/Storages/MergeTree/MergeTreeReaderStream.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderStream.cpp
@@ -89,7 +89,8 @@ MergeTreeReaderStream::MergeTreeReaderStream(
                     buffer_size,
                     sum_mark_range_bytes,
                     settings.min_bytes_to_use_direct_io,
-                    settings.min_bytes_to_use_mmap_io);
+                    settings.min_bytes_to_use_mmap_io,
+                    settings.mmap_cache.get());
             },
             uncompressed_cache);
 
@@ -105,8 +106,13 @@ MergeTreeReaderStream::MergeTreeReaderStream(
     else
     {
         auto buffer = std::make_unique<CompressedReadBufferFromFile>(
-            disk->readFile(path_prefix + data_file_extension, buffer_size,
-                sum_mark_range_bytes, settings.min_bytes_to_use_direct_io, settings.min_bytes_to_use_mmap_io)
+            disk->readFile(
+                path_prefix + data_file_extension,
+                buffer_size,
+                sum_mark_range_bytes,
+                settings.min_bytes_to_use_direct_io,
+                settings.min_bytes_to_use_mmap_io,
+                settings.mmap_cache.get())
         );
 
         if (profile_callback)

From 6f9f7d118da12c9988d98f7a68db18800a04d903 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 04:27:26 +0300
Subject: [PATCH 246/260] Fixes

---
 programs/server/config.xml               | 6 ++++--
 src/Storages/MergeTree/MergeTreeData.cpp | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/programs/server/config.xml b/programs/server/config.xml
index 543f7c54ca6..50c29e05a4c 100644
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@@ -297,20 +297,22 @@
       -->
     <mark_cache_size>5368709120</mark_cache_size>
 
+
     <!-- If you enable the `min_bytes_to_use_mmap_io` setting,
          the data in MergeTree tables can be read with mmap to avoid copying from kernel to userspace.
          It makes sense only for large files and helps only if data reside in page cache.
          To avoid frequent open/mmap/munmap/close calls (which are very expensive due to consequent page faults)
          and to reuse mappings from several threads and queries,
          the cache of mapped files is maintained. Its size is the number of mapped regions (usually equal to the number of mapped files).
-         The amount of data in mapped files can be monitored in system.metrics, system.metric_log by the MappedFiles, MappedFileBytes metrics
+         The amount of data in mapped files can be monitored
+         in system.metrics, system.metric_log by the MappedFiles, MappedFileBytes metrics
          and in system.asynchronous_metrics, system.asynchronous_metrics_log by the MMapCacheCells metric,
          and also in system.events, system.processes, system.query_log, system.query_thread_log by the
          CreatedReadBufferMMap, CreatedReadBufferMMapFailed, MappedFileCacheHits, MappedFileCacheMisses events.
          Note that the amount of data in mapped files does not consume memory directly and is not accounted
          in query or server memory usage - because this memory can be discarded similar to OS page cache.
          The cache is dropped (the files are closed) automatically on removal of old parts in MergeTree,
-         also it can be dropped manually by the SYSTEM DROP MMAP CACHE query. Good luck.
+         also it can be dropped manually by the SYSTEM DROP MMAP CACHE query.
       -->
     <mmap_cache_size>1000</mmap_cache_size>
 
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 039b9281135..f5007918a22 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1163,7 +1163,8 @@ void MergeTreeData::clearOldPartsFromFilesystem(bool force)
 
     /// This is needed to close files to avoid they reside on disk after being deleted.
     /// NOTE: we can drop files from cache more selectively but this is good enough.
-    global_context.dropMappedFileCache();
+    if (!parts_to_remove.empty())
+        global_context.dropMappedFileCache();
 }
 
 void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts_to_remove)

From 699af9d034bffbff5755023ba2c614f12aa68760 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 04:29:55 +0300
Subject: [PATCH 247/260] Add perf test

---
 tests/performance/mmap_io.xml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 tests/performance/mmap_io.xml

diff --git a/tests/performance/mmap_io.xml b/tests/performance/mmap_io.xml
new file mode 100644
index 00000000000..d9e6037c2d2
--- /dev/null
+++ b/tests/performance/mmap_io.xml
@@ -0,0 +1,17 @@
+<test>
+    <preconditions>
+        <table_exists>hits_10m_single</table_exists>
+    </preconditions>
+
+    <settings>
+        <min_bytes_to_use_mmap_io>1</min_bytes_to_use_mmap_io>
+    </settings>
+
+    <create_query>CREATE TABLE hits_none (WatchID UInt64 CODEC(NONE)) ENGINE = MergeTree ORDER BY tuple()</create_query>
+    <fill_query>INSERT INTO hits_none SELECT WatchID FROM test.hits</fill_query>
+    <fill_query>OPTIMIZE TABLE hits_none FINAL</fill_query>
+
+    <query><![CDATA[SELECT sum(WatchID) FROM hits_none]]></query>
+
+    <drop_query>DROP TABLE hits_none</drop_query>
+</test>

From 613d1e3c17adf8a7d02520d1392e98a57b269e89 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@yandex-team.ru>
Date: Sun, 28 Mar 2021 04:47:27 +0300
Subject: [PATCH 248/260] Update version_date.tsv after release 21.3.4.25

---
 utils/list-versions/version_date.tsv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 231d22b50da..628806902b2 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,3 +1,4 @@
+v21.3.4.25-lts	2021-03-28
 v21.3.3.14-lts	2021-03-19
 v21.3.2.5-lts	2021-03-12
 v21.2.6.1-stable	2021-03-15

From 771493f03a1bb23a571a653cf3328d7cb2de22a0 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@yandex-team.ru>
Date: Sun, 28 Mar 2021 05:03:48 +0300
Subject: [PATCH 249/260] Update version_date.tsv after release 21.2.7.11

---
 utils/list-versions/version_date.tsv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 628806902b2..799492cdd90 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,6 +1,7 @@
 v21.3.4.25-lts	2021-03-28
 v21.3.3.14-lts	2021-03-19
 v21.3.2.5-lts	2021-03-12
+v21.2.7.11-stable	2021-03-28
 v21.2.6.1-stable	2021-03-15
 v21.2.5.5-stable	2021-03-02
 v21.2.4.6-stable	2021-02-20

From 7cc6eeff0dcc639c309f082b3c75f0ddbb89517b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 05:55:41 +0300
Subject: [PATCH 250/260] Update test

---
 tests/queries/0_stateless/01271_show_privileges.reference | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference
index 8e90dbbb8a0..7928f531a7d 100644
--- a/tests/queries/0_stateless/01271_show_privileges.reference
+++ b/tests/queries/0_stateless/01271_show_privileges.reference
@@ -76,6 +76,7 @@ SYSTEM SHUTDOWN	['SYSTEM KILL','SHUTDOWN']	GLOBAL	SYSTEM
 SYSTEM DROP DNS CACHE	['SYSTEM DROP DNS','DROP DNS CACHE','DROP DNS']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP MARK CACHE	['SYSTEM DROP MARK','DROP MARK CACHE','DROP MARKS']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP UNCOMPRESSED CACHE	['SYSTEM DROP UNCOMPRESSED','DROP UNCOMPRESSED CACHE','DROP UNCOMPRESSED']	GLOBAL	SYSTEM DROP CACHE
+SYSTEM DROP MMAP CACHE	['SYSTEM DROP MMAP','DROP MMAP CACHE','DROP MMAP']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP COMPILED EXPRESSION CACHE	['SYSTEM DROP COMPILED EXPRESSION','DROP COMPILED EXPRESSION CACHE','DROP COMPILED EXPRESSIONS']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP CACHE	['DROP CACHE']	\N	SYSTEM
 SYSTEM RELOAD CONFIG	['RELOAD CONFIG']	GLOBAL	SYSTEM RELOAD

From 21ea7bf9ab484b794c34aeda0003239cb0eb0728 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 21:09:21 +0300
Subject: [PATCH 251/260] Add results from Kimmo Linna

---
 website/benchmark/hardware/index.html         |  1 +
 .../benchmark/hardware/results/asus_a15.json  | 54 +++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 website/benchmark/hardware/results/asus_a15.json

diff --git a/website/benchmark/hardware/index.html b/website/benchmark/hardware/index.html
index 92da6328f0f..a57930b279d 100644
--- a/website/benchmark/hardware/index.html
+++ b/website/benchmark/hardware/index.html
@@ -75,6 +75,7 @@ Results for Raspberry Pi and Digital Ocean CPU-optimized are from <b>Fritz Wijay
 Results for Digitalocean (Storage-intesinve VMs) + (CPU/GP) are from <b>Yiğit Konur</b> and <b>Metehan Çetinkaya</b> of seo.do.<br/>
 Results for 2x AMD EPYC 7F72 3.2 Ghz (Total 96 Cores, IBM Cloud's Bare Metal Service) from <b>Yiğit Konur</b> and <b>Metehan Çetinkaya</b> of seo.do.<br/>
 Results for 2x AMD EPYC 7742 (128 physical cores, 1 TB DDR4-3200 RAM) from <b>Yedige Davletgaliyev</b> and <b>Nikita Zhavoronkov</b> of blockchair.com.<br/>
+Results for ASUS A15 (Ryzen laptop) are from <b>Kimmo Linna</b>.<br/>
 </p>
     </div>
 </div>
diff --git a/website/benchmark/hardware/results/asus_a15.json b/website/benchmark/hardware/results/asus_a15.json
new file mode 100644
index 00000000000..983dbde8681
--- /dev/null
+++ b/website/benchmark/hardware/results/asus_a15.json
@@ -0,0 +1,54 @@
+[
+    {
+        "system":       "Asus A15",
+        "system_full":  "Asus A15 (16 × AMD Ryzen 7 4800H, 16 GiB RAM)",
+        "time":         "2021-03-23 00:00:00",
+        "kind":         "laptop",
+        "result":
+        [
+[0.004, 0.003, 0.003],
+[0.019, 0.013, 0.012],
+[0.053, 0.041, 0.037],
+[0.106, 0.057, 0.056],
+[0.158, 0.115, 0.110],
+[0.324, 0.266, 0.262],
+[0.027, 0.024, 0.026],
+[0.017, 0.016, 0.017],
+[0.644, 0.589, 0.582],
+[0.733, 0.679, 0.679],
+[0.233, 0.201, 0.197],
+[0.276, 0.235, 0.236],
+[1.025, 0.962, 0.962],
+[1.342, 1.270, 1.264],
+[1.170, 1.129, 1.124],
+[1.375, 1.346, 1.351],
+[3.271, 3.210, 3.242],
+[1.960, 1.898, 1.907],
+[5.997, 5.965, 5.983],
+[0.106, 0.065, 0.055],
+[1.264, 0.990, 0.989],
+[1.555, 1.241, 1.239],
+[3.798, 3.307, 3.280],
+[1.949, 1.022, 0.995],
+[0.393, 0.292, 0.292],
+[0.307, 0.254, 0.255],
+[0.378, 0.297, 0.290],
+[1.632, 1.399, 1.386],
+[2.111, 1.909, 1.900],
+[3.349, 3.352, 3.357],
+[0.892, 0.824, 0.816],
+[1.505, 1.392, 1.378],
+[9.105, 8.951, 8.914],
+[5.195, 4.975, 4.919],
+[5.150, 5.021, 4.955],
+[1.756, 1.743, 1.749],
+[0.161, 0.154, 0.158],
+[0.108, 0.058, 0.055],
+[0.101, 0.102, 0.052],
+[0.365, 0.309, 0.334],
+[0.050, 0.023, 0.023],
+[0.037, 0.019, 0.015],
+[0.023, 0.013, 0.018]
+        ]
+    }
+]

From 608d37deedd82a646fb84fece19b5577c5046c71 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Sun, 28 Mar 2021 21:30:43 +0300
Subject: [PATCH 252/260] CachedCompressedReadBuffer fix cache usage

---
 src/Compression/CachedCompressedReadBuffer.cpp | 18 ++++++------------
 src/IO/UncompressedCache.h                     | 14 +++++++-------
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/Compression/CachedCompressedReadBuffer.cpp b/src/Compression/CachedCompressedReadBuffer.cpp
index 0548de07859..5a0db7ef877 100644
--- a/src/Compression/CachedCompressedReadBuffer.cpp
+++ b/src/Compression/CachedCompressedReadBuffer.cpp
@@ -33,33 +33,27 @@ bool CachedCompressedReadBuffer::nextImpl()
 
     /// Let's check for the presence of a decompressed block in the cache, grab the ownership of this block, if it exists.
     UInt128 key = cache->hash(path, file_pos);
-    owned_cell = cache->get(key);
 
-    if (!owned_cell)
+    owned_cell = cache->getOrSet(key, [&]()
     {
-        /// If not, read it from the file.
         initInput();
         file_in->seek(file_pos, SEEK_SET);
 
-        owned_cell = std::make_shared<UncompressedCacheCell>();
+        auto cell = std::make_shared<UncompressedCacheCell>();
 
         size_t size_decompressed;
         size_t size_compressed_without_checksum;
-        owned_cell->compressed_size = readCompressedData(size_decompressed, size_compressed_without_checksum, false);
+        cell->compressed_size = readCompressedData(size_decompressed, size_compressed_without_checksum, false);
 
-        if (owned_cell->compressed_size)
+        if (cell->compressed_size)
         {
             owned_cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer();
             owned_cell->data.resize(size_decompressed + owned_cell->additional_bytes);
             decompressTo(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum);
-
         }
 
-        /// Put data into cache.
-        /// NOTE: Even if we don't read anything (compressed_size == 0)
-        /// because we can reuse this information and don't reopen file in future
-        cache->set(key, owned_cell);
-    }
+        return cell;
+    });
 
     if (owned_cell->data.size() == 0)
         return false;
diff --git a/src/IO/UncompressedCache.h b/src/IO/UncompressedCache.h
index 86f1530e5b3..63c525bba50 100644
--- a/src/IO/UncompressedCache.h
+++ b/src/IO/UncompressedCache.h
@@ -58,16 +58,16 @@ public:
         return key;
     }
 
-    MappedPtr get(const Key & key)
+    template <typename LoadFunc>
+    MappedPtr getOrSet(const Key & key, LoadFunc && load)
     {
-        MappedPtr res = Base::get(key);
-
-        if (res)
-            ProfileEvents::increment(ProfileEvents::UncompressedCacheHits);
-        else
+        auto result = Base::getOrSet(key, load);
+        if (result.second)
             ProfileEvents::increment(ProfileEvents::UncompressedCacheMisses);
+        else
+            ProfileEvents::increment(ProfileEvents::UncompressedCacheHits);
 
-        return res;
+        return result.first;
     }
 
 private:

From 15b41fd110ba83c2911385a417bc76231341b68e Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 22:15:13 +0300
Subject: [PATCH 253/260] Review fix

---
 src/IO/MMapReadBufferFromFileWithCache.cpp |  4 ++--
 src/IO/MappedFileCache.h                   | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/IO/MMapReadBufferFromFileWithCache.cpp b/src/IO/MMapReadBufferFromFileWithCache.cpp
index 6796484b40b..1bdda49b1ce 100644
--- a/src/IO/MMapReadBufferFromFileWithCache.cpp
+++ b/src/IO/MMapReadBufferFromFileWithCache.cpp
@@ -27,7 +27,7 @@ MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
     mapped = cache.getOrSet(cache.hash(file_name, offset, length), [&]
     {
         return std::make_shared<MappedFile>(file_name, offset, length);
-    }).first;
+    });
 
     init();
 }
@@ -38,7 +38,7 @@ MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
     mapped = cache.getOrSet(cache.hash(file_name, offset, -1), [&]
     {
         return std::make_shared<MappedFile>(file_name, offset);
-    }).first;
+    });
 
     init();
 }
diff --git a/src/IO/MappedFileCache.h b/src/IO/MappedFileCache.h
index d5b35ab3060..38525bad443 100644
--- a/src/IO/MappedFileCache.h
+++ b/src/IO/MappedFileCache.h
@@ -44,16 +44,16 @@ public:
         return key;
     }
 
-    MappedPtr get(const Key & key)
+    template <typename LoadFunc>
+    MappedPtr getOrSet(const Key & key, LoadFunc && load)
     {
-        MappedPtr res = Base::get(key);
-
-        if (res)
-            ProfileEvents::increment(ProfileEvents::MappedFileCacheHits);
-        else
+        auto result = Base::getOrSet(key, load);
+        if (result.second)
             ProfileEvents::increment(ProfileEvents::MappedFileCacheMisses);
+        else
+            ProfileEvents::increment(ProfileEvents::MappedFileCacheHits);
 
-        return res;
+        return result.first;
     }
 };
 

From 2a8ac01cdb98e89e8eaa4f8537da31fc2fab1e8a Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 22:24:28 +0300
Subject: [PATCH 254/260] Rename as suggested by Kita

---
 programs/local/LocalServer.cpp                |  2 +-
 programs/server/Server.cpp                    |  2 +-
 programs/server/config.xml                    |  4 ++--
 src/Common/CurrentMetrics.cpp                 |  4 ++--
 src/Common/ProfileEvents.cpp                  |  4 ++--
 .../CompressedReadBufferFromFile.cpp          |  2 +-
 .../CompressedReadBufferFromFile.h            |  4 ++--
 src/Disks/DiskCacheWrapper.cpp                |  2 +-
 src/Disks/DiskCacheWrapper.h                  |  2 +-
 src/Disks/DiskDecorator.cpp                   |  2 +-
 src/Disks/DiskDecorator.h                     |  2 +-
 src/Disks/DiskLocal.cpp                       |  2 +-
 src/Disks/DiskLocal.h                         |  2 +-
 src/Disks/DiskMemory.cpp                      |  2 +-
 src/Disks/DiskMemory.h                        |  2 +-
 src/Disks/IDisk.h                             |  4 ++--
 src/Disks/S3/DiskS3.cpp                       |  2 +-
 src/Disks/S3/DiskS3.h                         |  2 +-
 src/IO/MMapReadBufferFromFileDescriptor.h     |  4 ++--
 src/IO/MMapReadBufferFromFileWithCache.cpp    |  8 +++----
 src/IO/MMapReadBufferFromFileWithCache.h      |  8 +++----
 src/IO/{MappedFile.cpp => MMappedFile.cpp}    | 14 +++++------
 src/IO/{MappedFile.h => MMappedFile.h}        | 10 ++++----
 .../{MappedFileCache.h => MMappedFileCache.h} | 18 +++++++-------
 ...scriptor.cpp => MMappedFileDescriptor.cpp} | 24 +++++++++----------
 ...leDescriptor.h => MMappedFileDescriptor.h} | 22 ++++++++---------
 src/IO/createReadBufferFromFileBase.cpp       |  2 +-
 src/IO/createReadBufferFromFileBase.h         |  4 ++--
 src/IO/ya.make                                |  4 ++--
 src/Interpreters/AsynchronousMetrics.cpp      |  4 ++--
 src/Interpreters/Context.cpp                  | 12 +++++-----
 src/Interpreters/Context.h                    |  8 +++----
 src/Interpreters/InterpreterSystemQuery.cpp   |  2 +-
 src/Storages/MergeTree/MergeTreeData.cpp      |  2 +-
 .../MergeTree/MergeTreeDataSelectExecutor.cpp |  2 +-
 src/Storages/MergeTree/MergeTreeIOSettings.h  |  6 ++---
 36 files changed, 100 insertions(+), 100 deletions(-)
 rename src/IO/{MappedFile.cpp => MMappedFile.cpp} (78%)
 rename src/IO/{MappedFile.h => MMappedFile.h} (65%)
 rename src/IO/{MappedFileCache.h => MMappedFileCache.h} (65%)
 rename src/IO/{MappedFileDescriptor.cpp => MMappedFileDescriptor.cpp} (62%)
 rename src/IO/{MappedFileDescriptor.h => MMappedFileDescriptor.h} (67%)

diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index 31f9ca5b8c4..2909b838c84 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -263,7 +263,7 @@ try
     /// A cache for mmapped files.
     size_t mmap_cache_size = config().getUInt64("mmap_cache_size", 1000);   /// The choice of default is arbitrary.
     if (mmap_cache_size)
-        global_context->setMappedFileCache(mmap_cache_size);
+        global_context->setMMappedFileCache(mmap_cache_size);
 
     /// Load global settings from default_profile and system_profile.
     global_context->setDefaultProfiles(config());
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 9728fe402d2..b54e882c699 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -832,7 +832,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
     /// A cache for mmapped files.
     size_t mmap_cache_size = config().getUInt64("mmap_cache_size", 1000);   /// The choice of default is arbitrary.
     if (mmap_cache_size)
-        global_context->setMappedFileCache(mmap_cache_size);
+        global_context->setMMappedFileCache(mmap_cache_size);
 
 #if USE_EMBEDDED_COMPILER
     size_t compiled_expression_cache_size = config().getUInt64("compiled_expression_cache_size", 500);
diff --git a/programs/server/config.xml b/programs/server/config.xml
index 50c29e05a4c..9c01b328290 100644
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@@ -305,10 +305,10 @@
          and to reuse mappings from several threads and queries,
          the cache of mapped files is maintained. Its size is the number of mapped regions (usually equal to the number of mapped files).
          The amount of data in mapped files can be monitored
-         in system.metrics, system.metric_log by the MappedFiles, MappedFileBytes metrics
+         in system.metrics, system.metric_log by the MMappedFiles, MMappedFileBytes metrics
          and in system.asynchronous_metrics, system.asynchronous_metrics_log by the MMapCacheCells metric,
          and also in system.events, system.processes, system.query_log, system.query_thread_log by the
-         CreatedReadBufferMMap, CreatedReadBufferMMapFailed, MappedFileCacheHits, MappedFileCacheMisses events.
+         CreatedReadBufferMMap, CreatedReadBufferMMapFailed, MMappedFileCacheHits, MMappedFileCacheMisses events.
          Note that the amount of data in mapped files does not consume memory directly and is not accounted
          in query or server memory usage - because this memory can be discarded similar to OS page cache.
          The cache is dropped (the files are closed) automatically on removal of old parts in MergeTree,
diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index e45339efe9f..abbb3c71d72 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -66,8 +66,8 @@
     M(PartsWide, "Wide parts.") \
     M(PartsCompact, "Compact parts.") \
     M(PartsInMemory, "In-memory parts.") \
-    M(MappedFiles, "Total number of mmapped files.") \
-    M(MappedFileBytes, "Sum size of mmapped file regions.") \
+    M(MMappedFiles, "Total number of mmapped files.") \
+    M(MMappedFileBytes, "Sum size of mmapped file regions.") \
 
 namespace CurrentMetrics
 {
diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 948c5f3c6b8..35703790d82 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -32,8 +32,8 @@
     M(UncompressedCacheHits, "") \
     M(UncompressedCacheMisses, "") \
     M(UncompressedCacheWeightLost, "") \
-    M(MappedFileCacheHits, "") \
-    M(MappedFileCacheMisses, "") \
+    M(MMappedFileCacheHits, "") \
+    M(MMappedFileCacheMisses, "") \
     M(IOBufferAllocs, "") \
     M(IOBufferAllocBytes, "") \
     M(ArenaAllocChunks, "") \
diff --git a/src/Compression/CompressedReadBufferFromFile.cpp b/src/Compression/CompressedReadBufferFromFile.cpp
index e3cec15d504..e14a1784b14 100644
--- a/src/Compression/CompressedReadBufferFromFile.cpp
+++ b/src/Compression/CompressedReadBufferFromFile.cpp
@@ -49,7 +49,7 @@ CompressedReadBufferFromFile::CompressedReadBufferFromFile(
     size_t estimated_size,
     size_t aio_threshold,
     size_t mmap_threshold,
-    MappedFileCache * mmap_cache,
+    MMappedFileCache * mmap_cache,
     size_t buf_size,
     bool allow_different_codecs_)
     : BufferWithOwnMemory<ReadBuffer>(0)
diff --git a/src/Compression/CompressedReadBufferFromFile.h b/src/Compression/CompressedReadBufferFromFile.h
index 6e7d0df8333..2ee7021b35a 100644
--- a/src/Compression/CompressedReadBufferFromFile.h
+++ b/src/Compression/CompressedReadBufferFromFile.h
@@ -9,7 +9,7 @@
 namespace DB
 {
 
-class MappedFileCache;
+class MMappedFileCache;
 
 
 /// Unlike CompressedReadBuffer, it can do seek.
@@ -33,7 +33,7 @@ public:
     CompressedReadBufferFromFile(std::unique_ptr<ReadBufferFromFileBase> buf, bool allow_different_codecs_ = false);
 
     CompressedReadBufferFromFile(
-        const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MappedFileCache * mmap_cache,
+        const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MMappedFileCache * mmap_cache,
         size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, bool allow_different_codecs_ = false);
 
     void seek(size_t offset_in_compressed_file, size_t offset_in_decompressed_block);
diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index 589a39a5731..f101de340f1 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -109,7 +109,7 @@ DiskCacheWrapper::readFile(
     size_t estimated_size,
     size_t aio_threshold,
     size_t mmap_threshold,
-    MappedFileCache * mmap_cache) const
+    MMappedFileCache * mmap_cache) const
 {
     if (!cache_file_predicate(path))
         return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache);
diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h
index e7920fa25b6..7e627b0c3c3 100644
--- a/src/Disks/DiskCacheWrapper.h
+++ b/src/Disks/DiskCacheWrapper.h
@@ -39,7 +39,7 @@ public:
         size_t estimated_size,
         size_t aio_threshold,
         size_t mmap_threshold,
-        MappedFileCache * mmap_cache) const override;
+        MMappedFileCache * mmap_cache) const override;
 
     std::unique_ptr<WriteBufferFromFileBase> writeFile(const String & path, size_t buf_size, WriteMode mode) override;
 
diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp
index c12612f9e5d..144dc928dd3 100644
--- a/src/Disks/DiskDecorator.cpp
+++ b/src/Disks/DiskDecorator.cpp
@@ -115,7 +115,7 @@ void DiskDecorator::listFiles(const String & path, std::vector<String> & file_na
 
 std::unique_ptr<ReadBufferFromFileBase>
 DiskDecorator::readFile(
-    const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MappedFileCache * mmap_cache) const
+    const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MMappedFileCache * mmap_cache) const
 {
     return delegate->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache);
 }
diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h
index 837fb4532fd..918223b38d7 100644
--- a/src/Disks/DiskDecorator.h
+++ b/src/Disks/DiskDecorator.h
@@ -41,7 +41,7 @@ public:
         size_t estimated_size,
         size_t aio_threshold,
         size_t mmap_threshold,
-        MappedFileCache * mmap_cache) const override;
+        MMappedFileCache * mmap_cache) const override;
 
     std::unique_ptr<WriteBufferFromFileBase> writeFile(
         const String & path,
diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp
index 4f7cad06456..42378179103 100644
--- a/src/Disks/DiskLocal.cpp
+++ b/src/Disks/DiskLocal.cpp
@@ -220,7 +220,7 @@ void DiskLocal::replaceFile(const String & from_path, const String & to_path)
 
 std::unique_ptr<ReadBufferFromFileBase>
 DiskLocal::readFile(
-    const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MappedFileCache * mmap_cache) const
+    const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MMappedFileCache * mmap_cache) const
 {
     return createReadBufferFromFileBase(disk_path + path, estimated_size, aio_threshold, mmap_threshold, mmap_cache, buf_size);
 }
diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h
index e1c9cb11d0e..d957fc6f847 100644
--- a/src/Disks/DiskLocal.h
+++ b/src/Disks/DiskLocal.h
@@ -77,7 +77,7 @@ public:
         size_t estimated_size,
         size_t aio_threshold,
         size_t mmap_threshold,
-        MappedFileCache * mmap_cache) const override;
+        MMappedFileCache * mmap_cache) const override;
 
     std::unique_ptr<WriteBufferFromFileBase> writeFile(
         const String & path,
diff --git a/src/Disks/DiskMemory.cpp b/src/Disks/DiskMemory.cpp
index 39a95357900..d8b3c74d0d9 100644
--- a/src/Disks/DiskMemory.cpp
+++ b/src/Disks/DiskMemory.cpp
@@ -314,7 +314,7 @@ void DiskMemory::replaceFileImpl(const String & from_path, const String & to_pat
     files.insert(std::move(node));
 }
 
-std::unique_ptr<ReadBufferFromFileBase> DiskMemory::readFile(const String & path, size_t /*buf_size*/, size_t, size_t, size_t, MappedFileCache *) const
+std::unique_ptr<ReadBufferFromFileBase> DiskMemory::readFile(const String & path, size_t /*buf_size*/, size_t, size_t, size_t, MMappedFileCache *) const
 {
     std::lock_guard lock(mutex);
 
diff --git a/src/Disks/DiskMemory.h b/src/Disks/DiskMemory.h
index e7e61393cbc..d5c57b20a4a 100644
--- a/src/Disks/DiskMemory.h
+++ b/src/Disks/DiskMemory.h
@@ -68,7 +68,7 @@ public:
         size_t estimated_size,
         size_t aio_threshold,
         size_t mmap_threshold,
-        MappedFileCache * mmap_cache) const override;
+        MMappedFileCache * mmap_cache) const override;
 
     std::unique_ptr<WriteBufferFromFileBase> writeFile(
         const String & path,
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index 734061a284e..726145cb5d2 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -31,7 +31,7 @@ using Reservations = std::vector<ReservationPtr>;
 
 class ReadBufferFromFileBase;
 class WriteBufferFromFileBase;
-class MappedFileCache;
+class MMappedFileCache;
 
 /**
  * Mode of opening a file for write.
@@ -155,7 +155,7 @@ public:
         size_t estimated_size = 0,
         size_t aio_threshold = 0,
         size_t mmap_threshold = 0,
-        MappedFileCache * mmap_cache = nullptr) const = 0;
+        MMappedFileCache * mmap_cache = nullptr) const = 0;
 
     /// Open the file for write and return WriteBufferFromFileBase object.
     virtual std::unique_ptr<WriteBufferFromFileBase> writeFile(
diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index f23b1b0a615..30803c7a0c8 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -671,7 +671,7 @@ void DiskS3::replaceFile(const String & from_path, const String & to_path)
         moveFile(from_path, to_path);
 }
 
-std::unique_ptr<ReadBufferFromFileBase> DiskS3::readFile(const String & path, size_t buf_size, size_t, size_t, size_t, MappedFileCache *) const
+std::unique_ptr<ReadBufferFromFileBase> DiskS3::readFile(const String & path, size_t buf_size, size_t, size_t, size_t, MMappedFileCache *) const
 {
     auto metadata = readMeta(path);
 
diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h
index 7f2698ad6d1..b5c4a94a447 100644
--- a/src/Disks/S3/DiskS3.h
+++ b/src/Disks/S3/DiskS3.h
@@ -90,7 +90,7 @@ public:
         size_t estimated_size,
         size_t aio_threshold,
         size_t mmap_threshold,
-        MappedFileCache * mmap_cache) const override;
+        MMappedFileCache * mmap_cache) const override;
 
     std::unique_ptr<WriteBufferFromFileBase> writeFile(
         const String & path,
diff --git a/src/IO/MMapReadBufferFromFileDescriptor.h b/src/IO/MMapReadBufferFromFileDescriptor.h
index 0068b550899..03718a61a6c 100644
--- a/src/IO/MMapReadBufferFromFileDescriptor.h
+++ b/src/IO/MMapReadBufferFromFileDescriptor.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <IO/ReadBufferFromFileBase.h>
-#include <IO/MappedFileDescriptor.h>
+#include <IO/MMappedFileDescriptor.h>
 
 
 namespace DB
@@ -21,7 +21,7 @@ protected:
     MMapReadBufferFromFileDescriptor() {}
     void init();
 
-    MappedFileDescriptor mapped;
+    MMappedFileDescriptor mapped;
 
 public:
     MMapReadBufferFromFileDescriptor(int fd_, size_t offset_, size_t length_);
diff --git a/src/IO/MMapReadBufferFromFileWithCache.cpp b/src/IO/MMapReadBufferFromFileWithCache.cpp
index 1bdda49b1ce..0d31c29bdaa 100644
--- a/src/IO/MMapReadBufferFromFileWithCache.cpp
+++ b/src/IO/MMapReadBufferFromFileWithCache.cpp
@@ -22,22 +22,22 @@ void MMapReadBufferFromFileWithCache::init()
 
 
 MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
-    MappedFileCache & cache, const std::string & file_name, size_t offset, size_t length)
+    MMappedFileCache & cache, const std::string & file_name, size_t offset, size_t length)
 {
     mapped = cache.getOrSet(cache.hash(file_name, offset, length), [&]
     {
-        return std::make_shared<MappedFile>(file_name, offset, length);
+        return std::make_shared<MMappedFile>(file_name, offset, length);
     });
 
     init();
 }
 
 MMapReadBufferFromFileWithCache::MMapReadBufferFromFileWithCache(
-    MappedFileCache & cache, const std::string & file_name, size_t offset)
+    MMappedFileCache & cache, const std::string & file_name, size_t offset)
 {
     mapped = cache.getOrSet(cache.hash(file_name, offset, -1), [&]
     {
-        return std::make_shared<MappedFile>(file_name, offset);
+        return std::make_shared<MMappedFile>(file_name, offset);
     });
 
     init();
diff --git a/src/IO/MMapReadBufferFromFileWithCache.h b/src/IO/MMapReadBufferFromFileWithCache.h
index 6d03be46735..ff84f81610a 100644
--- a/src/IO/MMapReadBufferFromFileWithCache.h
+++ b/src/IO/MMapReadBufferFromFileWithCache.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <IO/ReadBufferFromFileBase.h>
-#include <IO/MappedFileCache.h>
+#include <IO/MMappedFileCache.h>
 #include <IO/MMapReadBufferFromFileDescriptor.h>
 
 
@@ -11,17 +11,17 @@ namespace DB
 class MMapReadBufferFromFileWithCache : public ReadBufferFromFileBase
 {
 public:
-    MMapReadBufferFromFileWithCache(MappedFileCache & cache, const std::string & file_name, size_t offset, size_t length);
+    MMapReadBufferFromFileWithCache(MMappedFileCache & cache, const std::string & file_name, size_t offset, size_t length);
 
     /// Map till end of file.
-    MMapReadBufferFromFileWithCache(MappedFileCache & cache, const std::string & file_name, size_t offset);
+    MMapReadBufferFromFileWithCache(MMappedFileCache & cache, const std::string & file_name, size_t offset);
 
     off_t getPosition() override;
     std::string getFileName() const override;
     off_t seek(off_t offset, int whence) override;
 
 private:
-    MappedFileCache::MappedPtr mapped;
+    MMappedFileCache::MappedPtr mapped;
 
     void init();
 };
diff --git a/src/IO/MappedFile.cpp b/src/IO/MMappedFile.cpp
similarity index 78%
rename from src/IO/MappedFile.cpp
rename to src/IO/MMappedFile.cpp
index 8867320d46c..edd2e5ef0ce 100644
--- a/src/IO/MappedFile.cpp
+++ b/src/IO/MMappedFile.cpp
@@ -4,7 +4,7 @@
 #include <Common/ProfileEvents.h>
 #include <Common/formatReadable.h>
 #include <Common/Exception.h>
-#include <IO/MappedFile.h>
+#include <IO/MMappedFile.h>
 
 
 namespace ProfileEvents
@@ -23,7 +23,7 @@ namespace ErrorCodes
 }
 
 
-void MappedFile::open()
+void MMappedFile::open()
 {
     ProfileEvents::increment(ProfileEvents::FileOpen);
 
@@ -35,13 +35,13 @@ void MappedFile::open()
 }
 
 
-std::string MappedFile::getFileName() const
+std::string MMappedFile::getFileName() const
 {
     return file_name;
 }
 
 
-MappedFile::MappedFile(const std::string & file_name_, size_t offset_, size_t length_)
+MMappedFile::MMappedFile(const std::string & file_name_, size_t offset_, size_t length_)
     : file_name(file_name_)
 {
     open();
@@ -49,7 +49,7 @@ MappedFile::MappedFile(const std::string & file_name_, size_t offset_, size_t le
 }
 
 
-MappedFile::MappedFile(const std::string & file_name_, size_t offset_)
+MMappedFile::MMappedFile(const std::string & file_name_, size_t offset_)
     : file_name(file_name_)
 {
     open();
@@ -57,14 +57,14 @@ MappedFile::MappedFile(const std::string & file_name_, size_t offset_)
 }
 
 
-MappedFile::~MappedFile()
+MMappedFile::~MMappedFile()
 {
     if (fd != -1)
         close();    /// Exceptions will lead to std::terminate and that's Ok.
 }
 
 
-void MappedFile::close()
+void MMappedFile::close()
 {
     finish();
 
diff --git a/src/IO/MappedFile.h b/src/IO/MMappedFile.h
similarity index 65%
rename from src/IO/MappedFile.h
rename to src/IO/MMappedFile.h
index c023526fae5..6ecf988fa94 100644
--- a/src/IO/MappedFile.h
+++ b/src/IO/MMappedFile.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <Common/CurrentMetrics.h>
-#include <IO/MappedFileDescriptor.h>
+#include <IO/MMappedFileDescriptor.h>
 #include <cstddef>
 
 
@@ -15,15 +15,15 @@ namespace DB
 {
 
 /// Opens a file and mmaps a region in it (or a whole file) into memory. Unmaps and closes in destructor.
-class MappedFile : public MappedFileDescriptor
+class MMappedFile : public MMappedFileDescriptor
 {
 public:
-    MappedFile(const std::string & file_name_, size_t offset_, size_t length_);
+    MMappedFile(const std::string & file_name_, size_t offset_, size_t length_);
 
     /// Map till end of file.
-    MappedFile(const std::string & file_name_, size_t offset_);
+    MMappedFile(const std::string & file_name_, size_t offset_);
 
-    ~MappedFile() override;
+    ~MMappedFile() override;
 
     void close();
 
diff --git a/src/IO/MappedFileCache.h b/src/IO/MMappedFileCache.h
similarity index 65%
rename from src/IO/MappedFileCache.h
rename to src/IO/MMappedFileCache.h
index 38525bad443..cbd1977350b 100644
--- a/src/IO/MappedFileCache.h
+++ b/src/IO/MMappedFileCache.h
@@ -4,13 +4,13 @@
 #include <Common/SipHash.h>
 #include <Common/UInt128.h>
 #include <Common/ProfileEvents.h>
-#include <IO/MappedFile.h>
+#include <IO/MMappedFile.h>
 
 
 namespace ProfileEvents
 {
-    extern const Event MappedFileCacheHits;
-    extern const Event MappedFileCacheMisses;
+    extern const Event MMappedFileCacheHits;
+    extern const Event MMappedFileCacheMisses;
 }
 
 namespace DB
@@ -20,13 +20,13 @@ namespace DB
 /** Cache of opened and mmapped files for reading.
   * mmap/munmap is heavy operation and better to keep mapped file to subsequent use than to map/unmap every time.
   */
-class MappedFileCache : public LRUCache<UInt128, MappedFile, UInt128TrivialHash>
+class MMappedFileCache : public LRUCache<UInt128, MMappedFile, UInt128TrivialHash>
 {
 private:
-    using Base = LRUCache<UInt128, MappedFile, UInt128TrivialHash>;
+    using Base = LRUCache<UInt128, MMappedFile, UInt128TrivialHash>;
 
 public:
-    MappedFileCache(size_t max_size_in_bytes)
+    MMappedFileCache(size_t max_size_in_bytes)
         : Base(max_size_in_bytes) {}
 
     /// Calculate key from path to file and offset.
@@ -49,15 +49,15 @@ public:
     {
         auto result = Base::getOrSet(key, load);
         if (result.second)
-            ProfileEvents::increment(ProfileEvents::MappedFileCacheMisses);
+            ProfileEvents::increment(ProfileEvents::MMappedFileCacheMisses);
         else
-            ProfileEvents::increment(ProfileEvents::MappedFileCacheHits);
+            ProfileEvents::increment(ProfileEvents::MMappedFileCacheHits);
 
         return result.first;
     }
 };
 
-using MappedFileCachePtr = std::shared_ptr<MappedFileCache>;
+using MMappedFileCachePtr = std::shared_ptr<MMappedFileCache>;
 
 }
 
diff --git a/src/IO/MappedFileDescriptor.cpp b/src/IO/MMappedFileDescriptor.cpp
similarity index 62%
rename from src/IO/MappedFileDescriptor.cpp
rename to src/IO/MMappedFileDescriptor.cpp
index 31caf0c4728..db2752d14f9 100644
--- a/src/IO/MappedFileDescriptor.cpp
+++ b/src/IO/MMappedFileDescriptor.cpp
@@ -8,7 +8,7 @@
 #include <Common/formatReadable.h>
 #include <Common/Exception.h>
 #include <common/getPageSize.h>
-#include <IO/MappedFileDescriptor.h>
+#include <IO/MMappedFileDescriptor.h>
 
 
 namespace DB
@@ -28,29 +28,29 @@ static size_t getFileSize(int fd)
 {
     struct stat stat_res {};
     if (0 != fstat(fd, &stat_res))
-        throwFromErrno("MappedFileDescriptor: Cannot fstat.", ErrorCodes::CANNOT_STAT);
+        throwFromErrno("MMappedFileDescriptor: Cannot fstat.", ErrorCodes::CANNOT_STAT);
 
     off_t file_size = stat_res.st_size;
 
     if (file_size < 0)
-        throw Exception("MappedFileDescriptor: fstat returned negative file size", ErrorCodes::LOGICAL_ERROR);
+        throw Exception("MMappedFileDescriptor: fstat returned negative file size", ErrorCodes::LOGICAL_ERROR);
 
     return file_size;
 }
 
 
-MappedFileDescriptor::MappedFileDescriptor(int fd_, size_t offset_, size_t length_)
+MMappedFileDescriptor::MMappedFileDescriptor(int fd_, size_t offset_, size_t length_)
 {
     set(fd_, offset_, length_);
 }
 
-MappedFileDescriptor::MappedFileDescriptor(int fd_, size_t offset_)
+MMappedFileDescriptor::MMappedFileDescriptor(int fd_, size_t offset_)
     : fd(fd_), offset(offset_)
 {
     set(fd_, offset_);
 }
 
-void MappedFileDescriptor::set(int fd_, size_t offset_, size_t length_)
+void MMappedFileDescriptor::set(int fd_, size_t offset_, size_t length_)
 {
     finish();
 
@@ -63,7 +63,7 @@ void MappedFileDescriptor::set(int fd_, size_t offset_, size_t length_)
 
     void * buf = mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fd, offset);
     if (MAP_FAILED == buf)
-        throwFromErrno(fmt::format("MappedFileDescriptor: Cannot mmap {}.", ReadableSize(length)),
+        throwFromErrno(fmt::format("MMappedFileDescriptor: Cannot mmap {}.", ReadableSize(length)),
             ErrorCodes::CANNOT_ALLOCATE_MEMORY);
 
     data = static_cast<char *>(buf);
@@ -72,23 +72,23 @@ void MappedFileDescriptor::set(int fd_, size_t offset_, size_t length_)
     bytes_metric_increment.changeTo(length);
 }
 
-void MappedFileDescriptor::set(int fd_, size_t offset_)
+void MMappedFileDescriptor::set(int fd_, size_t offset_)
 {
     size_t file_size = getFileSize(fd_);
 
     if (offset > static_cast<size_t>(file_size))
-        throw Exception("MappedFileDescriptor: requested offset is greater than file size", ErrorCodes::BAD_ARGUMENTS);
+        throw Exception("MMappedFileDescriptor: requested offset is greater than file size", ErrorCodes::BAD_ARGUMENTS);
 
     set(fd_, offset_, file_size - offset);
 }
 
-void MappedFileDescriptor::finish()
+void MMappedFileDescriptor::finish()
 {
     if (!length)
         return;
 
     if (0 != munmap(data, length))
-        throwFromErrno(fmt::format("MappedFileDescriptor: Cannot munmap {}.", ReadableSize(length)),
+        throwFromErrno(fmt::format("MMappedFileDescriptor: Cannot munmap {}.", ReadableSize(length)),
             ErrorCodes::CANNOT_MUNMAP);
 
     length = 0;
@@ -97,7 +97,7 @@ void MappedFileDescriptor::finish()
     bytes_metric_increment.changeTo(0);
 }
 
-MappedFileDescriptor::~MappedFileDescriptor()
+MMappedFileDescriptor::~MMappedFileDescriptor()
 {
     finish(); /// Exceptions will lead to std::terminate and that's Ok.
 }
diff --git a/src/IO/MappedFileDescriptor.h b/src/IO/MMappedFileDescriptor.h
similarity index 67%
rename from src/IO/MappedFileDescriptor.h
rename to src/IO/MMappedFileDescriptor.h
index fbe7fa1915c..01dc7e1866c 100644
--- a/src/IO/MappedFileDescriptor.h
+++ b/src/IO/MMappedFileDescriptor.h
@@ -5,8 +5,8 @@
 
 namespace CurrentMetrics
 {
-    extern const Metric MappedFiles;
-    extern const Metric MappedFileBytes;
+    extern const Metric MMappedFiles;
+    extern const Metric MMappedFileBytes;
 }
 
 
@@ -15,16 +15,16 @@ namespace DB
 
 /// MMaps a region in file (or a whole file) into memory. Unmaps in destructor.
 /// Does not open or close file.
-class MappedFileDescriptor
+class MMappedFileDescriptor
 {
 public:
-    MappedFileDescriptor(int fd_, size_t offset_, size_t length_);
-    MappedFileDescriptor(int fd_, size_t offset_);
+    MMappedFileDescriptor(int fd_, size_t offset_, size_t length_);
+    MMappedFileDescriptor(int fd_, size_t offset_);
 
     /// Makes empty object that can be initialized with `set`.
-    MappedFileDescriptor() {}
+    MMappedFileDescriptor() {}
 
-    virtual ~MappedFileDescriptor();
+    virtual ~MMappedFileDescriptor();
 
     char * getData() { return data; }
     const char * getData() const { return data; }
@@ -41,8 +41,8 @@ public:
     void set(int fd_, size_t offset_);
 
 protected:
-    MappedFileDescriptor(const MappedFileDescriptor &) = delete;
-    MappedFileDescriptor(MappedFileDescriptor &&) = delete;
+    MMappedFileDescriptor(const MMappedFileDescriptor &) = delete;
+    MMappedFileDescriptor(MMappedFileDescriptor &&) = delete;
 
     void init();
 
@@ -51,8 +51,8 @@ protected:
     size_t length = 0;
     char * data = nullptr;
 
-    CurrentMetrics::Increment files_metric_increment{CurrentMetrics::MappedFiles, 0};
-    CurrentMetrics::Increment bytes_metric_increment{CurrentMetrics::MappedFileBytes, 0};
+    CurrentMetrics::Increment files_metric_increment{CurrentMetrics::MMappedFiles, 0};
+    CurrentMetrics::Increment bytes_metric_increment{CurrentMetrics::MMappedFileBytes, 0};
 };
 
 }
diff --git a/src/IO/createReadBufferFromFileBase.cpp b/src/IO/createReadBufferFromFileBase.cpp
index 69e14169a87..230f049b2cb 100644
--- a/src/IO/createReadBufferFromFileBase.cpp
+++ b/src/IO/createReadBufferFromFileBase.cpp
@@ -21,7 +21,7 @@ namespace DB
 
 std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
     const std::string & filename_,
-    size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MappedFileCache * mmap_cache,
+    size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, MMappedFileCache * mmap_cache,
     size_t buffer_size_, int flags_, char * existing_memory_, size_t alignment)
 {
 #if defined(OS_LINUX) || defined(__FreeBSD__)
diff --git a/src/IO/createReadBufferFromFileBase.h b/src/IO/createReadBufferFromFileBase.h
index 16642196d50..46d5b39ea44 100644
--- a/src/IO/createReadBufferFromFileBase.h
+++ b/src/IO/createReadBufferFromFileBase.h
@@ -8,7 +8,7 @@
 namespace DB
 {
 
-class MappedFileCache;
+class MMappedFileCache;
 
 
 /** Create an object to read data from a file.
@@ -23,7 +23,7 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
     size_t estimated_size,
     size_t aio_threshold,
     size_t mmap_threshold,
-    MappedFileCache * mmap_cache,
+    MMappedFileCache * mmap_cache,
     size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
     int flags_ = -1,
     char * existing_memory_ = nullptr,
diff --git a/src/IO/ya.make b/src/IO/ya.make
index 2d282818272..01b3995ef7a 100644
--- a/src/IO/ya.make
+++ b/src/IO/ya.make
@@ -35,8 +35,8 @@ SRCS(
     MMapReadBufferFromFile.cpp
     MMapReadBufferFromFileDescriptor.cpp
     MMapReadBufferFromFileWithCache.cpp
-    MappedFile.cpp
-    MappedFileDescriptor.cpp
+    MMappedFile.cpp
+    MMappedFileDescriptor.cpp
     MemoryReadWriteBuffer.cpp
     MySQLBinlogEventReadBuffer.cpp
     MySQLPacketPayloadReadBuffer.cpp
diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp
index dabcdf4b2d6..3d582293190 100644
--- a/src/Interpreters/AsynchronousMetrics.cpp
+++ b/src/Interpreters/AsynchronousMetrics.cpp
@@ -12,7 +12,7 @@
 #include <Storages/StorageMergeTree.h>
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <IO/UncompressedCache.h>
-#include <IO/MappedFileCache.h>
+#include <IO/MMappedFileCache.h>
 #include <Databases/IDatabase.h>
 #include <chrono>
 
@@ -188,7 +188,7 @@ void AsynchronousMetrics::update()
     }
 
     {
-        if (auto mmap_cache = global_context.getMappedFileCache())
+        if (auto mmap_cache = global_context.getMMappedFileCache())
         {
             new_values["MMapCacheCells"] = mmap_cache->count();
         }
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index e743b8ffb90..9aa8bde3421 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -55,7 +55,7 @@
 #include <Interpreters/DDLWorker.h>
 #include <IO/ReadBufferFromFile.h>
 #include <IO/UncompressedCache.h>
-#include <IO/MappedFileCache.h>
+#include <IO/MMappedFileCache.h>
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ParserCreateQuery.h>
 #include <Parsers/parseQuery.h>
@@ -342,7 +342,7 @@ struct ContextShared
     AccessControlManager access_control_manager;
     mutable UncompressedCachePtr uncompressed_cache;        /// The cache of decompressed blocks.
     mutable MarkCachePtr mark_cache;                        /// Cache of marks in compressed files.
-    mutable MappedFileCachePtr mmap_cache; /// Cache of mmapped files to avoid frequent open/map/unmap/close and to reuse from several threads.
+    mutable MMappedFileCachePtr mmap_cache; /// Cache of mmapped files to avoid frequent open/map/unmap/close and to reuse from several threads.
     ProcessList process_list;                               /// Executing queries at the moment.
     MergeList merge_list;                                   /// The list of executable merge (for (Replicated)?MergeTree)
     ReplicatedFetchList replicated_fetch_list;
@@ -1456,23 +1456,23 @@ void Context::dropMarkCache() const
 }
 
 
-void Context::setMappedFileCache(size_t cache_size_in_num_entries)
+void Context::setMMappedFileCache(size_t cache_size_in_num_entries)
 {
     auto lock = getLock();
 
     if (shared->mmap_cache)
         throw Exception("Mapped file cache has been already created.", ErrorCodes::LOGICAL_ERROR);
 
-    shared->mmap_cache = std::make_shared<MappedFileCache>(cache_size_in_num_entries);
+    shared->mmap_cache = std::make_shared<MMappedFileCache>(cache_size_in_num_entries);
 }
 
-MappedFileCachePtr Context::getMappedFileCache() const
+MMappedFileCachePtr Context::getMMappedFileCache() const
 {
     auto lock = getLock();
     return shared->mmap_cache;
 }
 
-void Context::dropMappedFileCache() const
+void Context::dropMMappedFileCache() const
 {
     auto lock = getLock();
     if (shared->mmap_cache)
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 196b38f6a98..3e24c8520a4 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -68,7 +68,7 @@ class ReplicatedFetchList;
 class Cluster;
 class Compiler;
 class MarkCache;
-class MappedFileCache;
+class MMappedFileCache;
 class UncompressedCache;
 class ProcessList;
 class QueryStatus;
@@ -625,9 +625,9 @@ public:
     void dropMarkCache() const;
 
     /// Create a cache of mapped files to avoid frequent open/map/unmap/close and to reuse from several threads.
-    void setMappedFileCache(size_t cache_size_in_num_entries);
-    std::shared_ptr<MappedFileCache> getMappedFileCache() const;
-    void dropMappedFileCache() const;
+    void setMMappedFileCache(size_t cache_size_in_num_entries);
+    std::shared_ptr<MMappedFileCache> getMMappedFileCache() const;
+    void dropMMappedFileCache() const;
 
     /** Clear the caches of the uncompressed blocks and marks.
       * This is usually done when renaming tables, changing the type of columns, deleting a table.
diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp
index 6a3e307bec6..67576836d52 100644
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@@ -270,7 +270,7 @@ BlockIO InterpreterSystemQuery::execute()
             break;
         case Type::DROP_MMAP_CACHE:
             context.checkAccess(AccessType::SYSTEM_DROP_MMAP_CACHE);
-            system_context.dropMappedFileCache();
+            system_context.dropMMappedFileCache();
             break;
 #if USE_EMBEDDED_COMPILER
         case Type::DROP_COMPILED_EXPRESSION_CACHE:
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index f5007918a22..ed0e35b7f1a 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1164,7 +1164,7 @@ void MergeTreeData::clearOldPartsFromFilesystem(bool force)
     /// This is needed to close files to avoid they reside on disk after being deleted.
     /// NOTE: we can drop files from cache more selectively but this is good enough.
     if (!parts_to_remove.empty())
-        global_context.dropMappedFileCache();
+        global_context.dropMMappedFileCache();
 }
 
 void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts_to_remove)
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 2cbc8fbd743..dcfc3293bb6 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -546,7 +546,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
     {
         .min_bytes_to_use_direct_io = settings.min_bytes_to_use_direct_io,
         .min_bytes_to_use_mmap_io = settings.min_bytes_to_use_mmap_io,
-        .mmap_cache = context.getMappedFileCache(),
+        .mmap_cache = context.getMMappedFileCache(),
         .max_read_buffer_size = settings.max_read_buffer_size,
         .save_marks_in_cache = true,
         .checksum_on_read = settings.checksum_on_read,
diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h
index 5d89b00111c..dd241cfd591 100644
--- a/src/Storages/MergeTree/MergeTreeIOSettings.h
+++ b/src/Storages/MergeTree/MergeTreeIOSettings.h
@@ -7,15 +7,15 @@
 namespace DB
 {
 
-class MappedFileCache;
-using MappedFileCachePtr = std::shared_ptr<MappedFileCache>;
+class MMappedFileCache;
+using MMappedFileCachePtr = std::shared_ptr<MMappedFileCache>;
 
 
 struct MergeTreeReaderSettings
 {
     size_t min_bytes_to_use_direct_io = 0;
     size_t min_bytes_to_use_mmap_io = 0;
-    MappedFileCachePtr mmap_cache;
+    MMappedFileCachePtr mmap_cache;
     size_t max_read_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
     /// If save_marks_in_cache is false, then, if marks are not in cache,
     ///  we will load them but won't save in the cache, to avoid evicting other data.

From d800e2b1cdea370e745c73361eeb29e28b1a77d7 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 22:29:32 +0300
Subject: [PATCH 255/260] Add a test

---
 tests/queries/0_stateless/01778_mmap_cache_infra.reference | 6 ++++++
 tests/queries/0_stateless/01778_mmap_cache_infra.sql       | 7 +++++++
 2 files changed, 13 insertions(+)
 create mode 100644 tests/queries/0_stateless/01778_mmap_cache_infra.reference
 create mode 100644 tests/queries/0_stateless/01778_mmap_cache_infra.sql

diff --git a/tests/queries/0_stateless/01778_mmap_cache_infra.reference b/tests/queries/0_stateless/01778_mmap_cache_infra.reference
new file mode 100644
index 00000000000..aa67cf13d62
--- /dev/null
+++ b/tests/queries/0_stateless/01778_mmap_cache_infra.reference
@@ -0,0 +1,6 @@
+MMappedFileCacheHits
+MMappedFileCacheMisses
+CreatedReadBufferMMap
+CreatedReadBufferMMapFailed
+MMappedFiles
+MMappedFileBytes
diff --git a/tests/queries/0_stateless/01778_mmap_cache_infra.sql b/tests/queries/0_stateless/01778_mmap_cache_infra.sql
new file mode 100644
index 00000000000..ea02fc05248
--- /dev/null
+++ b/tests/queries/0_stateless/01778_mmap_cache_infra.sql
@@ -0,0 +1,7 @@
+-- We check the existence of queries and metrics and don't check the results (a smoke test).
+
+SYSTEM DROP MMAP CACHE;
+
+SET system_events_show_zero_values = 1;
+SELECT event FROM system.events WHERE event LIKE '%MMap%';
+SELECT metric FROM system.metrics WHERE metric LIKE '%MMap%';

From cb252de4d3f0b4f792ef654a0f33766e81f22c49 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 22:34:23 +0300
Subject: [PATCH 256/260] Just in case

---
 tests/queries/0_stateless/01778_mmap_cache_infra.reference | 6 +++---
 tests/queries/0_stateless/01778_mmap_cache_infra.sql       | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/queries/0_stateless/01778_mmap_cache_infra.reference b/tests/queries/0_stateless/01778_mmap_cache_infra.reference
index aa67cf13d62..0e82b277bc1 100644
--- a/tests/queries/0_stateless/01778_mmap_cache_infra.reference
+++ b/tests/queries/0_stateless/01778_mmap_cache_infra.reference
@@ -1,6 +1,6 @@
-MMappedFileCacheHits
-MMappedFileCacheMisses
 CreatedReadBufferMMap
 CreatedReadBufferMMapFailed
-MMappedFiles
+MMappedFileCacheHits
+MMappedFileCacheMisses
 MMappedFileBytes
+MMappedFiles
diff --git a/tests/queries/0_stateless/01778_mmap_cache_infra.sql b/tests/queries/0_stateless/01778_mmap_cache_infra.sql
index ea02fc05248..29a84c5507b 100644
--- a/tests/queries/0_stateless/01778_mmap_cache_infra.sql
+++ b/tests/queries/0_stateless/01778_mmap_cache_infra.sql
@@ -3,5 +3,5 @@
 SYSTEM DROP MMAP CACHE;
 
 SET system_events_show_zero_values = 1;
-SELECT event FROM system.events WHERE event LIKE '%MMap%';
-SELECT metric FROM system.metrics WHERE metric LIKE '%MMap%';
+SELECT event FROM system.events WHERE event LIKE '%MMap%' ORDER BY event;
+SELECT metric FROM system.metrics WHERE metric LIKE '%MMap%' ORDER BY metric;

From 459d00f9998ee0bf9514070f0556bbdfec343da5 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Sun, 28 Mar 2021 22:42:34 +0300
Subject: [PATCH 257/260] Fixed tests

---
 src/Compression/CachedCompressedReadBuffer.cpp | 6 +++---
 src/IO/UncompressedCache.h                     | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/Compression/CachedCompressedReadBuffer.cpp b/src/Compression/CachedCompressedReadBuffer.cpp
index 5a0db7ef877..d511266d139 100644
--- a/src/Compression/CachedCompressedReadBuffer.cpp
+++ b/src/Compression/CachedCompressedReadBuffer.cpp
@@ -47,9 +47,9 @@ bool CachedCompressedReadBuffer::nextImpl()
 
         if (cell->compressed_size)
         {
-            owned_cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer();
-            owned_cell->data.resize(size_decompressed + owned_cell->additional_bytes);
-            decompressTo(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum);
+            cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer();
+            cell->data.resize(size_decompressed + cell->additional_bytes);
+            decompressTo(cell->data.data(), size_decompressed, size_compressed_without_checksum);
         }
 
         return cell;
diff --git a/src/IO/UncompressedCache.h b/src/IO/UncompressedCache.h
index 63c525bba50..b2f62ef2ada 100644
--- a/src/IO/UncompressedCache.h
+++ b/src/IO/UncompressedCache.h
@@ -61,7 +61,8 @@ public:
     template <typename LoadFunc>
     MappedPtr getOrSet(const Key & key, LoadFunc && load)
     {
-        auto result = Base::getOrSet(key, load);
+        auto result = Base::getOrSet(key, std::forward<LoadFunc>(load));
+
         if (result.second)
             ProfileEvents::increment(ProfileEvents::UncompressedCacheMisses);
         else

From a0a3380d91670b6f4e05aacb0e50dfa6ca161ad8 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 28 Mar 2021 22:46:45 +0300
Subject: [PATCH 258/260] Remove useless headers

---
 src/Core/NamesAndTypes.cpp | 1 -
 src/Storages/IStorage.cpp  | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/src/Core/NamesAndTypes.cpp b/src/Core/NamesAndTypes.cpp
index e96ce1824d2..7b1779d4346 100644
--- a/src/Core/NamesAndTypes.cpp
+++ b/src/Core/NamesAndTypes.cpp
@@ -6,7 +6,6 @@
 #include <IO/WriteHelpers.h>
 #include <IO/ReadBufferFromString.h>
 #include <IO/WriteBufferFromString.h>
-#include <sparsehash/dense_hash_map>
 
 
 namespace DB
diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp
index 2cbc36e02fe..39f6d1f632e 100644
--- a/src/Storages/IStorage.cpp
+++ b/src/Storages/IStorage.cpp
@@ -1,8 +1,5 @@
 #include <Storages/IStorage.h>
 
-#include <sparsehash/dense_hash_map>
-#include <sparsehash/dense_hash_set>
-
 #include <Common/StringUtils/StringUtils.h>
 #include <Common/quoteString.h>
 #include <IO/Operators.h>

From c81d807dbce5caf5c118a0c99ecc4f39543b41de Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Mon, 29 Mar 2021 05:25:58 +0300
Subject: [PATCH 259/260] Update JoiningTransform.cpp

---
 src/Processors/Transforms/JoiningTransform.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Processors/Transforms/JoiningTransform.cpp b/src/Processors/Transforms/JoiningTransform.cpp
index 2b87695db69..dea887fd7d7 100644
--- a/src/Processors/Transforms/JoiningTransform.cpp
+++ b/src/Processors/Transforms/JoiningTransform.cpp
@@ -38,7 +38,7 @@ void JoiningTransform::transform(Chunk & chunk)
     if (on_totals)
     {
         /// We have to make chunk empty before return
-        /// In case of using `arrayJoin` we can get more or less columns than one
+        /// In case of using `arrayJoin` we can get more or less rows than one
         auto cols = chunk.detachColumns();
         for (auto & col : cols)
             col = col->cloneResized(1);

From 0cb85b9ea89c5cc6e790ed7dd86d3f30dca01ed8 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 29 Mar 2021 17:26:48 +0300
Subject: [PATCH 260/260] Trying parallel functional tests one more time

---
 docker/test/stateless/run.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh
index 853814faae8..80199b0fe92 100755
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@@ -53,10 +53,11 @@ function run_tests()
     if [ "$NUM_TRIES" -gt "1" ]; then
         ADDITIONAL_OPTIONS+=('--skip')
         ADDITIONAL_OPTIONS+=('00000_no_tests_to_skip')
-        ADDITIONAL_OPTIONS+=('--jobs')
-        ADDITIONAL_OPTIONS+=('4')
     fi
 
+    ADDITIONAL_OPTIONS+=('--jobs')
+    ADDITIONAL_OPTIONS+=('8')
+
     if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
         ADDITIONAL_OPTIONS+=('--replicated-database')
     fi