Merge branch 'ClickHouse:master' into fix-named-collections-on-cluster-23.7

2024-11-23 08:02:02 +00:00 · 2023-07-27 10:48:25 +03:00 · 2023-07-27 10:48:25 +03:00 · f026ccf11e
commit f026ccf11e
parent bcc41fbc2f ddab6590b2
76 changed files with 4140 additions and 401 deletions
--- a/base/poco/Foundation/include/Poco/Message.h
+++ b/base/poco/Foundation/include/Poco/Message.h
@ -67,6 +67,8 @@ public:

    Message(
        const std::string & source, const std::string & text, Priority prio, const char * file, int line, std::string_view fmt_str = {});
+    Message(
+        std::string && source, std::string && text, Priority prio, const char * file, int line, std::string_view fmt_str);
    /// Creates a Message with the given source, text, priority,
    /// source file path and line.
    ///
--- a/base/poco/Foundation/src/Message.cpp
+++ b/base/poco/Foundation/src/Message.cpp
@ -60,6 +60,19 @@ Message::Message(const std::string& source, const std::string& text, Priority pr
 }


+Message::Message(std::string && source, std::string && text, Priority prio, const char * file, int line, std::string_view fmt_str):
+    _source(std::move(source)),
+    _text(std::move(text)),
+    _prio(prio),
+    _tid(0),
+    _file(file),
+    _line(line),
+    _pMap(0),
+    _fmt_str(fmt_str)
+{
+    init();
+}
+
 Message::Message(const Message& msg):
 	_source(msg._source),
 	_text(msg._text),
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@ -502,9 +502,10 @@ target_include_directories(_parquet SYSTEM BEFORE
        "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src"
        "${CMAKE_CURRENT_SOURCE_DIR}/cpp/src")
 target_link_libraries(_parquet
-    PUBLIC _arrow
-    PRIVATE
+    PUBLIC
+        _arrow
        ch_contrib::thrift
+    PRIVATE
        boost::headers_only
        boost::regex
        OpenSSL::Crypto OpenSSL::SSL)
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@ -945,6 +945,44 @@ Result:
 └────────────┴───────┘
 ```

+## toDecimalString
+
+Converts a numeric value to String with the number of fractional digits in the output specified by the user.
+
+**Syntax**
+
+``` sql
+toDecimalString(number, scale)
+```
+
+**Parameters**
+
+- `number` — Value to be represented as String, [Int, UInt](/docs/en/sql-reference/data-types/int-uint.md), [Float](/docs/en/sql-reference/data-types/float.md), [Decimal](/docs/en/sql-reference/data-types/decimal.md),
+- `scale` — Number of fractional digits, [UInt8](/docs/en/sql-reference/data-types/int-uint.md).
+    * Maximum scale for [Decimal](/docs/en/sql-reference/data-types/decimal.md) and [Int, UInt](/docs/en/sql-reference/data-types/int-uint.md) types is 77 (it is the maximum possible number of significant digits for Decimal),
+    * Maximum scale for [Float](/docs/en/sql-reference/data-types/float.md) is 60.
+
+**Returned value**
+
+- Input value represented as [String](/docs/en/sql-reference/data-types/string.md) with given number of fractional digits (scale).
+    The number is rounded up or down according to common arithmetic in case requested scale is smaller than original number's scale.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT toDecimalString(CAST('64.32', 'Float64'), 5);
+```
+
+Result:
+
+```response
+┌toDecimalString(CAST('64.32', 'Float64'), 5)─┐
+│ 64.32000                                    │
+└─────────────────────────────────────────────┘
+```
+
 ## reinterpretAsUInt(8\|16\|32\|64)

 ## reinterpretAsInt(8\|16\|32\|64)
--- a/docs/en/sql-reference/statements/system.md
+++ b/docs/en/sql-reference/statements/system.md
@ -414,3 +414,29 @@ Will do sync syscall.
 ```sql
 SYSTEM SYNC FILE CACHE [ON CLUSTER cluster_name]
 ```
+
+
+### SYSTEM STOP LISTEN
+
+Closes the socket and gracefully terminates the existing connections to the server on the specified port with the specified protocol. 
+
+However, if the corresponding protocol settings were not specified in the clickhouse-server configuration, this command will have no effect.
+
+```sql
+SYSTEM STOP LISTEN [ON CLUSTER cluster_name] [QUERIES ALL | QUERIES DEFAULT | QUERIES CUSTOM | TCP | TCP_WITH_PROXY | TCP_SECURE | HTTP | HTTPS | MYSQL | GRPC | POSTGRESQL | PROMETHEUS | CUSTOM 'protocol']
+```
+
+- If `CUSTOM 'protocol'` modifier is specified, the custom protocol with the specified name defined in the protocols section of the server configuration will be stopped.
+- If `QUERIES ALL` modifier is specified, all protocols are stopped.
+- If `QUERIES DEFAULT` modifier is specified, all default protocols are stopped.
+- If `QUERIES CUSTOM` modifier is specified, all custom protocols are stopped.
+
+### SYSTEM START LISTEN
+
+Allows new connections to be established on the specified protocols.
+
+However, if the server on the specified port and protocol was not stopped using the SYSTEM STOP LISTEN command, this command will have no effect.
+
+```sql
+SYSTEM START LISTEN [ON CLUSTER cluster_name] [QUERIES ALL | QUERIES DEFAULT | QUERIES CUSTOM | TCP | TCP_WITH_PROXY | TCP_SECURE | HTTP | HTTPS | MYSQL | GRPC | POSTGRESQL | PROMETHEUS | CUSTOM 'protocol']
+```
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@ -762,6 +762,44 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut;
 └────────────┴───────┘
 ```

+## toDecimalString
+
+Принимает любой численный тип первым аргументом, возвращает строковое десятичное представление числа с точностью, заданной вторым аргументом.
+
+**Синтаксис**
+
+``` sql
+toDecimalString(number, scale)
+```
+
+**Параметры**
+
+-   `number` — Значение любого числового типа: [Int, UInt](/docs/ru/sql-reference/data-types/int-uint.md), [Float](/docs/ru/sql-reference/data-types/float.md), [Decimal](/docs/ru/sql-reference/data-types/decimal.md),
+-   `scale` — Требуемое количество десятичных знаков после запятой, [UInt8](/docs/ru/sql-reference/data-types/int-uint.md).
+    * Значение `scale` для типов [Decimal](/docs/ru/sql-reference/data-types/decimal.md) и [Int, UInt](/docs/ru/sql-reference/data-types/int-uint.md) должно не превышать 77 (так как это наибольшее количество значимых символов для этих типов),
+    * Значение `scale` для типа [Float](/docs/ru/sql-reference/data-types/float.md) не должно превышать 60.
+
+**Возвращаемое значение**
+
+-   Строка ([String](/docs/en/sql-reference/data-types/string.md)), представляющая собой десятичное представление входного числа с заданной длиной дробной части.
+    При необходимости число округляется по стандартным правилам арифметики.
+
+**Пример использования**
+
+Запрос:
+
+``` sql
+SELECT toDecimalString(CAST('64.32', 'Float64'), 5);
+```
+
+Результат:
+
+```response
+┌─toDecimalString(CAST('64.32', 'Float64'), 5)┐
+│ 64.32000                                    │
+└─────────────────────────────────────────────┘
+```
+
 ## reinterpretAsUInt(8\|16\|32\|64) {#reinterpretasuint8163264}

 ## reinterpretAsInt(8\|16\|32\|64) {#reinterpretasint8163264}
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -812,6 +812,11 @@ bool Client::processWithFuzzing(const String & full_query)
        }
        catch (...)
        {
+            if (!ast_to_process)
+                fmt::print(stderr,
+                    "Error while forming new query: {}\n",
+                    getCurrentExceptionMessage(true));
+
            // Some functions (e.g. protocol parsers) don't throw, but
            // set last_exception instead, so we'll also do it here for
            // uniformity.
--- a/programs/keeper/CMakeLists.txt
+++ b/programs/keeper/CMakeLists.txt
@ -65,6 +65,7 @@ if (BUILD_STANDALONE_KEEPER)
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/PrometheusRequestHandler.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/PrometheusMetricsWriter.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/waitServersToFinish.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/ServerType.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTPRequestHandlerFactoryMain.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/HTTPServer.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/ReadHeaders.cpp
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1457,6 +1457,24 @@ try
        access_control.reload(AccessControl::ReloadMode::USERS_CONFIG_ONLY);
    });

+    global_context->setStopServersCallback([&](const ServerType & server_type)
+    {
+        stopServers(servers, server_type);
+    });
+
+    global_context->setStartServersCallback([&](const ServerType & server_type)
+    {
+        createServers(
+            config(),
+            listen_hosts,
+            listen_try,
+            server_pool,
+            async_metrics,
+            servers,
+            /* start_servers= */ true,
+            server_type);
+    });
+
    /// Limit on total number of concurrently executed queries.
    global_context->getProcessList().setMaxSize(server_settings.max_concurrent_queries);

@ -1998,7 +2016,8 @@ void Server::createServers(
    Poco::ThreadPool & server_pool,
    AsynchronousMetrics & async_metrics,
    std::vector<ProtocolServerAdapter> & servers,
-    bool start_servers)
+    bool start_servers,
+    const ServerType & server_type)
 {
    const Settings & settings = global_context->getSettingsRef();

@ -2012,6 +2031,9 @@ void Server::createServers(

    for (const auto & protocol : protocols)
    {
+        if (!server_type.shouldStart(ServerType::Type::CUSTOM, protocol))
+            continue;
+
        std::vector<std::string> hosts;
        if (config.has("protocols." + protocol + ".host"))
            hosts.push_back(config.getString("protocols." + protocol + ".host"));
@ -2057,9 +2079,13 @@ void Server::createServers(
    }

    for (const auto & listen_host : listen_hosts)
+    {
+        const char * port_name;
+
+        if (server_type.shouldStart(ServerType::Type::HTTP))
        {
            /// HTTP
-        const char * port_name = "http_port";
+            port_name = "http_port";
            createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
            {
                Poco::Net::ServerSocket socket;
@ -2074,7 +2100,10 @@ void Server::createServers(
                    std::make_unique<HTTPServer>(
                        httpContext(), createHandlerFactory(*this, config, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params));
            });
+        }

+        if (server_type.shouldStart(ServerType::Type::HTTPS))
+        {
            /// HTTPS
            port_name = "https_port";
            createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
@ -2095,7 +2124,10 @@ void Server::createServers(
                throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "HTTPS protocol is disabled because Poco library was built without NetSSL support.");
 #endif
            });
+        }

+        if (server_type.shouldStart(ServerType::Type::TCP))
+        {
            /// TCP
            port_name = "tcp_port";
            createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
@ -2114,7 +2146,10 @@ void Server::createServers(
                        socket,
                        new Poco::Net::TCPServerParams));
            });
+        }

+        if (server_type.shouldStart(ServerType::Type::TCP_WITH_PROXY))
+        {
            /// TCP with PROXY protocol, see https://github.com/wolfeidau/proxyv2/blob/master/docs/proxy-protocol.txt
            port_name = "tcp_with_proxy_port";
            createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
@ -2133,7 +2168,10 @@ void Server::createServers(
                        socket,
                        new Poco::Net::TCPServerParams));
            });
+        }

+        if (server_type.shouldStart(ServerType::Type::TCP_SECURE))
+        {
            /// TCP with SSL
            port_name = "tcp_port_secure";
            createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
@ -2157,7 +2195,10 @@ void Server::createServers(
                throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.");
    #endif
            });
+        }

+        if (server_type.shouldStart(ServerType::Type::MYSQL))
+        {
            port_name = "mysql_port";
            createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
            {
@ -2171,7 +2212,10 @@ void Server::createServers(
                    "MySQL compatibility protocol: " + address.toString(),
                    std::make_unique<TCPServer>(new MySQLHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
            });
+        }

+        if (server_type.shouldStart(ServerType::Type::POSTGRESQL))
+        {
            port_name = "postgresql_port";
            createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
            {
@ -2185,8 +2229,11 @@ void Server::createServers(
                    "PostgreSQL compatibility protocol: " + address.toString(),
                    std::make_unique<TCPServer>(new PostgreSQLHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
            });
+        }

 #if USE_GRPC
+        if (server_type.shouldStart(ServerType::Type::GRPC))
+        {
            port_name = "grpc_port";
            createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
            {
@ -2197,8 +2244,10 @@ void Server::createServers(
                    "gRPC protocol: " + server_address.toString(),
                    std::make_unique<GRPCServer>(*this, makeSocketAddress(listen_host, port, &logger())));
            });
+        }
 #endif
-
+        if (server_type.shouldStart(ServerType::Type::PROMETHEUS))
+        {
            /// Prometheus (if defined and not setup yet with http_port)
            port_name = "prometheus.port";
            createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
@ -2216,6 +2265,7 @@ void Server::createServers(
            });
        }
    }
+}

 void Server::createInterserverServers(
    Poco::Util::AbstractConfiguration & config,
@ -2224,7 +2274,8 @@ void Server::createInterserverServers(
    Poco::ThreadPool & server_pool,
    AsynchronousMetrics & async_metrics,
    std::vector<ProtocolServerAdapter> & servers,
-    bool start_servers)
+    bool start_servers,
+    const ServerType & server_type)
 {
    const Settings & settings = global_context->getSettingsRef();

@ -2235,9 +2286,13 @@ void Server::createInterserverServers(

    /// Now iterate over interserver_listen_hosts
    for (const auto & interserver_listen_host : interserver_listen_hosts)
+    {
+        const char * port_name;
+
+        if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTP))
        {
            /// Interserver IO HTTP
-        const char * port_name = "interserver_http_port";
+            port_name = "interserver_http_port";
            createServer(config, interserver_listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
            {
                Poco::Net::ServerSocket socket;
@ -2255,7 +2310,10 @@ void Server::createInterserverServers(
                        socket,
                        http_params));
            });
+        }

+        if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTPS))
+        {
            port_name = "interserver_https_port";
            createServer(config, interserver_listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
            {
@ -2281,6 +2339,44 @@ void Server::createInterserverServers(
            });
        }
    }
+}
+
+void Server::stopServers(
+    std::vector<ProtocolServerAdapter> & servers,
+    const ServerType & server_type
+) const
+{
+    Poco::Logger * log = &logger();
+
+    /// Remove servers once all their connections are closed
+    auto check_server = [&log](const char prefix[], auto & server)
+    {
+        if (!server.isStopping())
+            return false;
+        size_t current_connections = server.currentConnections();
+        LOG_DEBUG(log, "Server {}{}: {} ({} connections)",
+            server.getDescription(),
+            prefix,
+            !current_connections ? "finished" : "waiting",
+            current_connections);
+        return !current_connections;
+    };
+
+    std::erase_if(servers, std::bind_front(check_server, " (from one of previous remove)"));
+
+    for (auto & server : servers)
+    {
+        if (!server.isStopping())
+        {
+            const std::string server_port_name = server.getPortName();
+
+            if (server_type.shouldStop(server_port_name))
+                server.stop();
+        }
+    }
+
+    std::erase_if(servers, std::bind_front(check_server, ""));
+}

 void Server::updateServers(
    Poco::Util::AbstractConfiguration & config,
--- a/programs/server/Server.h
+++ b/programs/server/Server.h
@ -3,8 +3,9 @@
 #include <Server/IServer.h>

 #include <Daemon/BaseDaemon.h>
-#include "Server/HTTP/HTTPContext.h"
+#include <Server/HTTP/HTTPContext.h>
 #include <Server/TCPProtocolStackFactory.h>
+#include <Server/ServerType.h>
 #include <Poco/Net/HTTPServerParams.h>

 /** Server provides three interfaces:
@ -106,7 +107,8 @@ private:
        Poco::ThreadPool & server_pool,
        AsynchronousMetrics & async_metrics,
        std::vector<ProtocolServerAdapter> & servers,
-        bool start_servers = false);
+        bool start_servers = false,
+        const ServerType & server_type = ServerType(ServerType::Type::QUERIES_ALL));

    void createInterserverServers(
        Poco::Util::AbstractConfiguration & config,
@ -115,7 +117,8 @@ private:
        Poco::ThreadPool & server_pool,
        AsynchronousMetrics & async_metrics,
        std::vector<ProtocolServerAdapter> & servers,
-        bool start_servers = false);
+        bool start_servers = false,
+        const ServerType & server_type = ServerType(ServerType::Type::QUERIES_ALL));

    void updateServers(
        Poco::Util::AbstractConfiguration & config,
@ -123,6 +126,11 @@ private:
        AsynchronousMetrics & async_metrics,
        std::vector<ProtocolServerAdapter> & servers,
        std::vector<ProtocolServerAdapter> & servers_to_start_before_tables);
+
+    void stopServers(
+        std::vector<ProtocolServerAdapter> & servers,
+        const ServerType & server_type
+    ) const;
 };

 }
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -187,6 +187,7 @@ enum class AccessType
    M(SYSTEM_THREAD_FUZZER, "SYSTEM START THREAD FUZZER, SYSTEM STOP THREAD FUZZER, START THREAD FUZZER, STOP THREAD FUZZER", GLOBAL, SYSTEM) \
    M(SYSTEM_UNFREEZE, "SYSTEM UNFREEZE", GLOBAL, SYSTEM) \
    M(SYSTEM_FAILPOINT, "SYSTEM ENABLE FAILPOINT, SYSTEM DISABLE FAILPOINT", GLOBAL, SYSTEM) \
+    M(SYSTEM_LISTEN, "SYSTEM START LISTEN, SYSTEM STOP LISTEN", GLOBAL, SYSTEM) \
    M(SYSTEM, "", GROUP, ALL) /* allows to execute SYSTEM {SHUTDOWN|RELOAD CONFIG|...} */ \
    \
    M(dictGet, "dictHas, dictGetHierarchy, dictIsIn", DICTIONARY, ALL) /* allows to execute functions dictGet(), dictHas(), dictGetHierarchy(), dictIsIn() */\
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -267,6 +267,10 @@ add_object_library(clickhouse_processors_queryplan Processors/QueryPlan)
 add_object_library(clickhouse_processors_queryplan_optimizations Processors/QueryPlan/Optimizations)
 add_object_library(clickhouse_user_defined_functions Functions/UserDefined)

+if (USE_PARQUET)
+    add_object_library(clickhouse_processors_formats_impl_parquet Processors/Formats/Impl/Parquet)
+endif()
+
 if (TARGET ch_contrib::nuraft)
    add_object_library(clickhouse_coordination Coordination)
 endif()
--- a/src/Common/Allocator.cpp
+++ b/src/Common/Allocator.cpp
@ -1,4 +1,26 @@
 #include "Allocator.h"

-template class Allocator<false>;
-template class Allocator<true>;
+/** Keep definition of this constant in cpp file; otherwise its value
+  * is inlined into allocator code making it impossible to override it
+  * in third-party code.
+  *
+  * Note: extern may seem redundant, but is actually needed due to bug in GCC.
+  * See also: https://gcc.gnu.org/legacy-ml/gcc-help/2017-12/msg00021.html
+  */
+#ifdef NDEBUG
+    __attribute__((__weak__)) extern const size_t MMAP_THRESHOLD = 128 * (1ULL << 20);
+#else
+    /**
+      * In debug build, use small mmap threshold to reproduce more memory
+      * stomping bugs. Along with ASLR it will hopefully detect more issues than
+      * ASan. The program may fail due to the limit on number of memory mappings.
+      *
+      * Not too small to avoid too quick exhaust of memory mappings.
+      */
+    __attribute__((__weak__)) extern const size_t MMAP_THRESHOLD = 16384;
+#endif
+
+template class Allocator<false, false>;
+template class Allocator<true, false>;
+template class Allocator<false, true>;
+template class Allocator<true, true>;
--- a/src/Common/Allocator.h
+++ b/src/Common/Allocator.h
@ -36,26 +36,51 @@
 #include <Common/Allocator_fwd.h>


+/// Required for older Darwin builds, that lack definition of MAP_ANONYMOUS
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+/**
+  * Many modern allocators (for example, tcmalloc) do not do a mremap for
+  * realloc, even in case of large enough chunks of memory. Although this allows
+  * you to increase performance and reduce memory consumption during realloc.
+  * To fix this, we do mremap manually if the chunk of memory is large enough.
+  * The threshold (64 MB) is chosen quite large, since changing the address
+  * space is very slow, especially in the case of a large number of threads. We
+  * expect that the set of operations mmap/something to do/mremap can only be
+  * performed about 1000 times per second.
+  *
+  * P.S. This is also required, because tcmalloc can not allocate a chunk of
+  * memory greater than 16 GB.
+  *
+  * P.P.S. Note that MMAP_THRESHOLD symbol is intentionally made weak. It allows
+  * to override it during linkage when using ClickHouse as a library in
+  * third-party applications which may already use own allocator doing mmaps
+  * in the implementation of alloc/realloc.
+  */
+extern const size_t MMAP_THRESHOLD;
+
 static constexpr size_t MALLOC_MIN_ALIGNMENT = 8;

+namespace CurrentMetrics
+{
+    extern const Metric MMappedAllocs;
+    extern const Metric MMappedAllocBytes;
+}
+
 namespace DB
 {
-
 namespace ErrorCodes
 {
+    extern const int BAD_ARGUMENTS;
    extern const int CANNOT_ALLOCATE_MEMORY;
+    extern const int CANNOT_MUNMAP;
+    extern const int CANNOT_MREMAP;
    extern const int LOGICAL_ERROR;
 }
-
 }

-/** Previously there was a code which tried to use manual mmap and mremap (clickhouse_mremap.h) for large allocations/reallocations (64MB+).
-  * Most modern allocators (including jemalloc) don't use mremap, so the idea was to take advantage from mremap system call for large reallocs.
-  * Actually jemalloc had support for mremap, but it was intentionally removed from codebase https://github.com/jemalloc/jemalloc/commit/e2deab7a751c8080c2b2cdcfd7b11887332be1bb.
-  * Our performance tests also shows that without manual mmap/mremap/munmap clickhouse is overall faster for about 1-2% and up to 5-7x for some types of queries.
-  * That is why we don't do manuall mmap/mremap/munmap here and completely rely on jemalloc for allocations of any size.
-  */
-
 /** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena.
  * Also used in hash tables.
  * The interface is different from std::allocator
@ -63,8 +88,10 @@ namespace ErrorCodes
  * - passing the size into the `free` method;
  * - by the presence of the `alignment` argument;
  * - the possibility of zeroing memory (used in hash tables);
+  * - random hint address for mmap
+  * - mmap_threshold for using mmap less or more
  */
-template <bool clear_memory_>
+template <bool clear_memory_, bool mmap_populate>
 class Allocator
 {
 public:
@ -82,7 +109,7 @@ public:
        try
        {
            checkSize(size);
-            freeNoTrack(buf);
+            freeNoTrack(buf, size);
            CurrentMemoryTracker::free(size);
        }
        catch (...)
@ -105,26 +132,49 @@ public:
            /// nothing to do.
            /// BTW, it's not possible to change alignment while doing realloc.
        }
-        else if (alignment <= MALLOC_MIN_ALIGNMENT)
+        else if (old_size < MMAP_THRESHOLD && new_size < MMAP_THRESHOLD
+                 && alignment <= MALLOC_MIN_ALIGNMENT)
        {
            /// Resize malloc'd memory region with no special alignment requirement.
            CurrentMemoryTracker::realloc(old_size, new_size);

            void * new_buf = ::realloc(buf, new_size);
            if (nullptr == new_buf)
-            {
-                DB::throwFromErrno(
-                    fmt::format("Allocator: Cannot realloc from {} to {}.", ReadableSize(old_size), ReadableSize(new_size)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
-            }
+                DB::throwFromErrno(fmt::format("Allocator: Cannot realloc from {} to {}.", ReadableSize(old_size), ReadableSize(new_size)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);

            buf = new_buf;
            if constexpr (clear_memory)
                if (new_size > old_size)
                    memset(reinterpret_cast<char *>(buf) + old_size, 0, new_size - old_size);
        }
+        else if (old_size >= MMAP_THRESHOLD && new_size >= MMAP_THRESHOLD)
+        {
+            /// Resize mmap'd memory region.
+            CurrentMemoryTracker::realloc(old_size, new_size);
+
+            // On apple and freebsd self-implemented mremap used (common/mremap.h)
+            buf = clickhouse_mremap(buf, old_size, new_size, MREMAP_MAYMOVE,
+                                    PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
+            if (MAP_FAILED == buf)
+                DB::throwFromErrno(fmt::format("Allocator: Cannot mremap memory chunk from {} to {}.",
+                    ReadableSize(old_size), ReadableSize(new_size)), DB::ErrorCodes::CANNOT_MREMAP);
+
+            /// No need for zero-fill, because mmap guarantees it.
+        }
+        else if (new_size < MMAP_THRESHOLD)
+        {
+            /// Small allocs that requires a copy. Assume there's enough memory in system. Call CurrentMemoryTracker once.
+            CurrentMemoryTracker::realloc(old_size, new_size);
+
+            void * new_buf = allocNoTrack(new_size, alignment);
+            memcpy(new_buf, buf, std::min(old_size, new_size));
+            freeNoTrack(buf, old_size);
+            buf = new_buf;
+        }
        else
        {
            /// Big allocs that requires a copy. MemoryTracker is called inside 'alloc', 'free' methods.
+
            void * new_buf = alloc(new_size, alignment);
            memcpy(new_buf, buf, std::min(old_size, new_size));
            free(buf, old_size);
@ -142,10 +192,43 @@ protected:

    static constexpr bool clear_memory = clear_memory_;

+    // Freshly mmapped pages are copy-on-write references to a global zero page.
+    // On the first write, a page fault occurs, and an actual writable page is
+    // allocated. If we are going to use this memory soon, such as when resizing
+    // hash tables, it makes sense to pre-fault the pages by passing
+    // MAP_POPULATE to mmap(). This takes some time, but should be faster
+    // overall than having a hot loop interrupted by page faults.
+    // It is only supported on Linux.
+    static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS
+#if defined(OS_LINUX)
+        | (mmap_populate ? MAP_POPULATE : 0)
+#endif
+        ;
+
 private:
    void * allocNoTrack(size_t size, size_t alignment)
    {
        void * buf;
+        size_t mmap_min_alignment = ::getPageSize();
+
+        if (size >= MMAP_THRESHOLD)
+        {
+            if (alignment > mmap_min_alignment)
+                throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS,
+                                    "Too large alignment {}: more than page size when allocating {}.",
+                                    ReadableSize(alignment), ReadableSize(size));
+
+            buf = mmap(getMmapHint(), size, PROT_READ | PROT_WRITE,
+                       mmap_flags, -1, 0);
+            if (MAP_FAILED == buf)
+                DB::throwFromErrno(fmt::format("Allocator: Cannot mmap {}.", ReadableSize(size)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
+            /// No need for zero-fill, because mmap guarantees it.
+
+            CurrentMetrics::add(CurrentMetrics::MMappedAllocs);
+            CurrentMetrics::add(CurrentMetrics::MMappedAllocBytes, size);
+        }
+        else
+        {
            if (alignment <= MALLOC_MIN_ALIGNMENT)
            {
                if constexpr (clear_memory)
@ -168,13 +251,25 @@ private:
                if constexpr (clear_memory)
                    memset(buf, 0, size);
            }
+        }
        return buf;
    }

-    void freeNoTrack(void * buf)
+    void freeNoTrack(void * buf, size_t size)
+    {
+        if (size >= MMAP_THRESHOLD)
+        {
+            if (0 != munmap(buf, size))
+                DB::throwFromErrno(fmt::format("Allocator: Cannot munmap {}.", ReadableSize(size)), DB::ErrorCodes::CANNOT_MUNMAP);
+
+            CurrentMetrics::sub(CurrentMetrics::MMappedAllocs);
+            CurrentMetrics::sub(CurrentMetrics::MMappedAllocBytes, size);
+        }
+        else
        {
            ::free(buf);
        }
+    }

    void checkSize(size_t size)
    {
@ -182,6 +277,21 @@ private:
        if (size >= 0x8000000000000000ULL)
            throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Too large size ({}) passed to allocator. It indicates an error.", size);
    }
+
+#ifndef NDEBUG
+    /// In debug builds, request mmap() at random addresses (a kind of ASLR), to
+    /// reproduce more memory stomping bugs. Note that Linux doesn't do it by
+    /// default. This may lead to worse TLB performance.
+    void * getMmapHint()
+    {
+        return reinterpret_cast<void *>(std::uniform_int_distribution<intptr_t>(0x100000000000UL, 0x700000000000UL)(thread_local_rng));
+    }
+#else
+    void * getMmapHint()
+    {
+        return nullptr;
+    }
+#endif
 };


@ -257,5 +367,7 @@ constexpr size_t allocatorInitialBytes<AllocatorWithStackMemory<

 /// Prevent implicit template instantiation of Allocator

-extern template class Allocator<false>;
-extern template class Allocator<true>;
+extern template class Allocator<false, false>;
+extern template class Allocator<true, false>;
+extern template class Allocator<false, true>;
+extern template class Allocator<true, true>;
--- a/src/Common/Allocator_fwd.h
+++ b/src/Common/Allocator_fwd.h
@ -3,7 +3,7 @@
  * This file provides forward declarations for Allocator.
  */

-template <bool clear_memory_>
+template <bool clear_memory_, bool mmap_populate = false>
 class Allocator;

 template <typename Base, size_t N = 64, size_t Alignment = 1>
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -149,8 +149,10 @@
    M(RestartReplicaThreadsActive, "Number of threads in the RESTART REPLICA thread pool running a task.") \
    M(QueryPipelineExecutorThreads, "Number of threads in the PipelineExecutor thread pool.") \
    M(QueryPipelineExecutorThreadsActive, "Number of threads in the PipelineExecutor thread pool running a task.") \
-    M(ParquetDecoderThreads, "Number of threads in the ParquetBlockInputFormat thread pool running a task.") \
-    M(ParquetDecoderThreadsActive, "Number of threads in the ParquetBlockInputFormat thread pool.") \
+    M(ParquetDecoderThreads, "Number of threads in the ParquetBlockInputFormat thread pool.") \
+    M(ParquetDecoderThreadsActive, "Number of threads in the ParquetBlockInputFormat thread pool running a task.") \
+    M(ParquetEncoderThreads, "Number of threads in ParquetBlockOutputFormat thread pool.") \
+    M(ParquetEncoderThreadsActive, "Number of threads in ParquetBlockOutputFormat thread pool running a task.") \
    M(OutdatedPartsLoadingThreads, "Number of threads in the threadpool for loading Outdated data parts.") \
    M(OutdatedPartsLoadingThreadsActive, "Number of active threads in the threadpool for loading Outdated data parts.") \
    M(DistributedBytesToInsert, "Number of pending bytes to process for asynchronous insertion into Distributed tables. Number of bytes for every shard is summed.") \
@ -173,6 +175,8 @@
    M(PartsInMemory, "In-memory parts.") \
    M(MMappedFiles, "Total number of mmapped files.") \
    M(MMappedFileBytes, "Sum size of mmapped file regions.") \
+    M(MMappedAllocs, "Total number of mmapped allocations") \
+    M(MMappedAllocBytes, "Sum bytes of mmapped allocations") \
    M(AsynchronousReadWait, "Number of threads waiting for asynchronous read.") \
    M(PendingAsyncInsert, "Number of asynchronous inserts that are waiting for flush.") \
    M(KafkaConsumers, "Number of active Kafka consumers") \
--- a/src/Common/HashTable/HashTableAllocator.h
+++ b/src/Common/HashTable/HashTableAllocator.h
@ -8,7 +8,7 @@
  * table, so it makes sense to pre-fault the pages so that page faults don't
  * interrupt the resize loop. Set the allocator parameter accordingly.
  */
-using HashTableAllocator = Allocator<true /* clear_memory */>;
+using HashTableAllocator = Allocator<true /* clear_memory */, true /* mmap_populate */>;

 template <size_t initial_bytes = 64>
 using HashTableAllocatorWithStackMemory = AllocatorWithStackMemory<HashTableAllocator, initial_bytes>;
--- a/src/Common/IntervalTree.h
+++ b/src/Common/IntervalTree.h
@ -27,15 +27,9 @@ struct Interval
 };

 template <typename IntervalStorageType>
-bool operator<(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
+auto operator<=>(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
 {
-    return std::tie(lhs.left, lhs.right) < std::tie(rhs.left, rhs.right);
-}
-
-template <typename IntervalStorageType>
-bool operator<=(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
-{
-    return std::tie(lhs.left, lhs.right) <= std::tie(rhs.left, rhs.right);
+    return std::tie(lhs.left, lhs.right) <=> std::tie(rhs.left, rhs.right);
 }

 template <typename IntervalStorageType>
@ -44,24 +38,6 @@ bool operator==(const Interval<IntervalStorageType> & lhs, const Interval<Interv
    return std::tie(lhs.left, lhs.right) == std::tie(rhs.left, rhs.right);
 }

-template <typename IntervalStorageType>
-bool operator!=(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
-{
-    return std::tie(lhs.left, lhs.right) != std::tie(rhs.left, rhs.right);
-}
-
-template <typename IntervalStorageType>
-bool operator>(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
-{
-    return std::tie(lhs.left, lhs.right) > std::tie(rhs.left, rhs.right);
-}
-
-template <typename IntervalStorageType>
-bool operator>=(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
-{
-    return std::tie(lhs.left, lhs.right) >= std::tie(rhs.left, rhs.right);
-}
-
 struct IntervalTreeVoidValue
 {
 };
--- a/src/Common/LoggingFormatStringHelpers.h
+++ b/src/Common/LoggingFormatStringHelpers.h
@ -43,6 +43,17 @@ struct PreformattedMessage
    operator const std::string & () const { return text; }
    operator std::string () && { return std::move(text); }
    operator fmt::format_string<> () const { UNREACHABLE(); }
+
+    void apply(std::string & out_text, std::string_view & out_format_string) const &
+    {
+        out_text = text;
+        out_format_string = format_string;
+    }
+    void apply(std::string & out_text, std::string_view & out_format_string) &&
+    {
+        out_text = std::move(text);
+        out_format_string = format_string;
+    }
 };

 template <typename... Args>
@ -99,10 +110,33 @@ template <typename T> constexpr std::string_view tryGetStaticFormatString(T && x
    }
 }

+/// Constexpr ifs are not like ifdefs, and compiler still checks that unneeded code can be compiled
+/// This template is useful to avoid compilation failures when condition of some "constexpr if" is false
+template<bool enable> struct ConstexprIfsAreNotIfdefs
+{
+    template <typename T> constexpr static std::string_view getStaticFormatString(T &&) { return {}; }
+    template <typename T> static PreformattedMessage getPreformatted(T &&) { return {}; }
+};
+
+template<> struct ConstexprIfsAreNotIfdefs<true>
+{
+    template <typename T> consteval static std::string_view getStaticFormatString(T && x)
+    {
+        /// See tryGetStaticFormatString(...)
+        static_assert(!std::is_same_v<std::string, std::decay_t<T>>);
+        static_assert(std::is_nothrow_convertible<T, const char * const>::value);
+        static_assert(!std::is_pointer<T>::value);
+        return std::string_view(x);
+    }
+
+    template <typename T> static T && getPreformatted(T && x) { return std::forward<T>(x); }
+};
+
 template <typename... Ts> constexpr size_t numArgs(Ts &&...) { return sizeof...(Ts); }
 template <typename T, typename... Ts> constexpr auto firstArg(T && x, Ts &&...) { return std::forward<T>(x); }
 /// For implicit conversion of fmt::basic_runtime<> to char* for std::string ctor
 template <typename T, typename... Ts> constexpr auto firstArg(fmt::basic_runtime<T> && data, Ts &&...) { return data.str.data(); }
+template <typename T, typename... Ts> constexpr auto firstArg(const fmt::basic_runtime<T> & data, Ts &&...) { return data.str.data(); }

 consteval ssize_t formatStringCountArgsNum(const char * const str, size_t len)
 {
@ -142,26 +176,19 @@ consteval void formatStringCheckArgsNumImpl(std::string_view str, size_t nargs)
        functionThatFailsCompilationOfConstevalFunctions("unexpected number of arguments in a format string");
 }

-template <typename... Args>
-struct CheckArgsNumHelperImpl
-{
 template<typename T>
-    consteval CheckArgsNumHelperImpl(T && str)
+consteval void formatStringCheckArgsNum(T && str, size_t nargs)
 {
-        formatStringCheckArgsNumImpl(tryGetStaticFormatString(str), sizeof...(Args));
+    formatStringCheckArgsNumImpl(tryGetStaticFormatString(str), nargs);
 }
+template<typename T> inline void formatStringCheckArgsNum(fmt::basic_runtime<T> &&, size_t) {}
+template<> inline void formatStringCheckArgsNum(PreformattedMessage &, size_t) {}
+template<> inline void formatStringCheckArgsNum(const PreformattedMessage &, size_t) {}
+template<> inline void formatStringCheckArgsNum(PreformattedMessage &&, size_t) {}

-    /// No checks for fmt::runtime and PreformattedMessage
-    template<typename T> CheckArgsNumHelperImpl(fmt::basic_runtime<T> &&) {}
-    template<> CheckArgsNumHelperImpl(PreformattedMessage &) {}
-    template<> CheckArgsNumHelperImpl(const PreformattedMessage &) {}
-    template<> CheckArgsNumHelperImpl(PreformattedMessage &&) {}
-
-};
-
-template <typename... Args> using CheckArgsNumHelper = CheckArgsNumHelperImpl<std::type_identity_t<Args>...>;
-template <typename... Args> void formatStringCheckArgsNum(CheckArgsNumHelper<Args...>, Args &&...) {}
-
+template<typename T> struct FormatStringTypeInfo{ static constexpr bool is_static = true; static constexpr bool has_format = true; };
+template<typename T> struct FormatStringTypeInfo<fmt::basic_runtime<T>> { static constexpr bool is_static = false; static constexpr bool has_format = false; };
+template<> struct FormatStringTypeInfo<PreformattedMessage> { static constexpr bool is_static = false; static constexpr bool has_format = true; };

 /// This wrapper helps to avoid too frequent and noisy log messages.
 /// For each pair (logger_name, format_string) it remembers when such a message was logged the last time.
--- a/src/Common/PODArray.cpp
+++ b/src/Common/PODArray.cpp
@ -15,4 +15,14 @@ template class PODArray<Int8, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADD
 template class PODArray<Int16, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
 template class PODArray<Int32, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
 template class PODArray<Int64, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
+
+template class PODArray<UInt8, 4096, Allocator<false>, 0, 0>;
+template class PODArray<UInt16, 4096, Allocator<false>, 0, 0>;
+template class PODArray<UInt32, 4096, Allocator<false>, 0, 0>;
+template class PODArray<UInt64, 4096, Allocator<false>, 0, 0>;
+
+template class PODArray<Int8, 4096, Allocator<false>, 0, 0>;
+template class PODArray<Int16, 4096, Allocator<false>, 0, 0>;
+template class PODArray<Int32, 4096, Allocator<false>, 0, 0>;
+template class PODArray<Int64, 4096, Allocator<false>, 0, 0>;
 }
--- a/src/Common/PODArray.h
+++ b/src/Common/PODArray.h
@ -783,4 +783,15 @@ extern template class PODArray<Int8, 4096, Allocator<false>, PADDING_FOR_SIMD -
 extern template class PODArray<Int16, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
 extern template class PODArray<Int32, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
 extern template class PODArray<Int64, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
+
+extern template class PODArray<UInt8, 4096, Allocator<false>, 0, 0>;
+extern template class PODArray<UInt16, 4096, Allocator<false>, 0, 0>;
+extern template class PODArray<UInt32, 4096, Allocator<false>, 0, 0>;
+extern template class PODArray<UInt64, 4096, Allocator<false>, 0, 0>;
+
+extern template class PODArray<Int8, 4096, Allocator<false>, 0, 0>;
+extern template class PODArray<Int16, 4096, Allocator<false>, 0, 0>;
+extern template class PODArray<Int32, 4096, Allocator<false>, 0, 0>;
+extern template class PODArray<Int64, 4096, Allocator<false>, 0, 0>;
+
 }
--- a/src/Common/ProgressIndication.cpp
+++ b/src/Common/ProgressIndication.cpp
@ -101,9 +101,6 @@ void ProgressIndication::writeFinalProgress()
                    << formatReadableSizeWithDecimalSuffix(progress.read_bytes * 1000000000.0 / elapsed_ns) << "/s.)";
    else
        std::cout << ". ";
-    auto peak_memory_usage = getMemoryUsage().peak;
-    if (peak_memory_usage >= 0)
-        std::cout << "\nPeak memory usage (for query) " << formatReadableSizeWithBinarySuffix(peak_memory_usage) << ".";
 }

 void ProgressIndication::writeProgress(WriteBufferFromFileDescriptor & message)
--- a/src/Common/logger_useful.h
+++ b/src/Common/logger_useful.h
@ -1,7 +1,7 @@
 #pragma once

 /// Macros for convenient usage of Poco logger.
-
+#include <unistd.h>
 #include <fmt/format.h>
 #include <Poco/Logger.h>
 #include <Poco/Message.h>
@ -28,6 +28,32 @@ namespace

 #define LOG_IMPL_FIRST_ARG(X, ...) X

+/// Copy-paste from contrib/libpq/include/c.h
+/// There's no easy way to count the number of arguments without evaluating these arguments...
+#define CH_VA_ARGS_NARGS(...) \
+    CH_VA_ARGS_NARGS_(__VA_ARGS__, \
+                   63,62,61,60,                   \
+                   59,58,57,56,55,54,53,52,51,50, \
+                   49,48,47,46,45,44,43,42,41,40, \
+                   39,38,37,36,35,34,33,32,31,30, \
+                   29,28,27,26,25,24,23,22,21,20, \
+                   19,18,17,16,15,14,13,12,11,10, \
+                   9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define CH_VA_ARGS_NARGS_( \
+    _01,_02,_03,_04,_05,_06,_07,_08,_09,_10, \
+    _11,_12,_13,_14,_15,_16,_17,_18,_19,_20, \
+    _21,_22,_23,_24,_25,_26,_27,_28,_29,_30, \
+    _31,_32,_33,_34,_35,_36,_37,_38,_39,_40, \
+    _41,_42,_43,_44,_45,_46,_47,_48,_49,_50, \
+    _51,_52,_53,_54,_55,_56,_57,_58,_59,_60, \
+    _61,_62,_63, N, ...) \
+    (N)
+
+#define LINE_NUM_AS_STRING_IMPL2(x) #x
+#define LINE_NUM_AS_STRING_IMPL(x) LINE_NUM_AS_STRING_IMPL2(x)
+#define LINE_NUM_AS_STRING LINE_NUM_AS_STRING_IMPL(__LINE__)
+#define MESSAGE_FOR_EXCEPTION_ON_LOGGING "Failed to write a log message: " __FILE__ ":" LINE_NUM_AS_STRING "\n"
+
 /// Logs a message to a specified logger with that level.
 /// If more than one argument is provided,
 ///  the first argument is interpreted as a template with {}-substitutions
@ -39,21 +65,48 @@ namespace
    auto _logger = ::getLogger(logger);                                                                             \
    const bool _is_clients_log = (DB::CurrentThread::getGroup() != nullptr) &&                                      \
        (DB::CurrentThread::get().getClientLogsLevel() >= (priority));                                              \
-    if (_is_clients_log || _logger->is((PRIORITY)))                               \
+    if (!_is_clients_log && !_logger->is((PRIORITY)))                                                               \
+        break;                                                                                                      \
+                                                                                                                    \
+    try                                                                                                             \
    {                                                                                                               \
-        std::string formatted_message = numArgs(__VA_ARGS__) > 1 ? fmt::format(__VA_ARGS__) : firstArg(__VA_ARGS__); \
-        formatStringCheckArgsNum(__VA_ARGS__);                                    \
-        if (auto _channel = _logger->getChannel())                                \
-        {                                                                         \
-            std::string file_function;                                            \
-            file_function += __FILE__;                                            \
-            file_function += "; ";                                                \
-            file_function += __PRETTY_FUNCTION__;                                 \
-            Poco::Message poco_message(_logger->name(), formatted_message,        \
-                (PRIORITY), file_function.c_str(), __LINE__, tryGetStaticFormatString(LOG_IMPL_FIRST_ARG(__VA_ARGS__))); \
-            _channel->log(poco_message);                                          \
-        }                                                                         \
        ProfileEvents::incrementForLogMessage(PRIORITY);                                                            \
+        auto _channel = _logger->getChannel();                                                                      \
+        if (!_channel)                                                                                              \
+            break;                                                                                                  \
+                                                                                                                    \
+        constexpr size_t _nargs = CH_VA_ARGS_NARGS(__VA_ARGS__);                                                    \
+        using LogTypeInfo = FormatStringTypeInfo<std::decay_t<decltype(LOG_IMPL_FIRST_ARG(__VA_ARGS__))>>;          \
+                                                                                                                    \
+        std::string_view _format_string;                                                                            \
+        std::string _formatted_message;                                                                             \
+                                                                                                                    \
+        if constexpr (LogTypeInfo::is_static)                                                                       \
+        {                                                                                                           \
+            formatStringCheckArgsNum(LOG_IMPL_FIRST_ARG(__VA_ARGS__), _nargs - 1);                                  \
+            _format_string = ConstexprIfsAreNotIfdefs<LogTypeInfo::is_static>::getStaticFormatString(LOG_IMPL_FIRST_ARG(__VA_ARGS__)); \
+        }                                                                                                           \
+                                                                                                                    \
+        constexpr bool is_preformatted_message = !LogTypeInfo::is_static && LogTypeInfo::has_format;                \
+        if constexpr (is_preformatted_message)                                                                      \
+        {                                                                                                           \
+            static_assert(_nargs == 1 || !is_preformatted_message);                                                 \
+            ConstexprIfsAreNotIfdefs<is_preformatted_message>::getPreformatted(LOG_IMPL_FIRST_ARG(__VA_ARGS__)).apply(_formatted_message, _format_string);  \
+        }                                                                                                           \
+        else                                                                                                        \
+        {                                                                                                           \
+             _formatted_message = _nargs == 1 ? firstArg(__VA_ARGS__) : fmt::format(__VA_ARGS__);                   \
+        }                                                                                                           \
+                                                                                                                    \
+        std::string _file_function = __FILE__ "; ";                                                                 \
+        _file_function += __PRETTY_FUNCTION__;                                                                      \
+        Poco::Message _poco_message(_logger->name(), std::move(_formatted_message),                                 \
+            (PRIORITY), _file_function.c_str(), __LINE__, _format_string);                                          \
+        _channel->log(_poco_message);                                                                               \
+    }                                                                                                               \
+    catch (...)                                                                                                     \
+    {                                                                                                               \
+        ::write(STDERR_FILENO, static_cast<const void *>(MESSAGE_FOR_EXCEPTION_ON_LOGGING), sizeof(MESSAGE_FOR_EXCEPTION_ON_LOGGING)); \
    }                                                                                                               \
 } while (false)

--- a/src/Common/tests/gtest_log.cpp
+++ b/src/Common/tests/gtest_log.cpp
@ -1,6 +1,7 @@
 #include <string>
 #include <vector>
 #include <Common/logger_useful.h>
+#include <Common/thread_local_rng.h>
 #include <gtest/gtest.h>

 #include <Poco/Logger.h>
@ -50,3 +51,55 @@ TEST(Logger, TestLog)
    }

 }
+
+static size_t global_counter = 0;
+
+static std::string getLogMessage()
+{
+    ++global_counter;
+    return "test1 " + std::to_string(thread_local_rng());
+}
+
+static size_t getLogMessageParam()
+{
+    ++global_counter;
+    return thread_local_rng();
+}
+
+static PreformattedMessage getPreformatted()
+{
+    ++global_counter;
+    return PreformattedMessage::create("test3 {}", thread_local_rng());
+}
+
+static size_t getLogMessageParamOrThrow()
+{
+    size_t x = thread_local_rng();
+    if (x % 1000 == 0)
+        return x;
+    throw Poco::Exception("error", 42);
+}
+
+TEST(Logger, SideEffects)
+{
+    std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+    auto my_channel = Poco::AutoPtr<Poco::StreamChannel>(new Poco::StreamChannel(oss));
+    auto * log = &Poco::Logger::create("Logger", my_channel.get());
+    log->setLevel("trace");
+
+    /// Ensure that parameters are evaluated only once
+    global_counter = 0;
+    LOG_TRACE(log, fmt::runtime(getLogMessage()));
+    EXPECT_EQ(global_counter, 1);
+    LOG_TRACE(log, "test2 {}", getLogMessageParam());
+    EXPECT_EQ(global_counter, 2);
+    LOG_TRACE(log, getPreformatted());
+    EXPECT_EQ(global_counter, 3);
+
+    auto var = PreformattedMessage::create("test4 {}", thread_local_rng());
+    LOG_TRACE(log, var);
+    EXPECT_EQ(var.text.starts_with("test4 "), true);
+    EXPECT_EQ(var.format_string, "test4 {}");
+
+    LOG_TRACE(log, "test no throw {}", getLogMessageParamOrThrow());
+}
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -674,6 +674,7 @@ class IColumn;
    M(UInt64, remote_read_min_bytes_for_seek, 4 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes required for remote read (url, s3) to do seek, instead of read with ignore.", 0) \
    M(UInt64, merge_tree_min_bytes_per_task_for_remote_reading, 4 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes to read per task.", 0) \
    M(Bool, merge_tree_use_const_size_tasks_for_remote_reading, true, "Whether to use constant size tasks for reading from a remote table.", 0) \
+    M(Bool, merge_tree_determine_task_size_by_prewhere_columns, true, "Whether to use only prewhere columns size to determine reading task size.", 0) \
    \
    M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table", 0) \
    M(Bool, wait_for_async_insert, true, "If true wait for processing of asynchronous insertion", 0) \
@ -953,6 +954,10 @@ class IColumn;
    M(ParquetVersion, output_format_parquet_version, "2.latest", "Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default)", 0) \
    M(ParquetCompression, output_format_parquet_compression_method, "lz4", "Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed)", 0) \
    M(Bool, output_format_parquet_compliant_nested_types, true, "In parquet file schema, use name 'element' instead of 'item' for list elements. This is a historical artifact of Arrow library implementation. Generally increases compatibility, except perhaps with some old versions of Arrow.", 0) \
+    M(Bool, output_format_parquet_use_custom_encoder, true, "Use experimental faster Parquet encoder implementation.", 0) \
+    M(Bool, output_format_parquet_parallel_encoding, true, "Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder.", 0) \
+    M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, "Target page size in bytes, before compression.", 0) \
+    M(UInt64, output_format_parquet_batch_size, 1024, "Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs.", 0) \
    M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \
    M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \
    M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -130,6 +130,10 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.parquet.max_block_size = settings.input_format_parquet_max_block_size;
    format_settings.parquet.output_compression_method = settings.output_format_parquet_compression_method;
    format_settings.parquet.output_compliant_nested_types = settings.output_format_parquet_compliant_nested_types;
+    format_settings.parquet.use_custom_encoder = settings.output_format_parquet_use_custom_encoder;
+    format_settings.parquet.parallel_encoding = settings.output_format_parquet_parallel_encoding;
+    format_settings.parquet.data_page_size = settings.output_format_parquet_data_page_size;
+    format_settings.parquet.write_batch_size = settings.output_format_parquet_batch_size;
    format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
    format_settings.pretty.color = settings.output_format_pretty_color;
    format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
@ -434,7 +438,7 @@ OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible(
        return format;
    }

-    return getOutputFormat(name, buf, sample, context, _format_settings);
+    return getOutputFormat(name, buf, sample, context, format_settings);
 }


@ -453,6 +457,7 @@ OutputFormatPtr FormatFactory::getOutputFormat(
        context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name);

    auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
+    format_settings.max_threads = context->getSettingsRef().max_threads;

    /** TODO: Materialization is needed, because formats can use the functions `IDataType`,
      *  which only work with full columns.
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -100,6 +100,8 @@ struct FormatSettings

    UInt64 max_parser_depth = DBMS_DEFAULT_MAX_PARSER_DEPTH;

+    size_t max_threads = 1;
+
    enum class ArrowCompression
    {
        NONE,
@ -233,10 +235,14 @@ struct FormatSettings
        bool output_string_as_string = false;
        bool output_fixed_string_as_fixed_byte_array = true;
        bool preserve_order = false;
+        bool use_custom_encoder = true;
+        bool parallel_encoding = true;
        UInt64 max_block_size = 8192;
        ParquetVersion output_version;
        ParquetCompression output_compression_method = ParquetCompression::SNAPPY;
        bool output_compliant_nested_types = true;
+        size_t data_page_size = 1024 * 1024;
+        size_t write_batch_size = 1024;
    } parquet;

    struct Pretty
--- a/src/Functions/FunctionToDecimalString.cpp
+++ b/src/Functions/FunctionToDecimalString.cpp
@ -0,0 +1,22 @@
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionToDecimalString.h>
+#include <Functions/IFunction.h>
+
+namespace DB
+{
+
+REGISTER_FUNCTION(ToDecimalString)
+{
+    factory.registerFunction<FunctionToDecimalString>(
+        FunctionDocumentation{
+            .description=R"(
+Returns string representation of a number. First argument is the number of any numeric type,
+second argument is the desired number of digits in fractional part. Returns String.
+
+        )",
+            .examples{{"toDecimalString", "SELECT toDecimalString(2.1456,2)", ""}},
+            .categories{"String"}
+        }, FunctionFactory::CaseInsensitive);
+}
+
+}
--- a/src/Functions/FunctionToDecimalString.h
+++ b/src/Functions/FunctionToDecimalString.h
@ -0,0 +1,262 @@
+#pragma once
+
+#include <Core/Types.h>
+#include <Core/DecimalFunctions.h>
+#include <Functions/IFunction.h>
+#include <Functions/FunctionHelpers.h>
+#include <Columns/ColumnsNumber.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnDecimal.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <IO/WriteBufferFromVector.h>
+#include <IO/WriteHelpers.h>
+#include <Interpreters/Context_fwd.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER;
+}
+
+class FunctionToDecimalString : public IFunction
+{
+public:
+    static constexpr auto name = "toDecimalString";
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionToDecimalString>(); }
+
+    String getName() const override { return name; }
+
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+    {
+        FunctionArgumentDescriptors mandatory_args = {
+            {"Value", &isNumber<IDataType>, nullptr, "Number"},
+            {"precision", &isNativeInteger<IDataType>, &isColumnConst, "const Integer"}
+        };
+
+        validateFunctionArgumentTypes(*this, arguments, mandatory_args, {});
+
+        return std::make_shared<DataTypeString>();
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+private:
+    /// For operations with Integer/Float
+    template <typename FromVectorType>
+    void vectorConstant(const FromVectorType & vec_from, UInt8 precision,
+                        ColumnString::Chars & vec_to, ColumnString::Offsets & result_offsets) const
+    {
+        size_t input_rows_count = vec_from.size();
+        result_offsets.resize(input_rows_count);
+
+        /// Buffer is used here and in functions below because resulting size cannot be precisely anticipated,
+        /// and buffer resizes on-the-go. Also, .count() provided by buffer is convenient in this case.
+        WriteBufferFromVector<ColumnString::Chars> buf_to(vec_to);
+
+        for (size_t i = 0; i < input_rows_count; ++i)
+        {
+            format(vec_from[i], buf_to, precision);
+            result_offsets[i] = buf_to.count();
+        }
+
+        buf_to.finalize();
+    }
+
+    template <typename FirstArgVectorType>
+    void vectorVector(const FirstArgVectorType & vec_from, const ColumnVector<UInt8>::Container & vec_precision,
+                      ColumnString::Chars & vec_to, ColumnString::Offsets & result_offsets) const
+    {
+        size_t input_rows_count = vec_from.size();
+        result_offsets.resize(input_rows_count);
+
+        WriteBufferFromVector<ColumnString::Chars> buf_to(vec_to);
+
+        constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
+
+        for (size_t i = 0; i < input_rows_count; ++i)
+        {
+            if (vec_precision[i] > max_digits)
+                throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER,
+                                    "Too many fractional digits requested, shall not be more than {}", max_digits);
+            format(vec_from[i], buf_to, vec_precision[i]);
+            result_offsets[i] = buf_to.count();
+        }
+
+        buf_to.finalize();
+    }
+
+    /// For operations with Decimal
+    template <typename FirstArgVectorType>
+    void vectorConstant(const FirstArgVectorType & vec_from, UInt8 precision,
+                        ColumnString::Chars & vec_to, ColumnString::Offsets & result_offsets, UInt8 from_scale) const
+    {
+        /// There are no more than 77 meaning digits (as it is the max length of UInt256). So we can limit it with 77.
+        constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
+        if (precision > max_digits)
+            throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER,
+                                "Too many fractional digits requested for Decimal, must not be more than {}", max_digits);
+
+        WriteBufferFromVector<ColumnString::Chars> buf_to(vec_to);
+        size_t input_rows_count = vec_from.size();
+        result_offsets.resize(input_rows_count);
+
+        for (size_t i = 0; i < input_rows_count; ++i)
+        {
+            writeText(vec_from[i], from_scale, buf_to, true, true, precision);
+            writeChar(0, buf_to);
+            result_offsets[i] = buf_to.count();
+        }
+        buf_to.finalize();
+    }
+
+    template <typename FirstArgVectorType>
+    void vectorVector(const FirstArgVectorType & vec_from, const ColumnVector<UInt8>::Container & vec_precision,
+                      ColumnString::Chars & vec_to, ColumnString::Offsets & result_offsets, UInt8 from_scale) const
+    {
+        size_t input_rows_count = vec_from.size();
+        result_offsets.resize(input_rows_count);
+
+        WriteBufferFromVector<ColumnString::Chars> buf_to(vec_to);
+
+        constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
+
+        for (size_t i = 0; i < input_rows_count; ++i)
+        {
+            if (vec_precision[i] > max_digits)
+                throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER,
+                                    "Too many fractional digits requested for Decimal, must not be more than {}", max_digits);
+            writeText(vec_from[i], from_scale, buf_to, true, true, vec_precision[i]);
+            writeChar(0, buf_to);
+            result_offsets[i] = buf_to.count();
+        }
+        buf_to.finalize();
+    }
+
+    template <is_floating_point T>
+    static void format(T value, DB::WriteBuffer & out, UInt8 precision)
+    {
+        /// Maximum of 60 is hard-coded in 'double-conversion/double-conversion.h' for floating point values,
+        /// Catch this here to give user a more reasonable error.
+        if (precision > 60)
+            throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER,
+                                "Too high precision requested for Float, must not be more than 60, got {}", Int8(precision));
+
+        DB::DoubleConverter<false>::BufferType buffer;
+        double_conversion::StringBuilder builder{buffer, sizeof(buffer)};
+
+        const auto result = DB::DoubleConverter<false>::instance().ToFixed(value, precision, &builder);
+
+        if (!result)
+            throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER, "Error processing number: {}", value);
+
+        out.write(buffer, builder.position());
+        writeChar(0, out);
+    }
+
+    template <is_integer T>
+    static void format(T value, DB::WriteBuffer & out, UInt8 precision)
+    {
+        /// Fractional part for Integer is just trailing zeros. Let's limit it with 77 (like with Decimals).
+        constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
+        if (precision > max_digits)
+            throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER,
+                                "Too many fractional digits requested, shall not be more than {}", max_digits);
+        writeText(value, out);
+        if (precision > 0) [[likely]]
+        {
+            writeChar('.', out);
+            for (int i = 0; i < precision; ++i)
+                writeChar('0', out);
+            writeChar(0, out);
+        }
+    }
+
+public:
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
+    {
+        switch (arguments[0].type->getTypeId())
+        {
+            case TypeIndex::UInt8:      return executeType<UInt8>(arguments);
+            case TypeIndex::UInt16:     return executeType<UInt16>(arguments);
+            case TypeIndex::UInt32:     return executeType<UInt32>(arguments);
+            case TypeIndex::UInt64:     return executeType<UInt64>(arguments);
+            case TypeIndex::UInt128:    return executeType<UInt128>(arguments);
+            case TypeIndex::UInt256:    return executeType<UInt256>(arguments);
+            case TypeIndex::Int8:       return executeType<Int8>(arguments);
+            case TypeIndex::Int16:      return executeType<Int16>(arguments);
+            case TypeIndex::Int32:      return executeType<Int32>(arguments);
+            case TypeIndex::Int64:      return executeType<Int64>(arguments);
+            case TypeIndex::Int128:     return executeType<Int128>(arguments);
+            case TypeIndex::Int256:     return executeType<Int256>(arguments);
+            case TypeIndex::Float32:    return executeType<Float32>(arguments);
+            case TypeIndex::Float64:    return executeType<Float64>(arguments);
+            case TypeIndex::Decimal32:  return executeType<Decimal32>(arguments);
+            case TypeIndex::Decimal64:  return executeType<Decimal64>(arguments);
+            case TypeIndex::Decimal128: return executeType<Decimal128>(arguments);
+            case TypeIndex::Decimal256: return executeType<Decimal256>(arguments);
+            default:
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}",
+                                arguments[0].column->getName(), getName());
+        }
+    }
+
+private:
+    template <typename T>
+    ColumnPtr executeType(const ColumnsWithTypeAndName & arguments) const
+    {
+        const auto * precision_col = checkAndGetColumn<ColumnVector<UInt8>>(arguments[1].column.get());
+        const auto * precision_col_const = checkAndGetColumnConst<ColumnVector<UInt8>>(arguments[1].column.get());
+
+        auto result_col = ColumnString::create();
+        auto * result_col_string = assert_cast<ColumnString *>(result_col.get());
+        ColumnString::Chars & result_chars = result_col_string->getChars();
+        ColumnString::Offsets & result_offsets = result_col_string->getOffsets();
+
+        if constexpr (is_decimal<T>)
+        {
+            const auto * from_col = checkAndGetColumn<ColumnDecimal<T>>(arguments[0].column.get());
+            UInt8 from_scale = from_col->getScale();
+
+            if (from_col)
+            {
+                if (precision_col_const)
+                    vectorConstant(from_col->getData(), precision_col_const->template getValue<UInt8>(), result_chars, result_offsets, from_scale);
+                else if (precision_col)
+                    vectorVector(from_col->getData(), precision_col->getData(), result_chars, result_offsets, from_scale);
+                else
+                    throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of second argument of function formatDecimal", arguments[1].column->getName());
+            }
+            else
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function formatDecimal", arguments[0].column->getName());
+        }
+        else
+        {
+            const auto * from_col = checkAndGetColumn<ColumnVector<T>>(arguments[0].column.get());
+            if (from_col)
+            {
+                if (precision_col_const)
+                    vectorConstant(from_col->getData(), precision_col_const->template getValue<UInt8>(), result_chars, result_offsets);
+                else if (precision_col)
+                    vectorVector(from_col->getData(), precision_col->getData(), result_chars, result_offsets);
+                else
+                    throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of second argument of function formatDecimal", arguments[1].column->getName());
+
+            }
+            else
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function formatDecimal", arguments[0].column->getName());
+        }
+
+        return result_col;
+    }
+};
+
+}
--- a/src/IO/WriteHelpers.h
+++ b/src/IO/WriteHelpers.h
@ -905,26 +905,26 @@ inline void writeText(const IPv4 & x, WriteBuffer & buf) { writeIPv4Text(x, buf)
 inline void writeText(const IPv6 & x, WriteBuffer & buf) { writeIPv6Text(x, buf); }

 template <typename T>
-void writeDecimalFractional(const T & x, UInt32 scale, WriteBuffer & ostr, bool trailing_zeros)
+void writeDecimalFractional(const T & x, UInt32 scale, WriteBuffer & ostr, bool trailing_zeros,
+                            bool fixed_fractional_length, UInt32 fractional_length)
 {
    /// If it's big integer, but the number of digits is small,
    /// use the implementation for smaller integers for more efficient arithmetic.
-
    if constexpr (std::is_same_v<T, Int256>)
    {
        if (x <= std::numeric_limits<UInt32>::max())
        {
-            writeDecimalFractional(static_cast<UInt32>(x), scale, ostr, trailing_zeros);
+            writeDecimalFractional(static_cast<UInt32>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
            return;
        }
        else if (x <= std::numeric_limits<UInt64>::max())
        {
-            writeDecimalFractional(static_cast<UInt64>(x), scale, ostr, trailing_zeros);
+            writeDecimalFractional(static_cast<UInt64>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
            return;
        }
        else if (x <= std::numeric_limits<UInt128>::max())
        {
-            writeDecimalFractional(static_cast<UInt128>(x), scale, ostr, trailing_zeros);
+            writeDecimalFractional(static_cast<UInt128>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
            return;
        }
    }
@ -932,24 +932,36 @@ void writeDecimalFractional(const T & x, UInt32 scale, WriteBuffer & ostr, bool
    {
        if (x <= std::numeric_limits<UInt32>::max())
        {
-            writeDecimalFractional(static_cast<UInt32>(x), scale, ostr, trailing_zeros);
+            writeDecimalFractional(static_cast<UInt32>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
            return;
        }
        else if (x <= std::numeric_limits<UInt64>::max())
        {
-            writeDecimalFractional(static_cast<UInt64>(x), scale, ostr, trailing_zeros);
+            writeDecimalFractional(static_cast<UInt64>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
            return;
        }
    }

    constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
    assert(scale <= max_digits);
+    assert(fractional_length <= max_digits);
+
    char buf[max_digits];
-    memset(buf, '0', scale);
+    memset(buf, '0', std::max(scale, fractional_length));

    T value = x;
    Int32 last_nonzero_pos = 0;
-    for (Int32 pos = scale - 1; pos >= 0; --pos)
+
+    if (fixed_fractional_length && fractional_length < scale)
+    {
+        T new_value = value / DecimalUtils::scaleMultiplier<Int256>(scale - fractional_length - 1);
+        auto round_carry = new_value % 10;
+        value = new_value / 10;
+        if (round_carry >= 5)
+            value += 1;
+    }
+
+    for (Int32 pos = fixed_fractional_length ? std::min(scale - 1, fractional_length - 1) : scale - 1; pos >= 0; --pos)
    {
        auto remainder = value % 10;
        value /= 10;
@ -961,11 +973,12 @@ void writeDecimalFractional(const T & x, UInt32 scale, WriteBuffer & ostr, bool
    }

    writeChar('.', ostr);
-    ostr.write(buf, trailing_zeros ? scale : last_nonzero_pos + 1);
+    ostr.write(buf, fixed_fractional_length ? fractional_length : (trailing_zeros ? scale : last_nonzero_pos + 1));
 }

 template <typename T>
-void writeText(Decimal<T> x, UInt32 scale, WriteBuffer & ostr, bool trailing_zeros)
+void writeText(Decimal<T> x, UInt32 scale, WriteBuffer & ostr, bool trailing_zeros,
+               bool fixed_fractional_length = false, UInt32 fractional_length = 0)
 {
    T part = DecimalUtils::getWholePart(x, scale);

@ -976,7 +989,7 @@ void writeText(Decimal<T> x, UInt32 scale, WriteBuffer & ostr, bool trailing_zer

    writeIntText(part, ostr);

-    if (scale)
+    if (scale || (fixed_fractional_length && fractional_length > 0))
    {
        part = DecimalUtils::getFractionalPart(x, scale);
        if (part || trailing_zeros)
@ -984,7 +997,7 @@ void writeText(Decimal<T> x, UInt32 scale, WriteBuffer & ostr, bool trailing_zer
            if (part < 0)
                part *= T(-1);

-            writeDecimalFractional(part, scale, ostr, trailing_zeros);
+            writeDecimalFractional(part, scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
        }
    }
 }
--- a/src/Interpreters/ActionsVisitor.cpp
+++ b/src/Interpreters/ActionsVisitor.cpp
@ -1210,22 +1210,16 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
            else if (data.is_create_parameterized_view && query_parameter)
            {
                const auto data_type = DataTypeFactory::instance().get(query_parameter->type);
-                /// Use getUniqueName() to allow multiple use of query parameter in the query:
-                ///
-                ///     CREATE VIEW view AS
-                ///     SELECT *
-                ///     FROM system.one
-                ///     WHERE dummy = {k1:Int}+1 OR dummy = {k1:Int}+2
-                ///                    ^^                    ^^
-                ///
-                /// NOTE: query in the VIEW will not be modified this is needed
-                /// only during analysis for CREATE VIEW to avoid duplicated
-                /// column names.
-                ColumnWithTypeAndName column(data_type, data.getUniqueName("__" + query_parameter->getColumnName()));
+                /// During analysis for CREATE VIEW of a parameterized view, if parameter is
+                /// used multiple times, column is only added once
+                if (!data.hasColumn(query_parameter->name))
+                {
+                    ColumnWithTypeAndName column(data_type, query_parameter->name);
                    data.addColumn(column);
+                }

                argument_types.push_back(data_type);
-                argument_names.push_back(column.name);
+                argument_names.push_back(query_parameter->name);
            }
            else
            {
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -21,6 +21,7 @@
 #include <Core/BackgroundSchedulePool.h>
 #include <Formats/FormatFactory.h>
 #include <Databases/IDatabase.h>
+#include <Server/ServerType.h>
 #include <Storages/IStorage.h>
 #include <Storages/MarkCache.h>
 #include <Storages/MergeTree/MergeList.h>
@ -357,6 +358,9 @@ struct ContextSharedPart : boost::noncopyable

    Context::ConfigReloadCallback config_reload_callback;

+    Context::StartStopServersCallback start_servers_callback;
+    Context::StartStopServersCallback stop_servers_callback;
+
    bool is_server_completely_started = false;

 #if USE_ROCKSDB
@ -3688,6 +3692,36 @@ void Context::reloadConfig() const
    shared->config_reload_callback();
 }

+void Context::setStartServersCallback(StartStopServersCallback && callback)
+{
+    /// Is initialized at server startup, so lock isn't required. Otherwise use mutex.
+    shared->start_servers_callback = std::move(callback);
+}
+
+void Context::setStopServersCallback(StartStopServersCallback && callback)
+{
+    /// Is initialized at server startup, so lock isn't required. Otherwise use mutex.
+    shared->stop_servers_callback = std::move(callback);
+}
+
+void Context::startServers(const ServerType & server_type) const
+{
+    /// Use mutex if callback may be changed after startup.
+    if (!shared->start_servers_callback)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't start servers because start_servers_callback is not set.");
+
+    shared->start_servers_callback(server_type);
+}
+
+void Context::stopServers(const ServerType & server_type) const
+{
+    /// Use mutex if callback may be changed after startup.
+    if (!shared->stop_servers_callback)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't stop servers because stop_servers_callback is not set.");
+
+    shared->stop_servers_callback(server_type);
+}
+

 void Context::shutdown()
 {
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -134,6 +134,7 @@ using StoragePolicyPtr = std::shared_ptr<const IStoragePolicy>;
 using StoragePoliciesMap = std::map<String, StoragePolicyPtr>;
 class StoragePolicySelector;
 using StoragePolicySelectorPtr = std::shared_ptr<const StoragePolicySelector>;
+class ServerType;
 template <class Queue>
 class MergeTreeBackgroundExecutor;

@ -1057,6 +1058,13 @@ public:
    void setConfigReloadCallback(ConfigReloadCallback && callback);
    void reloadConfig() const;

+    using StartStopServersCallback = std::function<void(const ServerType &)>;
+    void setStartServersCallback(StartStopServersCallback && callback);
+    void setStopServersCallback(StartStopServersCallback && callback);
+
+    void startServers(const ServerType & server_type) const;
+    void stopServers(const ServerType & server_type) const;
+
    void shutdown();

    bool isInternalQuery() const { return is_internal_query; }
--- a/src/Interpreters/DatabaseCatalog.cpp
+++ b/src/Interpreters/DatabaseCatalog.cpp
@ -349,6 +349,15 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(

    DatabasePtr database;
    {
+        // Callers assume that this method doesn't throw exceptions, but getDatabaseName() will throw if there is no database part.
+        // So, fail early and gracefully...
+        if (!table_id.hasDatabase())
+        {
+            if (exception)
+                exception->emplace(Exception(ErrorCodes::UNKNOWN_DATABASE, "Empty database name"));
+            return {};
+        }
+
        std::lock_guard lock{databases_mutex};
        auto it = databases.find(table_id.getDatabaseName());
        if (databases.end() == it)
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@ -556,6 +556,14 @@ BlockIO InterpreterSystemQuery::execute()
            );
            break;
        }
+        case Type::STOP_LISTEN:
+            getContext()->checkAccess(AccessType::SYSTEM_LISTEN);
+            getContext()->stopServers(query.server_type);
+            break;
+        case Type::START_LISTEN:
+            getContext()->checkAccess(AccessType::SYSTEM_LISTEN);
+            getContext()->startServers(query.server_type);
+            break;
        case Type::FLUSH_ASYNC_INSERT_QUEUE:
        {
            getContext()->checkAccess(AccessType::SYSTEM_FLUSH_ASYNC_INSERT_QUEUE);
@ -567,9 +575,6 @@ BlockIO InterpreterSystemQuery::execute()
            queue->flushAll();
            break;
        }
-        case Type::STOP_LISTEN_QUERIES:
-        case Type::START_LISTEN_QUERIES:
-            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "{} is not supported yet", query.type);
        case Type::STOP_THREAD_FUZZER:
            getContext()->checkAccess(AccessType::SYSTEM_THREAD_FUZZER);
            ThreadFuzzer::stop();
@ -1181,8 +1186,12 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
            required_access.emplace_back(AccessType::SYSTEM_SYNC_FILE_CACHE);
            break;
        }
-        case Type::STOP_LISTEN_QUERIES:
-        case Type::START_LISTEN_QUERIES:
+        case Type::STOP_LISTEN:
+        case Type::START_LISTEN:
+        {
+            required_access.emplace_back(AccessType::SYSTEM_LISTEN);
+            break;
+        }
        case Type::STOP_THREAD_FUZZER:
        case Type::START_THREAD_FUZZER:
        case Type::ENABLE_FAILPOINT:
--- a/src/Parsers/ASTSystemQuery.cpp
+++ b/src/Parsers/ASTSystemQuery.cpp
@ -220,6 +220,17 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &,
    {
        settings.ostr << (settings.hilite ? hilite_none : "");
    }
+    else if (type == Type::START_LISTEN || type == Type::STOP_LISTEN)
+    {
+        settings.ostr << (settings.hilite ? hilite_keyword : "") << " " << ServerType::serverTypeToString(server_type.type)
+            << (settings.hilite ? hilite_none : "");
+
+        if (server_type.type == ServerType::CUSTOM)
+        {
+            settings.ostr << (settings.hilite ? hilite_identifier : "") << " " << backQuoteIfNeed(server_type.custom_name);
+        }
+
+    }
 }


--- a/src/Parsers/ASTSystemQuery.h
+++ b/src/Parsers/ASTSystemQuery.h
@ -3,6 +3,7 @@
 #include <Parsers/ASTQueryWithOnCluster.h>
 #include <Parsers/IAST.h>
 #include <Parsers/SyncReplicaMode.h>
+#include <Server/ServerType.h>

 #include "config.h"

@ -35,8 +36,8 @@ public:
 #if USE_AWS_S3
        DROP_S3_CLIENT_CACHE,
 #endif
-        STOP_LISTEN_QUERIES,
-        START_LISTEN_QUERIES,
+        STOP_LISTEN,
+        START_LISTEN,
        RESTART_REPLICAS,
        RESTART_REPLICA,
        RESTORE_REPLICA,
@ -116,6 +117,8 @@ public:

    SyncReplicaMode sync_replica_mode = SyncReplicaMode::DEFAULT;

+    ServerType server_type;
+
    String getID(char) const override { return "SYSTEM query"; }

    ASTPtr clone() const override
--- a/src/Parsers/ParserSystemQuery.cpp
+++ b/src/Parsers/ParserSystemQuery.cpp
@ -442,6 +442,42 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected &
            break;
        }

+        case Type::START_LISTEN:
+        case Type::STOP_LISTEN:
+        {
+            if (!parseQueryWithOnCluster(res, pos, expected))
+                return false;
+
+            ServerType::Type current_type = ServerType::Type::END;
+            std::string current_custom_name;
+
+            for (const auto & type : magic_enum::enum_values<ServerType::Type>())
+            {
+                if (ParserKeyword{ServerType::serverTypeToString(type)}.ignore(pos, expected))
+                {
+                    current_type = type;
+                    break;
+                }
+            }
+
+            if (current_type == ServerType::Type::END)
+                return false;
+
+            if (current_type == ServerType::CUSTOM)
+            {
+                ASTPtr ast;
+
+                if (!ParserStringLiteral{}.parse(pos, ast, expected))
+                    return false;
+
+                current_custom_name = ast->as<ASTLiteral &>().value.get<const String &>();
+            }
+
+            res->server_type = ServerType(current_type, current_custom_name);
+
+            break;
+        }
+
        default:
        {
            if (!parseQueryWithOnCluster(res, pos, expected))
--- a/src/Parsers/examples/CMakeLists.txt
+++ b/src/Parsers/examples/CMakeLists.txt
@ -3,8 +3,8 @@ set(SRCS)
 clickhouse_add_executable(lexer lexer.cpp ${SRCS})
 target_link_libraries(lexer PRIVATE clickhouse_parsers)

-clickhouse_add_executable(select_parser select_parser.cpp ${SRCS})
+clickhouse_add_executable(select_parser select_parser.cpp ${SRCS} "../../Server/ServerType.cpp")
 target_link_libraries(select_parser PRIVATE clickhouse_parsers)

-clickhouse_add_executable(create_parser create_parser.cpp ${SRCS})
+clickhouse_add_executable(create_parser create_parser.cpp ${SRCS} "../../Server/ServerType.cpp")
 target_link_libraries(create_parser PRIVATE clickhouse_parsers)
--- a/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp
+++ b/src/Processors/Formats/Impl/CHColumnToArrowColumn.cpp
@ -684,9 +684,6 @@ namespace DB
        bool output_fixed_string_as_fixed_byte_array,
        std::unordered_map<String, MutableColumnPtr> & dictionary_values)
    {
-        const String column_type_name = column_type->getFamilyName();
-        WhichDataType which(column_type);
-
        switch (column_type->getTypeId())
        {
            case TypeIndex::Nullable:
@ -796,7 +793,7 @@ namespace DB
                FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
 #undef DISPATCH
            default:
-                throw Exception(ErrorCodes::UNKNOWN_TYPE, "Internal type '{}' of a column '{}' is not supported for conversion into {} data format.", column_type_name, column_name, format_name);
+                throw Exception(ErrorCodes::UNKNOWN_TYPE, "Internal type '{}' of a column '{}' is not supported for conversion into {} data format.", column_type->getFamilyName(), column_name, format_name);
        }
    }

--- a/src/Processors/Formats/Impl/Parquet/PrepareForWrite.cpp
+++ b/src/Processors/Formats/Impl/Parquet/PrepareForWrite.cpp
@ -0,0 +1,628 @@
+#include "Processors/Formats/Impl/Parquet/Write.h"
+
+#include <Columns/MaskOperations.h>
+#include <Columns/ColumnFixedString.h>
+#include <Columns/ColumnNullable.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnTuple.h>
+#include <Columns/ColumnLowCardinality.h>
+#include <Columns/ColumnMap.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypeLowCardinality.h>
+#include <DataTypes/DataTypeMap.h>
+#include <DataTypes/DataTypeDateTime64.h>
+#include <DataTypes/DataTypeFixedString.h>
+
+
+/// This file deals with schema conversion and with repetition and definition levels.
+
+/// Schema conversion is pretty straightforward.
+
+/// "Repetition and definition levels" are a somewhat tricky way of encoding information about
+/// optional fields and lists.
+///
+/// If you don't want to learn how these work, feel free to skip the updateRepDefLevels* functions.
+/// All you need to know is:
+///  * values for nulls are not encoded, so we have to filter nullable columns,
+///  * information about all array lengths and nulls is encoded in the arrays `def` and `rep`,
+///    which need to be encoded next to the data,
+///  * `def` and `rep` arrays can be longer than `primitive_column`, because they include nulls and
+///    empty arrays; the values in primitive_column correspond to positions where def[i] == max_def.
+///
+/// If you do want to learn it, dremel paper: https://research.google/pubs/pub36632/
+/// Instead of reading the whole paper, try staring at figures 2-3 for a while - it might be enough.
+/// (Why does Parquet do all this instead of just storing array lengths and null masks? I'm not
+/// really sure.)
+///
+/// We calculate the levels recursively, from inner to outer columns.
+/// This means scanning the whole array for each Array/Nullable nesting level, which is probably not
+/// the most efficient way to do it. But there's usually at most one nesting level, so it's fine.
+///
+/// Most of this is moot because ClickHouse doesn't support nullable arrays or tuples right now, so
+/// almost none of the tricky cases can happen. We implement it in full generality anyway (mostly
+/// because I only learned the previous sentence after writing most of the code).
+
+
+namespace DB::ErrorCodes
+{
+    extern const int UNKNOWN_TYPE;
+    extern const int TOO_DEEP_RECURSION; // I'm 14 and this is deep
+    extern const int UNKNOWN_COMPRESSION_METHOD;
+    extern const int LOGICAL_ERROR;
+}
+
+namespace DB::Parquet
+{
+
+/// Thrift structs that Parquet uses for various metadata inside the parquet file.
+namespace parq = parquet::format;
+
+namespace
+{
+
+void assertNoDefOverflow(ColumnChunkWriteState & s)
+{
+    if (s.max_def == UINT8_MAX)
+        throw Exception(ErrorCodes::TOO_DEEP_RECURSION,
+            "Column has more than 255 levels of nested Array/Nullable. Impressive! Unfortunately, "
+            "this is not supported by this Parquet encoder (but is supported by Parquet, if you "
+            "really need this for some reason).");
+}
+
+void updateRepDefLevelsAndFilterColumnForNullable(ColumnChunkWriteState & s, const NullMap & null_map)
+{
+    /// Increment definition levels for non-nulls.
+    /// Filter the column to contain only non-null values.
+
+    assertNoDefOverflow(s);
+    ++s.max_def;
+
+    /// Normal case: no arrays or nullables inside this nullable.
+    if (s.max_def == 1)
+    {
+        chassert(s.def.empty());
+        s.def.resize(null_map.size());
+        for (size_t i = 0; i < s.def.size(); ++i)
+            s.def[i] = !null_map[i];
+
+        /// We could be more efficient with this:
+        ///  * Instead of doing the filter() here, we could defer it to writeColumnChunkBody(), at
+        ///    least in the simple case of Nullable(Primitive). Then it'll parallelize if the table
+        ///    consists of one big tuple.
+        ///  * Instead of filtering explicitly, we could build filtering into the data encoder.
+        ///  * Instead of filling out the `def` values above, we could point to null_map and build
+        ///    the '!' into the encoder.
+        /// None of these seem worth the complexity right now.
+        s.primitive_column = s.primitive_column->filter(s.def, /*result_size_hint*/ -1);
+
+        return;
+    }
+
+    /// Weird general case: Nullable(Array), Nullable(Nullable), or any arbitrary nesting like that.
+    /// This is currently not allowed in ClickHouse, but let's support it anyway just in case.
+
+    IColumn::Filter filter;
+    size_t row_idx = static_cast<size_t>(-1);
+    for (size_t i = 0; i < s.def.size(); ++i)
+    {
+        row_idx += s.max_rep == 0 || s.rep[i] == 0;
+        if (s.def[i] == s.max_def - 1)
+            filter.push_back(!null_map[row_idx]);
+        s.def[i] += !null_map[row_idx];
+    }
+    s.primitive_column = s.primitive_column->filter(filter, /*result_size_hint*/ -1);
+}
+
+void updateRepDefLevelsForArray(ColumnChunkWriteState & s, const IColumn::Offsets & offsets)
+{
+    /// Increment all definition levels.
+    /// For non-first elements of arrays, increment repetition levels.
+    /// For empty arrays, insert a zero into repetition and definition levels arrays.
+
+    assertNoDefOverflow(s);
+    ++s.max_def;
+    ++s.max_rep;
+
+    /// Common case: no arrays or nullables inside this array.
+    if (s.max_rep == 1 && s.max_def == 1)
+    {
+        s.def.resize_fill(s.primitive_column->size(), 1);
+        s.rep.resize_fill(s.primitive_column->size(), 1);
+        size_t i = 0;
+        for (ssize_t row = 0; row < static_cast<ssize_t>(offsets.size()); ++row)
+        {
+            size_t n = offsets[row] - offsets[row - 1];
+            if (n)
+            {
+                s.rep[i] = 0;
+                i += n;
+            }
+            else
+            {
+                s.def.push_back(1);
+                s.rep.push_back(1);
+                s.def[i] = 0;
+                s.rep[i] = 0;
+                i += 1;
+            }
+        }
+        return;
+    }
+
+    /// General case: Array(Array), Array(Nullable), or any arbitrary nesting like that.
+
+    for (auto & x : s.def)
+        ++x;
+
+    if (s.max_rep == 1)
+        s.rep.resize_fill(s.def.size(), 1);
+    else
+        for (auto & x : s.rep)
+            ++x;
+
+    PaddedPODArray<UInt8> mask(s.def.size(), 1); // for inserting zeroes to rep and def
+    size_t i = 0; // in the input (s.def/s.rep)
+    size_t empty_arrays = 0;
+    for (ssize_t row = 0; row < static_cast<ssize_t>(offsets.size()); ++row)
+    {
+        size_t n = offsets[row] - offsets[row - 1];
+        if (n)
+        {
+            /// Un-increment the first rep of the array.
+            /// Skip n "items" in the nested column; first element of each item has rep = 1
+            /// (we incremented it above).
+            chassert(s.rep[i] == 1);
+            --s.rep[i];
+            do
+            {
+                ++i;
+                if (i == s.rep.size())
+                {
+                    --n;
+                    chassert(n == 0);
+                    break;
+                }
+                n -= s.rep[i] == 1;
+            } while (n);
+        }
+        else
+        {
+            mask.push_back(1);
+            mask[i + empty_arrays] = 0;
+            ++empty_arrays;
+        }
+    }
+
+    if (empty_arrays != 0)
+    {
+        expandDataByMask(s.def, mask, false);
+        expandDataByMask(s.rep, mask, false);
+    }
+}
+
+parq::CompressionCodec::type compressionMethodToParquet(CompressionMethod c)
+{
+    switch (c)
+    {
+        case CompressionMethod::None: return parq::CompressionCodec::UNCOMPRESSED;
+        case CompressionMethod::Snappy: return parq::CompressionCodec::SNAPPY;
+        case CompressionMethod::Gzip: return parq::CompressionCodec::GZIP;
+        case CompressionMethod::Brotli: return parq::CompressionCodec::BROTLI;
+        case CompressionMethod::Lz4: return parq::CompressionCodec::LZ4_RAW;
+        case CompressionMethod::Zstd: return parq::CompressionCodec::ZSTD;
+
+        default:
+            throw Exception(ErrorCodes::UNKNOWN_COMPRESSION_METHOD, "Compression method {} is not supported by Parquet", toContentEncodingName(c));
+    }
+}
+
+/// Depth-first traversal of the schema tree for this column.
+void prepareColumnRecursive(
+    ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
+    ColumnChunkWriteStates & states, SchemaElements & schemas);
+
+void preparePrimitiveColumn(ColumnPtr column, DataTypePtr type, const std::string & name,
+    const WriteOptions & options, ColumnChunkWriteStates & states, SchemaElements & schemas)
+{
+    /// Add physical column info.
+    auto & state = states.emplace_back();
+    state.primitive_column = column;
+    state.compression = options.compression;
+
+    state.column_chunk.__isset.meta_data = true;
+    state.column_chunk.meta_data.__set_path_in_schema({name});
+    state.column_chunk.meta_data.__set_codec(compressionMethodToParquet(state.compression));
+
+    /// Add logical schema leaf.
+    auto & schema = schemas.emplace_back();
+    schema.__set_repetition_type(parq::FieldRepetitionType::REQUIRED);
+    schema.__set_name(name);
+
+    /// Convert the type enums.
+
+    using T = parq::Type;
+    using C = parq::ConvertedType;
+
+    auto types = [&](T::type type_, std::optional<C::type> converted = std::nullopt, std::optional<parq::LogicalType> logical = std::nullopt)
+    {
+        state.column_chunk.meta_data.__set_type(type_);
+        schema.__set_type(type_);
+        if (converted)
+            schema.__set_converted_type(*converted);
+        if (logical)
+            schema.__set_logicalType(*logical);
+    };
+
+    auto int_type = [](Int8 bits, bool signed_)
+    {
+        parq::LogicalType t;
+        t.__isset.INTEGER = true;
+        t.INTEGER.__set_bitWidth(bits);
+        t.INTEGER.__set_isSigned(signed_);
+        return t;
+    };
+
+    auto fixed_string = [&](size_t size, std::optional<C::type> converted = std::nullopt, std::optional<parq::LogicalType> logical = std::nullopt)
+    {
+        state.column_chunk.meta_data.__set_type(parq::Type::FIXED_LEN_BYTE_ARRAY);
+        schema.__set_type(parq::Type::FIXED_LEN_BYTE_ARRAY);
+        schema.__set_type_length(static_cast<Int32>(size));
+        if (converted)
+            schema.__set_converted_type(*converted);
+        if (logical)
+            schema.__set_logicalType(*logical);
+    };
+
+    auto decimal = [&](Int32 bytes, UInt32 precision, UInt32 scale)
+    {
+        state.column_chunk.meta_data.__set_type(parq::Type::FIXED_LEN_BYTE_ARRAY);
+        schema.__set_type(parq::Type::FIXED_LEN_BYTE_ARRAY);
+        schema.__set_type_length(bytes);
+        schema.__set_scale(static_cast<Int32>(scale));
+        schema.__set_precision(static_cast<Int32>(precision));
+        schema.__set_converted_type(parq::ConvertedType::DECIMAL);
+        parq::DecimalType d;
+        d.__set_scale(static_cast<Int32>(scale));
+        d.__set_precision(static_cast<Int32>(precision));
+        parq::LogicalType t;
+        t.__set_DECIMAL(d);
+        schema.__set_logicalType(t);
+    };
+
+    switch (type->getTypeId())
+    {
+        case TypeIndex::UInt8:
+            if (isBool(type))
+            {
+                types(T::BOOLEAN);
+                state.is_bool = true;
+            }
+            else
+            {
+                types(T::INT32, C::UINT_8 , int_type(8 , false));
+            }
+            break;
+        case TypeIndex::UInt16: types(T::INT32, C::UINT_16, int_type(16, false)); break;
+        case TypeIndex::UInt32: types(T::INT32, C::UINT_32, int_type(32, false)); break;
+        case TypeIndex::UInt64: types(T::INT64, C::UINT_64, int_type(64, false)); break;
+        case TypeIndex::Int8:   types(T::INT32, C::INT_8  , int_type(8 , true)); break;
+        case TypeIndex::Int16:  types(T::INT32, C::INT_16 , int_type(16, true)); break;
+        case TypeIndex::Int32:  types(T::INT32); break;
+        case TypeIndex::Int64:  types(T::INT64); break;
+        case TypeIndex::Float32: types(T::FLOAT); break;
+        case TypeIndex::Float64: types(T::DOUBLE); break;
+
+        /// These don't have suitable parquet logical types, so we write them as plain numbers.
+        /// (Parquet has "enums" but they're just strings, with nowhere to declare all possible enum
+        /// values in advance as part of the data type.)
+        case TypeIndex::Enum8:    types(T::INT32, C::INT_8  , int_type(8 , true)); break; //  Int8
+        case TypeIndex::Enum16:   types(T::INT32, C::INT_16 , int_type(16, true)); break; //  Int16
+        case TypeIndex::IPv4:     types(T::INT32, C::UINT_32, int_type(32, false)); break; // UInt32
+        case TypeIndex::Date:     types(T::INT32, C::UINT_16, int_type(16, false)); break; // UInt16
+        case TypeIndex::DateTime: types(T::INT32, C::UINT_32, int_type(32, false)); break; // UInt32
+
+        case TypeIndex::Date32:
+        {
+            parq::LogicalType t;
+            t.__set_DATE({});
+            types(T::INT32, C::DATE, t);
+            break;
+        }
+
+        case TypeIndex::DateTime64:
+        {
+            std::optional<parq::ConvertedType::type> converted;
+            std::optional<parq::TimeUnit> unit;
+            switch (assert_cast<const DataTypeDateTime64 &>(*type).getScale())
+            {
+                case 3:
+                    converted = parq::ConvertedType::TIMESTAMP_MILLIS;
+                    unit.emplace().__set_MILLIS({});
+                    break;
+                case 6:
+                    converted = parq::ConvertedType::TIMESTAMP_MICROS;
+                    unit.emplace().__set_MICROS({});
+                    break;
+                case 9:
+                    unit.emplace().__set_NANOS({});
+                    break;
+            }
+
+            std::optional<parq::LogicalType> t;
+            if (unit)
+            {
+                parq::TimestampType tt;
+                tt.__set_isAdjustedToUTC(true);
+                tt.__set_unit(*unit);
+                t.emplace().__set_TIMESTAMP(tt);
+            }
+            types(T::INT64, converted, t);
+            break;
+        }
+
+        case TypeIndex::String:
+        case TypeIndex::FixedString:
+        {
+            if (options.output_fixed_string_as_fixed_byte_array &&
+                type->getTypeId() == TypeIndex::FixedString)
+            {
+                fixed_string(assert_cast<const DataTypeFixedString &>(*type).getN());
+            }
+            else if (options.output_string_as_string)
+            {
+                parq::LogicalType t;
+                t.__set_STRING({});
+                types(T::BYTE_ARRAY, C::UTF8, t);
+            }
+            else
+            {
+                types(T::BYTE_ARRAY);
+            }
+            break;
+        }
+
+        /// Parquet doesn't have logical types for these.
+        case TypeIndex::UInt128: fixed_string(16); break;
+        case TypeIndex::UInt256: fixed_string(32); break;
+        case TypeIndex::Int128:  fixed_string(16); break;
+        case TypeIndex::Int256:  fixed_string(32); break;
+        case TypeIndex::IPv6:    fixed_string(16); break;
+
+        case TypeIndex::Decimal32:  decimal(4 , getDecimalPrecision(*type), getDecimalScale(*type)); break;
+        case TypeIndex::Decimal64:  decimal(8 , getDecimalPrecision(*type), getDecimalScale(*type)); break;
+        case TypeIndex::Decimal128: decimal(16, getDecimalPrecision(*type), getDecimalScale(*type)); break;
+        case TypeIndex::Decimal256: decimal(32, getDecimalPrecision(*type), getDecimalScale(*type)); break;
+
+        default:
+            throw Exception(ErrorCodes::UNKNOWN_TYPE, "Internal type '{}' of column '{}' is not supported for conversion into Parquet data format.", type->getFamilyName(), name);
+    }
+}
+
+void prepareColumnNullable(
+    ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
+    ColumnChunkWriteStates & states, SchemaElements & schemas)
+{
+    const ColumnNullable * column_nullable = assert_cast<const ColumnNullable *>(column.get());
+    ColumnPtr nested_column = column_nullable->getNestedColumnPtr();
+    DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(type.get())->getNestedType();
+    const NullMap & null_map = column_nullable->getNullMapData();
+
+    size_t child_states_begin = states.size();
+    size_t child_schema_idx = schemas.size();
+
+    prepareColumnRecursive(nested_column, nested_type, name, options, states, schemas);
+
+    if (schemas[child_schema_idx].repetition_type == parq::FieldRepetitionType::REQUIRED)
+    {
+        /// Normal case: we just slap a FieldRepetitionType::OPTIONAL onto the nested column.
+        schemas[child_schema_idx].repetition_type = parq::FieldRepetitionType::OPTIONAL;
+    }
+    else
+    {
+        /// Weird case: Nullable(Nullable(...)). Or Nullable(Tuple(Nullable(...))), etc.
+        /// This is probably not allowed in ClickHouse, but let's support it just in case.
+        auto & schema = *schemas.insert(schemas.begin() + child_schema_idx, {});
+        schema.__set_repetition_type(parq::FieldRepetitionType::OPTIONAL);
+        schema.__set_name("nullable");
+        schema.__set_num_children(1);
+        for (size_t i = child_states_begin; i < states.size(); ++i)
+        {
+            Strings & path = states[i].column_chunk.meta_data.path_in_schema;
+            path.insert(path.begin(), schema.name + ".");
+        }
+    }
+
+    for (size_t i = child_states_begin; i < states.size(); ++i)
+    {
+        auto & s = states[i];
+        updateRepDefLevelsAndFilterColumnForNullable(s, null_map);
+    }
+}
+
+void prepareColumnTuple(
+    ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
+    ColumnChunkWriteStates & states, SchemaElements & schemas)
+{
+    const auto * column_tuple = assert_cast<const ColumnTuple *>(column.get());
+    const auto * type_tuple = assert_cast<const DataTypeTuple *>(type.get());
+
+    auto & tuple_schema = schemas.emplace_back();
+    tuple_schema.__set_repetition_type(parq::FieldRepetitionType::REQUIRED);
+    tuple_schema.__set_name(name);
+    tuple_schema.__set_num_children(static_cast<Int32>(type_tuple->getElements().size()));
+
+    size_t child_states_begin = states.size();
+
+    for (size_t i = 0; i < type_tuple->getElements().size(); ++i)
+        prepareColumnRecursive(column_tuple->getColumnPtr(i), type_tuple->getElement(i), type_tuple->getNameByPosition(i + 1), options, states, schemas);
+
+    for (size_t i = child_states_begin; i < states.size(); ++i)
+    {
+        Strings & path = states[i].column_chunk.meta_data.path_in_schema;
+        /// O(nesting_depth^2), but who cares.
+        path.insert(path.begin(), name);
+    }
+}
+
+void prepareColumnArray(
+    ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
+    ColumnChunkWriteStates & states, SchemaElements & schemas)
+{
+    const auto * column_array = assert_cast<const ColumnArray *>(column.get());
+    ColumnPtr nested_column = column_array->getDataPtr();
+    DataTypePtr nested_type = assert_cast<const DataTypeArray *>(type.get())->getNestedType();
+    const auto & offsets = column_array->getOffsets();
+
+    /// Schema for lists https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
+    ///
+    /// required group `name` (List):
+    ///   repeated group "list":
+    ///     <recurse into nested type> "element"
+
+    /// Add the groups schema.
+
+    schemas.emplace_back();
+    schemas.emplace_back();
+    auto & list_schema = schemas[schemas.size() - 2];
+    auto & item_schema = schemas[schemas.size() - 1];
+
+    list_schema.__set_repetition_type(parq::FieldRepetitionType::REQUIRED);
+    list_schema.__set_name(name);
+    list_schema.__set_num_children(1);
+    list_schema.__set_converted_type(parq::ConvertedType::LIST);
+    list_schema.__isset.logicalType = true;
+    list_schema.logicalType.__set_LIST({});
+
+    item_schema.__set_repetition_type(parq::FieldRepetitionType::REPEATED);
+    item_schema.__set_name("list");
+    item_schema.__set_num_children(1);
+
+    std::array<std::string, 2> path_prefix = {list_schema.name, item_schema.name};
+    size_t child_states_begin = states.size();
+
+    /// Recurse.
+    prepareColumnRecursive(nested_column, nested_type, "element", options, states, schemas);
+
+    /// Update repetition+definition levels and fully-qualified column names (x -> myarray.list.x).
+    for (size_t i = child_states_begin; i < states.size(); ++i)
+    {
+        Strings & path = states[i].column_chunk.meta_data.path_in_schema;
+        path.insert(path.begin(), path_prefix.begin(), path_prefix.end());
+
+        updateRepDefLevelsForArray(states[i], offsets);
+    }
+}
+
+void prepareColumnMap(
+    ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
+    ColumnChunkWriteStates & states, SchemaElements & schemas)
+{
+    const auto * column_map = assert_cast<const ColumnMap *>(column.get());
+    const auto * column_array = &column_map->getNestedColumn();
+    const auto & offsets = column_array->getOffsets();
+    ColumnPtr column_tuple = column_array->getDataPtr();
+
+    const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
+    DataTypePtr tuple_type = std::make_shared<DataTypeTuple>(map_type->getKeyValueTypes(), Strings{"key", "value"});
+
+    /// Map is an array of tuples
+    /// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
+    ///
+    /// required group `name` (Map):
+    ///   repeated group "key_value":
+    ///     reqiured <...> "key"
+    ///     <...> "value"
+
+    auto & map_schema = schemas.emplace_back();
+    map_schema.__set_repetition_type(parq::FieldRepetitionType::REQUIRED);
+    map_schema.__set_name(name);
+    map_schema.__set_num_children(1);
+    map_schema.__set_converted_type(parq::ConvertedType::MAP);
+    map_schema.__set_logicalType({});
+    map_schema.logicalType.__set_MAP({});
+
+    size_t tuple_schema_idx = schemas.size();
+    size_t child_states_begin = states.size();
+
+    prepareColumnTuple(column_tuple, tuple_type, "key_value", options, states, schemas);
+
+    schemas[tuple_schema_idx].__set_repetition_type(parq::FieldRepetitionType::REPEATED);
+    schemas[tuple_schema_idx].__set_converted_type(parq::ConvertedType::MAP_KEY_VALUE);
+
+    for (size_t i = child_states_begin; i < states.size(); ++i)
+    {
+        Strings & path = states[i].column_chunk.meta_data.path_in_schema;
+        path.insert(path.begin(), name);
+
+        updateRepDefLevelsForArray(states[i], offsets);
+    }
+}
+
+void prepareColumnRecursive(
+    ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
+    ColumnChunkWriteStates & states, SchemaElements & schemas)
+{
+    switch (type->getTypeId())
+    {
+        case TypeIndex::Nullable: prepareColumnNullable(column, type, name, options, states, schemas); break;
+        case TypeIndex::Array: prepareColumnArray(column, type, name, options, states, schemas); break;
+        case TypeIndex::Tuple: prepareColumnTuple(column, type, name, options, states, schemas); break;
+        case TypeIndex::Map: prepareColumnMap(column, type, name, options, states, schemas); break;
+        case TypeIndex::LowCardinality:
+        {
+            auto nested_type = assert_cast<const DataTypeLowCardinality &>(*type).getDictionaryType();
+            if (nested_type->isNullable())
+                prepareColumnNullable(
+                    column->convertToFullColumnIfLowCardinality(), nested_type, name, options, states, schemas);
+            else
+                /// Use nested data type, but keep ColumnLowCardinality. The encoder can deal with it.
+                preparePrimitiveColumn(column, nested_type, name, options, states, schemas);
+            break;
+        }
+        default:
+            preparePrimitiveColumn(column, type, name, options, states, schemas);
+            break;
+    }
+}
+
+}
+
+SchemaElements convertSchema(const Block & sample, const WriteOptions & options)
+{
+    SchemaElements schema;
+    auto & root = schema.emplace_back();
+    root.__set_name("schema");
+    root.__set_num_children(static_cast<Int32>(sample.columns()));
+
+    for (const auto & c : sample)
+        prepareColumnForWrite(c.column, c.type, c.name, options, nullptr, &schema);
+
+    return schema;
+}
+
+void prepareColumnForWrite(
+    ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
+    ColumnChunkWriteStates * out_columns_to_write, SchemaElements * out_schema)
+{
+    if (column->empty() && out_columns_to_write != nullptr)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty column passed to Parquet encoder");
+
+    ColumnChunkWriteStates states;
+    SchemaElements schemas;
+    prepareColumnRecursive(column, type, name, options, states, schemas);
+
+    if (out_columns_to_write)
+        for (auto & s : states)
+            out_columns_to_write->push_back(std::move(s));
+    if (out_schema)
+        out_schema->insert(out_schema->end(), schemas.begin(), schemas.end());
+
+    if (column->empty())
+        states.clear();
+}
+
+}
--- a/src/Processors/Formats/Impl/Parquet/ThriftUtil.cpp
+++ b/src/Processors/Formats/Impl/Parquet/ThriftUtil.cpp
@ -0,0 +1,35 @@
+#include <Processors/Formats/Impl/Parquet/ThriftUtil.h>
+#include <thrift/protocol/TCompactProtocol.h>
+
+namespace DB::Parquet
+{
+
+class WriteBufferTransport : public apache::thrift::transport::TTransport
+{
+public:
+    WriteBuffer & out;
+    size_t bytes = 0;
+
+    explicit WriteBufferTransport(WriteBuffer & out_) : out(out_) {}
+
+    void write(const uint8_t* buf, uint32_t len)
+    {
+        out.write(reinterpret_cast<const char *>(buf), len);
+        bytes += len;
+    }
+};
+
+template <typename T>
+size_t serializeThriftStruct(const T & obj, WriteBuffer & out)
+{
+    auto trans = std::make_shared<WriteBufferTransport>(out);
+    auto proto = apache::thrift::protocol::TCompactProtocolFactoryT<WriteBufferTransport>().getProtocol(trans);
+    obj.write(proto.get());
+    return trans->bytes;
+}
+
+template size_t serializeThriftStruct<parquet::format::PageHeader>(const parquet::format::PageHeader &, WriteBuffer & out);
+template size_t serializeThriftStruct<parquet::format::ColumnChunk>(const parquet::format::ColumnChunk &, WriteBuffer & out);
+template size_t serializeThriftStruct<parquet::format::FileMetaData>(const parquet::format::FileMetaData &, WriteBuffer & out);
+
+}
--- a/src/Processors/Formats/Impl/Parquet/ThriftUtil.h
+++ b/src/Processors/Formats/Impl/Parquet/ThriftUtil.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include <generated/parquet_types.h> // in contrib/arrow/cpp/src/ , generated from parquet.thrift
+#include <IO/WriteBuffer.h>
+
+namespace DB::Parquet
+{
+
+/// Returns number of bytes written.
+template <typename T>
+size_t serializeThriftStruct(const T & obj, WriteBuffer & out);
+
+extern template size_t serializeThriftStruct<parquet::format::PageHeader>(const parquet::format::PageHeader &, WriteBuffer & out);
+extern template size_t serializeThriftStruct<parquet::format::ColumnChunk>(const parquet::format::ColumnChunk &, WriteBuffer & out);
+extern template size_t serializeThriftStruct<parquet::format::FileMetaData>(const parquet::format::FileMetaData &, WriteBuffer & out);
+
+}
--- a/src/Processors/Formats/Impl/Parquet/Write.cpp
+++ b/src/Processors/Formats/Impl/Parquet/Write.cpp
@ -0,0 +1,911 @@
+#include "Processors/Formats/Impl/Parquet/Write.h"
+#include "Processors/Formats/Impl/Parquet/ThriftUtil.h"
+#include <parquet/encoding.h>
+#include <parquet/schema.h>
+#include <arrow/util/rle_encoding.h>
+#include <lz4.h>
+#include <Columns/MaskOperations.h>
+#include <Columns/ColumnFixedString.h>
+#include <Columns/ColumnNullable.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnDecimal.h>
+#include <Columns/ColumnTuple.h>
+#include <Columns/ColumnMap.h>
+#include <IO/WriteHelpers.h>
+#include "config_version.h"
+
+#if USE_SNAPPY
+#include <snappy.h>
+#endif
+
+namespace DB::ErrorCodes
+{
+    extern const int CANNOT_COMPRESS;
+    extern const int LIMIT_EXCEEDED;
+    extern const int LOGICAL_ERROR;
+}
+
+namespace DB::Parquet
+{
+
+namespace parq = parquet::format;
+
+namespace
+{
+
+template <typename T, typename SourceType>
+struct StatisticsNumeric
+{
+    T min = std::numeric_limits<T>::max();
+    T max = std::numeric_limits<T>::min();
+
+    void add(SourceType x)
+    {
+        min = std::min(min, static_cast<T>(x));
+        max = std::max(max, static_cast<T>(x));
+    }
+
+    void merge(const StatisticsNumeric & s)
+    {
+        min = std::min(min, s.min);
+        max = std::max(max, s.max);
+    }
+
+    void clear() { *this = {}; }
+
+    parq::Statistics get(const WriteOptions &)
+    {
+        parq::Statistics s;
+        s.__isset.min_value = s.__isset.max_value = true;
+        s.min_value.resize(sizeof(T));
+        s.max_value.resize(sizeof(T));
+        memcpy(s.min_value.data(), &min, sizeof(T));
+        memcpy(s.max_value.data(), &max, sizeof(T));
+
+        if constexpr (std::is_signed<T>::value)
+        {
+            s.__set_min(s.min_value);
+            s.__set_max(s.max_value);
+        }
+        return s;
+    }
+};
+
+struct StatisticsFixedStringRef
+{
+    size_t fixed_string_size = UINT64_MAX;
+    const uint8_t * min = nullptr;
+    const uint8_t * max = nullptr;
+
+    void add(parquet::FixedLenByteArray a)
+    {
+        chassert(fixed_string_size != UINT64_MAX);
+        addMin(a.ptr);
+        addMax(a.ptr);
+    }
+
+    void merge(const StatisticsFixedStringRef & s)
+    {
+        chassert(fixed_string_size == UINT64_MAX || fixed_string_size == s.fixed_string_size);
+        fixed_string_size = s.fixed_string_size;
+        if (s.min == nullptr)
+            return;
+        addMin(s.min);
+        addMax(s.max);
+    }
+
+    void clear() { min = max = nullptr; }
+
+    parq::Statistics get(const WriteOptions & options) const
+    {
+        parq::Statistics s;
+        if (min == nullptr || fixed_string_size > options.max_statistics_size)
+            return s;
+        s.__set_min_value(std::string(reinterpret_cast<const char *>(min), fixed_string_size));
+        s.__set_max_value(std::string(reinterpret_cast<const char *>(max), fixed_string_size));
+        return s;
+    }
+
+    void addMin(const uint8_t * p)
+    {
+        if (min == nullptr || memcmp(p, min, fixed_string_size) < 0)
+            min = p;
+    }
+    void addMax(const uint8_t * p)
+    {
+        if (max == nullptr || memcmp(p, max, fixed_string_size) > 0)
+            max = p;
+    }
+};
+
+template<size_t S>
+struct StatisticsFixedStringCopy
+{
+    bool empty = true;
+    std::array<uint8_t, S> min {};
+    std::array<uint8_t, S> max {};
+
+    void add(parquet::FixedLenByteArray a)
+    {
+        addMin(a.ptr);
+        addMax(a.ptr);
+        empty = false;
+    }
+
+    void merge(const StatisticsFixedStringCopy<S> & s)
+    {
+        if (s.empty)
+            return;
+        addMin(&s.min[0]);
+        addMax(&s.max[0]);
+        empty = false;
+    }
+
+    void clear() { empty = true; }
+
+    parq::Statistics get(const WriteOptions &) const
+    {
+        parq::Statistics s;
+        if (empty)
+            return s;
+        s.__set_min_value(std::string(reinterpret_cast<const char *>(min.data()), S));
+        s.__set_max_value(std::string(reinterpret_cast<const char *>(max.data()), S));
+        return s;
+    }
+
+    void addMin(const uint8_t * p)
+    {
+        if (empty || memcmp(p, min.data(), S) < 0)
+            memcpy(min.data(), p, S);
+    }
+    void addMax(const uint8_t * p)
+    {
+        if (empty || memcmp(p, max.data(), S) > 0)
+            memcpy(max.data(), p, S);
+    }
+};
+
+struct StatisticsStringRef
+{
+    parquet::ByteArray min;
+    parquet::ByteArray max;
+
+    void add(parquet::ByteArray x)
+    {
+        addMin(x);
+        addMax(x);
+    }
+
+    void merge(const StatisticsStringRef & s)
+    {
+        if (s.min.ptr == nullptr)
+            return;
+        addMin(s.min);
+        addMax(s.max);
+    }
+
+    void clear() { *this = {}; }
+
+    parq::Statistics get(const WriteOptions & options) const
+    {
+        parq::Statistics s;
+        if (min.ptr == nullptr)
+            return s;
+        if (static_cast<size_t>(min.len) <= options.max_statistics_size)
+            s.__set_min_value(std::string(reinterpret_cast<const char *>(min.ptr), static_cast<size_t>(min.len)));
+        if (static_cast<size_t>(max.len) <= options.max_statistics_size)
+            s.__set_max_value(std::string(reinterpret_cast<const char *>(max.ptr), static_cast<size_t>(max.len)));
+        return s;
+    }
+
+    void addMin(parquet::ByteArray x)
+    {
+        if (min.ptr == nullptr || compare(x, min) < 0)
+            min = x;
+    }
+
+    void addMax(parquet::ByteArray x)
+    {
+        if (max.ptr == nullptr || compare(x, max) > 0)
+            max = x;
+    }
+
+    static int compare(parquet::ByteArray a, parquet::ByteArray b)
+    {
+        int t = memcmp(a.ptr, b.ptr, std::min(a.len, b.len));
+        if (t != 0)
+            return t;
+        return a.len - b.len;
+    }
+};
+
+/// The column usually needs to be converted to one of Parquet physical types, e.g. UInt16 -> Int32
+/// or [element of ColumnString] -> std::string_view.
+/// We do this conversion in small batches rather than all at once, just before encoding the batch,
+/// in hopes of getting better performance through cache locality.
+/// The Coverter* structs below are responsible for that.
+/// When conversion is not needed, getBatch() will just return pointer into original data.
+
+template <typename Col, typename To, typename MinMaxType = typename std::conditional<
+        std::is_signed<typename Col::Container::value_type>::value,
+        To,
+        typename std::make_unsigned<To>::type>::type>
+struct ConverterNumeric
+{
+    using Statistics = StatisticsNumeric<MinMaxType, To>;
+
+    const Col & column;
+    PODArray<To> buf;
+
+    explicit ConverterNumeric(const ColumnPtr & c) : column(assert_cast<const Col &>(*c)) {}
+
+    const To * getBatch(size_t offset, size_t count)
+    {
+        if constexpr (sizeof(*column.getData().data()) == sizeof(To))
+            return reinterpret_cast<const To *>(column.getData().data() + offset);
+        else
+        {
+            buf.resize(count);
+            for (size_t i = 0; i < count; ++i)
+                buf[i] = static_cast<To>(column.getData()[offset + i]); // NOLINT
+            return buf.data();
+        }
+    }
+};
+
+struct ConverterString
+{
+    using Statistics = StatisticsStringRef;
+
+    const ColumnString & column;
+    PODArray<parquet::ByteArray> buf;
+
+    explicit ConverterString(const ColumnPtr & c) : column(assert_cast<const ColumnString &>(*c)) {}
+
+    const parquet::ByteArray * getBatch(size_t offset, size_t count)
+    {
+        buf.resize(count);
+        for (size_t i = 0; i < count; ++i)
+        {
+            StringRef s = column.getDataAt(offset + i);
+            buf[i] = parquet::ByteArray(static_cast<UInt32>(s.size), reinterpret_cast<const uint8_t *>(s.data));
+        }
+        return buf.data();
+    }
+};
+
+struct ConverterFixedString
+{
+    using Statistics = StatisticsFixedStringRef;
+
+    const ColumnFixedString & column;
+    PODArray<parquet::FixedLenByteArray> buf;
+
+    explicit ConverterFixedString(const ColumnPtr & c) : column(assert_cast<const ColumnFixedString &>(*c)) {}
+
+    const parquet::FixedLenByteArray * getBatch(size_t offset, size_t count)
+    {
+        buf.resize(count);
+        for (size_t i = 0; i < count; ++i)
+            buf[i].ptr = reinterpret_cast<const uint8_t *>(column.getChars().data() + (offset + i) * column.getN());
+        return buf.data();
+    }
+
+    size_t fixedStringSize() { return column.getN(); }
+};
+
+struct ConverterFixedStringAsString
+{
+    using Statistics = StatisticsStringRef;
+
+    const ColumnFixedString & column;
+    PODArray<parquet::ByteArray> buf;
+
+    explicit ConverterFixedStringAsString(const ColumnPtr & c) : column(assert_cast<const ColumnFixedString &>(*c)) {}
+
+    const parquet::ByteArray * getBatch(size_t offset, size_t count)
+    {
+        buf.resize(count);
+        for (size_t i = 0; i < count; ++i)
+            buf[i] = parquet::ByteArray(static_cast<UInt32>(column.getN()), reinterpret_cast<const uint8_t *>(column.getChars().data() + (offset + i) * column.getN()));
+        return buf.data();
+    }
+};
+
+template <typename T>
+struct ConverterNumberAsFixedString
+{
+    /// Calculate min/max statistics for little-endian fixed strings, not numbers, because parquet
+    /// doesn't know it's numbers.
+    using Statistics = StatisticsFixedStringCopy<sizeof(T)>;
+
+    const ColumnVector<T> & column;
+    PODArray<parquet::FixedLenByteArray> buf;
+
+    explicit ConverterNumberAsFixedString(const ColumnPtr & c) : column(assert_cast<const ColumnVector<T> &>(*c)) {}
+
+    const parquet::FixedLenByteArray * getBatch(size_t offset, size_t count)
+    {
+        buf.resize(count);
+        for (size_t i = 0; i < count; ++i)
+            buf[i].ptr = reinterpret_cast<const uint8_t *>(column.getData().data() + offset + i);
+        return buf.data();
+    }
+
+    size_t fixedStringSize() { return sizeof(T); }
+};
+
+/// Like ConverterNumberAsFixedString, but converts to big-endian. Because that's the byte order
+/// Parquet uses for decimal types and literally nothing else, for some reason.
+template <typename T>
+struct ConverterDecimal
+{
+    using Statistics = StatisticsFixedStringCopy<sizeof(T)>;
+
+    const ColumnDecimal<T> & column;
+    PODArray<uint8_t> data_buf;
+    PODArray<parquet::FixedLenByteArray> ptr_buf;
+
+    explicit ConverterDecimal(const ColumnPtr & c) : column(assert_cast<const ColumnDecimal<T> &>(*c)) {}
+
+    const parquet::FixedLenByteArray * getBatch(size_t offset, size_t count)
+    {
+        data_buf.resize(count * sizeof(T));
+        ptr_buf.resize(count);
+        memcpy(data_buf.data(), reinterpret_cast<const char *>(column.getData().data() + offset), count * sizeof(T));
+        for (size_t i = 0; i < count; ++i)
+        {
+            std::reverse(data_buf.data() + i * sizeof(T), data_buf.data() + (i + 1) * sizeof(T));
+            ptr_buf[i].ptr = data_buf.data() + i * sizeof(T);
+        }
+        return ptr_buf.data();
+    }
+
+    size_t fixedStringSize() { return sizeof(T); }
+};
+
+/// Returns either `source` or `scratch`.
+PODArray<char> & compress(PODArray<char> & source, PODArray<char> & scratch, CompressionMethod method)
+{
+    /// We could use wrapWriteBufferWithCompressionMethod() for everything, but I worry about the
+    /// overhead of creating a bunch of WriteBuffers on each page (thousands of values).
+    switch (method)
+    {
+        case CompressionMethod::None:
+            return source;
+
+        case CompressionMethod::Lz4:
+        {
+            #pragma clang diagnostic push
+            #pragma clang diagnostic ignored "-Wold-style-cast"
+
+            size_t max_dest_size = LZ4_COMPRESSBOUND(source.size());
+
+            #pragma clang diagnostic pop
+
+            if (max_dest_size > std::numeric_limits<int>::max())
+                throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(source.size()));
+
+            scratch.resize(max_dest_size);
+
+            int compressed_size = LZ4_compress_default(
+                source.data(),
+                scratch.data(),
+                static_cast<int>(source.size()),
+                static_cast<int>(max_dest_size));
+
+            scratch.resize(static_cast<size_t>(compressed_size));
+            return scratch;
+        }
+
+#if USE_SNAPPY
+        case CompressionMethod::Snappy:
+        {
+            size_t max_dest_size = snappy::MaxCompressedLength(source.size());
+
+            if (max_dest_size > std::numeric_limits<int>::max())
+                throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(source.size()));
+
+            scratch.resize(max_dest_size);
+
+            size_t compressed_size;
+            snappy::RawCompress(source.data(), source.size(), scratch.data(), &compressed_size);
+
+            scratch.resize(static_cast<size_t>(compressed_size));
+            return scratch;
+        }
+#endif
+
+        default:
+        {
+            auto dest_buf = std::make_unique<WriteBufferFromVector<PODArray<char>>>(scratch);
+            auto compressed_buf = wrapWriteBufferWithCompressionMethod(
+                std::move(dest_buf),
+                method,
+                /*level*/ 3,
+                source.size(),
+                /*existing_memory*/ source.data());
+            chassert(compressed_buf->position() == source.data());
+            chassert(compressed_buf->available() == source.size());
+            compressed_buf->position() += source.size();
+            compressed_buf->finalize();
+            return scratch;
+        }
+    }
+}
+
+void encodeRepDefLevelsRLE(const UInt8 * data, size_t size, UInt8 max_level, PODArray<char> & out)
+{
+    using arrow::util::RleEncoder;
+
+    chassert(max_level > 0);
+    size_t offset = out.size();
+    size_t prefix_size = sizeof(Int32);
+
+    int bit_width = bitScanReverse(max_level) + 1;
+    int max_rle_size = RleEncoder::MaxBufferSize(bit_width, static_cast<int>(size)) +
+                       RleEncoder::MinBufferSize(bit_width);
+
+    out.resize(offset + prefix_size + max_rle_size);
+
+    RleEncoder encoder(reinterpret_cast<uint8_t *>(out.data() + offset + prefix_size), max_rle_size, bit_width);
+    for (size_t i = 0; i < size; ++i)
+        encoder.Put(data[i]);
+    encoder.Flush();
+    Int32 len = encoder.len();
+
+    memcpy(out.data() + offset, &len, prefix_size);
+    out.resize(offset + prefix_size + len);
+}
+
+void addToEncodingsUsed(ColumnChunkWriteState & s, parq::Encoding::type e)
+{
+    if (!std::count(s.column_chunk.meta_data.encodings.begin(), s.column_chunk.meta_data.encodings.end(), e))
+        s.column_chunk.meta_data.encodings.push_back(e);
+}
+
+void writePage(const parq::PageHeader & header, const PODArray<char> & compressed, ColumnChunkWriteState & s, WriteBuffer & out)
+{
+    size_t header_size = serializeThriftStruct(header, out);
+    out.write(compressed.data(), compressed.size());
+
+    /// Remember first data page and first dictionary page.
+    if (header.__isset.data_page_header && s.column_chunk.meta_data.data_page_offset == -1)
+        s.column_chunk.meta_data.__set_data_page_offset(s.column_chunk.meta_data.total_compressed_size);
+    if (header.__isset.dictionary_page_header && !s.column_chunk.meta_data.__isset.dictionary_page_offset)
+        s.column_chunk.meta_data.__set_dictionary_page_offset(s.column_chunk.meta_data.total_compressed_size);
+
+    s.column_chunk.meta_data.total_uncompressed_size += header.uncompressed_page_size + header_size;
+    s.column_chunk.meta_data.total_compressed_size += header.compressed_page_size + header_size;
+}
+
+template <typename ParquetDType, typename Converter>
+void writeColumnImpl(
+    ColumnChunkWriteState & s, const WriteOptions & options, WriteBuffer & out, Converter && converter)
+{
+    size_t num_values = s.max_def > 0 ? s.def.size() : s.primitive_column->size();
+    auto encoding = options.encoding;
+
+    typename Converter::Statistics page_statistics;
+    typename Converter::Statistics total_statistics;
+
+    bool use_dictionary = options.use_dictionary_encoding && !s.is_bool;
+
+    std::optional<parquet::ColumnDescriptor> fixed_string_descr;
+    if constexpr (std::is_same<ParquetDType, parquet::FLBAType>::value)
+    {
+        /// This just communicates one number to MakeTypedEncoder(): the fixed string length.
+        fixed_string_descr.emplace(parquet::schema::PrimitiveNode::Make(
+            "", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
+            parquet::ConvertedType::NONE, static_cast<int>(converter.fixedStringSize())), 0, 0);
+
+        if constexpr (std::is_same<typename Converter::Statistics, StatisticsFixedStringRef>::value)
+            page_statistics.fixed_string_size = converter.fixedStringSize();
+    }
+
+    /// Could use an arena here (by passing a custom MemoryPool), to reuse memory across pages.
+    /// Alternatively, we could avoid using arrow's dictionary encoding code and leverage
+    /// ColumnLowCardinality instead. It would work basically the same way as what this function
+    /// currently does: add values to the ColumnRowCardinality (instead of `encoder`) in batches,
+    /// checking dictionary size after each batch. That might be faster.
+    auto encoder = parquet::MakeTypedEncoder<ParquetDType>(
+        // ignored if using dictionary
+        static_cast<parquet::Encoding::type>(encoding),
+        use_dictionary, fixed_string_descr ? &*fixed_string_descr : nullptr);
+
+    struct PageData
+    {
+        parq::PageHeader header;
+        PODArray<char> data;
+    };
+    std::vector<PageData> dict_encoded_pages; // can't write them out until we have full dictionary
+
+    /// Reused across pages to reduce number of allocations and improve locality.
+    PODArray<char> encoded;
+    PODArray<char> compressed_maybe;
+
+    /// Start of current page.
+    size_t def_offset = 0; // index in def and rep
+    size_t data_offset = 0; // index in primitive_column
+
+    auto flush_page = [&](size_t def_count, size_t data_count)
+    {
+        encoded.clear();
+
+        /// Concatenate encoded rep, def, and data.
+
+        if (s.max_rep > 0)
+            encodeRepDefLevelsRLE(s.rep.data() + def_offset, def_count, s.max_rep, encoded);
+        if (s.max_def > 0)
+            encodeRepDefLevelsRLE(s.def.data() + def_offset, def_count, s.max_def, encoded);
+
+        std::shared_ptr<parquet::Buffer> values = encoder->FlushValues(); // resets it for next page
+
+        encoded.resize(encoded.size() + values->size());
+        memcpy(encoded.data() + encoded.size() - values->size(), values->data(), values->size());
+        values.reset();
+
+        if (encoded.size() > INT32_MAX)
+            throw Exception(ErrorCodes::CANNOT_COMPRESS, "Uncompressed page is too big: {}", encoded.size());
+
+        size_t uncompressed_size = encoded.size();
+        auto & compressed = compress(encoded, compressed_maybe, s.compression);
+
+        if (compressed.size() > INT32_MAX)
+            throw Exception(ErrorCodes::CANNOT_COMPRESS, "Compressed page is too big: {}", compressed.size());
+
+        parq::PageHeader header;
+        header.__set_type(parq::PageType::DATA_PAGE);
+        header.__set_uncompressed_page_size(static_cast<int>(uncompressed_size));
+        header.__set_compressed_page_size(static_cast<int>(compressed.size()));
+        header.__isset.data_page_header = true;
+        auto & d = header.data_page_header;
+        d.__set_num_values(static_cast<Int32>(def_count));
+        d.__set_encoding(use_dictionary ? parq::Encoding::RLE_DICTIONARY : encoding);
+        d.__set_definition_level_encoding(parq::Encoding::RLE);
+        d.__set_repetition_level_encoding(parq::Encoding::RLE);
+        /// We could also put checksum in `header.crc`, but apparently no one uses it:
+        /// https://issues.apache.org/jira/browse/PARQUET-594
+
+        if (options.write_page_statistics)
+        {
+            d.__set_statistics(page_statistics.get(options));
+
+            if (s.max_def == 1 && s.max_rep == 0)
+                d.statistics.__set_null_count(static_cast<Int64>(def_count - data_count));
+        }
+
+        total_statistics.merge(page_statistics);
+        page_statistics.clear();
+
+        if (use_dictionary)
+        {
+            dict_encoded_pages.push_back({.header = std::move(header)});
+            std::swap(dict_encoded_pages.back().data, compressed);
+        }
+        else
+        {
+            writePage(header, compressed, s, out);
+        }
+
+        def_offset += def_count;
+        data_offset += data_count;
+    };
+
+    auto flush_dict = [&] -> bool
+    {
+        auto * dict_encoder = dynamic_cast<parquet::DictEncoder<ParquetDType> *>(encoder.get());
+        int dict_size = dict_encoder->dict_encoded_size();
+
+        encoded.resize(static_cast<size_t>(dict_size));
+        dict_encoder->WriteDict(reinterpret_cast<uint8_t *>(encoded.data()));
+
+        auto & compressed = compress(encoded, compressed_maybe, s.compression);
+
+        if (compressed.size() > INT32_MAX)
+            throw Exception(ErrorCodes::CANNOT_COMPRESS, "Compressed dictionary page is too big: {}", compressed.size());
+
+        parq::PageHeader header;
+        header.__set_type(parq::PageType::DICTIONARY_PAGE);
+        header.__set_uncompressed_page_size(dict_size);
+        header.__set_compressed_page_size(static_cast<int>(compressed.size()));
+        header.__isset.dictionary_page_header = true;
+        header.dictionary_page_header.__set_num_values(dict_encoder->num_entries());
+        header.dictionary_page_header.__set_encoding(parq::Encoding::PLAIN);
+
+        writePage(header, compressed, s, out);
+
+        for (auto & p : dict_encoded_pages)
+            writePage(p.header, p.data, s, out);
+
+        dict_encoded_pages.clear();
+        encoder.reset();
+
+        return true;
+    };
+
+    auto is_dict_too_big = [&] {
+        auto * dict_encoder = dynamic_cast<parquet::DictEncoder<ParquetDType> *>(encoder.get());
+        int dict_size = dict_encoder->dict_encoded_size();
+        return static_cast<size_t>(dict_size) >= options.dictionary_size_limit;
+    };
+
+    while (def_offset < num_values)
+    {
+        /// Pick enough data for a page.
+        size_t next_def_offset = def_offset;
+        size_t next_data_offset = data_offset;
+        while (true)
+        {
+            /// Bite off a batch of defs and corresponding data values.
+            size_t def_count = std::min(options.write_batch_size, num_values - next_def_offset);
+            size_t data_count = 0;
+            if (s.max_def == 0)
+                data_count = def_count;
+            else
+                for (size_t i = 0; i < def_count; ++i)
+                    data_count += s.def[next_def_offset + i] == s.max_def;
+
+            /// Encode the data (but not the levels yet), so that we can estimate its encoded size.
+            const typename ParquetDType::c_type * converted = converter.getBatch(next_data_offset, data_count);
+
+            if (options.write_page_statistics || options.write_column_chunk_statistics)
+/// Workaround for clang bug: https://github.com/llvm/llvm-project/issues/63630
+#ifdef MEMORY_SANITIZER
+#pragma clang loop vectorize(disable)
+#endif
+                for (size_t i = 0; i < data_count; ++i)
+                    page_statistics.add(converted[i]);
+
+            encoder->Put(converted, static_cast<int>(data_count));
+
+            next_def_offset += def_count;
+            next_data_offset += data_count;
+
+            if (use_dictionary && is_dict_too_big())
+            {
+                /// Fallback to non-dictionary encoding.
+                ///
+                /// Discard encoded data and start over.
+                /// This is different from what arrow does: arrow writes out the dictionary-encoded
+                /// data, then uses non-dictionary encoding for later pages.
+                /// Starting over seems better: it produces slightly smaller files (I saw 1-4%) in
+                /// exchange for slight decrease in speed (I saw < 5%). This seems like a good
+                /// trade because encoding speed is much less important than decoding (as evidenced
+                /// by arrow not supporting parallel encoding, even though it's easy to support).
+
+                def_offset = 0;
+                data_offset = 0;
+                dict_encoded_pages.clear();
+                use_dictionary = false;
+
+#ifndef NDEBUG
+                /// Arrow's DictEncoderImpl destructor asserts that FlushValues() was called, so we
+                /// call it even though we don't need its output.
+                encoder->FlushValues();
+#endif
+
+                encoder = parquet::MakeTypedEncoder<ParquetDType>(
+                    static_cast<parquet::Encoding::type>(encoding), /* use_dictionary */ false,
+                    fixed_string_descr ? &*fixed_string_descr : nullptr);
+                break;
+            }
+
+            if (next_def_offset == num_values ||
+                static_cast<size_t>(encoder->EstimatedDataEncodedSize()) >= options.data_page_size)
+            {
+                flush_page(next_def_offset - def_offset, next_data_offset - data_offset);
+                break;
+            }
+        }
+    }
+
+    if (use_dictionary)
+        flush_dict();
+
+    chassert(data_offset == s.primitive_column->size());
+
+    if (options.write_column_chunk_statistics)
+    {
+        s.column_chunk.meta_data.__set_statistics(total_statistics.get(options));
+
+        if (s.max_def == 1 && s.max_rep == 0)
+            s.column_chunk.meta_data.statistics.__set_null_count(static_cast<Int64>(def_offset - data_offset));
+    }
+
+    /// Report which encodings we've used.
+    if (s.max_rep > 0 || s.max_def > 0)
+        addToEncodingsUsed(s, parq::Encoding::RLE); // levels
+    if (use_dictionary)
+    {
+        addToEncodingsUsed(s, parq::Encoding::PLAIN); // dictionary itself
+        addToEncodingsUsed(s, parq::Encoding::RLE_DICTIONARY); // ids
+    }
+    else
+    {
+        addToEncodingsUsed(s, encoding);
+    }
+}
+
+}
+
+void writeColumnChunkBody(ColumnChunkWriteState & s, const WriteOptions & options, WriteBuffer & out)
+{
+    s.column_chunk.meta_data.__set_num_values(s.max_def > 0 ? s.def.size() : s.primitive_column->size());
+
+    /// We'll be updating these as we go.
+    s.column_chunk.meta_data.__set_encodings({});
+    s.column_chunk.meta_data.__set_total_compressed_size(0);
+    s.column_chunk.meta_data.__set_total_uncompressed_size(0);
+    s.column_chunk.meta_data.__set_data_page_offset(-1);
+
+    s.primitive_column = s.primitive_column->convertToFullColumnIfLowCardinality();
+
+    switch (s.primitive_column->getDataType())
+    {
+        /// Numeric conversion to Int32 or Int64.
+        #define N(source_type, parquet_dtype) \
+            writeColumnImpl<parquet::parquet_dtype>(s, options, out, \
+                ConverterNumeric<ColumnVector<source_type>, parquet::parquet_dtype::c_type>( \
+                    s.primitive_column))
+
+        case TypeIndex::UInt8:
+            if (s.is_bool)
+                writeColumnImpl<parquet::BooleanType>(s, options, out,
+                    ConverterNumeric<ColumnVector<UInt8>, bool, bool>(s.primitive_column));
+            else
+                N(UInt8 , Int32Type);
+         break;
+        case TypeIndex::UInt16 : N(UInt16, Int32Type); break;
+        case TypeIndex::UInt32 : N(UInt32, Int32Type); break;
+        case TypeIndex::UInt64 : N(UInt64, Int64Type); break;
+        case TypeIndex::Int8   : N(Int8  , Int32Type); break;
+        case TypeIndex::Int16  : N(Int16 , Int32Type); break;
+        case TypeIndex::Int32  : N(Int32 , Int32Type); break;
+        case TypeIndex::Int64  : N(Int64 , Int64Type); break;
+
+        case TypeIndex::Enum8:      N(Int8  , Int32Type); break;
+        case TypeIndex::Enum16:     N(Int16 , Int32Type); break;
+        case TypeIndex::Date:       N(UInt16, Int32Type); break;
+        case TypeIndex::Date32:     N(Int32 , Int32Type); break;
+        case TypeIndex::DateTime:   N(UInt32, Int32Type); break;
+
+        #undef N
+
+        case TypeIndex::Float32:
+            writeColumnImpl<parquet::FloatType>(
+                s, options, out, ConverterNumeric<ColumnVector<Float32>, Float32, Float32>(
+                    s.primitive_column));
+            break;
+
+        case TypeIndex::Float64:
+            writeColumnImpl<parquet::DoubleType>(
+                s, options, out, ConverterNumeric<ColumnVector<Float64>, Float64, Float64>(
+                    s.primitive_column));
+            break;
+
+        case TypeIndex::DateTime64:
+            writeColumnImpl<parquet::Int64Type>(
+                s, options, out, ConverterNumeric<ColumnDecimal<DateTime64>, Int64, Int64>(
+                    s.primitive_column));
+            break;
+
+        case TypeIndex::IPv4:
+            writeColumnImpl<parquet::Int32Type>(
+                s, options, out, ConverterNumeric<ColumnVector<IPv4>, Int32, UInt32>(
+                    s.primitive_column));
+            break;
+
+        case TypeIndex::String:
+            writeColumnImpl<parquet::ByteArrayType>(
+                s, options, out, ConverterString(s.primitive_column));
+            break;
+
+        case TypeIndex::FixedString:
+            if (options.output_fixed_string_as_fixed_byte_array)
+                writeColumnImpl<parquet::FLBAType>(
+                s, options, out, ConverterFixedString(s.primitive_column));
+            else
+                writeColumnImpl<parquet::ByteArrayType>(
+                s, options, out, ConverterFixedStringAsString(s.primitive_column));
+            break;
+
+        #define F(source_type) \
+            writeColumnImpl<parquet::FLBAType>( \
+                s, options, out, ConverterNumberAsFixedString<source_type>(s.primitive_column))
+        case TypeIndex::UInt128: F(UInt128); break;
+        case TypeIndex::UInt256: F(UInt256); break;
+        case TypeIndex::Int128:  F(Int128); break;
+        case TypeIndex::Int256:  F(Int256); break;
+        case TypeIndex::IPv6:    F(IPv6); break;
+        #undef F
+
+        #define D(source_type) \
+            writeColumnImpl<parquet::FLBAType>( \
+                s, options, out, ConverterDecimal<source_type>(s.primitive_column))
+        case TypeIndex::Decimal32:  D(Decimal32); break;
+        case TypeIndex::Decimal64:  D(Decimal64); break;
+        case TypeIndex::Decimal128: D(Decimal128); break;
+        case TypeIndex::Decimal256: D(Decimal256); break;
+        #undef D
+
+        default:
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected column type: {}", s.primitive_column->getFamilyName());
+    }
+
+    /// Free some memory.
+    s.primitive_column = {};
+    s.def = {};
+    s.rep = {};
+}
+
+void writeFileHeader(WriteBuffer & out)
+{
+    /// Write the magic bytes. We're a wizard now.
+    out.write("PAR1", 4);
+}
+
+parq::ColumnChunk finalizeColumnChunkAndWriteFooter(
+    size_t offset_in_file, ColumnChunkWriteState s, const WriteOptions &, WriteBuffer & out)
+{
+    if (s.column_chunk.meta_data.data_page_offset != -1)
+        s.column_chunk.meta_data.data_page_offset += offset_in_file;
+    if (s.column_chunk.meta_data.__isset.dictionary_page_offset)
+        s.column_chunk.meta_data.dictionary_page_offset += offset_in_file;
+    s.column_chunk.file_offset = offset_in_file + s.column_chunk.meta_data.total_compressed_size;
+
+    serializeThriftStruct(s.column_chunk, out);
+
+    return s.column_chunk;
+}
+
+parq::RowGroup makeRowGroup(std::vector<parq::ColumnChunk> column_chunks, size_t num_rows)
+{
+    parq::RowGroup r;
+    r.__set_num_rows(num_rows);
+    r.__set_columns(column_chunks);
+    r.__set_total_compressed_size(0);
+    for (auto & c : r.columns)
+    {
+        r.total_byte_size += c.meta_data.total_uncompressed_size;
+        r.total_compressed_size += c.meta_data.total_compressed_size;
+    }
+    if (!r.columns.empty())
+    {
+        auto & m = r.columns[0].meta_data;
+        r.__set_file_offset(m.__isset.dictionary_page_offset ? m.dictionary_page_offset : m.data_page_offset);
+    }
+    return r;
+}
+
+void writeFileFooter(std::vector<parq::RowGroup> row_groups, SchemaElements schema, const WriteOptions & options, WriteBuffer & out)
+{
+    parq::FileMetaData meta;
+    meta.version = 2;
+    meta.schema = std::move(schema);
+    meta.row_groups = std::move(row_groups);
+    for (auto & r : meta.row_groups)
+        meta.num_rows += r.num_rows;
+    meta.__set_created_by(VERSION_NAME " " VERSION_DESCRIBE);
+
+    if (options.write_page_statistics || options.write_column_chunk_statistics)
+    {
+        meta.__set_column_orders({});
+        for (auto & s : meta.schema)
+            if (!s.__isset.num_children)
+                meta.column_orders.emplace_back();
+        for (auto & c : meta.column_orders)
+            c.__set_TYPE_ORDER({});
+    }
+
+    size_t footer_size = serializeThriftStruct(meta, out);
+
+    if (footer_size > INT32_MAX)
+        throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Parquet file metadata too big: {}", footer_size);
+
+    writeIntBinary(static_cast<int>(footer_size), out);
+    out.write("PAR1", 4);
+}
+
+}
--- a/src/Processors/Formats/Impl/Parquet/Write.h
+++ b/src/Processors/Formats/Impl/Parquet/Write.h
@ -0,0 +1,136 @@
+#pragma once
+
+#include <Processors/Formats/Impl/Parquet/ThriftUtil.h>
+#include <Columns/IColumn.h>
+#include <DataTypes/IDataType.h>
+#include <Common/PODArray.h>
+#include <IO/CompressionMethod.h>
+
+namespace DB::Parquet
+{
+
+/// A good resource for learning how Parquet format works is
+/// contrib/arrow/cpp/src/parquet/parquet.thrift
+
+struct WriteOptions
+{
+    bool output_string_as_string = false;
+    bool output_fixed_string_as_fixed_byte_array = true;
+
+    CompressionMethod compression = CompressionMethod::Lz4;
+
+    size_t data_page_size = 1024 * 1024;
+    size_t write_batch_size = 1024;
+
+    bool use_dictionary_encoding = true;
+    size_t dictionary_size_limit = 1024 * 1024;
+    /// If using dictionary, this encoding is used as a fallback when dictionary gets too big.
+    /// Otherwise, this is used for everything.
+    parquet::format::Encoding::type encoding = parquet::format::Encoding::PLAIN;
+
+    bool write_page_statistics = true;
+    bool write_column_chunk_statistics = true;
+    size_t max_statistics_size = 4096;
+};
+
+/// Information about a primitive column (leaf of the schema tree) to write to Parquet file.
+struct ColumnChunkWriteState
+{
+    /// After writeColumnChunkBody(), offsets in this struct are relative to the start of column chunk.
+    /// Then finalizeColumnChunkAndWriteFooter() fixes them up before writing to file.
+    parquet::format::ColumnChunk column_chunk;
+
+    ColumnPtr primitive_column;
+    CompressionMethod compression; // must match what's inside column_chunk
+    bool is_bool = false;
+
+    /// Repetition and definition levels. Produced by prepareColumnForWrite().
+    /// def is empty iff max_def == 0, which means no arrays or nullables.
+    /// rep is empty iff max_rep == 0, which means no arrays.
+    PaddedPODArray<UInt8> def; // definition levels
+    PaddedPODArray<UInt8> rep; // repetition levels
+    /// Max possible levels, according to schema. Actual max in def/rep may be smaller.
+    UInt8 max_def = 0;
+    UInt8 max_rep = 0;
+
+    ColumnChunkWriteState() = default;
+    /// Prevent accidental copying.
+    ColumnChunkWriteState(ColumnChunkWriteState &&) = default;
+    ColumnChunkWriteState & operator=(ColumnChunkWriteState &&) = default;
+
+    /// Estimated memory usage.
+    size_t allocatedBytes() const
+    {
+        size_t r = def.allocated_bytes() + rep.allocated_bytes();
+        if (primitive_column)
+            r += primitive_column->allocatedBytes();
+        return r;
+    }
+};
+
+using SchemaElements = std::vector<parquet::format::SchemaElement>;
+using ColumnChunkWriteStates = std::vector<ColumnChunkWriteState>;
+
+/// Parquet file consists of row groups, which consist of column chunks.
+///
+/// Column chunks can be encoded mostly independently of each other, in parallel.
+/// But there are two small complications:
+///  1. One ClickHouse column can translate to multiple leaf columns in parquet.
+///     E.g. tuples and maps.
+///     If all primitive columns are in one big tuple, we'd like to encode them in parallel too,
+///     even though they're one top-level ClickHouse column.
+///  2. At the end of each encoded column chunk there's a footer (struct ColumnMetaData) that
+///     contains some absolute offsets in the file. We can't encode it until we know the exact
+///     position in the file where the column chunk will go. So these footers have to be serialized
+///     sequentially, after we know sizes of all previous column chunks.
+///
+/// With that in mind, here's how to write a parquet file:
+///
+/// (1) writeFileHeader()
+/// (2) For each row group:
+///  | (3) For each ClickHouse column:
+///  |    (4) Call prepareColumnForWrite().
+///  |        It'll produce one or more ColumnChunkWriteStates, corresponding to primitive columns that
+///  |        we need to write.
+///  |        It'll also produce SchemaElements as a byproduct, describing the logical types and
+///  |        groupings of the physical columns (e.g. tuples, arrays, maps).
+///  | (5) For each ColumnChunkWriteState:
+///  |    (6) Call writeColumnChunkBody() to write the actual data to the given WriteBuffer.
+///  |    (7) Call finalizeColumnChunkAndWriteFooter() to write the footer of the column chunk.
+///  | (8) Call makeRowGroup() using the ColumnChunk metadata structs from previous step.
+/// (9) Call writeFileFooter() using the row groups from previous step and SchemaElements from
+///     convertSchema().
+///
+/// Steps (4) and (6) can be parallelized, both within and across row groups.
+
+/// Parquet schema is a tree of SchemaElements, flattened into a list in depth-first order.
+/// Leaf nodes correspond to physical columns of primitive types. Inner nodes describe logical
+/// groupings of those columns, e.g. tuples or structs.
+SchemaElements convertSchema(const Block & sample, const WriteOptions & options);
+
+void prepareColumnForWrite(
+    ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
+    ColumnChunkWriteStates * out_columns_to_write, SchemaElements * out_schema = nullptr);
+
+void writeFileHeader(WriteBuffer & out);
+
+/// Encodes a column chunk, without the footer.
+/// The ColumnChunkWriteState-s should then passed to finalizeColumnChunkAndWriteFooter().
+void writeColumnChunkBody(ColumnChunkWriteState & s, const WriteOptions & options, WriteBuffer & out);
+
+/// Unlike most of the column chunk data, the footer (`ColumnMetaData`) needs to know its absolute
+/// offset in the file. So we encode it separately, after all previous row groups and column chunks
+/// have been encoded.
+/// (If you're wondering if the 8-byte offset values can be patched inside the encoded blob - no,
+/// they're varint-encoded and can't be padded to a fixed length.)
+/// `offset_in_file` is the absolute position in the file where the writeColumnChunkBody()'s output
+/// starts.
+/// Returns a ColumnChunk to add to the RowGroup.
+parquet::format::ColumnChunk finalizeColumnChunkAndWriteFooter(
+    size_t offset_in_file, ColumnChunkWriteState s, const WriteOptions & options, WriteBuffer & out);
+
+parquet::format::RowGroup makeRowGroup(std::vector<parquet::format::ColumnChunk> column_chunks, size_t num_rows);
+
+void writeFileFooter(std::vector<parquet::format::RowGroup> row_groups, SchemaElements schema, const WriteOptions & options, WriteBuffer & out);
+
+}
--- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
@ -59,7 +59,12 @@ ParquetBlockInputFormat::ParquetBlockInputFormat(
        pool = std::make_unique<ThreadPool>(CurrentMetrics::ParquetDecoderThreads, CurrentMetrics::ParquetDecoderThreadsActive, max_decoding_threads);
 }

-ParquetBlockInputFormat::~ParquetBlockInputFormat() = default;
+ParquetBlockInputFormat::~ParquetBlockInputFormat()
+{
+    is_stopped = true;
+    if (pool)
+        pool->wait();
+}

 void ParquetBlockInputFormat::initializeIfNeeded()
 {
--- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp
@ -3,14 +3,23 @@
 #if USE_PARQUET

 #include <Formats/FormatFactory.h>
+#include <IO/WriteBufferFromVector.h>
 #include <parquet/arrow/writer.h>
 #include "ArrowBufferedStreams.h"
 #include "CHColumnToArrowColumn.h"


+namespace CurrentMetrics
+{
+    extern const Metric ParquetEncoderThreads;
+    extern const Metric ParquetEncoderThreadsActive;
+}
+
 namespace DB
 {

+using namespace Parquet;
+
 namespace ErrorCodes
 {
    extern const int UNKNOWN_EXCEPTION;
@ -59,19 +68,229 @@ namespace
        if (method == FormatSettings::ParquetCompression::GZIP)
            return parquet::Compression::type::GZIP;

-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method");
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported parquet compression method");
    }
-
 }

 ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_)
    : IOutputFormat(header_, out_), format_settings{format_settings_}
 {
+    if (format_settings.parquet.use_custom_encoder)
+    {
+        if (format_settings.parquet.parallel_encoding && format_settings.max_threads > 1)
+            pool = std::make_unique<ThreadPool>(
+                CurrentMetrics::ParquetEncoderThreads, CurrentMetrics::ParquetEncoderThreadsActive,
+                format_settings.max_threads);
+
+        using C = FormatSettings::ParquetCompression;
+        switch (format_settings.parquet.output_compression_method)
+        {
+            case C::NONE: options.compression = CompressionMethod::None; break;
+            case C::SNAPPY: options.compression = CompressionMethod::Snappy; break;
+            case C::ZSTD: options.compression = CompressionMethod::Zstd; break;
+            case C::LZ4: options.compression = CompressionMethod::Lz4; break;
+            case C::GZIP: options.compression = CompressionMethod::Gzip; break;
+            case C::BROTLI: options.compression = CompressionMethod::Brotli; break;
+        }
+        options.output_string_as_string = format_settings.parquet.output_string_as_string;
+        options.output_fixed_string_as_fixed_byte_array = format_settings.parquet.output_fixed_string_as_fixed_byte_array;
+        options.data_page_size = format_settings.parquet.data_page_size;
+        options.write_batch_size = format_settings.parquet.write_batch_size;
+
+        schema = convertSchema(header_, options);
+    }
 }

-void ParquetBlockOutputFormat::consumeStaged()
+ParquetBlockOutputFormat::~ParquetBlockOutputFormat()
 {
-    const size_t columns_num = staging_chunks.at(0).getNumColumns();
+    if (pool)
+    {
+        is_stopped = true;
+        pool->wait();
+    }
+}
+
+void ParquetBlockOutputFormat::consume(Chunk chunk)
+{
+    /// Poll background tasks.
+    if (pool)
+    {
+        std::unique_lock lock(mutex);
+        while (true)
+        {
+            /// If some row groups are ready to be written to the file, write them.
+            reapCompletedRowGroups(lock);
+
+            if (background_exception)
+                std::rethrow_exception(background_exception);
+
+            if (is_stopped)
+                return;
+
+            /// If there's too much work in flight, wait for some of it to complete.
+            if (row_groups.size() < 2)
+                break;
+            if (bytes_in_flight <= format_settings.parquet.row_group_bytes * 4 &&
+                task_queue.size() <= format_settings.max_threads * 4)
+                break;
+
+            condvar.wait(lock);
+        }
+    }
+
+    /// Do something like SquashingTransform to produce big enough row groups.
+    /// Because the real SquashingTransform is only used for INSERT, not for SELECT ... INTO OUTFILE.
+    /// The latter doesn't even have a pipeline where a transform could be inserted, so it's more
+    /// convenient to do the squashing here. It's also parallelized here.
+
+    if (chunk.getNumRows() != 0)
+    {
+        staging_rows += chunk.getNumRows();
+        staging_bytes += chunk.bytes();
+        staging_chunks.push_back(std::move(chunk));
+    }
+
+    const size_t target_rows = std::max(static_cast<UInt64>(1), format_settings.parquet.row_group_rows);
+
+    if (staging_rows < target_rows &&
+        staging_bytes < format_settings.parquet.row_group_bytes)
+        return;
+
+    /// In the rare case that more than `row_group_rows` rows arrived in one chunk, split the
+    /// staging chunk into multiple row groups.
+    if (staging_rows >= target_rows * 2)
+    {
+        /// Increase row group size slightly (by < 2x) to avoid a small row group at the end.
+        size_t num_row_groups = std::max(static_cast<size_t>(1), staging_rows / target_rows);
+        size_t row_group_size = (staging_rows - 1) / num_row_groups + 1; // round up
+
+        Chunk concatenated = std::move(staging_chunks[0]);
+        for (size_t i = 1; i < staging_chunks.size(); ++i)
+            concatenated.append(staging_chunks[i]);
+        staging_chunks.clear();
+
+        for (size_t offset = 0; offset < staging_rows; offset += row_group_size)
+        {
+            size_t count = std::min(row_group_size, staging_rows - offset);
+            MutableColumns columns = concatenated.cloneEmptyColumns();
+            for (size_t i = 0; i < columns.size(); ++i)
+                columns[i]->insertRangeFrom(*concatenated.getColumns()[i], offset, count);
+
+            Chunks piece;
+            piece.emplace_back(std::move(columns), count, concatenated.getChunkInfo());
+            writeRowGroup(std::move(piece));
+        }
+    }
+    else
+    {
+        writeRowGroup(std::move(staging_chunks));
+    }
+
+    staging_chunks.clear();
+    staging_rows = 0;
+    staging_bytes = 0;
+}
+
+void ParquetBlockOutputFormat::finalizeImpl()
+{
+    if (!staging_chunks.empty())
+        writeRowGroup(std::move(staging_chunks));
+
+    if (format_settings.parquet.use_custom_encoder)
+    {
+        if (pool)
+        {
+            std::unique_lock lock(mutex);
+
+            /// Wait for background work to complete.
+            while (true)
+            {
+                reapCompletedRowGroups(lock);
+
+                if (background_exception)
+                    std::rethrow_exception(background_exception);
+
+                if (is_stopped)
+                    return;
+
+                if (row_groups.empty())
+                    break;
+
+                condvar.wait(lock);
+            }
+        }
+
+        if (row_groups_complete.empty())
+        {
+            base_offset = out.count();
+            writeFileHeader(out);
+        }
+        writeFileFooter(std::move(row_groups_complete), schema, options, out);
+    }
+    else
+    {
+        if (!file_writer)
+        {
+            Block header = materializeBlock(getPort(PortKind::Main).getHeader());
+            std::vector<Chunk> chunks;
+            chunks.push_back(Chunk(header.getColumns(), 0));
+            writeRowGroup(std::move(chunks));
+        }
+
+        if (file_writer)
+        {
+            auto status = file_writer->Close();
+            if (!status.ok())
+                throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while closing a table: {}", status.ToString());
+        }
+    }
+}
+
+void ParquetBlockOutputFormat::resetFormatterImpl()
+{
+    if (pool)
+    {
+        is_stopped = true;
+        pool->wait();
+        is_stopped = false;
+    }
+
+    background_exception = nullptr;
+    threads_running = 0;
+    task_queue.clear();
+    row_groups.clear();
+    file_writer.reset();
+    row_groups_complete.clear();
+    staging_chunks.clear();
+    staging_rows = 0;
+    staging_bytes = 0;
+}
+
+void ParquetBlockOutputFormat::onCancel()
+{
+    is_stopped = true;
+}
+
+void ParquetBlockOutputFormat::writeRowGroup(std::vector<Chunk> chunks)
+{
+    if (pool)
+        writeRowGroupInParallel(std::move(chunks));
+    else if (!format_settings.parquet.use_custom_encoder)
+        writeUsingArrow(std::move(chunks));
+    else
+    {
+        Chunk concatenated = std::move(chunks[0]);
+        for (size_t i = 1; i < chunks.size(); ++i)
+            concatenated.append(chunks[i]);
+        chunks.clear();
+
+        writeRowGroupInOneThread(std::move(concatenated));
+    }
+}
+
+void ParquetBlockOutputFormat::writeUsingArrow(std::vector<Chunk> chunks)
+{
+    const size_t columns_num = chunks.at(0).getNumColumns();
    std::shared_ptr<arrow::Table> arrow_table;

    if (!ch_column_to_arrow_column)
@ -85,7 +304,7 @@ void ParquetBlockOutputFormat::consumeStaged()
            format_settings.parquet.output_fixed_string_as_fixed_byte_array);
    }

-    ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, staging_chunks, columns_num);
+    ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunks, columns_num);

    if (!file_writer)
    {
@ -112,64 +331,234 @@ void ParquetBlockOutputFormat::consumeStaged()
        file_writer = std::move(result.ValueOrDie());
    }

-    // TODO: calculate row_group_size depending on a number of rows and table size
-
-    // allow slightly bigger than row_group_size to avoid a very small tail row group
-    auto status = file_writer->WriteTable(*arrow_table, std::max<size_t>(format_settings.parquet.row_group_rows, staging_rows));
+    auto status = file_writer->WriteTable(*arrow_table, INT64_MAX);

    if (!status.ok())
        throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while writing a table: {}", status.ToString());
 }

-void ParquetBlockOutputFormat::consume(Chunk chunk)
-{
-    /// Do something like SquashingTransform to produce big enough row groups.
-    /// Because the real SquashingTransform is only used for INSERT, not for SELECT ... INTO OUTFILE.
-    /// The latter doesn't even have a pipeline where a transform could be inserted, so it's more
-    /// convenient to do the squashing here.
-    staging_rows += chunk.getNumRows();
-    staging_bytes += chunk.bytes();
-    staging_chunks.push_back(std::move(chunk));
-    chassert(staging_chunks.back().getNumColumns() == staging_chunks.front().getNumColumns());
-    if (staging_rows < format_settings.parquet.row_group_rows &&
-        staging_bytes < format_settings.parquet.row_group_bytes)
+void ParquetBlockOutputFormat::writeRowGroupInOneThread(Chunk chunk)
 {
+    if (chunk.getNumRows() == 0)
        return;
+
+    const Block & header = getPort(PortKind::Main).getHeader();
+    Parquet::ColumnChunkWriteStates columns_to_write;
+    chassert(header.columns() == chunk.getNumColumns());
+    for (size_t i = 0; i < header.columns(); ++i)
+        prepareColumnForWrite(
+            chunk.getColumns()[i], header.getByPosition(i).type, header.getByPosition(i).name,
+            options, &columns_to_write);
+
+    if (row_groups_complete.empty())
+    {
+        base_offset = out.count();
+        writeFileHeader(out);
+    }
+
+    std::vector<parquet::format::ColumnChunk> column_chunks;
+    for (auto & s : columns_to_write)
+    {
+        size_t offset = out.count() - base_offset;
+        writeColumnChunkBody(s, options, out);
+        auto c = finalizeColumnChunkAndWriteFooter(offset, std::move(s), options, out);
+        column_chunks.push_back(std::move(c));
+    }
+
+    auto r = makeRowGroup(std::move(column_chunks), chunk.getNumRows());
+    row_groups_complete.push_back(std::move(r));
+}
+
+void ParquetBlockOutputFormat::writeRowGroupInParallel(std::vector<Chunk> chunks)
+{
+    std::unique_lock lock(mutex);
+
+    const Block & header = getPort(PortKind::Main).getHeader();
+
+    RowGroupState & r = row_groups.emplace_back();
+    r.column_chunks.resize(header.columns());
+    r.tasks_in_flight = r.column_chunks.size();
+
+    std::vector<Columns> columnses;
+    for (auto & chunk : chunks)
+    {
+        chassert(header.columns() == chunk.getNumColumns());
+        r.num_rows += chunk.getNumRows();
+        columnses.push_back(chunk.detachColumns());
+    }
+
+    for (size_t i = 0; i < header.columns(); ++i)
+    {
+        Task & t = task_queue.emplace_back(&r, i, this);
+        t.column_type = header.getByPosition(i).type;
+        t.column_name = header.getByPosition(i).name;
+
+        /// Defer concatenating the columns to the threads.
+        size_t bytes = 0;
+        for (size_t j = 0; j < chunks.size(); ++j)
+        {
+            auto & col = columnses[j][i];
+            bytes += col->allocatedBytes();
+            t.column_pieces.push_back(std::move(col));
+        }
+        t.mem.set(bytes);
+    }
+
+    startMoreThreadsIfNeeded(lock);
+}
+
+void ParquetBlockOutputFormat::reapCompletedRowGroups(std::unique_lock<std::mutex> & lock)
+{
+    while (!row_groups.empty() && row_groups.front().tasks_in_flight == 0 && !is_stopped)
+    {
+        RowGroupState & r = row_groups.front();
+
+        /// Write to the file.
+
+        lock.unlock();
+
+        if (row_groups_complete.empty())
+        {
+            base_offset = out.count();
+            writeFileHeader(out);
+        }
+
+        std::vector<parquet::format::ColumnChunk> metadata;
+        for (auto & cols : r.column_chunks)
+        {
+            for (ColumnChunk & col : cols)
+            {
+                size_t offset = out.count() - base_offset;
+
+                out.write(col.serialized.data(), col.serialized.size());
+                auto m = finalizeColumnChunkAndWriteFooter(offset, std::move(col.state), options, out);
+
+                metadata.push_back(std::move(m));
+            }
+        }
+
+        row_groups_complete.push_back(makeRowGroup(std::move(metadata), r.num_rows));
+
+        lock.lock();
+
+        row_groups.pop_front();
+    }
+}
+
+void ParquetBlockOutputFormat::startMoreThreadsIfNeeded(const std::unique_lock<std::mutex> &)
+{
+    /// Speculate that all current are already working on tasks.
+    size_t to_add = std::min(task_queue.size(), format_settings.max_threads - threads_running);
+    for (size_t i = 0; i < to_add; ++i)
+    {
+        auto job = [this, thread_group = CurrentThread::getGroup()]()
+        {
+            if (thread_group)
+                CurrentThread::attachToGroupIfDetached(thread_group);
+            SCOPE_EXIT_SAFE(if (thread_group) CurrentThread::detachFromGroupIfNotDetached(););
+
+            try
+            {
+                setThreadName("ParquetEncoder");
+
+                threadFunction();
+            }
+            catch (...)
+            {
+                std::lock_guard lock(mutex);
+                background_exception = std::current_exception();
+                condvar.notify_all();
+                --threads_running;
+            }
+        };
+
+        if (threads_running == 0)
+        {
+            /// First thread. We need it to succeed; otherwise we may get stuck.
+            pool->scheduleOrThrowOnError(job);
+            ++threads_running;
        }
        else
        {
-        consumeStaged();
-        staging_chunks.clear();
-        staging_rows = 0;
-        staging_bytes = 0;
+            /// More threads. This may be called from inside the thread pool, so avoid waiting;
+            /// otherwise it may deadlock.
+            if (!pool->trySchedule(job))
+                break;
+        }
    }
 }

-void ParquetBlockOutputFormat::finalizeImpl()
+void ParquetBlockOutputFormat::threadFunction()
 {
-    if (!file_writer && staging_chunks.empty())
-    {
-        Block header = materializeBlock(getPort(PortKind::Main).getHeader());
+    std::unique_lock lock(mutex);

-        consume(Chunk(header.getColumns(), 0)); // this will make staging_chunks non-empty
+    while (true)
+    {
+        if (task_queue.empty() || is_stopped)
+        {
+            /// The check and the decrement need to be in the same critical section, to make sure
+            /// we never get stuck with tasks but no threads.
+            --threads_running;
+            return;
        }

-    if (!staging_chunks.empty())
+        auto task = std::move(task_queue.front());
+        task_queue.pop_front();
+
+        if (task.column_type)
        {
-        consumeStaged();
-        staging_chunks.clear();
-        staging_rows = 0;
-        staging_bytes = 0;
+            lock.unlock();
+
+            IColumn::MutablePtr concatenated = IColumn::mutate(std::move(task.column_pieces[0]));
+            for (size_t i = 1; i < task.column_pieces.size(); ++i)
+            {
+                auto & c = task.column_pieces[i];
+                concatenated->insertRangeFrom(*c, 0, c->size());
+                c.reset();
+            }
+            task.column_pieces.clear();
+
+            std::vector<ColumnChunkWriteState> subcolumns;
+            prepareColumnForWrite(
+                std::move(concatenated), task.column_type, task.column_name, options, &subcolumns);
+
+            lock.lock();
+
+            for (size_t i = 0; i < subcolumns.size(); ++i)
+            {
+                task.row_group->column_chunks[task.column_idx].emplace_back(this);
+                task.row_group->tasks_in_flight += 1;
+
+                auto & t = task_queue.emplace_back(task.row_group, task.column_idx, this);
+                t.subcolumn_idx = i;
+                t.state = std::move(subcolumns[i]);
+                t.mem.set(t.state.allocatedBytes());
            }

-    auto status = file_writer->Close();
-    if (!status.ok())
-        throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while closing a table: {}", status.ToString());
+            startMoreThreadsIfNeeded(lock);
+        }
+        else
+        {
+            lock.unlock();
+
+            PODArray<char> serialized;
+            {
+                WriteBufferFromVector buf(serialized);
+                writeColumnChunkBody(task.state, options, buf);
            }

-void ParquetBlockOutputFormat::resetFormatterImpl()
-{
-    file_writer.reset();
+            lock.lock();
+
+            auto & c = task.row_group->column_chunks[task.column_idx][task.subcolumn_idx];
+            c.state = std::move(task.state);
+            c.serialized = std::move(serialized);
+            c.mem.set(c.serialized.size() + c.state.allocatedBytes());
+        }
+
+        --task.row_group->tasks_in_flight;
+
+        condvar.notify_all();
+    }
 }

 void registerOutputFormatParquet(FormatFactory & factory)
--- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.h
+++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.h
@ -2,8 +2,11 @@
 #include "config.h"

 #if USE_PARQUET
+
 #include <Processors/Formats/IOutputFormat.h>
+#include <Processors/Formats/Impl/Parquet/Write.h>
 #include <Formats/FormatSettings.h>
+#include <Common/ThreadPool.h>

 namespace arrow
 {
@ -28,25 +31,129 @@ class ParquetBlockOutputFormat : public IOutputFormat
 {
 public:
    ParquetBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_);
+    ~ParquetBlockOutputFormat() override;

    String getName() const override { return "ParquetBlockOutputFormat"; }

    String getContentType() const override { return "application/octet-stream"; }

 private:
-    void consumeStaged();
+    struct MemoryToken
+    {
+        ParquetBlockOutputFormat * parent;
+        size_t bytes = 0;
+
+        explicit MemoryToken(ParquetBlockOutputFormat * p, size_t b = 0) : parent(p)
+        {
+            set(b);
+        }
+
+        MemoryToken(MemoryToken && t)
+          : parent(std::exchange(t.parent, nullptr)), bytes(std::exchange(t.bytes, 0)) {}
+
+        MemoryToken & operator=(MemoryToken && t)
+        {
+            parent = std::exchange(t.parent, nullptr);
+            bytes = std::exchange(t.bytes, 0);
+            return *this;
+        }
+
+        ~MemoryToken()
+        {
+            set(0);
+        }
+
+        void set(size_t new_size)
+        {
+            if (new_size == bytes)
+                return;
+            parent->bytes_in_flight += new_size - bytes; // overflow is fine
+            bytes = new_size;
+        }
+    };
+
+    struct ColumnChunk
+    {
+        Parquet::ColumnChunkWriteState state;
+        PODArray<char> serialized;
+
+        MemoryToken mem;
+
+        ColumnChunk(ParquetBlockOutputFormat * p) : mem(p) {}
+    };
+
+    struct RowGroupState
+    {
+        size_t tasks_in_flight = 0;
+        std::vector<std::vector<ColumnChunk>> column_chunks;
+        size_t num_rows = 0;
+    };
+
+    struct Task
+    {
+        RowGroupState * row_group;
+        size_t column_idx;
+        size_t subcolumn_idx = 0;
+
+        MemoryToken mem;
+
+        /// If not null, we need to call prepareColumnForWrite().
+        /// Otherwise we need to call writeColumnChunkBody().
+        DataTypePtr column_type;
+        std::string column_name;
+        std::vector<ColumnPtr> column_pieces;
+
+        Parquet::ColumnChunkWriteState state;
+
+        Task(RowGroupState * rg, size_t ci, ParquetBlockOutputFormat * p)
+            : row_group(rg), column_idx(ci), mem(p) {}
+    };
+
    void consume(Chunk) override;
    void finalizeImpl() override;
    void resetFormatterImpl() override;
+    void onCancel() override;

+    void writeRowGroup(std::vector<Chunk> chunks);
+    void writeUsingArrow(std::vector<Chunk> chunks);
+    void writeRowGroupInOneThread(Chunk chunk);
+    void writeRowGroupInParallel(std::vector<Chunk> chunks);
+
+    void threadFunction();
+    void startMoreThreadsIfNeeded(const std::unique_lock<std::mutex> & lock);
+
+    /// Called in single-threaded fashion. Writes to the file.
+    void reapCompletedRowGroups(std::unique_lock<std::mutex> & lock);
+
+    const FormatSettings format_settings;
+
+    /// Chunks to squash together to form a row group.
    std::vector<Chunk> staging_chunks;
    size_t staging_rows = 0;
    size_t staging_bytes = 0;

-    const FormatSettings format_settings;
-
    std::unique_ptr<parquet::arrow::FileWriter> file_writer;
    std::unique_ptr<CHColumnToArrowColumn> ch_column_to_arrow_column;
+
+    Parquet::WriteOptions options;
+    Parquet::SchemaElements schema;
+    std::vector<parquet::format::RowGroup> row_groups_complete;
+    size_t base_offset = 0;
+
+
+    std::mutex mutex;
+    std::condition_variable condvar; // wakes up consume()
+    std::unique_ptr<ThreadPool> pool;
+
+    std::atomic_bool is_stopped{false};
+    std::exception_ptr background_exception = nullptr;
+
+    /// Invariant: if there's at least one task then there's at least one thread.
+    size_t threads_running = 0;
+    std::atomic<size_t> bytes_in_flight{0};
+
+    std::deque<Task> task_queue;
+    std::deque<RowGroupState> row_groups;
 };

 }
--- a/src/Server/ServerType.cpp
+++ b/src/Server/ServerType.cpp
@ -0,0 +1,138 @@
+#include <Server/ServerType.h>
+
+#include <vector>
+#include <algorithm>
+#include <base/types.h>
+
+#include <magic_enum.hpp>
+
+namespace DB
+{
+
+namespace
+{
+    std::vector<std::string> getTypeIndexToTypeName()
+    {
+        constexpr std::size_t types_size = magic_enum::enum_count<ServerType::Type>();
+
+        std::vector<std::string> type_index_to_type_name;
+        type_index_to_type_name.resize(types_size);
+
+        auto entries = magic_enum::enum_entries<ServerType::Type>();
+        for (const auto & [entry, str] : entries)
+        {
+            auto str_copy = String(str);
+            std::replace(str_copy.begin(), str_copy.end(), '_', ' ');
+            type_index_to_type_name[static_cast<UInt64>(entry)] = std::move(str_copy);
+        }
+
+        return type_index_to_type_name;
+    }
+}
+
+const char * ServerType::serverTypeToString(ServerType::Type type)
+{
+    /** During parsing if SystemQuery is not parsed properly it is added to Expected variants as description check IParser.h.
+      * Description string must be statically allocated.
+      */
+    static std::vector<std::string> type_index_to_type_name = getTypeIndexToTypeName();
+    const auto & type_name = type_index_to_type_name[static_cast<UInt64>(type)];
+    return type_name.data();
+}
+
+bool ServerType::shouldStart(Type server_type, const std::string & custom_name_) const
+{
+    if (type == Type::QUERIES_ALL)
+        return true;
+
+    if (type == Type::QUERIES_DEFAULT)
+    {
+        switch (server_type)
+        {
+            case Type::TCP:
+            case Type::TCP_WITH_PROXY:
+            case Type::TCP_SECURE:
+            case Type::HTTP:
+            case Type::HTTPS:
+            case Type::MYSQL:
+            case Type::GRPC:
+            case Type::POSTGRESQL:
+            case Type::PROMETHEUS:
+            case Type::INTERSERVER_HTTP:
+            case Type::INTERSERVER_HTTPS:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    if (type == Type::QUERIES_CUSTOM)
+    {
+        switch (server_type)
+        {
+            case Type::CUSTOM:
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    return type == server_type && custom_name == custom_name_;
+}
+
+bool ServerType::shouldStop(const std::string & port_name) const
+{
+    Type port_type;
+    std::string port_custom_name;
+
+    if (port_name == "http_port")
+        port_type = Type::HTTP;
+
+    else if (port_name == "https_port")
+        port_type = Type::HTTPS;
+
+    else if (port_name == "tcp_port")
+        port_type = Type::TCP;
+
+    else if (port_name == "tcp_with_proxy_port")
+        port_type = Type::TCP_WITH_PROXY;
+
+    else if (port_name == "tcp_port_secure")
+        port_type = Type::TCP_SECURE;
+
+    else if (port_name == "mysql_port")
+        port_type = Type::MYSQL;
+
+    else if (port_name == "postgresql_port")
+        port_type = Type::POSTGRESQL;
+
+    else if (port_name == "grpc_port")
+        port_type = Type::GRPC;
+
+    else if (port_name == "prometheus.port")
+        port_type = Type::PROMETHEUS;
+
+    else if (port_name == "interserver_http_port")
+        port_type = Type::INTERSERVER_HTTP;
+
+    else if (port_name == "interserver_https_port")
+        port_type = Type::INTERSERVER_HTTPS;
+
+    else if (port_name.starts_with("protocols.") && port_name.ends_with(".port"))
+    {
+        constexpr size_t protocols_size = std::string_view("protocols.").size();
+        constexpr size_t port_size = std::string_view("protocols.").size();
+
+        port_type = Type::CUSTOM;
+        port_custom_name = port_name.substr(protocols_size, port_name.size() - port_size);
+    }
+    else
+        port_type = Type::UNKNOWN;
+
+    if (port_type == Type::UNKNOWN)
+        return false;
+
+    return shouldStart(type, port_custom_name);
+}
+
+}
--- a/src/Server/ServerType.h
+++ b/src/Server/ServerType.h
@ -0,0 +1,44 @@
+#pragma once
+
+#include <base/types.h>
+namespace DB
+{
+
+class ServerType
+{
+public:
+
+    enum Type
+    {
+        UNKNOWN,
+        TCP,
+        TCP_WITH_PROXY,
+        TCP_SECURE,
+        HTTP,
+        HTTPS,
+        MYSQL,
+        GRPC,
+        POSTGRESQL,
+        PROMETHEUS,
+        CUSTOM,
+        INTERSERVER_HTTP,
+        INTERSERVER_HTTPS,
+        QUERIES_ALL,
+        QUERIES_DEFAULT,
+        QUERIES_CUSTOM,
+        END
+    };
+
+    ServerType() = default;
+    explicit ServerType(Type type_, const std::string & custom_name_ = "") : type(type_), custom_name(custom_name_) {}
+
+    static const char * serverTypeToString(Type type);
+
+    bool shouldStart(Type server_type, const std::string & custom_name_ = "") const;
+    bool shouldStop(const std::string & port_name) const;
+
+    Type type;
+    std::string custom_name;
+};
+
+}
--- a/src/Storages/MergeTree/GinIndexStore.cpp
+++ b/src/Storages/MergeTree/GinIndexStore.cpp
@ -243,6 +243,15 @@ void GinIndexStore::finalize()
 {
    if (!current_postings.empty())
        writeSegment();
+
+    if (metadata_file_stream)
+        metadata_file_stream->finalize();
+
+    if (dict_file_stream)
+        dict_file_stream->finalize();
+
+    if (postings_file_stream)
+        postings_file_stream->finalize();
 }

 void GinIndexStore::initFileStreams()
@ -319,13 +328,8 @@ void GinIndexStore::writeSegment()
    current_segment.segment_id = getNextSegmentID();

    metadata_file_stream->sync();
-    metadata_file_stream->finalize();
-
    dict_file_stream->sync();
-    dict_file_stream->finalize();
-
    postings_file_stream->sync();
-    postings_file_stream->finalize();
 }

 GinIndexStoreDeserializer::GinIndexStoreDeserializer(const GinIndexStorePtr & store_)
--- a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp
+++ b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp
@ -328,7 +328,10 @@ MergeTreePrefetchedReadPool::PartsInfos MergeTreePrefetchedReadPool::getPartsInf
        for (const auto & range : part.ranges)
            part_info->sum_marks += range.end - range.begin;

-        part_info->approx_size_of_mark = getApproximateSizeOfGranule(*part_info->data_part, column_names);
+        const auto & columns = settings.merge_tree_determine_task_size_by_prewhere_columns && prewhere_info
+            ? prewhere_info->prewhere_actions->getRequiredColumnsNames()
+            : column_names;
+        part_info->approx_size_of_mark = getApproximateSizeOfGranule(*part_info->data_part, columns);

        const auto task_columns = getReadTaskColumns(
            part_reader_info,
@ -369,9 +372,9 @@ MergeTreePrefetchedReadPool::PartsInfos MergeTreePrefetchedReadPool::getPartsInf
        }
        if (prewhere_info)
        {
-            for (const auto & columns : task_columns.pre_columns)
+            for (const auto & cols : task_columns.pre_columns)
            {
-                for (const auto & col : columns)
+                for (const auto & col : cols)
                {
                    const size_t col_size = part.data_part->getColumnSize(col.name).data_compressed;
                    part_info->estimated_memory_usage_for_single_prefetch += std::min<size_t>(col_size, settings.prefetch_buffer_size);
--- a/src/Storages/MergeTree/MergeTreeReadPool.cpp
+++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp
@ -73,8 +73,10 @@ MergeTreeReadPool::MergeTreeReadPool(
        size_t total_marks = 0;
        for (const auto & part : parts_ranges)
        {
-            total_compressed_bytes += getApproxSizeOfPart(
-                *part.data_part, prewhere_info ? prewhere_info->prewhere_actions->getRequiredColumnsNames() : column_names_);
+            const auto & columns = settings.merge_tree_determine_task_size_by_prewhere_columns && prewhere_info
+                ? prewhere_info->prewhere_actions->getRequiredColumnsNames()
+                : column_names_;
+            total_compressed_bytes += getApproxSizeOfPart(*part.data_part, columns);
            total_marks += part.getMarksCount();
        }

--- a/src/Storages/StorageTableFunction.h
+++ b/src/Storages/StorageTableFunction.h
@ -156,7 +156,7 @@ public:
    void checkTableCanBeDropped() const override {}

 private:
-    mutable std::mutex nested_mutex;
+    mutable std::recursive_mutex nested_mutex;
    mutable GetNestedStorageFunc get_nested;
    mutable StoragePtr nested;
    const bool add_conversion;
--- a/tests/analyzer_tech_debt.txt
+++ b/tests/analyzer_tech_debt.txt
@ -128,6 +128,7 @@
 02581_share_big_sets_between_mutation_tasks_long
 02581_share_big_sets_between_multiple_mutations_tasks_long
 00992_system_parts_race_condition_zookeeper_long
+02818_parameterized_view_with_cte_multiple_usage
 02790_optimize_skip_unused_shards_join
 01940_custom_tld_sharding_key
 02815_range_dict_no_direct_join
--- a/tests/integration/test_system_start_stop_listen/init.py
+++ b/tests/integration/test_system_start_stop_listen/init.py
--- a/tests/integration/test_system_start_stop_listen/configs/cluster.xml
+++ b/tests/integration/test_system_start_stop_listen/configs/cluster.xml
@ -0,0 +1,16 @@
+<clickhouse>
+    <remote_servers>
+        <default>
+            <shard>
+                <replica>
+                    <host>node1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>node2</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </default>
+    </remote_servers>
+</clickhouse>
--- a/tests/integration/test_system_start_stop_listen/test.py
+++ b/tests/integration/test_system_start_stop_listen/test.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+
+import pytest
+import time
+from helpers.cluster import ClickHouseCluster
+from helpers.network import PartitionManager
+from helpers.test_tools import assert_eq_with_retry
+import random
+import string
+import json
+
+cluster = ClickHouseCluster(__file__)
+node1 = cluster.add_instance(
+    "node1", main_configs=["configs/cluster.xml"], with_zookeeper=True
+)
+node2 = cluster.add_instance(
+    "node2", main_configs=["configs/cluster.xml"], with_zookeeper=True
+)
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+
+def test_system_start_stop_listen_queries(started_cluster):
+    node1.query("SYSTEM STOP LISTEN QUERIES ALL")
+
+    assert "Connection refused" in node1.query_and_get_error("SELECT 1", timeout=3)
+
+    node2.query("SYSTEM START LISTEN ON CLUSTER default QUERIES ALL")
+
+    node1.query("SELECT 1")
--- a/tests/queries/0_stateless/01164_detach_attach_partition_race.sh
+++ b/tests/queries/0_stateless/01164_detach_attach_partition_race.sh
@ -7,7 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)

 $CLICKHOUSE_CLIENT -q "drop table if exists mt"

-$CLICKHOUSE_CLIENT -q "create table mt (n int) engine=MergeTree order by n settings parts_to_throw_insert=1000"
+$CLICKHOUSE_CLIENT -q "create table mt (n int) engine=MergeTree order by n settings parts_to_throw_insert=5000"
 $CLICKHOUSE_CLIENT -q "insert into mt values (1)"
 $CLICKHOUSE_CLIENT -q "insert into mt values (2)"
 $CLICKHOUSE_CLIENT -q "insert into mt values (3)"
--- a/tests/queries/0_stateless/01271_show_privileges.reference
+++ b/tests/queries/0_stateless/01271_show_privileges.reference
@ -138,6 +138,7 @@ SYSTEM FLUSH	[]	\N	SYSTEM
 SYSTEM THREAD FUZZER	['SYSTEM START THREAD FUZZER','SYSTEM STOP THREAD FUZZER','START THREAD FUZZER','STOP THREAD FUZZER']	GLOBAL	SYSTEM
 SYSTEM UNFREEZE	['SYSTEM UNFREEZE']	GLOBAL	SYSTEM
 SYSTEM FAILPOINT	['SYSTEM ENABLE FAILPOINT','SYSTEM DISABLE FAILPOINT']	GLOBAL	SYSTEM
+SYSTEM LISTEN	['SYSTEM START LISTEN','SYSTEM STOP LISTEN']	GLOBAL	SYSTEM
 SYSTEM	[]	\N	ALL
 dictGet	['dictHas','dictGetHierarchy','dictIsIn']	DICTIONARY	ALL
 displaySecretsInShowAndSelect	[]	GLOBAL	ALL
--- a/tests/queries/0_stateless/01778_mmap_cache_infra.reference
+++ b/tests/queries/0_stateless/01778_mmap_cache_infra.reference
@ -2,5 +2,7 @@ CreatedReadBufferMMap
 CreatedReadBufferMMapFailed
 MMappedFileCacheHits
 MMappedFileCacheMisses
+MMappedAllocBytes
+MMappedAllocs
 MMappedFileBytes
 MMappedFiles
--- a/tests/queries/0_stateless/01921_test_progress_bar.py
+++ b/tests/queries/0_stateless/01921_test_progress_bar.py
@ -17,4 +17,3 @@ with client(name="client1>", log=log) as client1:
    client1.send("SELECT number FROM numbers(1000) FORMAT Null")
    client1.expect("Progress: 1\.00 thousand rows, 8\.00 KB .*" + end_of_block)
    client1.expect("0 rows in set. Elapsed: [\\w]{1}\.[\\w]{3} sec.")
-    client1.expect("Peak memory usage \(for query\) .*B" + end_of_block)
--- a/tests/queries/0_stateless/01923_network_receive_time_metric_insert.sh
+++ b/tests/queries/0_stateless/01923_network_receive_time_metric_insert.sh
@ -9,7 +9,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 ${CLICKHOUSE_CLIENT} --multiquery --query "DROP TABLE IF EXISTS t; CREATE TABLE t (x UInt64) ENGINE = Memory;"

 # Rate limit is chosen for operation to spent more than one second.
-seq 1 1000 | pv --quiet --rate-limit 1000 | ${CLICKHOUSE_CLIENT} --query "INSERT INTO t FORMAT TSV"
+seq 1 1000 | pv --quiet --rate-limit 500 | ${CLICKHOUSE_CLIENT} --query "INSERT INTO t FORMAT TSV"

 # We check that the value of NetworkReceiveElapsedMicroseconds correctly includes the time spent waiting data from the client.
 ${CLICKHOUSE_CLIENT} --multiquery --query "SYSTEM FLUSH LOGS;
--- a/tests/queries/0_stateless/02117_show_create_table_system.reference
+++ b/tests/queries/0_stateless/02117_show_create_table_system.reference
@ -297,7 +297,7 @@ CREATE TABLE system.grants
 (
    `user_name` Nullable(String),
    `role_name` Nullable(String),
-    `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM' = 140, 'dictGet' = 141, 'displaySecretsInShowAndSelect' = 142, 'addressToLine' = 143, 'addressToLineWithInlines' = 144, 'addressToSymbol' = 145, 'demangle' = 146, 'INTROSPECTION' = 147, 'FILE' = 148, 'URL' = 149, 'REMOTE' = 150, 'MONGO' = 151, 'REDIS' = 152, 'MEILISEARCH' = 153, 'MYSQL' = 154, 'POSTGRES' = 155, 'SQLITE' = 156, 'ODBC' = 157, 'JDBC' = 158, 'HDFS' = 159, 'S3' = 160, 'HIVE' = 161, 'AZURE' = 162, 'SOURCES' = 163, 'CLUSTER' = 164, 'ALL' = 165, 'NONE' = 166),
+    `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM LISTEN' = 140, 'SYSTEM' = 141, 'dictGet' = 142, 'displaySecretsInShowAndSelect' = 143, 'addressToLine' = 144, 'addressToLineWithInlines' = 145, 'addressToSymbol' = 146, 'demangle' = 147, 'INTROSPECTION' = 148, 'FILE' = 149, 'URL' = 150, 'REMOTE' = 151, 'MONGO' = 152, 'REDIS' = 153, 'MEILISEARCH' = 154, 'MYSQL' = 155, 'POSTGRES' = 156, 'SQLITE' = 157, 'ODBC' = 158, 'JDBC' = 159, 'HDFS' = 160, 'S3' = 161, 'HIVE' = 162, 'AZURE' = 163, 'SOURCES' = 164, 'CLUSTER' = 165, 'ALL' = 166, 'NONE' = 167),
    `database` Nullable(String),
    `table` Nullable(String),
    `column` Nullable(String),
@ -584,10 +584,10 @@ ENGINE = SystemPartsColumns
 COMMENT 'SYSTEM TABLE is built on the fly.'
 CREATE TABLE system.privileges
 (
-    `privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM' = 140, 'dictGet' = 141, 'displaySecretsInShowAndSelect' = 142, 'addressToLine' = 143, 'addressToLineWithInlines' = 144, 'addressToSymbol' = 145, 'demangle' = 146, 'INTROSPECTION' = 147, 'FILE' = 148, 'URL' = 149, 'REMOTE' = 150, 'MONGO' = 151, 'REDIS' = 152, 'MEILISEARCH' = 153, 'MYSQL' = 154, 'POSTGRES' = 155, 'SQLITE' = 156, 'ODBC' = 157, 'JDBC' = 158, 'HDFS' = 159, 'S3' = 160, 'HIVE' = 161, 'AZURE' = 162, 'SOURCES' = 163, 'CLUSTER' = 164, 'ALL' = 165, 'NONE' = 166),
+    `privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM LISTEN' = 140, 'SYSTEM' = 141, 'dictGet' = 142, 'displaySecretsInShowAndSelect' = 143, 'addressToLine' = 144, 'addressToLineWithInlines' = 145, 'addressToSymbol' = 146, 'demangle' = 147, 'INTROSPECTION' = 148, 'FILE' = 149, 'URL' = 150, 'REMOTE' = 151, 'MONGO' = 152, 'REDIS' = 153, 'MEILISEARCH' = 154, 'MYSQL' = 155, 'POSTGRES' = 156, 'SQLITE' = 157, 'ODBC' = 158, 'JDBC' = 159, 'HDFS' = 160, 'S3' = 161, 'HIVE' = 162, 'AZURE' = 163, 'SOURCES' = 164, 'CLUSTER' = 165, 'ALL' = 166, 'NONE' = 167),
    `aliases` Array(String),
    `level` Nullable(Enum8('GLOBAL' = 0, 'DATABASE' = 1, 'TABLE' = 2, 'DICTIONARY' = 3, 'VIEW' = 4, 'COLUMN' = 5, 'NAMED_COLLECTION' = 6)),
-    `parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM' = 140, 'dictGet' = 141, 'displaySecretsInShowAndSelect' = 142, 'addressToLine' = 143, 'addressToLineWithInlines' = 144, 'addressToSymbol' = 145, 'demangle' = 146, 'INTROSPECTION' = 147, 'FILE' = 148, 'URL' = 149, 'REMOTE' = 150, 'MONGO' = 151, 'REDIS' = 152, 'MEILISEARCH' = 153, 'MYSQL' = 154, 'POSTGRES' = 155, 'SQLITE' = 156, 'ODBC' = 157, 'JDBC' = 158, 'HDFS' = 159, 'S3' = 160, 'HIVE' = 161, 'AZURE' = 162, 'SOURCES' = 163, 'CLUSTER' = 164, 'ALL' = 165, 'NONE' = 166))
+    `parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM LISTEN' = 140, 'SYSTEM' = 141, 'dictGet' = 142, 'displaySecretsInShowAndSelect' = 143, 'addressToLine' = 144, 'addressToLineWithInlines' = 145, 'addressToSymbol' = 146, 'demangle' = 147, 'INTROSPECTION' = 148, 'FILE' = 149, 'URL' = 150, 'REMOTE' = 151, 'MONGO' = 152, 'REDIS' = 153, 'MEILISEARCH' = 154, 'MYSQL' = 155, 'POSTGRES' = 156, 'SQLITE' = 157, 'ODBC' = 158, 'JDBC' = 159, 'HDFS' = 160, 'S3' = 161, 'HIVE' = 162, 'AZURE' = 163, 'SOURCES' = 164, 'CLUSTER' = 165, 'ALL' = 166, 'NONE' = 167))
 )
 ENGINE = SystemPrivileges
 COMMENT 'SYSTEM TABLE is built on the fly.'
--- a/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.sh
+++ b/tests/queries/0_stateless/02581_parquet_arrow_orc_compressions.sh
@ -5,6 +5,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh

+set -o pipefail
+
 $CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='none'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table"
 $CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='lz4'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table"
 $CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='snappy'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table"
--- a/tests/queries/0_stateless/02676_to_decimal_string.reference
+++ b/tests/queries/0_stateless/02676_to_decimal_string.reference
@ -0,0 +1,21 @@
+2.00000000000000000000000000000000000000000000000000000000000000000000000000000
+2.12
+-2.00000000000000000000000000000000000000000000000000000000000000000000000000000
+-2.12
+2.987600000000000033395508580724708735942840576171875000000000
+2.15
+-2.987600000000000033395508580724708735942840576171875000000000
+-2.15
+64.1230010986
+64.2340000000
+-64.1230010986
+-64.2340000000
+-32.345
+32.34500000000000000000000000000000000000000000000000000000000000000000000000000
+32.46
+-64.5671232345
+128.78932312332132985464
+-128.78932312332132985464
+128.78932312332132985464000000000000000000000000000000000000000000000000000000000
+128.7893231233
+-128.78932312332132985464123123789323123321329854600000000000000000000000000000000
--- a/tests/queries/0_stateless/02676_to_decimal_string.sql
+++ b/tests/queries/0_stateless/02676_to_decimal_string.sql
@ -0,0 +1,41 @@
+-- Regular types
+SELECT toDecimalString(2, 77);  -- more digits required than exist
+SELECT toDecimalString(2.123456, 2);  -- rounding
+SELECT toDecimalString(-2, 77);  -- more digits required than exist
+SELECT toDecimalString(-2.123456, 2);  -- rounding
+
+SELECT toDecimalString(2.9876, 60);  -- more digits required than exist (took 60 as it is float by default)
+SELECT toDecimalString(2.1456, 2);  -- rounding
+SELECT toDecimalString(-2.9876, 60);  -- more digits required than exist
+SELECT toDecimalString(-2.1456, 2);  -- rounding
+
+-- Float32 and Float64 tests. No sense to test big float precision -- the result will be a mess anyway.
+SELECT toDecimalString(64.123::Float32, 10);
+SELECT toDecimalString(64.234::Float64, 10);
+SELECT toDecimalString(-64.123::Float32, 10);
+SELECT toDecimalString(-64.234::Float64, 10);
+
+-- Decimals
+SELECT toDecimalString(-32.345::Decimal32(3), 3);
+SELECT toDecimalString(32.345::Decimal32(3), 77);  -- more digits required than exist
+SELECT toDecimalString(32.456::Decimal32(3), 2);  -- rounding
+SELECT toDecimalString('-64.5671232345'::Decimal64(10), 10);
+SELECT toDecimalString('128.78932312332132985464'::Decimal128(20), 20);
+SELECT toDecimalString('-128.78932312332132985464123123'::Decimal128(26), 20);  -- rounding
+SELECT toDecimalString('128.78932312332132985464'::Decimal128(20), 77);  -- more digits required than exist
+SELECT toDecimalString('128.789323123321329854641231237893231233213298546'::Decimal256(45), 10);  -- rounding
+SELECT toDecimalString('-128.789323123321329854641231237893231233213298546'::Decimal256(45), 77);  -- more digits required than exist
+
+-- Max number of decimal fractional digits is defined as 77 for Int/UInt/Decimal and 60 for Float.
+-- These values shall work OK.
+SELECT toDecimalString('32.32'::Float32, 61); -- {serverError CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER}
+SELECT toDecimalString('64.64'::Float64, 61); -- {serverError CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER}
+SELECT toDecimalString('88'::UInt8, 78); -- {serverError CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER}
+SELECT toDecimalString('646464'::Int256, 78); -- {serverError CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER}
+SELECT toDecimalString('-128.789323123321329854641231237893231233213298546'::Decimal256(45), 78); -- {serverError CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER}
+
+-- wrong types: #52407 and similar
+SELECT toDecimalString('256.256'::Decimal256(45), *); -- {serverError ILLEGAL_COLUMN}
+SELECT toDecimalString('128.128'::Decimal128(30), 'str'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT}
+SELECT toDecimalString('64.64'::Decimal64(10)); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
+SELECT toDecimalString('64.64'::Decimal64(10), 3, 3); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
--- a/tests/queries/0_stateless/02735_parquet_encoder.reference
+++ b/tests/queries/0_stateless/02735_parquet_encoder.reference
@ -0,0 +1,55 @@
+u8	Nullable(UInt8)					
+u16	Nullable(UInt16)					
+u32	Nullable(UInt32)					
+u64	Nullable(UInt64)					
+i8	Nullable(Int8)					
+i16	Nullable(Int16)					
+i32	Nullable(Int32)					
+i64	Nullable(Int64)					
+date	Nullable(UInt16)					
+date32	Nullable(Date32)					
+datetime	Nullable(UInt32)					
+datetime64	Nullable(DateTime64(3, \'UTC\'))					
+enum8	Nullable(Int8)					
+enum16	Nullable(Int16)					
+float32	Nullable(Float32)					
+float64	Nullable(Float64)					
+str	Nullable(String)					
+fstr	Nullable(FixedString(12))					
+u128	Nullable(FixedString(16))					
+u256	Nullable(FixedString(32))					
+i128	Nullable(FixedString(16))					
+i256	Nullable(FixedString(32))					
+decimal32	Nullable(Decimal(9, 3))					
+decimal64	Nullable(Decimal(18, 10))					
+decimal128	Nullable(Decimal(38, 20))					
+decimal256	Nullable(Decimal(76, 40))					
+ipv4	Nullable(UInt32)					
+ipv6	Nullable(FixedString(16))					
+0
+0
+0
+0
+1	2	1
+1	2	2
+1	3	3
+1	1000000	1
+3914219105369203805
+4	1000000	1
+(1000000,0,NULL,'100','299')
+(1000000,0,NULL,'0','-1294970296')
+(1000000,0,NULL,'-2147483296','2147481000')
+(100000,900000,NULL,'100009','999999')
+[(2,0,NULL,'','[]')]
+1	1
+0	1
+16159458007063698496
+16159458007063698496
+BYTE_ARRAY	String
+FIXED_LEN_BYTE_ARRAY	None
+BYTE_ARRAY	None
+BYTE_ARRAY	None
+BYTE_ARRAY	String
+never	gonna
+give	you
+up
--- a/tests/queries/0_stateless/02735_parquet_encoder.sql
+++ b/tests/queries/0_stateless/02735_parquet_encoder.sql
@ -0,0 +1,168 @@
+-- Tags: no-fasttest, no-parallel
+
+set output_format_parquet_use_custom_encoder = 1;
+set output_format_parquet_row_group_size = 1000;
+set output_format_parquet_data_page_size = 800;
+set output_format_parquet_batch_size = 100;
+set output_format_parquet_row_group_size_bytes = 1000000000;
+set engine_file_truncate_on_insert=1;
+
+-- Write random data to parquet file, then read from it and check that it matches what we wrote.
+-- Do this for all kinds of data types: primitive, Nullable(primitive), Array(primitive),
+-- Array(Nullable(primitive)), Array(Array(primitive)), Map(primitive, primitive), etc.
+
+drop table if exists basic_types_02735;
+create temporary table basic_types_02735 as select * from generateRandom('
+    u8 UInt8,
+    u16 UInt16,
+    u32 UInt32,
+    u64 UInt64,
+    i8 Int8,
+    i16 Int16,
+    i32 Int32,
+    i64 Int64,
+    date Date,
+    date32 Date32,
+    datetime DateTime,
+    datetime64 DateTime64,
+    enum8 Enum8(''x'' = 1, ''y'' = 2, ''z'' = 3),
+    enum16 Enum16(''xx'' = 1000, ''yy'' = 2000, ''zz'' = 3000),
+    float32 Float32,
+    float64 Float64,
+    str String,
+    fstr FixedString(12),
+    u128 UInt128,
+    u256 UInt256,
+    i128 Int128,
+    i256 Int256,
+    decimal32 Decimal32(3),
+    decimal64 Decimal64(10),
+    decimal128 Decimal128(20),
+    decimal256 Decimal256(40),
+    ipv4 IPv4,
+    ipv6 IPv6') limit 10101;
+insert into function file(basic_types_02735.parquet) select * from basic_types_02735;
+desc file(basic_types_02735.parquet);
+select (select sum(cityHash64(*)) from basic_types_02735) - (select sum(cityHash64(*)) from file(basic_types_02735.parquet));
+drop table basic_types_02735;
+
+
+drop table if exists nullables_02735;
+create temporary table nullables_02735 as select * from generateRandom('
+    u16 Nullable(UInt16),
+    i64 Nullable(Int64),
+    datetime64 Nullable(DateTime64),
+    enum8 Nullable(Enum8(''x'' = 1, ''y'' = 2, ''z'' = 3)),
+    float64 Nullable(Float64),
+    str Nullable(String),
+    fstr Nullable(FixedString(12)),
+    i256 Nullable(Int256),
+    decimal256 Nullable(Decimal256(40)),
+    ipv6 Nullable(IPv6)') limit 10000;
+insert into function file(nullables_02735.parquet) select * from nullables_02735;
+select (select sum(cityHash64(*)) from nullables_02735) - (select sum(cityHash64(*)) from file(nullables_02735.parquet));
+drop table nullables_02735;
+
+
+-- TODO: When cityHash64() fully supports Nullable: https://github.com/ClickHouse/ClickHouse/pull/48625
+--       the next two blocks can be simplified: arrays_out_02735 intermediate table is not needed,
+--       a.csv and b.csv are not needed.
+
+drop table if exists arrays_02735;
+drop table if exists arrays_out_02735;
+create table arrays_02735 engine = Memory as select * from generateRandom('
+    u32 Array(UInt32),
+    i8 Array(Int8),
+    datetime Array(DateTime),
+    enum16 Array(Enum16(''xx'' = 1000, ''yy'' = 2000, ''zz'' = 3000)),
+    float32 Array(Float32),
+    str Array(String),
+    fstr Array(FixedString(12)),
+    u128 Array(UInt128),
+    decimal64 Array(Decimal64(10)),
+    ipv4 Array(IPv4),
+    msi Map(String, Int16),
+    tup Tuple(FixedString(3), Array(String), Map(Int8, Date))') limit 10000;
+insert into function file(arrays_02735.parquet) select * from arrays_02735;
+create temporary table arrays_out_02735 as arrays_02735;
+insert into arrays_out_02735 select * from file(arrays_02735.parquet);
+select (select sum(cityHash64(*)) from arrays_02735) - (select sum(cityHash64(*)) from arrays_out_02735);
+--select (select sum(cityHash64(*)) from arrays_02735) -
+--       (select sum(cityHash64(u32, i8, datetime, enum16, float32, str, fstr, arrayMap(x->reinterpret(x, 'UInt128'), u128), decimal64, ipv4, msi, tup)) from file(arrays_02735.parquet));
+drop table arrays_02735;
+drop table arrays_out_02735;
+
+
+drop table if exists madness_02735;
+create temporary table madness_02735 as select * from generateRandom('
+    aa Array(Array(UInt32)),
+    aaa Array(Array(Array(UInt32))),
+    an Array(Nullable(String)),
+    aan Array(Array(Nullable(FixedString(10)))),
+    l LowCardinality(String),
+    ln LowCardinality(Nullable(FixedString(11))),
+    al Array(LowCardinality(UInt128)),
+    aaln Array(Array(LowCardinality(Nullable(String)))),
+    mln Map(LowCardinality(String), Nullable(Int8)),
+    t Tuple(Map(FixedString(5), Tuple(Array(UInt16), Nullable(UInt16), Array(Tuple(Int8, Decimal64(10))))), Tuple(kitchen UInt64, sink String)),
+    n Nested(hello UInt64, world Tuple(first String, second FixedString(1)))
+    ') limit 10000;
+insert into function file(madness_02735.parquet) select * from madness_02735;
+insert into function file(a.csv) select * from madness_02735 order by tuple(*);
+insert into function file(b.csv) select aa, aaa, an, aan, l, ln, arrayMap(x->reinterpret(x, 'UInt128'), al) as al_, aaln, mln, t, n.hello, n.world from file(madness_02735.parquet) order by tuple(aa, aaa, an, aan, l, ln, al_, aaln, mln, t, n.hello, n.world);
+select (select sum(cityHash64(*)) from file(a.csv, LineAsString)) - (select sum(cityHash64(*)) from file(b.csv, LineAsString));
+--select (select sum(cityHash64(*)) from madness_02735) -
+--       (select sum(cityHash64(aa, aaa, an, aan, l, ln, map(x->reinterpret(x, 'UInt128'), al), aaln, mln, t, n.hello, n.world)) from file(madness_02735.parquet));
+drop table madness_02735;
+
+
+-- Merging input blocks into bigger row groups.
+insert into function file(squash_02735.parquet) select '012345' union all select '543210' settings max_block_size = 1;
+select num_columns, num_rows, num_row_groups from file(squash_02735.parquet, ParquetMetadata);
+
+-- Row group size limit in bytes.
+insert into function file(row_group_bytes_02735.parquet) select '012345' union all select '543210' settings max_block_size = 1, output_format_parquet_row_group_size_bytes = 5;
+select num_columns, num_rows, num_row_groups from file(row_group_bytes_02735.parquet, ParquetMetadata);
+
+-- Row group size limit in rows.
+insert into function file(tiny_row_groups_02735.parquet) select * from numbers(3) settings output_format_parquet_row_group_size = 1;
+select num_columns, num_rows, num_row_groups from file(tiny_row_groups_02735.parquet, ParquetMetadata);
+
+-- 1M unique 8-byte values should exceed dictionary_size_limit (1 MB).
+insert into function file(big_column_chunk_02735.parquet) select number from numbers(1000000) settings output_format_parquet_row_group_size = 1000000;
+select num_columns, num_rows, num_row_groups from file(big_column_chunk_02735.parquet, ParquetMetadata);
+select sum(cityHash64(number)) from file(big_column_chunk_02735.parquet);
+
+-- Check statistics: signed vs unsigned, null count. Use enough rows to produce multiple pages.
+insert into function file(statistics_02735.parquet) select 100 + number%200 as a, toUInt32(number * 3000) as u, toInt32(number * 3000) as i, if(number % 10 == 9, toString(number), null) as s from numbers(1000000) settings output_format_parquet_row_group_size = 1000000;
+select num_columns, num_rows, num_row_groups from file(statistics_02735.parquet, ParquetMetadata);
+select tupleElement(c, 'statistics') from file(statistics_02735.parquet, ParquetMetadata) array join tupleElement(row_groups[1], 'columns') as c;
+
+-- Statistics string length limit (max_statistics_size).
+insert into function file(long_string_02735.parquet) select toString(range(number * 2000)) from numbers(2);
+select tupleElement(tupleElement(row_groups[1], 'columns'), 'statistics') from file(long_string_02735.parquet, ParquetMetadata);
+
+-- Compression setting.
+insert into function file(compressed_02735.parquet) select concat('aaaaaaaaaaaaaaaa', toString(number)) as s from numbers(1000) settings output_format_parquet_row_group_size = 10000, output_format_parquet_compression_method='zstd';
+select total_compressed_size < 10000, total_uncompressed_size > 15000 from file(compressed_02735.parquet, ParquetMetadata);
+insert into function file(compressed_02735.parquet) select concat('aaaaaaaaaaaaaaaa', toString(number)) as s from numbers(1000) settings output_format_parquet_row_group_size = 10000, output_format_parquet_compression_method='none';
+select total_compressed_size < 10000, total_uncompressed_size > 15000 from file(compressed_02735.parquet, ParquetMetadata);
+
+-- Single-threaded encoding and Arrow encoder.
+drop table if exists other_encoders_02735;
+create temporary table other_encoders_02735 as select number, number*2 from numbers(10000);
+insert into function file(single_thread_02735.parquet) select * from other_encoders_02735 settings max_threads = 1;
+select sum(cityHash64(*)) from file(single_thread_02735.parquet);
+insert into function file(arrow_02735.parquet) select * from other_encoders_02735 settings output_format_parquet_use_custom_encoder = 0;
+select sum(cityHash64(*)) from file(arrow_02735.parquet);
+
+-- String -> binary vs string; FixedString -> fixed-length-binary vs binary vs string.
+insert into function file(strings1_02735.parquet) select 'never', toFixedString('gonna', 5) settings output_format_parquet_string_as_string = 1, output_format_parquet_fixed_string_as_fixed_byte_array = 1;
+select columns.5, columns.6 from file(strings1_02735.parquet, ParquetMetadata) array join columns;
+insert into function file(strings2_02735.parquet) select 'give', toFixedString('you', 3) settings output_format_parquet_string_as_string = 0, output_format_parquet_fixed_string_as_fixed_byte_array = 0;
+select columns.5, columns.6 from file(strings2_02735.parquet, ParquetMetadata) array join columns;
+insert into function file(strings3_02735.parquet) select toFixedString('up', 2) settings output_format_parquet_string_as_string = 1, output_format_parquet_fixed_string_as_fixed_byte_array = 0;
+select columns.5, columns.6 from file(strings3_02735.parquet, ParquetMetadata) array join columns;
+select * from file(strings1_02735.parquet);
+select * from file(strings2_02735.parquet);
+select * from file(strings3_02735.parquet);
--- a/tests/queries/0_stateless/02814_currentDatabase_for_table_functions.reference
+++ b/tests/queries/0_stateless/02814_currentDatabase_for_table_functions.reference
@ -0,0 +1,17 @@
+-- Based on https://github.com/ClickHouse/ClickHouse/issues/52436
+-- Test that inserts performed via Buffer table engine land into destination table.
+-- { echoOn }
+
+DROP TABLE IF EXISTS null_table;
+DROP TABLE IF EXISTS null_table_buffer;
+DROP TABLE IF EXISTS null_mv;
+DROP VIEW IF EXISTS number_view;
+CREATE TABLE null_table (number UInt64) ENGINE = Null;
+CREATE VIEW number_view as SELECT * FROM numbers(10) as tb;
+CREATE MATERIALIZED VIEW null_mv Engine = Log AS SELECT * FROM null_table LEFT JOIN number_view as tb USING number;
+CREATE TABLE null_table_buffer (number UInt64) ENGINE = Buffer(currentDatabase(), null_table, 1, 1, 1, 100, 200, 10000, 20000);
+INSERT INTO null_table_buffer VALUES (1);
+SELECT sleep(3) FORMAT Null;
+-- Insert about should've landed into `null_mv`
+SELECT count() FROM null_mv;
+1
--- a/tests/queries/0_stateless/02814_currentDatabase_for_table_functions.sql
+++ b/tests/queries/0_stateless/02814_currentDatabase_for_table_functions.sql
@ -0,0 +1,19 @@
+-- Based on https://github.com/ClickHouse/ClickHouse/issues/52436
+-- Test that inserts performed via Buffer table engine land into destination table.
+-- { echoOn }
+
+DROP TABLE IF EXISTS null_table;
+DROP TABLE IF EXISTS null_table_buffer;
+DROP TABLE IF EXISTS null_mv;
+DROP VIEW IF EXISTS number_view;
+
+CREATE TABLE null_table (number UInt64) ENGINE = Null;
+CREATE VIEW number_view as SELECT * FROM numbers(10) as tb;
+CREATE MATERIALIZED VIEW null_mv Engine = Log AS SELECT * FROM null_table LEFT JOIN number_view as tb USING number;
+
+CREATE TABLE null_table_buffer (number UInt64) ENGINE = Buffer(currentDatabase(), null_table, 1, 1, 1, 100, 200, 10000, 20000);
+INSERT INTO null_table_buffer VALUES (1);
+SELECT sleep(3) FORMAT Null;
+
+-- Insert about should've landed into `null_mv`
+SELECT count() FROM null_mv;
--- a/tests/queries/0_stateless/02818_parameterized_view_with_cte_multiple_usage.reference
+++ b/tests/queries/0_stateless/02818_parameterized_view_with_cte_multiple_usage.reference
@ -0,0 +1,2 @@
+3	2
+3	2	3
--- a/tests/queries/0_stateless/02818_parameterized_view_with_cte_multiple_usage.sql
+++ b/tests/queries/0_stateless/02818_parameterized_view_with_cte_multiple_usage.sql
@ -0,0 +1,16 @@
+create view test_param_view as
+with {param_test_val:UInt8} as param_test_val
+select param_test_val,
+       arrayCount((a)->(a < param_test_val), t.arr) as cnt1
+from (select [1,2,3,4,5] as arr) t;
+
+select * from test_param_view(param_test_val = 3);
+
+create view test_param_view2 as
+with {param_test_val:UInt8} as param_test_val
+select param_test_val,
+       arrayCount((a)->(a < param_test_val), t.arr) as cnt1,
+       arrayCount((a)->(a < param_test_val+1), t.arr) as cnt2
+from (select [1,2,3,4,5] as arr) t;
+
+select * from test_param_view2(param_test_val = 3);
--- a/tests/queries/0_stateless/02828_create_as_table_function_rename.reference
+++ b/tests/queries/0_stateless/02828_create_as_table_function_rename.reference
@ -0,0 +1 @@
+0
--- a/tests/queries/0_stateless/02828_create_as_table_function_rename.sql
+++ b/tests/queries/0_stateless/02828_create_as_table_function_rename.sql
@ -0,0 +1,7 @@
+
+drop table if exists t1;
+create table t1 as remote('localhost', 'system.one');
+rename table t1 to t2;
+select * from t2;
+rename table t2 to t1;
+drop table t1;