Merge remote-tracking branch 'origin/master' into pr-incorrect-reverse-order

2024-09-19 16:20:50 +00:00 · 2024-09-03 17:24:59 +00:00 · 2024-09-03 17:24:59 +00:00 · e94531d5b9
commit e94531d5b9
parent 5684db7edc 959e8567bb
112 changed files with 2219 additions and 558 deletions
--- a/base/poco/Net/include/Poco/Net/HTTPSession.h
+++ b/base/poco/Net/include/Poco/Net/HTTPSession.h
@ -19,6 +19,8 @@


 #include <ios>
+#include <memory>
+#include <functional>
 #include "Poco/Any.h"
 #include "Poco/Buffer.h"
 #include "Poco/Exception.h"
@ -33,6 +35,27 @@ namespace Net
 {


+    class IHTTPSessionDataHooks
+    /// Interface to control stream of data bytes being sent or received though socket by HTTPSession
+    /// It allows to monitor, throttle and schedule data streams with syscall granulatrity
+    {
+    public:
+        virtual ~IHTTPSessionDataHooks() = default;
+
+        virtual void atStart(int bytes) = 0;
+        /// Called before sending/receiving data `bytes` to/from socket.
+
+        virtual void atFinish(int bytes) = 0;
+        /// Called when sending/receiving of data `bytes` is successfully finished.
+
+        virtual void atFail() = 0;
+        /// If an error occurred during send/receive `fail()` is called instead of `finish()`.
+    };
+
+
+    using HTTPSessionDataHooksPtr = std::shared_ptr<IHTTPSessionDataHooks>;
+
+
    class Net_API HTTPSession
    /// HTTPSession implements basic HTTP session management
    /// for both HTTP clients and HTTP servers.
@ -73,6 +96,12 @@ namespace Net
        Poco::Timespan getReceiveTimeout() const;
        /// Returns receive timeout for the HTTP session.

+        void setSendDataHooks(const HTTPSessionDataHooksPtr & sendDataHooks = {});
+        /// Sets data hooks that will be called on every sent to the socket.
+
+        void setReceiveDataHooks(const HTTPSessionDataHooksPtr & receiveDataHooks = {});
+        /// Sets data hooks that will be called on every receive from the socket.
+
        bool connected() const;
        /// Returns true if the underlying socket is connected.

@ -211,6 +240,10 @@ namespace Net
        Poco::Exception * _pException;
        Poco::Any _data;

+        // Data hooks
+        HTTPSessionDataHooksPtr _sendDataHooks;
+        HTTPSessionDataHooksPtr _receiveDataHooks;
+
        friend class HTTPStreamBuf;
        friend class HTTPHeaderStreamBuf;
        friend class HTTPFixedLengthStreamBuf;
@ -246,6 +279,16 @@ namespace Net
        return _receiveTimeout;
    }

+    inline void HTTPSession::setSendDataHooks(const HTTPSessionDataHooksPtr & sendDataHooks)
+    {
+        _sendDataHooks = sendDataHooks;
+    }
+
+    inline void HTTPSession::setReceiveDataHooks(const HTTPSessionDataHooksPtr & receiveDataHooks)
+    {
+        _receiveDataHooks = receiveDataHooks;
+    }
+
    inline StreamSocket & HTTPSession::socket()
    {
        return _socket;
--- a/base/poco/Net/src/HTTPSession.cpp
+++ b/base/poco/Net/src/HTTPSession.cpp
@ -128,14 +128,14 @@ int HTTPSession::get()
 {
 	if (_pCurrent == _pEnd)
 		refill();
-	
+
 	if (_pCurrent < _pEnd)
 		return *_pCurrent++;
 	else
 		return std::char_traits<char>::eof();
 }

-	
+
 int HTTPSession::peek()
 {
 	if (_pCurrent == _pEnd)
@ -147,7 +147,7 @@ int HTTPSession::peek()
 		return std::char_traits<char>::eof();
 }

-	
+
 int HTTPSession::read(char* buffer, std::streamsize length)
 {
 	if (_pCurrent < _pEnd)
@ -166,10 +166,17 @@ int HTTPSession::write(const char* buffer, std::streamsize length)
 {
 	try
 	{
-		return _socket.sendBytes(buffer, (int) length);
+		if (_sendDataHooks)
+			_sendDataHooks->atStart((int) length);
+		int result = _socket.sendBytes(buffer, (int) length);
+		if (_sendDataHooks)
+			_sendDataHooks->atFinish(result);
+		return result;
 	}
 	catch (Poco::Exception& exc)
 	{
+		if (_sendDataHooks)
+			_sendDataHooks->atFail();
 		setException(exc);
 		throw;
 	}
@ -180,10 +187,17 @@ int HTTPSession::receive(char* buffer, int length)
 {
 	try
 	{
-		return _socket.receiveBytes(buffer, length);
+		if (_receiveDataHooks)
+			_receiveDataHooks->atStart(length);
+		int result = _socket.receiveBytes(buffer, length);
+		if (_receiveDataHooks)
+			_receiveDataHooks->atFinish(result);
+		return result;
 	}
 	catch (Poco::Exception& exc)
 	{
+		if (_receiveDataHooks)
+			_receiveDataHooks->atFail();
 		setException(exc);
 		throw;
 	}
--- a/base/poco/Net/src/SocketImpl.cpp
+++ b/base/poco/Net/src/SocketImpl.cpp
@ -63,7 +63,7 @@ bool checkIsBrokenTimeout()

 SocketImpl::SocketImpl():
 	_sockfd(POCO_INVALID_SOCKET),
-	_blocking(true), 
+	_blocking(true),
 	_isBrokenTimeout(checkIsBrokenTimeout())
 {
 }
@ -82,7 +82,7 @@ SocketImpl::~SocketImpl()
 	close();
 }

-	
+
 SocketImpl* SocketImpl::acceptConnection(SocketAddress& clientAddr)
 {
 	if (_sockfd == POCO_INVALID_SOCKET) throw InvalidSocketException();
@ -118,7 +118,7 @@ void SocketImpl::connect(const SocketAddress& address)
 		rc = ::connect(_sockfd, address.addr(), address.length());
 	}
 	while (rc != 0 && lastError() == POCO_EINTR);
-	if (rc != 0) 
+	if (rc != 0)
 	{
 		int err = lastError();
 		error(err, address.toString());
@ -205,7 +205,7 @@ void SocketImpl::bind6(const SocketAddress& address, bool reuseAddress, bool reu
 #if defined(POCO_HAVE_IPv6)
 	if (address.family() != SocketAddress::IPv6)
 		throw Poco::InvalidArgumentException("SocketAddress must be an IPv6 address");
-		
+
 	if (_sockfd == POCO_INVALID_SOCKET)
 	{
 		init(address.af());
@ -226,11 +226,11 @@ void SocketImpl::bind6(const SocketAddress& address, bool reuseAddress, bool reu
 #endif
 }

-	
+
 void SocketImpl::listen(int backlog)
 {
 	if (_sockfd == POCO_INVALID_SOCKET) throw InvalidSocketException();
-	
+
 	int rc = ::listen(_sockfd, backlog);
 	if (rc != 0) error();
 }
@ -254,7 +254,7 @@ void SocketImpl::shutdownReceive()
 	if (rc != 0) error();
 }

-	
+
 void SocketImpl::shutdownSend()
 {
 	if (_sockfd == POCO_INVALID_SOCKET) throw InvalidSocketException();
@ -263,7 +263,7 @@ void SocketImpl::shutdownSend()
 	if (rc != 0) error();
 }

-	
+
 void SocketImpl::shutdown()
 {
 	if (_sockfd == POCO_INVALID_SOCKET) throw InvalidSocketException();
@ -318,7 +318,7 @@ int SocketImpl::receiveBytes(void* buffer, int length, int flags)
 				throw TimeoutException();
 		}
 	}
-	
+
 	int rc;
 	do
 	{
@ -326,7 +326,7 @@ int SocketImpl::receiveBytes(void* buffer, int length, int flags)
 		rc = ::recv(_sockfd, reinterpret_cast<char*>(buffer), length, flags);
 	}
 	while (blocking && rc < 0 && lastError() == POCO_EINTR);
-	if (rc < 0) 
+	if (rc < 0)
 	{
 		int err = lastError();
 		if ((err == POCO_EAGAIN || err == POCO_EWOULDBLOCK) && !blocking)
@ -364,7 +364,7 @@ int SocketImpl::receiveFrom(void* buffer, int length, SocketAddress& address, in
 				throw TimeoutException();
 		}
 	}
-	
+
 	sockaddr_storage abuffer;
 	struct sockaddr* pSA = reinterpret_cast<struct sockaddr*>(&abuffer);
 	poco_socklen_t saLen = sizeof(abuffer);
@ -451,7 +451,7 @@ bool SocketImpl::pollImpl(Poco::Timespan& remainingTime, int mode)
 	}
 	while (rc < 0 && lastError() == POCO_EINTR);
 	if (rc < 0) error();
-	return rc > 0; 
+	return rc > 0;

 #else

@ -494,7 +494,7 @@ bool SocketImpl::pollImpl(Poco::Timespan& remainingTime, int mode)
 	}
 	while (rc < 0 && errorCode == POCO_EINTR);
 	if (rc < 0) error(errorCode);
-	return rc > 0; 
+	return rc > 0;

 #endif // POCO_HAVE_FD_POLL
 }
@ -504,13 +504,13 @@ bool SocketImpl::poll(const Poco::Timespan& timeout, int mode)
 	Poco::Timespan remainingTime(timeout);
 	return pollImpl(remainingTime, mode);
 }
-	
+
 void SocketImpl::setSendBufferSize(int size)
 {
 	setOption(SOL_SOCKET, SO_SNDBUF, size);
 }

-	
+
 int SocketImpl::getSendBufferSize()
 {
 	int result;
@ -524,7 +524,7 @@ void SocketImpl::setReceiveBufferSize(int size)
 	setOption(SOL_SOCKET, SO_RCVBUF, size);
 }

-	
+
 int SocketImpl::getReceiveBufferSize()
 {
 	int result;
@ -570,7 +570,7 @@ Poco::Timespan SocketImpl::getReceiveTimeout()
 	return result;
 }

-	
+
 SocketAddress SocketImpl::address()
 {
 	if (_sockfd == POCO_INVALID_SOCKET) throw InvalidSocketException();
@ -581,7 +581,7 @@ SocketAddress SocketImpl::address()
 	int rc = ::getsockname(_sockfd, pSA, &saLen);
 	if (rc == 0)
 		return SocketAddress(pSA, saLen);
-	else 
+	else
 		error();
 	return SocketAddress();
 }
--- a/docker/keeper/Dockerfile
+++ b/docker/keeper/Dockerfile
@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="24.8.2.3"
+ARG VERSION="24.8.3.59"
 ARG PACKAGES="clickhouse-keeper"
 ARG DIRECT_DOWNLOAD_URLS=""

--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="24.8.2.3"
+ARG VERSION="24.8.3.59"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 ARG DIRECT_DOWNLOAD_URLS=""

--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@ -28,7 +28,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list

 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
-ARG VERSION="24.8.2.3"
+ARG VERSION="24.8.3.59"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"

 #docker-official-library:off
--- a/docs/changelogs/v24.3.10.33-lts.md
+++ b/docs/changelogs/v24.3.10.33-lts.md
@ -0,0 +1,32 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v24.3.10.33-lts (37b6502ebf0) FIXME as compared to v24.3.9.5-lts (a939270465e)
+
+#### Improvement
+* Backported in [#68870](https://github.com/ClickHouse/ClickHouse/issues/68870): Make allow_experimental_analyzer be controlled by the initiator for distributed queries. This ensures compatibility and correctness during operations in mixed version clusters. [#65777](https://github.com/ClickHouse/ClickHouse/pull/65777) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Backported in [#69095](https://github.com/ClickHouse/ClickHouse/issues/69095): Support for the Spanish language in the embedded dictionaries. [#69035](https://github.com/ClickHouse/ClickHouse/pull/69035) ([Vasily Okunev](https://github.com/VOkunev)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+* Backported in [#68995](https://github.com/ClickHouse/ClickHouse/issues/68995): Fix the upper bound of the function `fromModifiedJulianDay`. It was supposed to be `9999-12-31` but was mistakenly set to `9999-01-01`. [#67583](https://github.com/ClickHouse/ClickHouse/pull/67583) ([PHO](https://github.com/depressed-pho)).
+* Backported in [#68844](https://github.com/ClickHouse/ClickHouse/issues/68844): Fixed crash in Parquet filtering when data types in the file substantially differ from requested types (e.g. `... FROM file('a.parquet', Parquet, 'x String')`, but the file has `x Int64`). Without this fix, use `input_format_parquet_filter_push_down = 0` as a workaround. [#68131](https://github.com/ClickHouse/ClickHouse/pull/68131) ([Michael Kolupaev](https://github.com/al13n321)).
+* Backported in [#68881](https://github.com/ClickHouse/ClickHouse/issues/68881): Fixes [#50868](https://github.com/ClickHouse/ClickHouse/issues/50868). Small DateTime64 constant values returned by a nested subquery inside a distributed query were wrongly transformed to Nulls, thus causing errors and possible incorrect query results. [#68323](https://github.com/ClickHouse/ClickHouse/pull/68323) ([Shankar](https://github.com/shiyer7474)).
+* Backported in [#69054](https://github.com/ClickHouse/ClickHouse/issues/69054): Added back virtual columns ` _table` and `_database` to distributed tables. They were available until version 24.3. [#68672](https://github.com/ClickHouse/ClickHouse/pull/68672) ([Anton Popov](https://github.com/CurtizJ)).
+* Backported in [#68856](https://github.com/ClickHouse/ClickHouse/issues/68856): Fix possible error `Size of permutation (0) is less than required (...)` during Variant column permutation. [#68681](https://github.com/ClickHouse/ClickHouse/pull/68681) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#69152](https://github.com/ClickHouse/ClickHouse/issues/69152): Fix possible wrong result during anyHeavy state merge. [#68950](https://github.com/ClickHouse/ClickHouse/pull/68950) ([Raúl Marín](https://github.com/Algunenano)).
+* Backported in [#69112](https://github.com/ClickHouse/ClickHouse/issues/69112): Fix logical error when we have empty async insert. [#69080](https://github.com/ClickHouse/ClickHouse/pull/69080) ([Han Fei](https://github.com/hanfei1991)).
+
+#### NO CL CATEGORY
+
+* Backported in [#68938](https://github.com/ClickHouse/ClickHouse/issues/68938):. [#68897](https://github.com/ClickHouse/ClickHouse/pull/68897) ([Alexander Gololobov](https://github.com/davenger)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Backported in [#68826](https://github.com/ClickHouse/ClickHouse/issues/68826): Turn off fault injection for insert in `01396_inactive_replica_cleanup_nodes_zookeeper`. [#68715](https://github.com/ClickHouse/ClickHouse/pull/68715) ([alesapin](https://github.com/alesapin)).
+* Backported in [#68754](https://github.com/ClickHouse/ClickHouse/issues/68754): To make patch release possible from every commit on release branch, package_debug build is required and must not be skipped. [#68750](https://github.com/ClickHouse/ClickHouse/pull/68750) ([Max K.](https://github.com/maxknv)).
+* Backported in [#69044](https://github.com/ClickHouse/ClickHouse/issues/69044): Fix 01114_database_atomic flakiness. [#68930](https://github.com/ClickHouse/ClickHouse/pull/68930) ([Raúl Marín](https://github.com/Algunenano)).
+
--- a/docs/changelogs/v24.5.7.31-stable.md
+++ b/docs/changelogs/v24.5.7.31-stable.md
@ -0,0 +1,29 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v24.5.7.31-stable (6c185e9aec1) FIXME as compared to v24.5.6.45-stable (bdca8604c29)
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+* Backported in [#68564](https://github.com/ClickHouse/ClickHouse/issues/68564): Fix indexHint function case found by fuzzer. [#66286](https://github.com/ClickHouse/ClickHouse/pull/66286) ([Anton Popov](https://github.com/CurtizJ)).
+* Backported in [#68996](https://github.com/ClickHouse/ClickHouse/issues/68996): Fix the upper bound of the function `fromModifiedJulianDay`. It was supposed to be `9999-12-31` but was mistakenly set to `9999-01-01`. [#67583](https://github.com/ClickHouse/ClickHouse/pull/67583) ([PHO](https://github.com/depressed-pho)).
+* Backported in [#68865](https://github.com/ClickHouse/ClickHouse/issues/68865): Fixed crash in Parquet filtering when data types in the file substantially differ from requested types (e.g. `... FROM file('a.parquet', Parquet, 'x String')`, but the file has `x Int64`). Without this fix, use `input_format_parquet_filter_push_down = 0` as a workaround. [#68131](https://github.com/ClickHouse/ClickHouse/pull/68131) ([Michael Kolupaev](https://github.com/al13n321)).
+* Backported in [#69004](https://github.com/ClickHouse/ClickHouse/issues/69004): After https://github.com/ClickHouse/ClickHouse/pull/61984 `schema_inference_make_columns_nullable=0` still can make columns `Nullable` in Parquet/Arrow formats. The change was backward incompatible and users noticed the changes in the behaviour. This PR makes `schema_inference_make_columns_nullable=0` to work as before (no Nullable columns will be inferred) and introduces new value `auto` for this setting that will make columns `Nullable` only if data has information about nullability. [#68298](https://github.com/ClickHouse/ClickHouse/pull/68298) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#68882](https://github.com/ClickHouse/ClickHouse/issues/68882): Fixes [#50868](https://github.com/ClickHouse/ClickHouse/issues/50868). Small DateTime64 constant values returned by a nested subquery inside a distributed query were wrongly transformed to Nulls, thus causing errors and possible incorrect query results. [#68323](https://github.com/ClickHouse/ClickHouse/pull/68323) ([Shankar](https://github.com/shiyer7474)).
+* Backported in [#69023](https://github.com/ClickHouse/ClickHouse/issues/69023): Added back virtual columns ` _table` and `_database` to distributed tables. They were available until version 24.3. [#68672](https://github.com/ClickHouse/ClickHouse/pull/68672) ([Anton Popov](https://github.com/CurtizJ)).
+* Backported in [#68858](https://github.com/ClickHouse/ClickHouse/issues/68858): Fix possible error `Size of permutation (0) is less than required (...)` during Variant column permutation. [#68681](https://github.com/ClickHouse/ClickHouse/pull/68681) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#68784](https://github.com/ClickHouse/ClickHouse/issues/68784): Fix issue with materialized constant keys when hashing maps with arrays as keys in functions `sipHash(64/128)Keyed`. [#68731](https://github.com/ClickHouse/ClickHouse/pull/68731) ([Salvatore Mesoraca](https://github.com/aiven-sal)).
+* Backported in [#69154](https://github.com/ClickHouse/ClickHouse/issues/69154): Fix possible wrong result during anyHeavy state merge. [#68950](https://github.com/ClickHouse/ClickHouse/pull/68950) ([Raúl Marín](https://github.com/Algunenano)).
+
+#### NO CL CATEGORY
+
+* Backported in [#68940](https://github.com/ClickHouse/ClickHouse/issues/68940):. [#68897](https://github.com/ClickHouse/ClickHouse/pull/68897) ([Alexander Gololobov](https://github.com/davenger)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Backported in [#68828](https://github.com/ClickHouse/ClickHouse/issues/68828): Turn off fault injection for insert in `01396_inactive_replica_cleanup_nodes_zookeeper`. [#68715](https://github.com/ClickHouse/ClickHouse/pull/68715) ([alesapin](https://github.com/alesapin)).
+* Backported in [#69046](https://github.com/ClickHouse/ClickHouse/issues/69046): Fix 01114_database_atomic flakiness. [#68930](https://github.com/ClickHouse/ClickHouse/pull/68930) ([Raúl Marín](https://github.com/Algunenano)).
+
--- a/docs/changelogs/v24.6.5.30-stable.md
+++ b/docs/changelogs/v24.6.5.30-stable.md
@ -0,0 +1,29 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v24.6.5.30-stable (e6e196c92d6) FIXME as compared to v24.6.4.42-stable (c534bb4b4dd)
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+* Backported in [#68969](https://github.com/ClickHouse/ClickHouse/issues/68969): Fix the upper bound of the function `fromModifiedJulianDay`. It was supposed to be `9999-12-31` but was mistakenly set to `9999-01-01`. [#67583](https://github.com/ClickHouse/ClickHouse/pull/67583) ([PHO](https://github.com/depressed-pho)).
+* Backported in [#68814](https://github.com/ClickHouse/ClickHouse/issues/68814): Fixed crash in Parquet filtering when data types in the file substantially differ from requested types (e.g. `... FROM file('a.parquet', Parquet, 'x String')`, but the file has `x Int64`). Without this fix, use `input_format_parquet_filter_push_down = 0` as a workaround. [#68131](https://github.com/ClickHouse/ClickHouse/pull/68131) ([Michael Kolupaev](https://github.com/al13n321)).
+* Backported in [#69005](https://github.com/ClickHouse/ClickHouse/issues/69005): After https://github.com/ClickHouse/ClickHouse/pull/61984 `schema_inference_make_columns_nullable=0` still can make columns `Nullable` in Parquet/Arrow formats. The change was backward incompatible and users noticed the changes in the behaviour. This PR makes `schema_inference_make_columns_nullable=0` to work as before (no Nullable columns will be inferred) and introduces new value `auto` for this setting that will make columns `Nullable` only if data has information about nullability. [#68298](https://github.com/ClickHouse/ClickHouse/pull/68298) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#68883](https://github.com/ClickHouse/ClickHouse/issues/68883): Fixes [#50868](https://github.com/ClickHouse/ClickHouse/issues/50868). Small DateTime64 constant values returned by a nested subquery inside a distributed query were wrongly transformed to Nulls, thus causing errors and possible incorrect query results. [#68323](https://github.com/ClickHouse/ClickHouse/pull/68323) ([Shankar](https://github.com/shiyer7474)).
+* Backported in [#69025](https://github.com/ClickHouse/ClickHouse/issues/69025): Added back virtual columns ` _table` and `_database` to distributed tables. They were available until version 24.3. [#68672](https://github.com/ClickHouse/ClickHouse/pull/68672) ([Anton Popov](https://github.com/CurtizJ)).
+* Backported in [#68860](https://github.com/ClickHouse/ClickHouse/issues/68860): Fix possible error `Size of permutation (0) is less than required (...)` during Variant column permutation. [#68681](https://github.com/ClickHouse/ClickHouse/pull/68681) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#68786](https://github.com/ClickHouse/ClickHouse/issues/68786): Fix issue with materialized constant keys when hashing maps with arrays as keys in functions `sipHash(64/128)Keyed`. [#68731](https://github.com/ClickHouse/ClickHouse/pull/68731) ([Salvatore Mesoraca](https://github.com/aiven-sal)).
+* Backported in [#69156](https://github.com/ClickHouse/ClickHouse/issues/69156): Fix possible wrong result during anyHeavy state merge. [#68950](https://github.com/ClickHouse/ClickHouse/pull/68950) ([Raúl Marín](https://github.com/Algunenano)).
+* Backported in [#69116](https://github.com/ClickHouse/ClickHouse/issues/69116): Fix logical error when we have empty async insert. [#69080](https://github.com/ClickHouse/ClickHouse/pull/69080) ([Han Fei](https://github.com/hanfei1991)).
+
+#### NO CL CATEGORY
+
+* Backported in [#68942](https://github.com/ClickHouse/ClickHouse/issues/68942):. [#68897](https://github.com/ClickHouse/ClickHouse/pull/68897) ([Alexander Gololobov](https://github.com/davenger)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Backported in [#68830](https://github.com/ClickHouse/ClickHouse/issues/68830): Turn off fault injection for insert in `01396_inactive_replica_cleanup_nodes_zookeeper`. [#68715](https://github.com/ClickHouse/ClickHouse/pull/68715) ([alesapin](https://github.com/alesapin)).
+* Backported in [#69048](https://github.com/ClickHouse/ClickHouse/issues/69048): Fix 01114_database_atomic flakiness. [#68930](https://github.com/ClickHouse/ClickHouse/pull/68930) ([Raúl Marín](https://github.com/Algunenano)).
+
--- a/docs/changelogs/v24.8.3.59-lts.md
+++ b/docs/changelogs/v24.8.3.59-lts.md
@ -0,0 +1,50 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v24.8.3.59-lts (e729b9fa40e) FIXME as compared to v24.8.2.3-lts (b54f79ed323)
+
+#### New Feature
+* Backported in [#68710](https://github.com/ClickHouse/ClickHouse/issues/68710): Query cache entries can now be dropped by tag. For example, the query cache entry created by `SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'abc'` can now be dropped by `SYSTEM DROP QUERY CACHE TAG 'abc'` (or of course just: `SYSTEM DROP QUERY CACHE` which will clear the entire query cache). [#68477](https://github.com/ClickHouse/ClickHouse/pull/68477) ([Michał Tabaszewski](https://github.com/pinsvin00)).
+
+#### Improvement
+* Backported in [#69097](https://github.com/ClickHouse/ClickHouse/issues/69097): Support for the Spanish language in the embedded dictionaries. [#69035](https://github.com/ClickHouse/ClickHouse/pull/69035) ([Vasily Okunev](https://github.com/VOkunev)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+* Backported in [#68973](https://github.com/ClickHouse/ClickHouse/issues/68973): Fix the upper bound of the function `fromModifiedJulianDay`. It was supposed to be `9999-12-31` but was mistakenly set to `9999-01-01`. [#67583](https://github.com/ClickHouse/ClickHouse/pull/67583) ([PHO](https://github.com/depressed-pho)).
+* Backported in [#68818](https://github.com/ClickHouse/ClickHouse/issues/68818): Fixed crash in Parquet filtering when data types in the file substantially differ from requested types (e.g. `... FROM file('a.parquet', Parquet, 'x String')`, but the file has `x Int64`). Without this fix, use `input_format_parquet_filter_push_down = 0` as a workaround. [#68131](https://github.com/ClickHouse/ClickHouse/pull/68131) ([Michael Kolupaev](https://github.com/al13n321)).
+* Backported in [#68893](https://github.com/ClickHouse/ClickHouse/issues/68893): After https://github.com/ClickHouse/ClickHouse/pull/61984 `schema_inference_make_columns_nullable=0` still can make columns `Nullable` in Parquet/Arrow formats. The change was backward incompatible and users noticed the changes in the behaviour. This PR makes `schema_inference_make_columns_nullable=0` to work as before (no Nullable columns will be inferred) and introduces new value `auto` for this setting that will make columns `Nullable` only if data has information about nullability. [#68298](https://github.com/ClickHouse/ClickHouse/pull/68298) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#68721](https://github.com/ClickHouse/ClickHouse/issues/68721): Fixes [#50868](https://github.com/ClickHouse/ClickHouse/issues/50868). Small DateTime64 constant values returned by a nested subquery inside a distributed query were wrongly transformed to Nulls, thus causing errors and possible incorrect query results. [#68323](https://github.com/ClickHouse/ClickHouse/pull/68323) ([Shankar](https://github.com/shiyer7474)).
+* Backported in [#69029](https://github.com/ClickHouse/ClickHouse/issues/69029): Added back virtual columns ` _table` and `_database` to distributed tables. They were available until version 24.3. [#68672](https://github.com/ClickHouse/ClickHouse/pull/68672) ([Anton Popov](https://github.com/CurtizJ)).
+* Backported in [#68864](https://github.com/ClickHouse/ClickHouse/issues/68864): Fix possible error `Size of permutation (0) is less than required (...)` during Variant column permutation. [#68681](https://github.com/ClickHouse/ClickHouse/pull/68681) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#68854](https://github.com/ClickHouse/ClickHouse/issues/68854): Fix possible error `DB::Exception: Block structure mismatch in joined block stream: different columns:` with new JSON column. [#68686](https://github.com/ClickHouse/ClickHouse/pull/68686) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#68790](https://github.com/ClickHouse/ClickHouse/issues/68790): Fix issue with materialized constant keys when hashing maps with arrays as keys in functions `sipHash(64/128)Keyed`. [#68731](https://github.com/ClickHouse/ClickHouse/pull/68731) ([Salvatore Mesoraca](https://github.com/aiven-sal)).
+* Backported in [#69108](https://github.com/ClickHouse/ClickHouse/issues/69108): TODO. [#68744](https://github.com/ClickHouse/ClickHouse/pull/68744) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Backported in [#68850](https://github.com/ClickHouse/ClickHouse/issues/68850): Fix resolving dynamic subcolumns from subqueries in analyzer. [#68824](https://github.com/ClickHouse/ClickHouse/pull/68824) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#68911](https://github.com/ClickHouse/ClickHouse/issues/68911): Fix complex types metadata parsing in DeltaLake. Closes [#68739](https://github.com/ClickHouse/ClickHouse/issues/68739). [#68836](https://github.com/ClickHouse/ClickHouse/pull/68836) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Backported in [#69160](https://github.com/ClickHouse/ClickHouse/issues/69160): Fix possible wrong result during anyHeavy state merge. [#68950](https://github.com/ClickHouse/ClickHouse/pull/68950) ([Raúl Marín](https://github.com/Algunenano)).
+* Backported in [#69072](https://github.com/ClickHouse/ClickHouse/issues/69072): Fixed writing to Materialized Views with enabled setting `optimize_functions_to_subcolumns`. [#68951](https://github.com/ClickHouse/ClickHouse/pull/68951) ([Anton Popov](https://github.com/CurtizJ)).
+* Backported in [#69016](https://github.com/ClickHouse/ClickHouse/issues/69016): Don't use serializations cache in const Dynamic column methods. It could let to use-of-unitialized value or even race condition during aggregations. [#68953](https://github.com/ClickHouse/ClickHouse/pull/68953) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#69120](https://github.com/ClickHouse/ClickHouse/issues/69120): Fix logical error when we have empty async insert. [#69080](https://github.com/ClickHouse/ClickHouse/pull/69080) ([Han Fei](https://github.com/hanfei1991)).
+
+#### NO CL CATEGORY
+
+* Backported in [#68947](https://github.com/ClickHouse/ClickHouse/issues/68947):. [#68897](https://github.com/ClickHouse/ClickHouse/pull/68897) ([Alexander Gololobov](https://github.com/davenger)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Backported in [#68704](https://github.com/ClickHouse/ClickHouse/issues/68704): Fix enumerating dynamic subcolumns. [#68582](https://github.com/ClickHouse/ClickHouse/pull/68582) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#69000](https://github.com/ClickHouse/ClickHouse/issues/69000): Prioritizing of virtual columns in hive partitioning. [#68606](https://github.com/ClickHouse/ClickHouse/pull/68606) ([Yarik Briukhovetskyi](https://github.com/yariks5s)).
+* Backported in [#68799](https://github.com/ClickHouse/ClickHouse/issues/68799): CI: Disable SQLLogic job. [#68654](https://github.com/ClickHouse/ClickHouse/pull/68654) ([Max K.](https://github.com/maxknv)).
+* Backported in [#68834](https://github.com/ClickHouse/ClickHouse/issues/68834): Turn off fault injection for insert in `01396_inactive_replica_cleanup_nodes_zookeeper`. [#68715](https://github.com/ClickHouse/ClickHouse/pull/68715) ([alesapin](https://github.com/alesapin)).
+* Backported in [#68781](https://github.com/ClickHouse/ClickHouse/issues/68781): Fix flaky test 00989_parallel_parts_loading. [#68737](https://github.com/ClickHouse/ClickHouse/pull/68737) ([alesapin](https://github.com/alesapin)).
+* Backported in [#68762](https://github.com/ClickHouse/ClickHouse/issues/68762): To make patch release possible from every commit on release branch, package_debug build is required and must not be skipped. [#68750](https://github.com/ClickHouse/ClickHouse/pull/68750) ([Max K.](https://github.com/maxknv)).
+* Backported in [#68810](https://github.com/ClickHouse/ClickHouse/issues/68810): Try to disable rerun check if job triggered manually. [#68751](https://github.com/ClickHouse/ClickHouse/pull/68751) ([Max K.](https://github.com/maxknv)).
+* Backported in [#68962](https://github.com/ClickHouse/ClickHouse/issues/68962): Fix 2477 timeout. [#68752](https://github.com/ClickHouse/ClickHouse/pull/68752) ([jsc0218](https://github.com/jsc0218)).
+* Backported in [#68977](https://github.com/ClickHouse/ClickHouse/issues/68977): Check setting use_json_alias_for_old_object_type in runtime. [#68793](https://github.com/ClickHouse/ClickHouse/pull/68793) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#68852](https://github.com/ClickHouse/ClickHouse/issues/68852): Make dynamic structure selection more consistent. [#68802](https://github.com/ClickHouse/ClickHouse/pull/68802) ([Kruglov Pavel](https://github.com/Avogar)).
+* Backported in [#69052](https://github.com/ClickHouse/ClickHouse/issues/69052): Fix 01114_database_atomic flakiness. [#68930](https://github.com/ClickHouse/ClickHouse/pull/68930) ([Raúl Marín](https://github.com/Algunenano)).
+
--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@ -499,7 +499,7 @@ Required parameters:
 - `type` — `encrypted`. Otherwise the encrypted disk is not created.
 - `disk` — Type of disk for data storage.
 - `key` — The key for encryption and decryption. Type: [Uint64](/docs/en/sql-reference/data-types/int-uint.md). You can use `key_hex` parameter to encode the key in hexadecimal form.
-    You can specify multiple keys using the `id` attribute (see example above).
+    You can specify multiple keys using the `id` attribute (see example below).

 Optional parameters:

--- a/docs/en/sql-reference/transactions.md
+++ b/docs/en/sql-reference/transactions.md
@ -8,14 +8,14 @@ slug: /en/guides/developer/transactional
 This is transactional (ACID) if the inserted rows are packed and inserted as a single block (see Notes):
 - Atomic: an INSERT succeeds or is rejected as a whole: if a confirmation is sent to the client, then all rows were inserted; if an error is sent to the client, then no rows were inserted.
 - Consistent: if there are no table constraints violated, then all rows in an INSERT are inserted and the INSERT succeeds; if constraints are violated, then no rows are inserted.
- Isolated: concurrent clients observe a consistent snapshot of the table–the state of the table either as it was before the INSERT attempt, or after the successful INSERT; no partial state is seen
+- Isolated: concurrent clients observe a consistent snapshot of the table–the state of the table either as it was before the INSERT attempt, or after the successful INSERT; no partial state is seen. Clients inside of another transaction have [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation), while clients outside of a transaction have [read uncommitted](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Read_uncommitted) isolation level.
 - Durable: a successful INSERT is written to the filesystem before answering to the client, on a single replica or multiple replicas (controlled by the `insert_quorum` setting), and ClickHouse can ask the OS to sync the filesystem data on the storage media (controlled by the `fsync_after_insert` setting).
 - INSERT into multiple tables with one statement is possible if materialized views are involved (the INSERT from the client is to a table which has associate materialized views).

 ## Case 2: INSERT into multiple partitions, of one table, of the MergeTree* family

 Same as Case 1 above, with this detail:
- If table has many partitions and INSERT covers many partitions–then insertion into every partition is transactional on its own
+- If table has many partitions and INSERT covers many partitions, then insertion into every partition is transactional on its own


 ## Case 3: INSERT into one distributed table of the MergeTree* family
@ -38,7 +38,7 @@ Same as Case 1 above, with this detail:
  - the insert format is column-based (like Native, Parquet, ORC, etc) and the data contains only one block of data
 - the size of the inserted block in general may depend on many settings (for example: `max_block_size`, `max_insert_block_size`, `min_insert_block_size_rows`, `min_insert_block_size_bytes`, `preferred_block_size_bytes`, etc)
 - if the client did not receive an answer from the server, the client does not know if the transaction succeeded, and it can repeat the transaction, using exactly-once insertion properties
- ClickHouse is using MVCC with snapshot isolation internally
+- ClickHouse is using [MVCC](https://en.wikipedia.org/wiki/Multiversion_concurrency_control) with [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation) internally for concurrent transactions
 - all ACID properties are valid even in the case of server kill/crash
 - either insert_quorum into different AZ or fsync should be enabled to ensure durable inserts in the typical setup
 - "consistency" in ACID terms does not cover the semantics of distributed systems, see https://jepsen.io/consistency which is controlled by different settings (select_sequential_consistency)
@ -260,7 +260,7 @@ FROM mergetree_table
 ### Transactions introspection

 You can inspect transactions by querying the `system.transactions` table, but note that you cannot query that
-table from a session that is in a transaction–open a second `clickhouse client` session to query that table.
+table from a session that is in a transaction. Open a second `clickhouse client` session to query that table.

 ```sql
 SELECT *
--- a/src/AggregateFunctions/AggregateFunctionAnyHeavy.cpp
+++ b/src/AggregateFunctions/AggregateFunctionAnyHeavy.cpp
@ -68,7 +68,10 @@ public:
        if (data().isEqualTo(to.data()))
            counter += to.counter;
        else if (!data().has() || counter < to.counter)
+        {
            data().set(to.data(), arena);
+            counter = to.counter - counter;
+        }
        else
            counter -= to.counter;
    }
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -2699,14 +2699,6 @@ bool ClientBase::processMultiQueryFromFile(const String & file_name)
    ReadBufferFromFile in(file_name);
    readStringUntilEOF(queries_from_file, in);

-    if (!getClientConfiguration().has("log_comment"))
-    {
-        Settings settings = client_context->getSettingsCopy();
-        /// NOTE: cannot use even weakly_canonical() since it fails for /dev/stdin due to resolving of "pipe:[X]"
-        settings.log_comment = fs::absolute(fs::path(file_name));
-        client_context->setSettings(settings);
-    }
-
    return executeMultiQuery(queries_from_file);
 }

--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -75,9 +75,9 @@
    M(GlobalThread, "Number of threads in global thread pool.") \
    M(GlobalThreadActive, "Number of threads in global thread pool running a task.") \
    M(GlobalThreadScheduled, "Number of queued or active jobs in global thread pool.") \
-    M(LocalThread, "Number of threads in local thread pools. The threads in local thread pools are taken from the global thread pool.") \
-    M(LocalThreadActive, "Number of threads in local thread pools running a task.") \
-    M(LocalThreadScheduled, "Number of queued or active jobs in local thread pools.") \
+    M(LocalThread, "Obsolete. Number of threads in local thread pools. The threads in local thread pools are taken from the global thread pool.") \
+    M(LocalThreadActive, "Obsolete. Number of threads in local thread pools running a task.") \
+    M(LocalThreadScheduled, "Obsolete. Number of queued or active jobs in local thread pools.") \
    M(MergeTreeDataSelectExecutorThreads, "Number of threads in the MergeTreeDataSelectExecutor thread pool.") \
    M(MergeTreeDataSelectExecutorThreadsActive, "Number of threads in the MergeTreeDataSelectExecutor thread pool running a task.") \
    M(MergeTreeDataSelectExecutorThreadsScheduled, "Number of queued or active jobs in the MergeTreeDataSelectExecutor thread pool.") \
@ -292,6 +292,9 @@
    M(DistrCacheWriteRequests, "Number of executed Write requests to Distributed Cache") \
    M(DistrCacheServerConnections, "Number of open connections to ClickHouse server from Distributed Cache") \
    \
+    M(SchedulerIOReadScheduled, "Number of IO reads are being scheduled currently") \
+    M(SchedulerIOWriteScheduled, "Number of IO writes are being scheduled currently") \
+    \
    M(StorageConnectionsStored, "Total count of sessions stored in the session pool for storages") \
    M(StorageConnectionsTotal, "Total count of all sessions: stored in the pool and actively used right now for storages") \
    \
--- a/src/Common/CurrentThread.cpp
+++ b/src/Common/CurrentThread.cpp
@ -113,6 +113,56 @@ std::string_view CurrentThread::getQueryId()
    return current_thread->getQueryId();
 }

+void CurrentThread::attachReadResource(ResourceLink link)
+{
+    if (unlikely(!current_thread))
+        return;
+    if (current_thread->read_resource_link)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Thread #{} has been already attached to read resource", std::to_string(getThreadId()));
+    current_thread->read_resource_link = link;
+}
+
+void CurrentThread::detachReadResource()
+{
+    if (unlikely(!current_thread))
+        return;
+    if (!current_thread->read_resource_link)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Thread #{} has not been attached to read resource", std::to_string(getThreadId()));
+    current_thread->read_resource_link.reset();
+}
+
+ResourceLink CurrentThread::getReadResourceLink()
+{
+    if (unlikely(!current_thread))
+        return {};
+    return current_thread->read_resource_link;
+}
+
+void CurrentThread::attachWriteResource(ResourceLink link)
+{
+    if (unlikely(!current_thread))
+        return;
+    if (current_thread->write_resource_link)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Thread #{} has been already attached to write resource", std::to_string(getThreadId()));
+    current_thread->write_resource_link = link;
+}
+
+void CurrentThread::detachWriteResource()
+{
+    if (unlikely(!current_thread))
+        return;
+    if (!current_thread->write_resource_link)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Thread #{} has not been attached to write resource", std::to_string(getThreadId()));
+    current_thread->write_resource_link.reset();
+}
+
+ResourceLink CurrentThread::getWriteResourceLink()
+{
+    if (unlikely(!current_thread))
+        return {};
+    return current_thread->write_resource_link;
+}
+
 MemoryTracker * CurrentThread::getUserMemoryTracker()
 {
    if (unlikely(!current_thread))
--- a/src/Common/CurrentThread.h
+++ b/src/Common/CurrentThread.h
@ -2,6 +2,7 @@

 #include <Interpreters/Context_fwd.h>
 #include <Common/ThreadStatus.h>
+#include <Common/Scheduler/ResourceLink.h>

 #include <memory>
 #include <string>
@ -23,7 +24,6 @@ class QueryStatus;
 struct Progress;
 class InternalTextLogsQueue;

-
 /** Collection of static methods to work with thread-local objects.
  * Allows to attach and detach query/process (thread group) to a thread
  * (to calculate query-related metrics and to allow to obtain query-related data from a thread).
@ -92,6 +92,14 @@ public:

    static std::string_view getQueryId();

+    // For IO Scheduling
+    static void attachReadResource(ResourceLink link);
+    static void detachReadResource();
+    static ResourceLink getReadResourceLink();
+    static void attachWriteResource(ResourceLink link);
+    static void detachWriteResource();
+    static ResourceLink getWriteResourceLink();
+
    /// Initializes query with current thread as master thread in constructor, and detaches it in destructor
    struct QueryScope : private boost::noncopyable
    {
@ -102,6 +110,39 @@ public:
        void logPeakMemoryUsage();
        bool log_peak_memory_usage_in_destructor = true;
    };
+
+    /// Scoped attach/detach of IO resource links
+    struct IOScope : private boost::noncopyable
+    {
+        explicit IOScope(ResourceLink read_resource_link, ResourceLink write_resource_link)
+        {
+            if (read_resource_link)
+            {
+                attachReadResource(read_resource_link);
+                read_attached = true;
+            }
+            if (write_resource_link)
+            {
+                attachWriteResource(write_resource_link);
+                write_attached = true;
+            }
+        }
+
+        explicit IOScope(const IOSchedulingSettings & settings)
+            : IOScope(settings.read_resource_link, settings.write_resource_link)
+        {}
+
+        ~IOScope()
+        {
+            if (read_attached)
+                detachReadResource();
+            if (write_attached)
+                detachWriteResource();
+        }
+
+        bool read_attached = false;
+        bool write_attached = false;
+    };
 };

 }
--- a/src/Common/HTTPConnectionPool.cpp
+++ b/src/Common/HTTPConnectionPool.cpp
@ -2,6 +2,7 @@
 #include <Common/HostResolvePool.h>

 #include <Common/ProfileEvents.h>
+#include <Common/Stopwatch.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/logger_useful.h>
 #include <Common/Exception.h>
@ -9,6 +10,7 @@
 #include <Common/ProxyConfiguration.h>
 #include <Common/MemoryTrackerSwitcher.h>
 #include <Common/SipHash.h>
+#include <Common/Scheduler/ResourceGuard.h>
 #include <Common/proxyConfigurationToPocoProxyConfig.h>

 #include <Poco/Net/HTTPChunkedStream.h>
@ -236,6 +238,59 @@ public:
 };


+// Session data hooks implementation for integration with resource scheduler.
+// Hooks are created per every request-response pair and are registered/unregistered in HTTP session.
+// * `atStart()` send resource request to the scheduler every time HTTP session is going to send or receive
+//   data to/from socket. `start()` waits for the scheduler confirmation. This way scheduler might
+//   throttle and/or schedule socket data streams.
+// * `atFinish()` hook is called on successful socket read/write operation.
+//   It informs the scheduler that operation is complete, which allows the scheduler to control the total
+//   amount of in-flight bytes and/or operations.
+// * `atFail()` hook is called on failure of socket operation. The purpose is to correct the amount of bytes
+//   passed through the scheduler queue to ensure fair bandwidth allocation even in presence of errors.
+struct ResourceGuardSessionDataHooks : public Poco::Net::IHTTPSessionDataHooks
+{
+    ResourceGuardSessionDataHooks(ResourceLink link_, const ResourceGuard::Metrics * metrics, LoggerPtr log_, const String & method, const String & uri)
+        : link(link_)
+        , log(log_)
+        , http_request(method + " " + uri)
+    {
+        request.metrics = metrics;
+        chassert(link);
+    }
+
+    ~ResourceGuardSessionDataHooks() override
+    {
+        request.assertFinished(); // Never destruct with an active request
+    }
+
+    void atStart(int bytes) override
+    {
+        Stopwatch timer;
+        request.enqueue(bytes, link);
+        request.wait();
+        timer.stop();
+        if (timer.elapsedMilliseconds() >= 5000)
+            LOG_INFO(log, "Resource request took too long to finish: {} ms for {}", timer.elapsedMilliseconds(), http_request);
+    }
+
+    void atFinish(int bytes) override
+    {
+        request.finish(bytes, link);
+    }
+
+    void atFail() override
+    {
+        request.finish(0, link);
+    }
+
+    ResourceLink link;
+    ResourceGuard::Request request;
+    LoggerPtr log;
+    String http_request;
+};
+
+
 // EndpointConnectionPool manage connections to the endpoint
 // Features:
 // - it uses HostResolver for address selecting. See Common/HostResolver.h for more info.
@ -246,8 +301,6 @@ public:
 // - `Session::reconnect()` uses the pool as well
 // - comprehensive sensors
 // - session is reused according its inner state, automatically
-
-
 template <class Session>
 class EndpointConnectionPool : public std::enable_shared_from_this<EndpointConnectionPool<Session>>, public IExtendedPool
 {
@ -337,6 +390,13 @@ private:
        std::ostream & sendRequest(Poco::Net::HTTPRequest & request) override
        {
            auto idle = idleTime();
+
+            // Set data hooks for IO scheduling
+            if (ResourceLink link = CurrentThread::getReadResourceLink())
+                Session::setReceiveDataHooks(std::make_shared<ResourceGuardSessionDataHooks>(link, ResourceGuard::Metrics::getIORead(), log, request.getMethod(), request.getURI()));
+            if (ResourceLink link = CurrentThread::getWriteResourceLink())
+                Session::setSendDataHooks(std::make_shared<ResourceGuardSessionDataHooks>(link, ResourceGuard::Metrics::getIOWrite(), log, request.getMethod(), request.getURI()));
+
            std::ostream & result = Session::sendRequest(request);
            result.exceptions(std::ios::badbit);

@ -393,6 +453,8 @@ private:
                }
            }
            response_stream = nullptr;
+            Session::setSendDataHooks();
+            Session::setReceiveDataHooks();

            group->atConnectionDestroy();

--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -86,6 +86,20 @@
    M(NetworkReceiveBytes, "Total number of bytes received from network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \
    M(NetworkSendBytes, "Total number of bytes send to network. Only ClickHouse-related network interaction is included, not by 3rd party libraries.") \
    \
+    M(GlobalThreadPoolExpansions, "Counts the total number of times new threads have been added to the global thread pool. This metric indicates the frequency of expansions in the global thread pool to accommodate increased processing demands.") \
+    M(GlobalThreadPoolShrinks, "Counts the total number of times the global thread pool has shrunk by removing threads. This occurs when the number of idle threads exceeds max_thread_pool_free_size, indicating adjustments in the global thread pool size in response to decreased thread utilization.") \
+    M(GlobalThreadPoolThreadCreationMicroseconds, "Total time spent waiting for new threads to start.") \
+    M(GlobalThreadPoolLockWaitMicroseconds, "Total time threads have spent waiting for locks in the global thread pool.") \
+    M(GlobalThreadPoolJobs, "Counts the number of jobs that have been pushed to the global thread pool.") \
+    M(GlobalThreadPoolJobWaitTimeMicroseconds, "Measures the elapsed time from when a job is scheduled in the thread pool to when it is picked up for execution by a worker thread. This metric helps identify delays in job processing, indicating the responsiveness of the thread pool to new tasks.") \
+    M(LocalThreadPoolExpansions, "Counts the total number of times threads have been borrowed from the global thread pool to expand local thread pools.") \
+    M(LocalThreadPoolShrinks, "Counts the total number of times threads have been returned to the global thread pool from local thread pools.") \
+    M(LocalThreadPoolThreadCreationMicroseconds, "Total time local thread pools have spent waiting to borrow a thread from the global pool.") \
+    M(LocalThreadPoolLockWaitMicroseconds, "Total time threads have spent waiting for locks in the local thread pools.") \
+    M(LocalThreadPoolJobs, "Counts the number of jobs that have been pushed to the local thread pools.") \
+    M(LocalThreadPoolBusyMicroseconds, "Total time threads have spent executing the actual work.") \
+    M(LocalThreadPoolJobWaitTimeMicroseconds, "Measures the elapsed time from when a job is scheduled in the thread pool to when it is picked up for execution by a worker thread. This metric helps identify delays in job processing, indicating the responsiveness of the thread pool to new tasks.") \
+    \
    M(DiskS3GetRequestThrottlerCount, "Number of DiskS3 GET and SELECT requests passed through throttler.") \
    M(DiskS3GetRequestThrottlerSleepMicroseconds, "Total time a query was sleeping to conform DiskS3 GET and SELECT request throttling.") \
    M(DiskS3PutRequestThrottlerCount, "Number of DiskS3 PUT, COPY, POST and LIST requests passed through throttler.") \
@ -106,6 +120,13 @@
    M(PartsWithAppliedMutationsOnFly, "Total number of parts for which there was any mutation applied on fly") \
    M(MutationsAppliedOnFlyInAllParts, "The sum of number of applied mutations on-fly for part among all read parts") \
    \
+    M(SchedulerIOReadRequests, "Resource requests passed through scheduler for IO reads.") \
+    M(SchedulerIOReadBytes, "Bytes passed through scheduler for IO reads.") \
+    M(SchedulerIOReadWaitMicroseconds, "Total time a query was waiting on resource requests for IO reads.") \
+    M(SchedulerIOWriteRequests, "Resource requests passed through scheduler for IO writes.") \
+    M(SchedulerIOWriteBytes, "Bytes passed through scheduler for IO writes.") \
+    M(SchedulerIOWriteWaitMicroseconds, "Total time a query was waiting on resource requests for IO writes.") \
+    \
    M(QueryMaskingRulesMatch, "Number of times query masking rules was successfully matched.") \
    \
    M(ReplicatedPartFetches, "Number of times a data part was downloaded from replica of a ReplicatedMergeTree table.") \
--- a/src/Common/Scheduler/ISchedulerQueue.h
+++ b/src/Common/Scheduler/ISchedulerQueue.h
@ -22,10 +22,13 @@ public:
    {}

    // Wrapper for `enqueueRequest()` that should be used to account for available resource budget
-    void enqueueRequestUsingBudget(ResourceRequest * request)
+    // Returns `estimated_cost` that should be passed later to `adjustBudget()`
+    [[ nodiscard ]] ResourceCost enqueueRequestUsingBudget(ResourceRequest * request)
    {
-        request->cost = budget.ask(request->cost);
+        ResourceCost estimated_cost = request->cost;
+        request->cost = budget.ask(estimated_cost);
        enqueueRequest(request);
+        return estimated_cost;
    }

    // Should be called to account for difference between real and estimated costs
@ -34,18 +37,6 @@ public:
        budget.adjust(estimated_cost, real_cost);
    }

-    // Adjust budget to account for extra consumption of `cost` resource units
-    void consumeBudget(ResourceCost cost)
-    {
-        adjustBudget(0, cost);
-    }
-
-    // Adjust budget to account for requested, but not consumed `cost` resource units
-    void accumulateBudget(ResourceCost cost)
-    {
-        adjustBudget(cost, 0);
-    }
-
    /// Enqueue new request to be executed using underlying resource.
    /// Should be called outside of scheduling subsystem, implementation must be thread-safe.
    virtual void enqueueRequest(ResourceRequest * request) = 0;
--- a/src/Common/Scheduler/Nodes/tests/ResourceTest.h
+++ b/src/Common/Scheduler/Nodes/tests/ResourceTest.h
@ -232,12 +232,13 @@ struct ResourceTestManager : public ResourceTestBase
        ResourceTestManager & t;

        Guard(ResourceTestManager & t_, ResourceLink link_, ResourceCost cost)
-            : ResourceGuard(link_, cost, PostponeLocking)
+            : ResourceGuard(ResourceGuard::Metrics::getIOWrite(), link_, cost, Lock::Defer)
            , t(t_)
        {
            t.onEnqueue(link);
            lock();
            t.onExecute(link);
+            consume(cost);
        }
    };

@ -310,8 +311,9 @@ struct ResourceTestManager : public ResourceTestBase
    // NOTE: actually leader's request(s) make their own small busy period.
    void blockResource(ResourceLink link)
    {
-        ResourceGuard g(link, 1, ResourceGuard::PostponeLocking);
+        ResourceGuard g(ResourceGuard::Metrics::getIOWrite(), link, 1, ResourceGuard::Lock::Defer);
        g.lock();
+        g.consume(1);
        // NOTE: at this point we assume resource to be blocked by single request (<max_requests>1</max_requests>)
        busy_period.arrive_and_wait(); // (1) notify all followers that resource is blocked
        busy_period.arrive_and_wait(); // (2) wait all followers to enqueue their requests
@ -320,10 +322,11 @@ struct ResourceTestManager : public ResourceTestBase
    {
        getLinkData(link).left += total_requests + 1;
        busy_period.arrive_and_wait(); // (1) wait leader to block resource
-        ResourceGuard g(link, cost, ResourceGuard::PostponeLocking);
+        ResourceGuard g(ResourceGuard::Metrics::getIOWrite(), link, cost, ResourceGuard::Lock::Defer);
        onEnqueue(link);
        busy_period.arrive_and_wait(); // (2) notify leader to unblock
        g.lock();
+        g.consume(cost);
        onExecute(link);
    }
 };
--- a/src/Common/Scheduler/Nodes/tests/gtest_dynamic_resource_manager.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_dynamic_resource_manager.cpp
@ -36,11 +36,16 @@ TEST(SchedulerDynamicResourceManager, Smoke)

    for (int i = 0; i < 10; i++)
    {
-        ResourceGuard gA(cA->get("res1"), ResourceGuard::PostponeLocking);
+        ResourceGuard gA(ResourceGuard::Metrics::getIOWrite(), cA->get("res1"), 1, ResourceGuard::Lock::Defer);
        gA.lock();
+        gA.consume(1);
        gA.unlock();

-        ResourceGuard gB(cB->get("res1"));
+        ResourceGuard gB(ResourceGuard::Metrics::getIOWrite(), cB->get("res1"));
+        gB.unlock();
+
+        ResourceGuard gC(ResourceGuard::Metrics::getIORead(), cB->get("res1"));
+        gB.consume(2);
    }
 }

--- a/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp
@ -1,11 +1,13 @@
 #include <gtest/gtest.h>

-#include <Common/Scheduler/SchedulerRoot.h>
-
 #include <Common/Scheduler/Nodes/tests/ResourceTest.h>

+#include <Common/Scheduler/SchedulerRoot.h>
+#include <Common/randomSeed.h>
+
 #include <barrier>
 #include <future>
+#include <pcg_random.hpp>

 using namespace DB;

@ -22,6 +24,17 @@ struct ResourceTest : public ResourceTestBase
    {
        scheduler.stop(true);
    }
+
+    std::mutex rng_mutex;
+    pcg64 rng{randomSeed()};
+
+    template <typename T>
+    T randomInt(T from, T to)
+    {
+        std::uniform_int_distribution<T> distribution(from, to);
+        std::lock_guard lock(rng_mutex);
+        return distribution(rng);
+    }
 };

 struct ResourceHolder
@ -109,26 +122,55 @@ TEST(SchedulerRoot, Smoke)
    r2.registerResource();

    {
-        ResourceGuard rg(a);
+        ResourceGuard rg(ResourceGuard::Metrics::getIOWrite(), a);
        EXPECT_TRUE(fc1->requests.contains(&rg.request));
+        rg.consume(1);
    }

    {
-        ResourceGuard rg(b);
+        ResourceGuard rg(ResourceGuard::Metrics::getIOWrite(), b);
        EXPECT_TRUE(fc1->requests.contains(&rg.request));
+        rg.consume(1);
    }

    {
-        ResourceGuard rg(c);
+        ResourceGuard rg(ResourceGuard::Metrics::getIOWrite(), c);
        EXPECT_TRUE(fc2->requests.contains(&rg.request));
+        rg.consume(1);
    }

    {
-        ResourceGuard rg(d);
+        ResourceGuard rg(ResourceGuard::Metrics::getIOWrite(), d);
        EXPECT_TRUE(fc2->requests.contains(&rg.request));
+        rg.consume(1);
    }
 }

+TEST(SchedulerRoot, Budget)
+{
+    ResourceTest t;
+
+    ResourceHolder r1(t);
+    r1.add<ConstraintTest>("/", "<max_requests>1</max_requests>");
+    r1.add<PriorityPolicy>("/prio");
+    auto a = r1.addQueue("/prio/A", "");
+    r1.registerResource();
+
+    ResourceCost total_real_cost = 0;
+    int total_requests = 10;
+    for (int i = 0 ; i < total_requests; i++)
+    {
+        ResourceCost est_cost = t.randomInt(1, 10);
+        ResourceCost real_cost = t.randomInt(0, 10);
+        ResourceGuard rg(ResourceGuard::Metrics::getIOWrite(), a, est_cost);
+        rg.consume(real_cost);
+        total_real_cost += real_cost;
+    }
+
+    EXPECT_EQ(total_requests, a.queue->dequeued_requests);
+    EXPECT_EQ(total_real_cost, a.queue->dequeued_cost - a.queue->getBudget());
+}
+
 TEST(SchedulerRoot, Cancel)
 {
    ResourceTest t;
--- a/src/Common/Scheduler/ResouceLink.cpp
+++ b/src/Common/Scheduler/ResouceLink.cpp
@ -1,25 +0,0 @@
-#include <Common/Scheduler/ISchedulerQueue.h>
-#include <Common/Scheduler/ResourceLink.h>
-#include <Common/Scheduler/ResourceRequest.h>
-
-namespace DB
-{
-void ResourceLink::adjust(ResourceCost estimated_cost, ResourceCost real_cost) const
-{
-    if (queue)
-        queue->adjustBudget(estimated_cost, real_cost);
-}
-
-void ResourceLink::consumed(ResourceCost cost) const
-{
-    if (queue)
-        queue->consumeBudget(cost);
-}
-
-void ResourceLink::accumulate(DB::ResourceCost cost) const
-{
-    if (queue)
-        queue->accumulateBudget(cost);
-}
-}
-
--- a/src/Common/Scheduler/ResourceGuard.h
+++ b/src/Common/Scheduler/ResourceGuard.h
@ -7,10 +7,30 @@
 #include <Common/Scheduler/ResourceRequest.h>
 #include <Common/Scheduler/ResourceLink.h>

+#include <Common/CurrentThread.h>
+#include <Common/ProfileEvents.h>
+#include <Common/CurrentMetrics.h>
+
 #include <condition_variable>
 #include <mutex>


+namespace ProfileEvents
+{
+    extern const Event SchedulerIOReadRequests;
+    extern const Event SchedulerIOReadBytes;
+    extern const Event SchedulerIOReadWaitMicroseconds;
+    extern const Event SchedulerIOWriteRequests;
+    extern const Event SchedulerIOWriteBytes;
+    extern const Event SchedulerIOWriteWaitMicroseconds;
+}
+
+namespace CurrentMetrics
+{
+    extern const Metric SchedulerIOReadScheduled;
+    extern const Metric SchedulerIOWriteScheduled;
+}
+
 namespace DB
 {

@ -22,12 +42,42 @@ namespace DB
 class ResourceGuard
 {
 public:
-    enum ResourceGuardCtor
+    enum class Lock
    {
-        LockStraightAway, /// Locks inside constructor (default)
+        Default, /// Locks inside constructor

        // WARNING: Only for tests. It is not exception-safe because `lock()` must be called after construction.
-        PostponeLocking /// Don't lock in constructor, but send request
+        Defer /// Don't lock in constructor, but send request
+    };
+
+    struct Metrics
+    {
+        const ProfileEvents::Event requests = ProfileEvents::end();
+        const ProfileEvents::Event cost = ProfileEvents::end();
+        const ProfileEvents::Event wait_microseconds = ProfileEvents::end();
+        const CurrentMetrics::Metric scheduled_count = CurrentMetrics::end();
+
+        static const Metrics * getIORead()
+        {
+            static Metrics metrics{
+                .requests = ProfileEvents::SchedulerIOReadRequests,
+                .cost = ProfileEvents::SchedulerIOReadBytes,
+                .wait_microseconds = ProfileEvents::SchedulerIOReadWaitMicroseconds,
+                .scheduled_count = CurrentMetrics::SchedulerIOReadScheduled
+            };
+            return &metrics;
+        }
+
+        static const Metrics * getIOWrite()
+        {
+            static Metrics metrics{
+                .requests = ProfileEvents::SchedulerIOWriteRequests,
+                .cost = ProfileEvents::SchedulerIOWriteBytes,
+                .wait_microseconds = ProfileEvents::SchedulerIOWriteWaitMicroseconds,
+                .scheduled_count = CurrentMetrics::SchedulerIOWriteScheduled
+            };
+            return &metrics;
+        }
    };

    enum RequestState
@ -46,60 +96,74 @@ public:
            chassert(state == Finished);
            state = Enqueued;
            ResourceRequest::reset(cost_);
-            link_.queue->enqueueRequestUsingBudget(this);
+            estimated_cost = link_.queue->enqueueRequestUsingBudget(this); // NOTE: it modifies `cost` and enqueues request
        }

        // This function is executed inside scheduler thread and wakes thread issued this `request`.
        // That thread will continue execution and do real consumption of requested resource synchronously.
        void execute() override
        {
-            {
-                std::unique_lock lock(mutex);
-                chassert(state == Enqueued);
-                state = Dequeued;
-            }
+            std::unique_lock lock(mutex);
+            chassert(state == Enqueued);
+            state = Dequeued;
            dequeued_cv.notify_one();
        }

        void wait()
        {
+            CurrentMetrics::Increment scheduled(metrics->scheduled_count);
+            auto timer = CurrentThread::getProfileEvents().timer(metrics->wait_microseconds);
            std::unique_lock lock(mutex);
            dequeued_cv.wait(lock, [this] { return state == Dequeued; });
        }

-        void finish()
+        void finish(ResourceCost real_cost_, ResourceLink link_)
        {
            // lock(mutex) is not required because `Dequeued` request cannot be used by the scheduler thread
            chassert(state == Dequeued);
            state = Finished;
+            if (estimated_cost != real_cost_)
+                link_.queue->adjustBudget(estimated_cost, real_cost_);
            ResourceRequest::finish();
+            ProfileEvents::increment(metrics->requests);
+            ProfileEvents::increment(metrics->cost, real_cost_);
        }

-        static Request & local()
+        void assertFinished()
+        {
+            // lock(mutex) is not required because `Finished` request cannot be used by the scheduler thread
+            chassert(state == Finished);
+        }
+
+        static Request & local(const Metrics * metrics)
        {
            // Since single thread cannot use more than one resource request simultaneously,
            // we can reuse thread-local request to avoid allocations
            static thread_local Request instance;
+            instance.metrics = metrics;
            return instance;
        }

+        const Metrics * metrics = nullptr; // Must be initialized before use
+
    private:
+        ResourceCost estimated_cost = 0; // Stores initial `cost` value in case budget was used to modify it
        std::mutex mutex;
        std::condition_variable dequeued_cv;
        RequestState state = Finished;
    };

-    /// Creates pending request for resource; blocks while resource is not available (unless `PostponeLocking`)
-    explicit ResourceGuard(ResourceLink link_, ResourceCost cost = 1, ResourceGuardCtor ctor = LockStraightAway)
+    /// Creates pending request for resource; blocks while resource is not available (unless `Lock::Defer`)
+    explicit ResourceGuard(const Metrics * metrics, ResourceLink link_, ResourceCost cost = 1, ResourceGuard::Lock type = ResourceGuard::Lock::Default)
        : link(link_)
-        , request(Request::local())
+        , request(Request::local(metrics))
    {
        if (cost == 0)
-            link.queue = nullptr; // Ignore zero-cost requests
-        else if (link.queue)
+            link.reset(); // Ignore zero-cost requests
+        else if (link)
        {
            request.enqueue(cost, link);
-            if (ctor == LockStraightAway)
+            if (type == Lock::Default)
                request.wait();
        }
    }
@ -112,22 +176,29 @@ public:
    /// Blocks until resource is available
    void lock()
    {
-        if (link.queue)
+        if (link)
            request.wait();
    }

-    /// Report resource consumption has finished
-    void unlock()
+    void consume(ResourceCost cost)
    {
-        if (link.queue)
+        real_cost += cost;
+    }
+
+    /// Report resource consumption has finished
+    void unlock(ResourceCost consumed = 0)
+    {
+        consume(consumed);
+        if (link)
        {
-            request.finish();
-            link.queue = nullptr;
+            request.finish(real_cost, link);
+            link.reset();
        }
    }

    ResourceLink link;
    Request & request;
+    ResourceCost real_cost = 0;
 };

 }
--- a/src/Common/Scheduler/ResourceLink.h
+++ b/src/Common/Scheduler/ResourceLink.h
@ -13,13 +13,28 @@ using ResourceCost = Int64;
 struct ResourceLink
 {
    ISchedulerQueue * queue = nullptr;
+
    bool operator==(const ResourceLink &) const = default;
+    explicit operator bool() const { return queue != nullptr; }

-    void adjust(ResourceCost estimated_cost, ResourceCost real_cost) const;
+    void reset()
+    {
+        queue = nullptr;
+    }
+};

-    void consumed(ResourceCost cost) const;
+/*
+ * Everything required for IO scheduling.
+ * Note that raw pointer are stored inside, so make sure that `ClassifierPtr` that produced
+ * resource links will outlive them. Usually classifier is stored in query `Context`.
+ */
+struct IOSchedulingSettings
+{
+    ResourceLink read_resource_link;
+    ResourceLink write_resource_link;

-    void accumulate(ResourceCost cost) const;
+    bool operator==(const IOSchedulingSettings &) const = default;
+    explicit operator bool() const { return read_resource_link && write_resource_link; }
 };

 }
--- a/src/Common/Scheduler/ResourceRequest.h
+++ b/src/Common/Scheduler/ResourceRequest.h
@ -45,7 +45,7 @@ constexpr ResourceCost ResourceCostMax = std::numeric_limits<int>::max();
 class ResourceRequest : public boost::intrusive::list_base_hook<>
 {
 public:
-    /// Cost of request execution; should be filled before request enqueueing.
+    /// Cost of request execution; should be filled before request enqueueing and remain constant until `finish()`.
    /// NOTE: If cost is not known in advance, ResourceBudget should be used (note that every ISchedulerQueue has it)
    ResourceCost cost;

--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@ -1,4 +1,5 @@
 #include <Common/ThreadPool.h>
+#include <Common/ProfileEvents.h>
 #include <Common/setThreadName.h>
 #include <Common/Exception.h>
 #include <Common/getNumberOfPhysicalCPUCores.h>
@ -27,6 +28,25 @@ namespace CurrentMetrics
    extern const Metric GlobalThreadScheduled;
 }

+namespace ProfileEvents
+{
+    extern const Event GlobalThreadPoolExpansions;
+    extern const Event GlobalThreadPoolShrinks;
+    extern const Event GlobalThreadPoolThreadCreationMicroseconds;
+    extern const Event GlobalThreadPoolLockWaitMicroseconds;
+    extern const Event GlobalThreadPoolJobs;
+    extern const Event GlobalThreadPoolJobWaitTimeMicroseconds;
+
+    extern const Event LocalThreadPoolExpansions;
+    extern const Event LocalThreadPoolShrinks;
+    extern const Event LocalThreadPoolThreadCreationMicroseconds;
+    extern const Event LocalThreadPoolLockWaitMicroseconds;
+    extern const Event LocalThreadPoolJobs;
+    extern const Event LocalThreadPoolBusyMicroseconds;
+    extern const Event LocalThreadPoolJobWaitTimeMicroseconds;
+
+}
+
 class JobWithPriority
 {
 public:
@ -40,6 +60,7 @@ public:
    /// Call stacks of all jobs' schedulings leading to this one
    std::vector<StackTrace::FramePointers> frame_pointers;
    bool enable_job_stack_trace = false;
+    Stopwatch job_create_time;

    JobWithPriority(
        Job job_, Priority priority_, CurrentMetrics::Metric metric,
@ -59,6 +80,13 @@ public:
    {
        return priority > rhs.priority; // Reversed for `priority_queue` max-heap to yield minimum value (i.e. highest priority) first
    }
+
+    UInt64 elapsedMicroseconds() const
+    {
+        return job_create_time.elapsedMicroseconds();
+    }
+
+
 };

 static constexpr auto DEFAULT_THREAD_NAME = "ThreadPool";
@ -180,14 +208,18 @@ ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, Priority priority, std:
    };

    {
+        Stopwatch watch;
        std::unique_lock lock(mutex);
+        ProfileEvents::increment(
+            std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolLockWaitMicroseconds : ProfileEvents::LocalThreadPoolLockWaitMicroseconds,
+            watch.elapsedMicroseconds());

        if (CannotAllocateThreadFaultInjector::injectFault())
            return on_error("fault injected");

        auto pred = [this] { return !queue_size || scheduled_jobs < queue_size || shutdown; };

-        if (wait_microseconds)  /// Check for optional. Condition is true if the optional is set and the value is zero.
+        if (wait_microseconds)  /// Check for optional. Condition is true if the optional is set. Even if the value is zero.
        {
            if (!job_finished.wait_for(lock, std::chrono::microseconds(*wait_microseconds), pred))
                return on_error(fmt::format("no free thread (timeout={})", *wait_microseconds));
@ -216,7 +248,13 @@ ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, Priority priority, std:

            try
            {
+                Stopwatch watch2;
                threads.front() = Thread([this, it = threads.begin()] { worker(it); });
+                ProfileEvents::increment(
+                    std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolThreadCreationMicroseconds : ProfileEvents::LocalThreadPoolThreadCreationMicroseconds,
+                    watch2.elapsedMicroseconds());
+                ProfileEvents::increment(
+                    std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolExpansions : ProfileEvents::LocalThreadPoolExpansions);
            }
            catch (...)
            {
@ -239,6 +277,8 @@ ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, Priority priority, std:
    /// Wake up a free thread to run the new job.
    new_job_or_shutdown.notify_one();

+    ProfileEvents::increment(std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolJobs : ProfileEvents::LocalThreadPoolJobs);
+
    return static_cast<ReturnType>(true);
 }

@ -262,7 +302,14 @@ void ThreadPoolImpl<Thread>::startNewThreadsNoLock()

        try
        {
+            Stopwatch watch;
            threads.front() = Thread([this, it = threads.begin()] { worker(it); });
+            ProfileEvents::increment(
+                std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolThreadCreationMicroseconds : ProfileEvents::LocalThreadPoolThreadCreationMicroseconds,
+                watch.elapsedMicroseconds());
+            ProfileEvents::increment(
+                std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolExpansions : ProfileEvents::LocalThreadPoolExpansions);
+
        }
        catch (...)
        {
@ -293,7 +340,11 @@ void ThreadPoolImpl<Thread>::scheduleOrThrow(Job job, Priority priority, uint64_
 template <typename Thread>
 void ThreadPoolImpl<Thread>::wait()
 {
+    Stopwatch watch;
    std::unique_lock lock(mutex);
+    ProfileEvents::increment(
+        std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolLockWaitMicroseconds : ProfileEvents::LocalThreadPoolLockWaitMicroseconds,
+        watch.elapsedMicroseconds());
    /// Signal here just in case.
    /// If threads are waiting on condition variables, but there are some jobs in the queue
    /// then it will prevent us from deadlock.
@ -334,7 +385,11 @@ void ThreadPoolImpl<Thread>::finalize()

    /// Wait for all currently running jobs to finish (we don't wait for all scheduled jobs here like the function wait() does).
    for (auto & thread : threads)
+    {
        thread.join();
+        ProfileEvents::increment(
+            std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolShrinks : ProfileEvents::LocalThreadPoolShrinks);
+    }

    threads.clear();
 }
@ -391,7 +446,11 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
        std::optional<JobWithPriority> job_data;

        {
+            Stopwatch watch;
            std::unique_lock lock(mutex);
+            ProfileEvents::increment(
+                std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolLockWaitMicroseconds : ProfileEvents::LocalThreadPoolLockWaitMicroseconds,
+                watch.elapsedMicroseconds());

            // Finish with previous job if any
            if (job_is_done)
@ -424,6 +483,8 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
                {
                    thread_it->detach();
                    threads.erase(thread_it);
+                    ProfileEvents::increment(
+                        std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolShrinks : ProfileEvents::LocalThreadPoolShrinks);
                }
                return;
            }
@ -433,6 +494,10 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
            job_data = std::move(const_cast<JobWithPriority &>(jobs.top()));
            jobs.pop();

+            ProfileEvents::increment(
+                std::is_same_v<Thread, std::thread> ? ProfileEvents::GlobalThreadPoolJobWaitTimeMicroseconds : ProfileEvents::LocalThreadPoolJobWaitTimeMicroseconds,
+                job_data->elapsedMicroseconds());
+
            /// We don't run jobs after `shutdown` is set, but we have to properly dequeue all jobs and finish them.
            if (shutdown)
            {
@ -459,7 +524,22 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_

            CurrentMetrics::Increment metric_active_pool_threads(metric_active_threads);

-            job_data->job();
+            if constexpr (!std::is_same_v<Thread, std::thread>)
+            {
+                Stopwatch watch;
+                job_data->job();
+                // This metric is less relevant for the global thread pool, as it would show large values (time while
+                // a thread was used by local pools) and increment only when local pools are destroyed.
+                //
+                // In cases where global pool threads are used directly (without a local thread pool), distinguishing
+                // them is difficult.
+                ProfileEvents::increment(ProfileEvents::LocalThreadPoolBusyMicroseconds, watch.elapsedMicroseconds());
+            }
+            else
+            {
+                job_data->job();
+            }
+

            if (thread_trace_context.root_span.isTraceEnabled())
            {
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@ -131,7 +131,7 @@ private:
    bool threads_remove_themselves = true;
    const bool shutdown_on_exception = true;

-    boost::heap::priority_queue<JobWithPriority> jobs;
+    boost::heap::priority_queue<JobWithPriority,boost::heap::stable<true>> jobs;
    std::list<Thread> threads;
    std::exception_ptr first_exception;
    std::stack<OnDestroyCallback> on_destroy_callbacks;
--- a/src/Common/ThreadStatus.h
+++ b/src/Common/ThreadStatus.h
@ -7,11 +7,11 @@
 #include <Common/MemoryTracker.h>
 #include <Common/ProfileEvents.h>
 #include <Common/Stopwatch.h>
+#include <Common/Scheduler/ResourceLink.h>

 #include <boost/noncopyable.hpp>

 #include <functional>
-#include <map>
 #include <memory>
 #include <mutex>
 #include <unordered_set>
@ -188,6 +188,10 @@ public:
    Progress progress_in;
    Progress progress_out;

+    /// IO scheduling
+    ResourceLink read_resource_link;
+    ResourceLink write_resource_link;
+
 private:
    /// Group of threads, to which this thread attached
    ThreadGroupPtr thread_group;
--- a/src/Core/ExternalTable.cpp
+++ b/src/Core/ExternalTable.cpp
@ -17,11 +17,12 @@

 #include <Core/ExternalTable.h>
 #include <Core/Settings.h>
-#include <Poco/Net/MessageHeader.h>
 #include <Parsers/ASTNameTypePair.h>
+#include <Parsers/IdentifierQuotingStyle.h>
 #include <Parsers/ParserCreateQuery.h>
 #include <Parsers/parseQuery.h>
 #include <base/scope_guard.h>
+#include <Poco/Net/MessageHeader.h>


 namespace DB
@ -85,7 +86,15 @@ void BaseExternalTable::parseStructureFromStructureField(const std::string & arg
        /// We use `formatWithPossiblyHidingSensitiveData` instead of `getColumnNameWithoutAlias` because `column->type` is an ASTFunction.
        /// `getColumnNameWithoutAlias` will return name of the function with `(arguments)` even if arguments is empty.
        if (column)
-            structure.emplace_back(column->name, column->type->formatWithPossiblyHidingSensitiveData(0, true, true, false));
+            structure.emplace_back(
+                column->name,
+                column->type->formatWithPossiblyHidingSensitiveData(
+                    /*max_length=*/0,
+                    /*one_line=*/true,
+                    /*show_secrets=*/true,
+                    /*print_pretty_type_names=*/false,
+                    /*always_quote_identifiers=*/false,
+                    /*identifier_quoting_style=*/IdentifierQuotingStyle::Backticks));
        else
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Error while parsing table structure: expected column definition, got {}", child->formatForErrorMessage());
    }
@ -102,7 +111,15 @@ void BaseExternalTable::parseStructureFromTypesField(const std::string & argumen
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Error while parsing table structure: {}", error);

    for (size_t i = 0; i < type_list_raw->children.size(); ++i)
-        structure.emplace_back("_" + toString(i + 1), type_list_raw->children[i]->formatWithPossiblyHidingSensitiveData(0, true, true, false));
+        structure.emplace_back(
+            "_" + toString(i + 1),
+            type_list_raw->children[i]->formatWithPossiblyHidingSensitiveData(
+                /*max_length=*/0,
+                /*one_line=*/true,
+                /*show_secrets=*/true,
+                /*print_pretty_type_names=*/false,
+                /*always_quote_identifiers=*/false,
+                /*identifier_quoting_style=*/IdentifierQuotingStyle::Backticks));
 }

 void BaseExternalTable::initSampleBlock()
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -1296,6 +1296,9 @@ class IColumn;
    M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \
    M(DateTimeOverflowBehavior, date_time_overflow_behavior, "ignore", "Overflow mode for Date, Date32, DateTime, DateTime64 types. Possible values: 'ignore', 'throw', 'saturate'.", 0) \
    M(Bool, validate_experimental_and_suspicious_types_inside_nested_types, true, "Validate usage of experimental and suspicious types inside nested types like Array/Map/Tuple", 0) \
+    \
+    M(Bool, output_format_always_quote_identifiers, false, "Always quote identifiers", 0) \
+    M(IdentifierQuotingStyle, output_format_identifier_quoting_style, IdentifierQuotingStyle::Backticks, "Set the quoting style for identifiers", 0) \


 // End of FORMAT_FACTORY_SETTINGS
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@ -75,6 +75,8 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
            {"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
            {"create_if_not_exists", false, false, "New setting."},
            {"allow_materialized_view_with_bad_select", true, true, "Support (but not enable yet) stricter validation in CREATE MATERIALIZED VIEW"},
+            {"output_format_always_quote_identifiers", false, false, "New setting."},
+            {"output_format_identifier_quoting_style", "Backticks", "Backticks", "New setting."}
        }
    },
    {"24.8",
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@ -244,4 +244,10 @@ IMPLEMENT_SETTING_ENUM(
    GroupArrayActionWhenLimitReached,
    ErrorCodes::BAD_ARGUMENTS,
    {{"throw", GroupArrayActionWhenLimitReached::THROW}, {"discard", GroupArrayActionWhenLimitReached::DISCARD}})
+
+IMPLEMENT_SETTING_ENUM(IdentifierQuotingStyle, ErrorCodes::BAD_ARGUMENTS,
+    {{"None", IdentifierQuotingStyle::None},
+     {"Backticks", IdentifierQuotingStyle::Backticks},
+     {"DoubleQuotes", IdentifierQuotingStyle::DoubleQuotes},
+     {"BackticksMySQL", IdentifierQuotingStyle::BackticksMySQL}})
 }
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@ -10,6 +10,7 @@
 #include <Formats/FormatSettings.h>
 #include <IO/ReadSettings.h>
 #include <Parsers/ASTSQLSecurity.h>
+#include <Parsers/IdentifierQuotingStyle.h>
 #include <QueryPipeline/SizeLimits.h>
 #include <Common/ShellCommandSettings.h>

@ -351,6 +352,8 @@ DECLARE_SETTING_ENUM_WITH_RENAME(DateTimeOverflowBehavior, FormatSettings::DateT

 DECLARE_SETTING_ENUM(SQLSecurityType)

+DECLARE_SETTING_ENUM(IdentifierQuotingStyle)
+
 enum class GroupArrayActionWhenLimitReached : uint8_t
 {
    THROW,
--- a/src/Core/SettingsFields.cpp
+++ b/src/Core/SettingsFields.cpp
@ -210,7 +210,7 @@ namespace
 {
    UInt64 stringToMaxThreads(const String & str)
    {
-        if (startsWith(str, "auto"))
+        if (startsWith(str, "auto") || startsWith(str, "'auto"))
            return 0;
        return parseFromString<UInt64>(str);
    }
@ -237,7 +237,8 @@ SettingFieldMaxThreads & SettingFieldMaxThreads::operator=(const Field & f)
 String SettingFieldMaxThreads::toString() const
 {
    if (is_auto)
-        return "auto(" + ::DB::toString(value) + ")";
+        /// Removing quotes here will introduce an incompatibility between replicas with different versions.
+        return "'auto(" + ::DB::toString(value) + ")'";
    else
        return ::DB::toString(value);
 }
--- a/src/DataTypes/DataTypeObject.cpp
+++ b/src/DataTypes/DataTypeObject.cpp
@ -519,10 +519,10 @@ static DataTypePtr createJSON(const ASTPtr & arguments)
    if (!context)
        context = Context::getGlobalContextInstance();

-    if (context->getSettingsRef().use_json_alias_for_old_object_type)
+    if (context->getSettingsRef().allow_experimental_object_type && context->getSettingsRef().use_json_alias_for_old_object_type)
    {
        if (arguments && !arguments->children.empty())
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Experimental Object type doesn't support any arguments. If you want to use new JSON type, set setting allow_experimental_json_type = 1");
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Experimental Object type doesn't support any arguments. If you want to use new JSON type, set settings allow_experimental_json_type = 1 and use_json_alias_for_old_object_type = 0");

        return std::make_shared<DataTypeObjectDeprecated>("JSON", false);
    }
--- a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp
+++ b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp
@ -8,6 +8,7 @@
 #include <IO/ReadBufferFromString.h>
 #include <Common/logger_useful.h>
 #include <Common/Throttler.h>
+#include <Common/Scheduler/ResourceGuard.h>
 #include <base/sleep.h>
 #include <Common/ProfileEvents.h>
 #include <IO/SeekableReadBuffer.h>
@ -113,7 +114,9 @@ bool ReadBufferFromAzureBlobStorage::nextImpl()
    {
        try
        {
+            ResourceGuard rlock(ResourceGuard::Metrics::getIORead(), read_settings.io_scheduling.read_resource_link, to_read_bytes);
            bytes_read = data_stream->ReadToCount(reinterpret_cast<uint8_t *>(data_ptr), to_read_bytes);
+            rlock.unlock(bytes_read); // Do not hold resource under bandwidth throttler
            if (read_settings.remote_throttler)
                read_settings.remote_throttler->add(bytes_read, ProfileEvents::RemoteReadThrottlerBytes, ProfileEvents::RemoteReadThrottlerSleepMicroseconds);
            break;
--- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp
+++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp
@ -101,15 +101,13 @@ void WriteBufferFromAzureBlobStorage::execWithRetry(std::function<void()> func,
    {
        try
        {
-            ResourceGuard rlock(write_settings.resource_link, cost); // Note that zero-cost requests are ignored
+            ResourceGuard rlock(ResourceGuard::Metrics::getIOWrite(), write_settings.io_scheduling.write_resource_link, cost); // Note that zero-cost requests are ignored
            func();
+            rlock.unlock(cost);
            break;
        }
        catch (const Azure::Core::RequestFailedException & e)
        {
-            if (cost)
-                write_settings.resource_link.accumulate(cost); // Accumulate resource for later use, because we have failed to consume it
-
            if (i == num_tries - 1 || !isRetryableAzureException(e))
                throw;

@ -117,8 +115,6 @@ void WriteBufferFromAzureBlobStorage::execWithRetry(std::function<void()> func,
        }
        catch (...)
        {
-            if (cost)
-                write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure
            throw;
        }
    }
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@ -461,14 +461,17 @@ DiskObjectStoragePtr DiskObjectStorage::createDiskObjectStorage()
 }

 template <class Settings>
-static inline Settings updateResourceLink(const Settings & settings, const String & resource_name)
+static inline Settings updateIOSchedulingSettings(const Settings & settings, const String & read_resource_name, const String & write_resource_name)
 {
-    if (resource_name.empty())
+    if (read_resource_name.empty() && write_resource_name.empty())
        return settings;
    if (auto query_context = CurrentThread::getQueryContext())
    {
        Settings result(settings);
-        result.resource_link = query_context->getWorkloadClassifier()->get(resource_name);
+        if (!read_resource_name.empty())
+            result.io_scheduling.read_resource_link = query_context->getWorkloadClassifier()->get(read_resource_name);
+        if (!write_resource_name.empty())
+            result.io_scheduling.write_resource_link = query_context->getWorkloadClassifier()->get(write_resource_name);
        return result;
    }
    return settings;
@ -500,7 +503,7 @@ std::unique_ptr<ReadBufferFromFileBase> DiskObjectStorage::readFile(

    return object_storage->readObjects(
        storage_objects,
-        updateResourceLink(settings, getReadResourceName()),
+        updateIOSchedulingSettings(settings, getReadResourceName(), getWriteResourceName()),
        read_hint,
        file_size);
 }
@ -513,7 +516,7 @@ std::unique_ptr<WriteBufferFromFileBase> DiskObjectStorage::writeFile(
 {
    LOG_TEST(log, "Write file: {}", path);

-    WriteSettings write_settings = updateResourceLink(settings, getWriteResourceName());
+    WriteSettings write_settings = updateIOSchedulingSettings(settings, getReadResourceName(), getWriteResourceName());
    auto transaction = createObjectStorageTransaction();
    return transaction->writeFile(path, buf_size, mode, write_settings);
 }
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -6,7 +6,6 @@

 #include <IO/ReadBufferFromIStream.h>
 #include <IO/ReadBufferFromS3.h>
-#include <Common/Scheduler/ResourceGuard.h>
 #include <IO/S3/getObjectInfo.h>
 #include <IO/S3/Requests.h>

@ -423,22 +422,13 @@ Aws::S3::Model::GetObjectResult ReadBufferFromS3::sendRequest(size_t attempt, si
    ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::ReadBufferFromS3InitMicroseconds);

    // We do not know in advance how many bytes we are going to consume, to avoid blocking estimated it from below
-    constexpr ResourceCost estimated_cost = 1;
-    ResourceGuard rlock(read_settings.resource_link, estimated_cost);
-
+    CurrentThread::IOScope io_scope(read_settings.io_scheduling);
    Aws::S3::Model::GetObjectOutcome outcome = client_ptr->GetObject(req);

-    rlock.unlock();
-
    if (outcome.IsSuccess())
-    {
-        ResourceCost bytes_read = outcome.GetResult().GetContentLength();
-        read_settings.resource_link.adjust(estimated_cost, bytes_read);
        return outcome.GetResultWithOwnership();
-    }
    else
    {
-        read_settings.resource_link.accumulate(estimated_cost);
        const auto & error = outcome.GetError();
        throw S3Exception(error.GetMessage(), error.GetErrorType());
    }
--- a/src/IO/ReadSettings.h
+++ b/src/IO/ReadSettings.h
@ -118,8 +118,7 @@ struct ReadSettings
    ThrottlerPtr remote_throttler;
    ThrottlerPtr local_throttler;

-    // Resource to be used during reading
-    ResourceLink resource_link;
+    IOSchedulingSettings io_scheduling;

    size_t http_max_tries = 10;
    size_t http_retry_initial_backoff_ms = 100;
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@ -11,7 +11,6 @@
 #include <Common/Throttler.h>
 #include <Interpreters/Cache/FileCache.h>

-#include <Common/Scheduler/ResourceGuard.h>
 #include <IO/WriteHelpers.h>
 #include <IO/S3Common.h>
 #include <IO/S3/Requests.h>
@ -558,12 +557,11 @@ void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data)

        auto & request = std::get<0>(*worker_data);

-        ResourceCost cost = request.GetContentLength();
-        ResourceGuard rlock(write_settings.resource_link, cost);
+        CurrentThread::IOScope io_scope(write_settings.io_scheduling);
+
        Stopwatch watch;
        auto outcome = client_ptr->UploadPart(request);
        watch.stop();
-        rlock.unlock(); // Avoid acquiring other locks under resource lock

        ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds());

@ -577,7 +575,6 @@ void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data)
        if (!outcome.IsSuccess())
        {
            ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
-            write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure
            throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
        }

@ -715,12 +712,11 @@ void WriteBufferFromS3::makeSinglepartUpload(WriteBufferFromS3::PartData && data
            if (client_ptr->isClientForDisk())
                ProfileEvents::increment(ProfileEvents::DiskS3PutObject);

-            ResourceCost cost = request.GetContentLength();
-            ResourceGuard rlock(write_settings.resource_link, cost);
+            CurrentThread::IOScope io_scope(write_settings.io_scheduling);
+
            Stopwatch watch;
            auto outcome = client_ptr->PutObject(request);
            watch.stop();
-            rlock.unlock();

            ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds());
            if (blob_log)
@ -734,7 +730,6 @@ void WriteBufferFromS3::makeSinglepartUpload(WriteBufferFromS3::PartData && data
            }

            ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1);
-            write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure

            if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY)
            {
--- a/src/IO/WriteSettings.h
+++ b/src/IO/WriteSettings.h
@ -13,8 +13,7 @@ struct WriteSettings
    ThrottlerPtr remote_throttler;
    ThrottlerPtr local_throttler;

-    // Resource to be used during reading
-    ResourceLink resource_link;
+    IOSchedulingSettings io_scheduling;

    /// Filesystem cache settings
    bool enable_filesystem_cache_on_write_operations = false;
--- a/src/Interpreters/ActionsDAG.cpp
+++ b/src/Interpreters/ActionsDAG.cpp
@ -396,7 +396,7 @@ const ActionsDAG::Node * ActionsDAG::tryFindInOutputs(const std::string & name)
    return nullptr;
 }

-ActionsDAG::NodeRawConstPtrs ActionsDAG::findInOutpus(const Names & names) const
+ActionsDAG::NodeRawConstPtrs ActionsDAG::findInOutputs(const Names & names) const
 {
    NodeRawConstPtrs required_nodes;
    required_nodes.reserve(names.size());
@ -524,7 +524,7 @@ void ActionsDAG::removeUnusedActions(const NameSet & required_names, bool allow_

 void ActionsDAG::removeUnusedActions(const Names & required_names, bool allow_remove_inputs, bool allow_constant_folding)
 {
-    auto required_nodes = findInOutpus(required_names);
+    auto required_nodes = findInOutputs(required_names);
    outputs.swap(required_nodes);
    removeUnusedActions(allow_remove_inputs, allow_constant_folding);
 }
--- a/src/Interpreters/ActionsDAG.h
+++ b/src/Interpreters/ActionsDAG.h
@ -156,7 +156,7 @@ public:
    const Node * tryFindInOutputs(const std::string & name) const;

    /// Same, but for the list of names.
-    NodeRawConstPtrs findInOutpus(const Names & names) const;
+    NodeRawConstPtrs findInOutputs(const Names & names) const;

    /// Find first node with the same name in output nodes and replace it.
    /// If was not found, add node to outputs end.
@ -436,7 +436,7 @@ public:
    /// Returns a list of nodes representing atomic predicates.
    static NodeRawConstPtrs extractConjunctionAtoms(const Node * predicate);

-    /// Get a list of nodes. For every node, check if it can be compused using allowed subset of inputs.
+    /// Get a list of nodes. For every node, check if it can be computed using allowed subset of inputs.
    /// Returns only those nodes from the list which can be computed.
    static NodeRawConstPtrs filterNodesByAllowedInputs(
        NodeRawConstPtrs nodes,
--- a/src/Interpreters/AsynchronousInsertQueue.cpp
+++ b/src/Interpreters/AsynchronousInsertQueue.cpp
@ -33,6 +33,8 @@
 #include <Common/SensitiveDataMasker.h>
 #include <Common/SipHash.h>
 #include <Common/logger_useful.h>
+#include <Parsers/ASTExpressionList.h>
+#include <Parsers/ASTIdentifier.h>

 namespace CurrentMetrics
 {
@ -308,6 +310,7 @@ void AsynchronousInsertQueue::preprocessInsertQuery(const ASTPtr & query, const
        /* no_squash */ false,
        /* no_destination */ false,
        /* async_insert */ false);
+
    auto table = interpreter.getTable(insert_query);
    auto sample_block = InterpreterInsertQuery::getSampleBlock(insert_query, table, table->getInMemoryMetadataPtr(), query_context);

@ -318,6 +321,10 @@ void AsynchronousInsertQueue::preprocessInsertQuery(const ASTPtr & query, const
    /// InterpreterInsertQuery::getTable() -> ITableFunction::execute().
    if (insert_query.table_id)
        query_context->checkAccess(AccessType::INSERT, insert_query.table_id, sample_block.getNames());
+
+    insert_query.columns = std::make_shared<ASTExpressionList>();
+    for (const auto & column : sample_block)
+        insert_query.columns->children.push_back(std::make_shared<ASTIdentifier>(column.name));
 }

 AsynchronousInsertQueue::PushResult
@ -696,6 +703,17 @@ catch (...)
    tryLogCurrentException("AsynchronousInsertQueue", "Failed to add elements to AsynchronousInsertLog");
 }

+void convertBlockToHeader(Block & block, const Block & header)
+{
+    auto converting_dag = ActionsDAG::makeConvertingActions(
+        block.getColumnsWithTypeAndName(),
+        header.getColumnsWithTypeAndName(),
+        ActionsDAG::MatchColumnsMode::Name);
+
+    auto converting_actions = std::make_shared<ExpressionActions>(std::move(converting_dag));
+    converting_actions->execute(block);
+}
+
 String serializeQuery(const IAST & query, size_t max_length)
 {
    return query.hasSecretParts()
@ -791,6 +809,61 @@ try
    if (async_insert_log)
        log_elements.reserve(data->entries.size());

+    auto add_entry_to_asynchronous_insert_log = [&, query_by_format = NameToNameMap{}](
+        const InsertData::EntryPtr & entry,
+        const String & parsing_exception,
+        size_t num_rows,
+        size_t num_bytes) mutable
+    {
+        if (!async_insert_log)
+            return;
+
+        AsynchronousInsertLogElement elem;
+        elem.event_time = timeInSeconds(entry->create_time);
+        elem.event_time_microseconds = timeInMicroseconds(entry->create_time);
+        elem.database = query_database;
+        elem.table = query_table;
+        elem.format = entry->format;
+        elem.query_id = entry->query_id;
+        elem.bytes = num_bytes;
+        elem.rows = num_rows;
+        elem.exception = parsing_exception;
+        elem.data_kind = entry->chunk.getDataKind();
+        elem.timeout_milliseconds = data->timeout_ms.count();
+        elem.flush_query_id = insert_query_id;
+
+        auto get_query_by_format = [&](const String & format) -> const String &
+        {
+            auto [it, inserted] = query_by_format.try_emplace(format);
+            if (!inserted)
+                return it->second;
+
+            auto query = key.query->clone();
+            assert_cast<ASTInsertQuery &>(*query).format = format;
+            it->second = serializeQuery(*query, insert_context->getSettingsRef().log_queries_cut_to_length);
+            return it->second;
+        };
+
+        if (entry->chunk.getDataKind() == DataKind::Parsed)
+            elem.query_for_logging = key.query_str;
+        else
+            elem.query_for_logging = get_query_by_format(entry->format);
+
+        /// If there was a parsing error,
+        /// the entry won't be flushed anyway,
+        /// so add the log element immediately.
+        if (!elem.exception.empty())
+        {
+            elem.status = AsynchronousInsertLogElement::ParsingError;
+            async_insert_log->add(std::move(elem));
+        }
+        else
+        {
+            elem.status = AsynchronousInsertLogElement::Ok;
+            log_elements.push_back(std::move(elem));
+        }
+    };
+
    try
    {
        interpreter = std::make_unique<InterpreterInsertQuery>(
@ -819,49 +892,20 @@ try
    catch (...)
    {
        logExceptionBeforeStart(query_for_logging, insert_context, key.query, query_span, start_watch.elapsedMilliseconds());
+
+        if (async_insert_log)
+        {
+            for (const auto & entry : data->entries)
+                add_entry_to_asynchronous_insert_log(entry, /*parsing_exception=*/ "", /*num_rows=*/ 0, entry->chunk.byteSize());
+
+            auto exception = getCurrentExceptionMessage(false);
+            auto flush_time = std::chrono::system_clock::now();
+            appendElementsToLogSafe(*async_insert_log, std::move(log_elements), flush_time, exception);
+        }
        throw;
    }

-    auto add_entry_to_asynchronous_insert_log = [&](const auto & entry,
-                                                    const auto & entry_query_for_logging,
-                                                    const auto & exception,
-                                                    size_t num_rows,
-                                                    size_t num_bytes,
-                                                    Milliseconds timeout_ms)
-    {
-        if (!async_insert_log)
-            return;
-
-        AsynchronousInsertLogElement elem;
-        elem.event_time = timeInSeconds(entry->create_time);
-        elem.event_time_microseconds = timeInMicroseconds(entry->create_time);
-        elem.query_for_logging = entry_query_for_logging;
-        elem.database = query_database;
-        elem.table = query_table;
-        elem.format = entry->format;
-        elem.query_id = entry->query_id;
-        elem.bytes = num_bytes;
-        elem.rows = num_rows;
-        elem.exception = exception;
-        elem.data_kind = entry->chunk.getDataKind();
-        elem.timeout_milliseconds = timeout_ms.count();
-        elem.flush_query_id = insert_query_id;
-
-        /// If there was a parsing error,
-        /// the entry won't be flushed anyway,
-        /// so add the log element immediately.
-        if (!elem.exception.empty())
-        {
-            elem.status = AsynchronousInsertLogElement::ParsingError;
-            async_insert_log->add(std::move(elem));
-        }
-        else
-        {
-            log_elements.push_back(elem);
-        }
-    };
-
-    auto finish_entries = [&]
+    auto finish_entries = [&](size_t num_rows, size_t num_bytes)
    {
        for (const auto & entry : data->entries)
        {
@ -874,20 +918,7 @@ try
            auto flush_time = std::chrono::system_clock::now();
            appendElementsToLogSafe(*async_insert_log, std::move(log_elements), flush_time, "");
        }
-    };

-    Chunk chunk;
-    auto header = pipeline.getHeader();
-
-    if (key.data_kind == DataKind::Parsed)
-        chunk = processEntriesWithParsing(key, data, header, insert_context, log, add_entry_to_asynchronous_insert_log);
-    else
-        chunk = processPreprocessedEntries(key, data, header, insert_context, add_entry_to_asynchronous_insert_log);
-
-    ProfileEvents::increment(ProfileEvents::AsyncInsertRows, chunk.getNumRows());
-
-    auto log_and_add_finish_to_query_log = [&](size_t num_rows, size_t num_bytes)
-    {
        LOG_DEBUG(log, "Flushed {} rows, {} bytes for query '{}'", num_rows, num_bytes, key.query_str);
        queue_shard_flush_time_history.updateWithCurrentTime();

@ -896,16 +927,24 @@ try
            query_log_elem, insert_context, key.query, pipeline, pulling_pipeline, query_span, QueryCache::Usage::None, internal);
    };

-
-    if (chunk.getNumRows() == 0)
-    {
-        finish_entries();
-        log_and_add_finish_to_query_log(0, 0);
-        return;
-    }
-
    try
    {
+        Chunk chunk;
+        auto header = pipeline.getHeader();
+
+        if (key.data_kind == DataKind::Parsed)
+            chunk = processEntriesWithParsing(key, data, header, insert_context, log, add_entry_to_asynchronous_insert_log);
+        else
+            chunk = processPreprocessedEntries(data, header, add_entry_to_asynchronous_insert_log);
+
+        ProfileEvents::increment(ProfileEvents::AsyncInsertRows, chunk.getNumRows());
+
+        if (chunk.getNumRows() == 0)
+        {
+            finish_entries(/*num_rows=*/ 0, /*num_bytes=*/ 0);
+            return;
+        }
+
        size_t num_rows = chunk.getNumRows();
        size_t num_bytes = chunk.bytes();

@ -915,7 +954,7 @@ try
        CompletedPipelineExecutor completed_executor(pipeline);
        completed_executor.execute();

-        log_and_add_finish_to_query_log(num_rows, num_bytes);
+        finish_entries(num_rows, num_bytes);
    }
    catch (...)
    {
@ -929,8 +968,6 @@ try
        }
        throw;
    }
-
-    finish_entries();
 }
 catch (const Exception & e)
 {
@ -991,7 +1028,6 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(

    StreamingFormatExecutor executor(header, format, std::move(on_error), std::move(adding_defaults_transform));
    auto chunk_info = std::make_shared<AsyncInsertInfo>();
-    auto query_for_logging = serializeQuery(*key.query, insert_context->getSettingsRef().log_queries_cut_to_length);

    for (const auto & entry : data->entries)
    {
@ -1009,7 +1045,8 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
        size_t num_rows = executor.execute(*buffer);

        total_rows += num_rows;
-        /// for some reason, client can pass zero rows and bytes to server.
+
+        /// For some reason, client can pass zero rows and bytes to server.
        /// We don't update offsets in this case, because we assume every insert has some rows during dedup
        /// but we have nothing to deduplicate for this insert.
        if (num_rows > 0)
@ -1018,8 +1055,7 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
            chunk_info->tokens.push_back(entry->async_dedup_token);
        }

-        add_to_async_insert_log(entry, query_for_logging, current_exception, num_rows, num_bytes, data->timeout_ms);
-
+        add_to_async_insert_log(entry, current_exception, num_rows, num_bytes);
        current_exception.clear();
        entry->resetChunk();
    }
@ -1031,30 +1067,14 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(

 template <typename LogFunc>
 Chunk AsynchronousInsertQueue::processPreprocessedEntries(
-    const InsertQuery & key,
    const InsertDataPtr & data,
    const Block & header,
-    const ContextPtr & insert_context,
    LogFunc && add_to_async_insert_log)
 {
    size_t total_rows = 0;
    auto chunk_info = std::make_shared<AsyncInsertInfo>();
    auto result_columns = header.cloneEmptyColumns();

-    std::unordered_map<String, String> format_to_query;
-
-    auto get_query_by_format = [&](const String & format) -> const String &
-    {
-        auto [it, inserted] = format_to_query.try_emplace(format);
-        if (!inserted)
-            return it->second;
-
-        auto query = key.query->clone();
-        assert_cast<ASTInsertQuery &>(*query).format = format;
-        it->second = serializeQuery(*query, insert_context->getSettingsRef().log_queries_cut_to_length);
-        return it->second;
-    };
-
    for (const auto & entry : data->entries)
    {
        const auto * block = entry->chunk.asBlock();
@ -1062,23 +1082,26 @@ Chunk AsynchronousInsertQueue::processPreprocessedEntries(
            throw Exception(ErrorCodes::LOGICAL_ERROR,
                "Expected entry with data kind Preprocessed. Got: {}", entry->chunk.getDataKind());

-        auto columns = block->getColumns();
+        Block block_to_insert = *block;
+        if (!isCompatibleHeader(block_to_insert, header))
+            convertBlockToHeader(block_to_insert, header);
+
+        auto columns = block_to_insert.getColumns();
        for (size_t i = 0, s = columns.size(); i < s; ++i)
            result_columns[i]->insertRangeFrom(*columns[i], 0, columns[i]->size());

-        total_rows += block->rows();
-        /// for some reason, client can pass zero rows and bytes to server.
+        total_rows += block_to_insert.rows();
+
+        /// For some reason, client can pass zero rows and bytes to server.
        /// We don't update offsets in this case, because we assume every insert has some rows during dedup,
        /// but we have nothing to deduplicate for this insert.
-        if (block->rows())
+        if (block_to_insert.rows() > 0)
        {
            chunk_info->offsets.push_back(total_rows);
            chunk_info->tokens.push_back(entry->async_dedup_token);
        }

-        const auto & query_for_logging = get_query_by_format(entry->format);
-        add_to_async_insert_log(entry, query_for_logging, "", block->rows(), block->bytes(), data->timeout_ms);
-
+        add_to_async_insert_log(entry, /*parsing_exception=*/ "", block_to_insert.rows(), block_to_insert.bytes());
        entry->resetChunk();
    }

--- a/src/Interpreters/AsynchronousInsertQueue.h
+++ b/src/Interpreters/AsynchronousInsertQueue.h
@ -288,10 +288,8 @@ private:

    template <typename LogFunc>
    static Chunk processPreprocessedEntries(
-        const InsertQuery & key,
        const InsertDataPtr & data,
        const Block & header,
-        const ContextPtr & insert_context,
        LogFunc && add_to_async_insert_log);

    template <typename E>
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@ -821,6 +821,19 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti
        {
            properties.indices = as_storage_metadata->getSecondaryIndices();
            properties.projections = as_storage_metadata->getProjections().clone();
+
+            /// CREATE TABLE AS should copy PRIMARY KEY, ORDER BY, and similar clauses.
+            if (!create.storage->primary_key && as_storage_metadata->isPrimaryKeyDefined() && as_storage_metadata->hasPrimaryKey())
+                create.storage->set(create.storage->primary_key, as_storage_metadata->getPrimaryKeyAST()->clone());
+
+            if (!create.storage->partition_by && as_storage_metadata->isPartitionKeyDefined() && as_storage_metadata->hasPartitionKey())
+                create.storage->set(create.storage->partition_by, as_storage_metadata->getPartitionKeyAST()->clone());
+
+            if (!create.storage->order_by && as_storage_metadata->isSortingKeyDefined() && as_storage_metadata->hasSortingKey())
+                create.storage->set(create.storage->order_by, as_storage_metadata->getSortingKeyAST()->clone());
+
+            if (!create.storage->sample_by && as_storage_metadata->isSamplingKeyDefined() && as_storage_metadata->hasSamplingKey())
+                create.storage->set(create.storage->sample_by, as_storage_metadata->getSamplingKeyAST()->clone());
        }
        else
        {
--- a/src/Interpreters/ProcessorsProfileLog.cpp
+++ b/src/Interpreters/ProcessorsProfileLog.cpp
@ -30,6 +30,8 @@ ColumnsDescription ProcessorProfileLogElement::getColumnsDescription()
        {"id", std::make_shared<DataTypeUInt64>(), "ID of processor."},
        {"parent_ids", std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>()), "Parent processors IDs."},
        {"plan_step", std::make_shared<DataTypeUInt64>(), "ID of the query plan step which created this processor. The value is zero if the processor was not added from any step."},
+        {"plan_step_name", std::make_shared<DataTypeString>(), "Name of the query plan step which created this processor. The value is empty if the processor was not added from any step."},
+        {"plan_step_description", std::make_shared<DataTypeString>(), "Description of the query plan step which created this processor. The value is empty if the processor was not added from any step."},
        {"plan_group", std::make_shared<DataTypeUInt64>(), "Group of the processor if it was created by query plan step. A group is a logical partitioning of processors added from the same query plan step. Group is used only for beautifying the result of EXPLAIN PIPELINE result."},

        {"initial_query_id", std::make_shared<DataTypeString>(), "ID of the initial query (for distributed query execution)."},
@ -64,6 +66,8 @@ void ProcessorProfileLogElement::appendToBlock(MutableColumns & columns) const
    }

    columns[i++]->insert(plan_step);
+    columns[i++]->insert(plan_step_name);
+    columns[i++]->insert(plan_step_description);
    columns[i++]->insert(plan_group);
    columns[i++]->insertData(initial_query_id.data(), initial_query_id.size());
    columns[i++]->insertData(query_id.data(), query_id.size());
--- a/src/Interpreters/ProcessorsProfileLog.h
+++ b/src/Interpreters/ProcessorsProfileLog.h
@ -19,6 +19,8 @@ struct ProcessorProfileLogElement

    UInt64 plan_step{};
    UInt64 plan_group{};
+    String plan_step_name;
+    String plan_step_description;

    String initial_query_id;
    String query_id;
--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@ -478,6 +478,8 @@ void logQueryFinish(
                    processor_elem.parent_ids = std::move(parents);

                    processor_elem.plan_step = reinterpret_cast<std::uintptr_t>(processor->getQueryPlanStep());
+                    processor_elem.plan_step_name = processor->getPlanStepName();
+                    processor_elem.plan_step_description = processor->getPlanStepDescription();
                    processor_elem.plan_group = processor->getQueryPlanStepGroup();

                    processor_elem.processor_name = processor->getName();
@ -793,7 +795,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
            /// Verify that AST formatting is consistent:
            /// If you format AST, parse it back, and format it again, you get the same string.

-            String formatted1 = ast->formatWithPossiblyHidingSensitiveData(0, true, true, false);
+            String formatted1 = ast->formatWithPossiblyHidingSensitiveData(0, true, true, false, false, IdentifierQuotingStyle::Backticks);

            /// The query can become more verbose after formatting, so:
            size_t new_max_query_size = max_query_size > 0 ? (1000 + 2 * max_query_size) : 0;
@ -818,7 +820,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(

            chassert(ast2);

-            String formatted2 = ast2->formatWithPossiblyHidingSensitiveData(0, true, true, false);
+            String formatted2 = ast2->formatWithPossiblyHidingSensitiveData(0, true, true, false, false, IdentifierQuotingStyle::Backticks);

            if (formatted1 != formatted2)
                throw Exception(ErrorCodes::LOGICAL_ERROR,
--- a/src/Interpreters/formatWithPossiblyHidingSecrets.h
+++ b/src/Interpreters/formatWithPossiblyHidingSecrets.h
@ -26,7 +26,12 @@ inline String format(const SecretHidingFormatSettings & settings)
        && settings.ctx->getAccess()->isGranted(AccessType::displaySecretsInShowAndSelect);

    return settings.query.formatWithPossiblyHidingSensitiveData(
-        settings.max_length, settings.one_line, show_secrets, settings.ctx->getSettingsRef().print_pretty_type_names);
+        settings.max_length,
+        settings.one_line,
+        show_secrets,
+        settings.ctx->getSettingsRef().print_pretty_type_names,
+        settings.ctx->getSettingsRef().output_format_always_quote_identifiers,
+        settings.ctx->getSettingsRef().output_format_identifier_quoting_style);
 }

 }
--- a/src/Parsers/ASTColumnDeclaration.cpp
+++ b/src/Parsers/ASTColumnDeclaration.cpp
@ -66,8 +66,8 @@ void ASTColumnDeclaration::formatImpl(const FormatSettings & format_settings, Fo
 {
    frame.need_parens = false;

-    /// We have to always backquote column names to avoid ambiguity with INDEX and other declarations in CREATE query.
-    format_settings.ostr << backQuote(name);
+    /// We have to always quote column names to avoid ambiguity with INDEX and other declarations in CREATE query.
+    format_settings.quoteIdentifier(name);

    if (type)
    {
--- a/src/Parsers/ASTDictionaryAttributeDeclaration.cpp
+++ b/src/Parsers/ASTDictionaryAttributeDeclaration.cpp
@ -35,7 +35,7 @@ void ASTDictionaryAttributeDeclaration::formatImpl(const FormatSettings & settin
 {
    frame.need_parens = false;

-    settings.ostr << backQuote(name);
+    settings.quoteIdentifier(name);

    if (type)
    {
--- a/src/Parsers/ASTIndexDeclaration.cpp
+++ b/src/Parsers/ASTIndexDeclaration.cpp
@ -79,7 +79,7 @@ void ASTIndexDeclaration::formatImpl(const FormatSettings & s, FormatState & sta
        }
        else
        {
-            s.ostr << backQuoteIfNeed(name);
+            s.writeIdentifier(name);
            s.ostr << " ";
            expr->formatImpl(s, state, frame);
        }
--- a/src/Parsers/ASTProjectionDeclaration.cpp
+++ b/src/Parsers/ASTProjectionDeclaration.cpp
@ -17,7 +17,7 @@ ASTPtr ASTProjectionDeclaration::clone() const

 void ASTProjectionDeclaration::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
 {
-    settings.ostr << backQuoteIfNeed(name);
+    settings.writeIdentifier(name);
    std::string indent_str = settings.one_line ? "" : std::string(4u * frame.indent, ' ');
    std::string nl_or_nothing = settings.one_line ? "" : "\n";
    settings.ostr << settings.nl_or_ws << indent_str << "(" << nl_or_nothing;
--- a/src/Parsers/ASTTableOverrides.cpp
+++ b/src/Parsers/ASTTableOverrides.cpp
@ -22,10 +22,8 @@ ASTPtr ASTTableOverride::clone() const
    return res;
 }

-void ASTTableOverride::formatImpl(const FormatSettings & settings_, FormatState & state, FormatStateStacked frame) const
+void ASTTableOverride::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
 {
-    FormatSettings settings = settings_;
-    settings.always_quote_identifiers = true;
    String nl_or_nothing = settings.one_line ? "" : "\n";
    String nl_or_ws = settings.one_line ? " " : "\n";
    String hl_keyword = settings.hilite ? hilite_keyword : "";
--- a/src/Parsers/IAST.cpp
+++ b/src/Parsers/IAST.cpp
@ -165,12 +165,21 @@ size_t IAST::checkDepthImpl(size_t max_depth) const
    return res;
 }

-String IAST::formatWithPossiblyHidingSensitiveData(size_t max_length, bool one_line, bool show_secrets, bool print_pretty_type_names) const
+String IAST::formatWithPossiblyHidingSensitiveData(
+    size_t max_length,
+    bool one_line,
+    bool show_secrets,
+    bool print_pretty_type_names,
+    bool always_quote_identifiers,
+    IdentifierQuotingStyle identifier_quoting_style) const
 {
+
    WriteBufferFromOwnString buf;
    FormatSettings settings(buf, one_line);
    settings.show_secrets = show_secrets;
    settings.print_pretty_type_names = print_pretty_type_names;
+    settings.always_quote_identifiers = always_quote_identifiers;
+    settings.identifier_quoting_style = identifier_quoting_style;
    format(settings);
    return wipeSensitiveDataAndCutToLength(buf.str(), max_length);
 }
@ -248,6 +257,34 @@ void IAST::FormatSettings::writeIdentifier(const String & name) const
    }
 }

+
+void IAST::FormatSettings::quoteIdentifier(const String & name) const
+{
+    switch (identifier_quoting_style)
+    {
+        case IdentifierQuotingStyle::None:
+        {
+            writeBackQuotedString(name, ostr);
+            break;
+        }
+        case IdentifierQuotingStyle::Backticks:
+        {
+            writeBackQuotedString(name, ostr);
+            break;
+        }
+        case IdentifierQuotingStyle::DoubleQuotes:
+        {
+            writeDoubleQuotedString(name, ostr);
+            break;
+        }
+        case IdentifierQuotingStyle::BackticksMySQL:
+        {
+            writeBackQuotedStringMySQL(name, ostr);
+            break;
+        }
+    }
+}
+
 void IAST::dumpTree(WriteBuffer & ostr, size_t indent) const
 {
    String indent_str(indent, '-');
--- a/src/Parsers/IAST.h
+++ b/src/Parsers/IAST.h
@ -238,6 +238,9 @@ public:
        }

        void writeIdentifier(const String & name) const;
+        // Quote identifier `name` even when `always_quote_identifiers` is false.
+        // If `identifier_quoting_style` is `IdentifierQuotingStyle::None`, quote it with `IdentifierQuotingStyle::Backticks`
+        void quoteIdentifier(const String & name) const;
    };

    /// State. For example, a set of nodes can be remembered, which we already walk through.
@ -278,7 +281,13 @@ public:

    /// Secrets are displayed regarding show_secrets, then SensitiveDataMasker is applied.
    /// You can use Interpreters/formatWithPossiblyHidingSecrets.h for convenience.
-    String formatWithPossiblyHidingSensitiveData(size_t max_length, bool one_line, bool show_secrets, bool print_pretty_type_names) const;
+    String formatWithPossiblyHidingSensitiveData(
+        size_t max_length,
+        bool one_line,
+        bool show_secrets,
+        bool print_pretty_type_names,
+        bool always_quote_identifiers,
+        IdentifierQuotingStyle identifier_quoting_style) const;

    /** formatForLogging and formatForErrorMessage always hide secrets. This inconsistent
      * behaviour is due to the fact such functions are called from Client which knows nothing about
@ -287,12 +296,12 @@ public:
      */
    String formatForLogging(size_t max_length = 0) const
    {
-        return formatWithPossiblyHidingSensitiveData(max_length, true, false, false);
+        return formatWithPossiblyHidingSensitiveData(max_length, true, false, false, false, IdentifierQuotingStyle::Backticks);
    }

    String formatForErrorMessage() const
    {
-        return formatWithPossiblyHidingSensitiveData(0, true, false, false);
+        return formatWithPossiblyHidingSensitiveData(0, true, false, false, false, IdentifierQuotingStyle::Backticks);
    }

    virtual bool hasSecretParts() const { return childrenHaveSecretParts(); }
--- a/src/Parsers/tests/gtest_Parser.cpp
+++ b/src/Parsers/tests/gtest_Parser.cpp
@ -231,47 +231,47 @@ INSTANTIATE_TEST_SUITE_P(ParserCreateDatabaseQuery, ParserTest,
        },
        {
            "CREATE DATABASE db ENGINE=MaterializeMySQL('addr:port', 'db', 'user', 'pw') TABLE OVERRIDE `tbl`\n(PARTITION BY toYYYYMM(created))",
-            "CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')\nTABLE OVERRIDE `tbl`\n(\n    PARTITION BY toYYYYMM(`created`)\n)"
+            "CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')\nTABLE OVERRIDE tbl\n(\n    PARTITION BY toYYYYMM(created)\n)"
        },
        {
            "CREATE DATABASE db ENGINE=Foo TABLE OVERRIDE `tbl` (), TABLE OVERRIDE a (COLUMNS (_created DateTime MATERIALIZED now())), TABLE OVERRIDE b (PARTITION BY rand())",
-            "CREATE DATABASE db\nENGINE = Foo\nTABLE OVERRIDE `tbl`\n(\n\n),\nTABLE OVERRIDE `a`\n(\n    COLUMNS\n    (\n        `_created` DateTime MATERIALIZED now()\n    )\n),\nTABLE OVERRIDE `b`\n(\n    PARTITION BY rand()\n)"
+            "CREATE DATABASE db\nENGINE = Foo\nTABLE OVERRIDE tbl\n(\n\n),\nTABLE OVERRIDE a\n(\n    COLUMNS\n    (\n        `_created` DateTime MATERIALIZED now()\n    )\n),\nTABLE OVERRIDE b\n(\n    PARTITION BY rand()\n)"
        },
        {
            "CREATE DATABASE db ENGINE=MaterializeMySQL('addr:port', 'db', 'user', 'pw') TABLE OVERRIDE tbl (COLUMNS (id UUID) PARTITION BY toYYYYMM(created))",
-            "CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')\nTABLE OVERRIDE `tbl`\n(\n    COLUMNS\n    (\n        `id` UUID\n    )\n    PARTITION BY toYYYYMM(`created`)\n)"
+            "CREATE DATABASE db\nENGINE = MaterializeMySQL('addr:port', 'db', 'user', 'pw')\nTABLE OVERRIDE tbl\n(\n    COLUMNS\n    (\n        `id` UUID\n    )\n    PARTITION BY toYYYYMM(created)\n)"
        },
        {
            "CREATE DATABASE db TABLE OVERRIDE tbl (COLUMNS (INDEX foo foo TYPE minmax GRANULARITY 1) PARTITION BY if(_staged = 1, 'staging', toYYYYMM(created)))",
-            "CREATE DATABASE db\nTABLE OVERRIDE `tbl`\n(\n    COLUMNS\n    (\n        INDEX foo `foo` TYPE minmax GRANULARITY 1\n    )\n    PARTITION BY if(`_staged` = 1, 'staging', toYYYYMM(`created`))\n)"
+            "CREATE DATABASE db\nTABLE OVERRIDE tbl\n(\n    COLUMNS\n    (\n        INDEX foo foo TYPE minmax GRANULARITY 1\n    )\n    PARTITION BY if(_staged = 1, 'staging', toYYYYMM(created))\n)"
        },
        {
            "CREATE DATABASE db TABLE OVERRIDE t1 (TTL inserted + INTERVAL 1 MONTH DELETE), TABLE OVERRIDE t2 (TTL `inserted` + INTERVAL 2 MONTH DELETE)",
-            "CREATE DATABASE db\nTABLE OVERRIDE `t1`\n(\n    TTL `inserted` + toIntervalMonth(1)\n),\nTABLE OVERRIDE `t2`\n(\n    TTL `inserted` + toIntervalMonth(2)\n)"
+            "CREATE DATABASE db\nTABLE OVERRIDE t1\n(\n    TTL inserted + toIntervalMonth(1)\n),\nTABLE OVERRIDE t2\n(\n    TTL inserted + toIntervalMonth(2)\n)"
        },
        {
            "CREATE DATABASE db ENGINE = MaterializeMySQL('127.0.0.1:3306', 'db', 'root', 'pw') SETTINGS allows_query_when_mysql_lost = 1 TABLE OVERRIDE tab3 (COLUMNS (_staged UInt8 MATERIALIZED 1) PARTITION BY (c3) TTL c3 + INTERVAL 10 minute), TABLE OVERRIDE tab5 (PARTITION BY (c3) TTL c3 + INTERVAL 10 minute)",
-            "CREATE DATABASE db\nENGINE = MaterializeMySQL('127.0.0.1:3306', 'db', 'root', 'pw')\nSETTINGS allows_query_when_mysql_lost = 1\nTABLE OVERRIDE `tab3`\n(\n    COLUMNS\n    (\n        `_staged` UInt8 MATERIALIZED 1\n    )\n    PARTITION BY `c3`\n    TTL `c3` + toIntervalMinute(10)\n),\nTABLE OVERRIDE `tab5`\n(\n    PARTITION BY `c3`\n    TTL `c3` + toIntervalMinute(10)\n)"
+            "CREATE DATABASE db\nENGINE = MaterializeMySQL('127.0.0.1:3306', 'db', 'root', 'pw')\nSETTINGS allows_query_when_mysql_lost = 1\nTABLE OVERRIDE tab3\n(\n    COLUMNS\n    (\n        `_staged` UInt8 MATERIALIZED 1\n    )\n    PARTITION BY c3\n    TTL c3 + toIntervalMinute(10)\n),\nTABLE OVERRIDE tab5\n(\n    PARTITION BY c3\n    TTL c3 + toIntervalMinute(10)\n)"
        },
        {
            "CREATE DATABASE db TABLE OVERRIDE tbl (PARTITION BY toYYYYMM(created) COLUMNS (created DateTime CODEC(Delta)))",
-            "CREATE DATABASE db\nTABLE OVERRIDE `tbl`\n(\n    COLUMNS\n    (\n        `created` DateTime CODEC(Delta)\n    )\n    PARTITION BY toYYYYMM(`created`)\n)"
+            "CREATE DATABASE db\nTABLE OVERRIDE tbl\n(\n    COLUMNS\n    (\n        `created` DateTime CODEC(Delta)\n    )\n    PARTITION BY toYYYYMM(created)\n)"
        },
        {
-            "CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1",
+            "CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1", 
            "CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1"
        },
        {
-            "CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1, b = 2",
+            "CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1, b = 2", 
            "CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2"
        },
        {
            "CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1, b = 2 TABLE OVERRIDE a (ORDER BY (id, version))",
-            "CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2\nTABLE OVERRIDE `a`\n(\n    ORDER BY (`id`, `version`)\n)"
+            "CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2\nTABLE OVERRIDE a\n(\n    ORDER BY (id, version)\n)"
        },
        {
            "CREATE DATABASE db ENGINE = Foo() SETTINGS a = 1, b = 2 COMMENT 'db comment' TABLE OVERRIDE a (ORDER BY (id, version))",
-            "CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2\nTABLE OVERRIDE `a`\n(\n    ORDER BY (`id`, `version`)\n)\nCOMMENT 'db comment'"
+            "CREATE DATABASE db\nENGINE = Foo\nSETTINGS a = 1, b = 2\nTABLE OVERRIDE a\n(\n    ORDER BY (id, version)\n)\nCOMMENT 'db comment'"
        }
 })));

--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@ -494,6 +494,12 @@ JoinClausesAndActions buildJoinClausesAndActions(
            necessary_names.push_back(name);
    };

+    bool is_join_with_special_storage = false;
+    if (const auto * right_table_node = join_node.getRightTableExpression()->as<TableNode>())
+    {
+        is_join_with_special_storage = dynamic_cast<const StorageJoin *>(right_table_node->getStorage().get());
+    }
+
    for (auto & join_clause : result.join_clauses)
    {
        const auto & left_filter_condition_nodes = join_clause.getLeftFilterConditionNodes();
@ -561,7 +567,7 @@ JoinClausesAndActions buildJoinClausesAndActions(
                if (!left_key_node->result_type->equals(*common_type))
                    left_key_node = &left_join_actions.addCast(*left_key_node, common_type, {});

-                if (!right_key_node->result_type->equals(*common_type))
+                if (!is_join_with_special_storage && !right_key_node->result_type->equals(*common_type))
                    right_key_node = &right_join_actions.addCast(*right_key_node, common_type, {});
            }

--- a/src/Processors/IProcessor.cpp
+++ b/src/Processors/IProcessor.cpp
@ -1,5 +1,6 @@
 #include <iostream>
 #include <Processors/IProcessor.h>
+#include <Processors/QueryPlan/IQueryPlanStep.h>

 #include <Common/logger_useful.h>
 #include <IO/WriteHelpers.h>
@ -9,6 +10,17 @@
 namespace DB
 {

+void IProcessor::setQueryPlanStep(IQueryPlanStep * step, size_t group)
+{
+    query_plan_step = step;
+    query_plan_step_group = group;
+    if (step)
+    {
+        plan_step_name = step->getName();
+        plan_step_description = step->getStepDescription();
+    }
+}
+
 void IProcessor::cancel() noexcept
 {

--- a/src/Processors/IProcessor.h
+++ b/src/Processors/IProcessor.h
@ -311,14 +311,12 @@ public:
    constexpr static size_t NO_STREAM = std::numeric_limits<size_t>::max();

    /// Step of QueryPlan from which processor was created.
-    void setQueryPlanStep(IQueryPlanStep * step, size_t group = 0)
-    {
-        query_plan_step = step;
-        query_plan_step_group = group;
-    }
+    void setQueryPlanStep(IQueryPlanStep * step, size_t group = 0);

    IQueryPlanStep * getQueryPlanStep() const { return query_plan_step; }
    size_t getQueryPlanStepGroup() const { return query_plan_step_group; }
+    const String & getPlanStepName() const { return plan_step_name; }
+    const String & getPlanStepDescription() const { return plan_step_description; }

    uint64_t getElapsedNs() const { return elapsed_ns; }
    uint64_t getInputWaitElapsedNs() const { return input_wait_elapsed_ns; }
@ -410,6 +408,8 @@ private:

    IQueryPlanStep * query_plan_step = nullptr;
    size_t query_plan_step_group = 0;
+    String plan_step_name;
+    String plan_step_description;
 };


--- a/src/Processors/QueryPlan/Optimizations/useDataParallelAggregation.cpp
+++ b/src/Processors/QueryPlan/Optimizations/useDataParallelAggregation.cpp
@ -155,7 +155,7 @@ bool isPartitionKeySuitsGroupByKey(
        return false;

    /// We are interested only in calculations required to obtain group by keys (and not aggregate function arguments for example).
-    auto key_nodes = group_by_actions.findInOutpus(aggregating.getParams().keys);
+    auto key_nodes = group_by_actions.findInOutputs(aggregating.getParams().keys);
    auto group_by_key_actions = ActionsDAG::cloneSubDAG(key_nodes, /*remove_aliases=*/ true);

    const auto & gb_key_required_columns = group_by_key_actions.getRequiredColumnsNames();
--- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp
+++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp
@ -83,7 +83,11 @@ void WriteBufferFromHTTPServerResponse::finishSendHeaders()
        return;

    if (!headers_started_sending)
+    {
+        if (compression_method != CompressionMethod::None)
+            response.set("Content-Encoding", toContentEncodingName(compression_method));
        startSendHeaders();
+    }

    writeHeaderSummary();
    writeExceptionCode();
@ -105,7 +109,13 @@ void WriteBufferFromHTTPServerResponse::nextImpl()
        initialized = true;

        if (compression_method != CompressionMethod::None)
-            response.set("Content-Encoding", toContentEncodingName(compression_method));
+        {
+            /// If we've already sent headers, just send the `Content-Encoding` down the socket directly
+            if (headers_started_sending)
+                socketSendStr("Content-Encoding: " + toContentEncodingName(compression_method) + "\r\n");
+            else
+                response.set("Content-Encoding", toContentEncodingName(compression_method));
+        }

        startSendHeaders();
        finishSendHeaders();
@ -177,8 +187,12 @@ void WriteBufferFromHTTPServerResponse::finalizeImpl()
        /// If no body data just send header
        startSendHeaders();

+        /// `finalizeImpl` must be idempotent, so set `initialized` here to not send stuff twice
        if (!initialized && offset() && compression_method != CompressionMethod::None)
+        {
+            initialized = true;
            socketSendStr("Content-Encoding: " + toContentEncodingName(compression_method) + "\r\n");
+        }

        finishSendHeaders();
    }
--- a/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp
+++ b/src/Storages/ObjectStorage/HDFS/ReadBufferFromHDFS.cpp
@ -119,27 +119,16 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory<S
            return false;
        }

-        ResourceGuard rlock(read_settings.resource_link, num_bytes_to_read);
-        int bytes_read;
-        try
-        {
-            bytes_read = hdfsRead(fs.get(), fin, internal_buffer.begin(), safe_cast<int>(num_bytes_to_read));
-        }
-        catch (...)
-        {
-            read_settings.resource_link.accumulate(num_bytes_to_read); // We assume no resource was used in case of failure
-            throw;
-        }
-        rlock.unlock();
+        ResourceGuard rlock(ResourceGuard::Metrics::getIORead(), read_settings.io_scheduling.read_resource_link, num_bytes_to_read);
+        int bytes_read = hdfsRead(fs.get(), fin, internal_buffer.begin(), safe_cast<int>(num_bytes_to_read));
+        rlock.unlock(std::max(0, bytes_read));

        if (bytes_read < 0)
        {
-            read_settings.resource_link.accumulate(num_bytes_to_read); // We assume no resource was used in case of failure
            throw Exception(ErrorCodes::NETWORK_ERROR,
                "Fail to read from HDFS: {}, file path: {}. Error: {}",
                hdfs_uri, hdfs_file_path, std::string(hdfsGetLastError()));
        }
-        read_settings.resource_link.adjust(num_bytes_to_read, bytes_read);

        if (bytes_read)
        {
--- a/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp
+++ b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp
@ -66,25 +66,12 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl

    int write(const char * start, size_t size)
    {
-        ResourceGuard rlock(write_settings.resource_link, size);
-        int bytes_written;
-        try
-        {
-            bytes_written = hdfsWrite(fs.get(), fout, start, safe_cast<int>(size));
-        }
-        catch (...)
-        {
-            write_settings.resource_link.accumulate(size); // We assume no resource was used in case of failure
-            throw;
-        }
-        rlock.unlock();
+        ResourceGuard rlock(ResourceGuard::Metrics::getIOWrite(), write_settings.io_scheduling.write_resource_link, size);
+        int bytes_written = hdfsWrite(fs.get(), fout, start, safe_cast<int>(size));
+        rlock.unlock(std::max(0, bytes_written));

        if (bytes_written < 0)
-        {
-            write_settings.resource_link.accumulate(size); // We assume no resource was used in case of failure
            throw Exception(ErrorCodes::NETWORK_ERROR, "Fail to write HDFS file: {} {}", hdfs_uri, std::string(hdfsGetLastError()));
-        }
-        write_settings.resource_link.adjust(size, bytes_written);

        if (write_settings.remote_throttler)
            write_settings.remote_throttler->add(bytes_written, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds);
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -221,14 +221,17 @@ void StorageReplicatedMergeTree::setZooKeeper()
    /// strange effects. So we always use only one session for all tables.
    /// (excluding auxiliary zookeepers)

-    std::lock_guard lock(current_zookeeper_mutex);
    if (zookeeper_name == default_zookeeper_name)
    {
-        current_zookeeper = getContext()->getZooKeeper();
+        auto new_keeper = getContext()->getZooKeeper();
+        std::lock_guard lock(current_zookeeper_mutex);
+        current_zookeeper = new_keeper;
    }
    else
    {
-        current_zookeeper = getContext()->getAuxiliaryZooKeeper(zookeeper_name);
+        auto new_keeper = getContext()->getAuxiliaryZooKeeper(zookeeper_name);
+        std::lock_guard lock(current_zookeeper_mutex);
+        current_zookeeper = new_keeper;
    }
 }

@ -365,7 +368,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
    bool has_zookeeper = getContext()->hasZooKeeper() || getContext()->hasAuxiliaryZooKeeper(zookeeper_name);
    if (has_zookeeper)
    {
-        /// It's possible for getZooKeeper() to timeout if  zookeeper host(s) can't
+        /// It's possible for getZooKeeper() to timeout if zookeeper host(s) can't
        /// be reached. In such cases Poco::Exception is thrown after a connection
        /// timeout - refer to src/Common/ZooKeeper/ZooKeeperImpl.cpp:866 for more info.
        ///
--- a/tests/ci/changelog.py
+++ b/tests/ci/changelog.py
@ -288,7 +288,7 @@ def generate_description(item: PullRequest, repo: Repository) -> Optional[Descri
    # Normalize bug fixes
    if (
        re.match(
-            r".*(?i)bug\Wfix",
+            r"(?i).*bug\Wfix",
            category,
        )
        # Map "Critical Bug Fix" to "Bug fix" category for changelog
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@ -427,6 +427,7 @@ class CI:
            pr_only=True,
            # TODO: approach with reference job names does not work because digest may not be calculated if job skipped in wf
            # reference_job_name=JobNames.INTEGRATION_TEST_TSAN,
+            timeout=4 * 3600,  # to be able to process many updated tests
        ),
        JobNames.COMPATIBILITY_TEST: CommonJobConfigs.COMPATIBILITY_TEST.with_properties(
            required_builds=[BuildNames.PACKAGE_RELEASE],
--- a/tests/ci/integration_tests_runner.py
+++ b/tests/ci/integration_tests_runner.py
@ -33,7 +33,7 @@ CLICKHOUSE_BINARY_PATH = "usr/bin/clickhouse"
 CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH = "usr/bin/clickhouse-odbc-bridge"
 CLICKHOUSE_LIBRARY_BRIDGE_BINARY_PATH = "usr/bin/clickhouse-library-bridge"

-FLAKY_TRIES_COUNT = 10  # run whole pytest several times
+FLAKY_TRIES_COUNT = 3  # run whole pytest several times
 FLAKY_REPEAT_COUNT = 5  # runs test case in single module several times
 MAX_TIME_SECONDS = 3600

@ -782,47 +782,71 @@ class ClickhouseIntegrationTestsRunner:
        logging.info("Found '%s' tests to run", " ".join(tests_to_run))
        result_state = "success"
        description_prefix = "No flaky tests: "
-        start = time.time()
        logging.info("Starting check with retries")
        final_retry = 0
-        logs = []
-        tries_num = 1 if should_fail else FLAKY_TRIES_COUNT
-        for i in range(tries_num):
-            if timeout_expired:
-                print("Timeout expired - break flaky check execution")
-                break
-            final_retry += 1
-            logging.info("Running tests for the %s time", i)
-            counters, tests_times, log_paths = self.try_run_test_group(
-                repo_path,
-                "bugfix" if should_fail else "flaky",
-                tests_to_run,
-                1,
-                1,
-                FLAKY_REPEAT_COUNT,
-            )
-            logs += log_paths
-            if counters["FAILED"]:
-                logging.info("Found failed tests: %s", " ".join(counters["FAILED"]))
-                description_prefix = "Failed tests found: "
-                result_state = "failure"
-                if not should_fail:
+        counters = {
+            "ERROR": [],
+            "PASSED": [],
+            "FAILED": [],
+            "SKIPPED": [],
+            "BROKEN": [],
+            "NOT_FAILED": [],
+        }  # type: Dict
+        tests_times = defaultdict(float)  # type: Dict
+        tests_log_paths = defaultdict(list)
+        id_counter = 0
+        for test_to_run in tests_to_run:
+            tries_num = 1 if should_fail else FLAKY_TRIES_COUNT
+            for i in range(tries_num):
+                if timeout_expired:
+                    print("Timeout expired - break flaky check execution")
                    break
-            if counters["ERROR"]:
-                description_prefix = "Failed tests found: "
-                logging.info("Found error tests: %s", " ".join(counters["ERROR"]))
-                # NOTE "error" result state will restart the whole test task,
-                # so we use "failure" here
-                result_state = "failure"
-                if not should_fail:
+                final_retry += 1
+                logging.info("Running tests for the %s time", i)
+                group_counters, group_test_times, log_paths = self.try_run_test_group(
+                    repo_path,
+                    f"bugfix_{id_counter}" if should_fail else f"flaky{id_counter}",
+                    [test_to_run],
+                    1,
+                    1,
+                    FLAKY_REPEAT_COUNT,
+                )
+                id_counter = id_counter + 1
+                for counter, value in group_counters.items():
+                    logging.info(
+                        "Tests from group %s stats, %s count %s",
+                        test_to_run,
+                        counter,
+                        len(value),
+                    )
+                    counters[counter] += value
+
+                for test_name, test_time in group_test_times.items():
+                    tests_times[test_name] = test_time
+                    tests_log_paths[test_name] = log_paths
+                if not should_fail and (
+                    group_counters["FAILED"] or group_counters["ERROR"]
+                ):
+                    logging.info(
+                        "Unexpected failure in group %s. Fail fast for current group",
+                        test_to_run,
+                    )
                    break
-            logging.info("Try is OK, all tests passed, going to clear env")
-            clear_ip_tables_and_restart_daemons()
-            logging.info("And going to sleep for some time")
-            if time.time() - start > MAX_TIME_SECONDS:
-                logging.info("Timeout reached, going to finish flaky check")
-                break
-            time.sleep(5)
+
+        if counters["FAILED"]:
+            logging.info("Found failed tests: %s", " ".join(counters["FAILED"]))
+            description_prefix = "Failed tests found: "
+            result_state = "failure"
+        if counters["ERROR"]:
+            description_prefix = "Failed tests found: "
+            logging.info("Found error tests: %s", " ".join(counters["ERROR"]))
+            # NOTE "error" result state will restart the whole test task,
+            # so we use "failure" here
+            result_state = "failure"
+        logging.info("Try is OK, all tests passed, going to clear env")
+        clear_ip_tables_and_restart_daemons()
+        logging.info("And going to sleep for some time")
+        time.sleep(5)

        test_result = []
        for state in ("ERROR", "FAILED", "PASSED", "SKIPPED"):
@ -833,13 +857,10 @@ class ClickhouseIntegrationTestsRunner:
            else:
                text_state = state
            test_result += [
-                (
-                    c + " (✕" + str(final_retry) + ")",
-                    text_state,
-                    f"{tests_times[c]:.2f}",
-                )
+                (c, text_state, f"{tests_times[c]:.2f}", tests_log_paths[c])
                for c in counters[state]
            ]
+
        status_text = description_prefix + ", ".join(
            [
                str(n).lower().replace("failed", "fail") + ": " + str(len(c))
@ -847,26 +868,50 @@ class ClickhouseIntegrationTestsRunner:
            ]
        )

-        return result_state, status_text, test_result, logs
+        return result_state, status_text, test_result, tests_log_paths

    def run_impl(self, repo_path, build_path):
        stopwatch = Stopwatch()
        if self.flaky_check or self.bugfix_validate_check:
-            return self.run_flaky_check(
-                repo_path, build_path, should_fail=self.bugfix_validate_check
+            result_state, status_text, test_result, tests_log_paths = (
+                self.run_flaky_check(
+                    repo_path, build_path, should_fail=self.bugfix_validate_check
+                )
+            )
+        else:
+            result_state, status_text, test_result, tests_log_paths = (
+                self.run_normal_check(build_path, repo_path)
            )

-        self._install_clickhouse(build_path)
+        if self.soft_deadline_time < time.time():
+            status_text = "Timeout, " + status_text
+            result_state = "failure"

+        if timeout_expired:
+            logging.error(
+                "Job killed by external timeout signal - setting status to failure!"
+            )
+            status_text = "Job timeout expired, " + status_text
+            result_state = "failure"
+            # add mock test case to make timeout visible in job report and in ci db
+            test_result.insert(
+                0, (JOB_TIMEOUT_TEST_NAME, "FAIL", f"{stopwatch.duration_seconds}", "")
+            )
+
+        if "(memory)" in self.params["context_name"]:
+            result_state = "success"
+
+        return result_state, status_text, test_result, tests_log_paths
+
+    def run_normal_check(self, build_path, repo_path):
+        self._install_clickhouse(build_path)
        logging.info("Pulling images")
        self._pre_pull_images(repo_path)
-
        logging.info(
            "Dump iptables before run %s",
            subprocess.check_output("sudo iptables -nvL", shell=True),
        )
        all_tests = self._get_all_tests(repo_path)
-
        if self.run_by_hash_total != 0:
            grouped_tests = self.group_test_by_file(all_tests)
            all_filtered_by_hash_tests = []
@ -874,7 +919,6 @@ class ClickhouseIntegrationTestsRunner:
                if stringhash(group) % self.run_by_hash_total == self.run_by_hash_num:
                    all_filtered_by_hash_tests += tests_in_group
            all_tests = all_filtered_by_hash_tests
-
        parallel_skip_tests = self._get_parallel_tests_skip_list(repo_path)
        logging.info(
            "Found %s tests first 3 %s", len(all_tests), " ".join(all_tests[:3])
@ -906,14 +950,12 @@ class ClickhouseIntegrationTestsRunner:
            len(not_found_tests),
            " ".join(not_found_tests[:3]),
        )
-
        grouped_tests = self.group_test_by_file(filtered_sequential_tests)
        i = 0
        for par_group in chunks(filtered_parallel_tests, PARALLEL_GROUP_SIZE):
            grouped_tests[f"parallel{i}"] = par_group
            i += 1
        logging.info("Found %s tests groups", len(grouped_tests))
-
        counters = {
            "ERROR": [],
            "PASSED": [],
@ -924,14 +966,11 @@ class ClickhouseIntegrationTestsRunner:
        }  # type: Dict
        tests_times = defaultdict(float)
        tests_log_paths = defaultdict(list)
-
        items_to_run = list(grouped_tests.items())
-
        logging.info("Total test groups %s", len(items_to_run))
        if self.shuffle_test_groups():
            logging.info("Shuffling test groups")
            random.shuffle(items_to_run)
-
        for group, tests in items_to_run:
            if timeout_expired:
                print("Timeout expired - break tests execution")
@ -959,7 +998,6 @@ class ClickhouseIntegrationTestsRunner:
            if len(counters["FAILED"]) + len(counters["ERROR"]) >= 20:
                logging.info("Collected more than 20 failed/error tests, stopping")
                break
-
        if counters["FAILED"] or counters["ERROR"]:
            logging.info(
                "Overall status failure, because we have tests in FAILED or ERROR state"
@ -968,7 +1006,6 @@ class ClickhouseIntegrationTestsRunner:
        else:
            logging.info("Overall success!")
            result_state = "success"
-
        test_result = []
        for state in (
            "ERROR",
@ -988,33 +1025,14 @@ class ClickhouseIntegrationTestsRunner:
                (c, text_state, f"{tests_times[c]:.2f}", tests_log_paths[c])
                for c in counters[state]
            ]
-
        failed_sum = len(counters["FAILED"]) + len(counters["ERROR"])
        status_text = f"fail: {failed_sum}, passed: {len(counters['PASSED'])}"

-        if self.soft_deadline_time < time.time():
-            status_text = "Timeout, " + status_text
-            result_state = "failure"
-
-        if timeout_expired:
-            logging.error(
-                "Job killed by external timeout signal - setting status to failure!"
-            )
-            status_text = "Job timeout expired, " + status_text
-            result_state = "failure"
-            # add mock test case to make timeout visible in job report and in ci db
-            test_result.insert(
-                0, (JOB_TIMEOUT_TEST_NAME, "FAIL", f"{stopwatch.duration_seconds}", "")
-            )
-
        if not counters or sum(len(counter) for counter in counters.values()) == 0:
            status_text = "No tests found for some reason! It's a bug"
            result_state = "failure"

-        if "(memory)" in self.params["context_name"]:
-            result_state = "success"
-
-        return result_state, status_text, test_result, []
+        return result_state, status_text, test_result, tests_log_paths


 def write_results(results_file, status_file, results, status):
@ -1047,7 +1065,9 @@ def run():
        logging.info("Clearing dmesg before run")
        subprocess.check_call("sudo -E dmesg --clear", shell=True)

-    state, description, test_results, _ = runner.run_impl(repo_path, build_path)
+    state, description, test_results, _test_log_paths = runner.run_impl(
+        repo_path, build_path
+    )
    logging.info("Tests finished")

    if IS_CI:
--- a/tests/integration/helpers/postgres_utility.py
+++ b/tests/integration/helpers/postgres_utility.py
@ -245,9 +245,9 @@ class PostgresManager:
    ):
        postgres_database = self.database_or_default(postgres_database)
        self.created_materialized_postgres_db_list.add(materialized_database)
-        self.instance.query(f"DROP DATABASE IF EXISTS {materialized_database}")
+        self.instance.query(f"DROP DATABASE IF EXISTS `{materialized_database}`")

-        create_query = f"CREATE DATABASE {materialized_database} ENGINE = MaterializedPostgreSQL('{ip}:{port}', '{postgres_database}', '{user}', '{password}')"
+        create_query = f"CREATE DATABASE `{materialized_database}` ENGINE = MaterializedPostgreSQL('{ip}:{port}', '{postgres_database}', '{user}', '{password}')"
        if len(settings) > 0:
            create_query += " SETTINGS "
            for i in range(len(settings)):
@ -259,7 +259,7 @@ class PostgresManager:
        assert materialized_database in self.instance.query("SHOW DATABASES")

    def drop_materialized_db(self, materialized_database="test_database"):
-        self.instance.query(f"DROP DATABASE IF EXISTS {materialized_database} SYNC")
+        self.instance.query(f"DROP DATABASE IF EXISTS `{materialized_database}` SYNC")
        if materialized_database in self.created_materialized_postgres_db_list:
            self.created_materialized_postgres_db_list.remove(materialized_database)

@ -329,11 +329,15 @@ def assert_nested_table_is_created(
        table = schema_name + "." + table_name

    print(f"Checking table {table} exists in {materialized_database}")
-    database_tables = instance.query(f"SHOW TABLES FROM {materialized_database}")
+    database_tables = instance.query(
+        f"SHOW TABLES FROM `{materialized_database}` WHERE name = '{table}'"
+    )

    while table not in database_tables:
        time.sleep(0.2)
-        database_tables = instance.query(f"SHOW TABLES FROM {materialized_database}")
+        database_tables = instance.query(
+            f"SHOW TABLES FROM `{materialized_database}` WHERE name = '{table}'"
+        )

    assert table in database_tables

@ -366,9 +370,9 @@ def check_tables_are_synchronized(

    table_path = ""
    if len(schema_name) == 0:
-        table_path = f"{materialized_database}.{table_name}"
+        table_path = f"`{materialized_database}`.`{table_name}`"
    else:
-        table_path = f"{materialized_database}.`{schema_name}.{table_name}`"
+        table_path = f"`{materialized_database}`.`{schema_name}.{table_name}`"

    print(f"Checking table is synchronized: {table_path}")
    result_query = f"select * from {table_path} order by {order_by};"
--- a/tests/integration/test_alter_settings_on_cluster/test.py
+++ b/tests/integration/test_alter_settings_on_cluster/test.py
@ -73,3 +73,8 @@ def test_default_database_on_cluster(started_cluster):
            database="test_default_database",
            sql="SHOW CREATE test_local_table FORMAT TSV",
        ).endswith("old_parts_lifetime = 100\n")
+
+    ch1.query(
+        database="test_default_database",
+        sql="DROP TABLE test_local_table ON CLUSTER 'cluster' SYNC",
+    )
--- a/tests/integration/test_always_fetch_merged/test.py
+++ b/tests/integration/test_always_fetch_merged/test.py
@ -80,3 +80,6 @@ def test_replica_always_download(started_cluster):

    assert int(node1_parts) < 10
    assert int(node2_parts) < 10
+
+    node1.query_with_retry("DROP TABLE test_table SYNC")
+    node2.query_with_retry("DROP TABLE test_table SYNC")
--- a/tests/integration/test_async_insert_adaptive_busy_timeout/test.py
+++ b/tests/integration/test_async_insert_adaptive_busy_timeout/test.py
@ -104,7 +104,7 @@ def test_with_merge_tree():
    _insert_queries_sequentially(
        table_name,
        _query_settings,
-        iterations=100,
+        iterations=10,
        max_values_size=1000,
        array_size_range=[10, 50],
    )
@ -125,7 +125,7 @@ def test_with_merge_tree_multithread():
        table_name,
        _query_settings,
        thread_num=15,
-        tasks=1000,
+        tasks=100,
        max_values_size=1000,
        array_size_range=[10, 15],
    )
@ -152,12 +152,12 @@ def test_with_replicated_merge_tree():
    _insert_queries_sequentially(
        table_name,
        settings,
-        iterations=100,
+        iterations=10,
        max_values_size=1000,
        array_size_range=[10, 50],
    )

-    node.query("DROP TABLE IF EXISTS {}".format(table_name))
+    node.query("DROP TABLE {} SYNC".format(table_name))


 def test_with_replicated_merge_tree_multithread():
@ -180,12 +180,12 @@ def test_with_replicated_merge_tree_multithread():
        table_name,
        _query_settings,
        thread_num=15,
-        tasks=1000,
+        tasks=100,
        max_values_size=1000,
        array_size_range=[10, 15],
    )

-    node.query("DROP TABLE IF EXISTS {}".format(table_name))
+    node.query("DROP TABLE {} SYNC".format(table_name))


 # Ensure that the combined duration of inserts with adaptive timeouts is less than
@ -200,13 +200,13 @@ def test_compare_sequential_inserts_durations_for_adaptive_and_fixed_async_timeo

    fixed_tm_settings = copy.copy(_query_settings)
    fixed_tm_settings["async_insert_use_adaptive_busy_timeout"] = 0
-    fixed_tm_settings["async_insert_busy_timeout_ms"] = 200
+    fixed_tm_settings["async_insert_busy_timeout_ms"] = 100

    fixed_tm_run_duration = timeit.timeit(
        lambda: _insert_queries_sequentially(
            fixed_tm_table_name,
            fixed_tm_settings,
-            iterations=100,
+            iterations=50,
            max_values_size=1000,
            array_size_range=[10, 50],
        ),
@ -231,13 +231,13 @@ def test_compare_sequential_inserts_durations_for_adaptive_and_fixed_async_timeo

    adaptive_tm_settings = copy.copy(_query_settings)
    adaptive_tm_settings["async_insert_busy_timeout_min_ms"] = 10
-    adaptive_tm_settings["async_insert_busy_timeout_max_ms"] = 1000
+    adaptive_tm_settings["async_insert_busy_timeout_max_ms"] = 500

    adaptive_tm_run_duration = timeit.timeit(
        lambda: _insert_queries_sequentially(
            adaptive_tm_table_name,
            adaptive_tm_settings,
-            iterations=100,
+            iterations=50,
            max_values_size=1000,
            array_size_range=[10, 50],
        ),
@ -268,14 +268,14 @@ def test_compare_parallel_inserts_durations_for_adaptive_and_fixed_async_timeout

    fixed_tm_settings = copy.copy(_query_settings)
    fixed_tm_settings["async_insert_use_adaptive_busy_timeout"] = 0
-    fixed_tm_settings["async_insert_busy_timeout_ms"] = 200
+    fixed_tm_settings["async_insert_busy_timeout_ms"] = 500

    fixed_tm_run_duration = timeit.timeit(
        lambda: _insert_queries_in_parallel(
            fixed_tm_table_name,
            fixed_tm_settings,
            thread_num=15,
-            tasks=1000,
+            tasks=150,
            max_values_size=1000,
            array_size_range=[10, 50],
        ),
@ -300,14 +300,14 @@ def test_compare_parallel_inserts_durations_for_adaptive_and_fixed_async_timeout

    adaptive_tm_settings = copy.copy(_query_settings)
    adaptive_tm_settings["async_insert_busy_timeout_min_ms"] = 10
-    adaptive_tm_settings["async_insert_busy_timeout_max_ms"] = 200
+    adaptive_tm_settings["async_insert_busy_timeout_max_ms"] = 500

    adaptive_tm_run_duration = timeit.timeit(
        lambda: _insert_queries_in_parallel(
            adaptive_tm_table_name,
            adaptive_tm_settings,
            thread_num=15,
-            tasks=100,
+            tasks=150,
            max_values_size=1000,
            array_size_range=[10, 50],
        ),
@ -344,29 +344,34 @@ def test_change_queries_frequency():

    settings = copy.copy(_query_settings)
    min_ms = 50
-    settings["async_insert_busy_timeout_min_ms"] = min_ms
-    settings["async_insert_busy_timeout_max_ms"] = 2000
+    max_ms = 200

-    _insert_queries_in_parallel(
-        table_name,
-        settings,
-        thread_num=15,
-        tasks=2000,
-        max_values_size=1000,
-        array_size_range=[10, 15],
-    )
+    settings["async_insert_busy_timeout_min_ms"] = min_ms
+    settings["async_insert_busy_timeout_max_ms"] = max_ms

    _insert_queries_sequentially(
        table_name,
        settings,
-        iterations=200,
+        iterations=50,
        max_values_size=1000,
        array_size_range=[10, 50],
    )
-
-    select_log_query = "SELECT timeout_milliseconds FROM system.asynchronous_insert_log ORDER BY event_time DESC LIMIT 50"
+    node.query("SYSTEM FLUSH LOGS")
+    select_log_query = f"SELECT countIf(timeout_milliseconds - {min_ms} < 25) FROM (SELECT timeout_milliseconds FROM system.asynchronous_insert_log ORDER BY event_time DESC LIMIT 10)"
    res = node.query(select_log_query)
-    for line in res.splitlines():
-        assert int(line) == min_ms
+    assert int(res) >= 5

-    node.query("DROP TABLE IF EXISTS {}".format(table_name))
+    _insert_queries_in_parallel(
+        table_name,
+        settings,
+        thread_num=10,
+        tasks=1000,
+        max_values_size=1000,
+        array_size_range=[10, 15],
+    )
+    node.query("SYSTEM FLUSH LOGS")
+    select_log_query = f"SELECT countIf({max_ms} - timeout_milliseconds < 100) FROM (SELECT timeout_milliseconds FROM system.asynchronous_insert_log ORDER BY event_time DESC LIMIT 10)"
+    res = node.query(select_log_query)
+    assert int(res) >= 5
+
+    node.query("DROP TABLE IF EXISTS {} SYNC".format(table_name))
--- a/tests/integration/test_async_load_databases/test.py
+++ b/tests/integration/test_async_load_databases/test.py
@ -28,9 +28,6 @@ def started_cluster():
            """
            CREATE DATABASE IF NOT EXISTS dict ENGINE=Dictionary;
            CREATE DATABASE IF NOT EXISTS test;
-            DROP TABLE IF EXISTS test.elements;
-            CREATE TABLE test.elements (id UInt64, a String, b Int32, c Float64) ENGINE=Log;
-            INSERT INTO test.elements VALUES (0, 'water', 10, 1), (1, 'air', 40, 0.01), (2, 'earth', 100, 1.7);
            """
        )

@ -49,6 +46,13 @@ def get_status(dictionary_name):
 def test_dict_get_data(started_cluster):
    query = instance.query

+    query(
+        "CREATE TABLE test.elements (id UInt64, a String, b Int32, c Float64) ENGINE=Log;"
+    )
+    query(
+        "INSERT INTO test.elements VALUES (0, 'water', 10, 1), (1, 'air', 40, 0.01), (2, 'earth', 100, 1.7);"
+    )
+
    # dictionaries_lazy_load == false, so these dictionary are not loaded.
    assert get_status("dep_x") == "NOT_LOADED"
    assert get_status("dep_y") == "NOT_LOADED"
@ -97,6 +101,8 @@ def test_dict_get_data(started_cluster):
    assert query("SELECT dictGetString('dep_x', 'a', toUInt64(4))") == "XX\n"
    assert query("SELECT dictGetString('dep_y', 'a', toUInt64(4))") == "ether\n"
    assert query("SELECT dictGetString('dep_z', 'a', toUInt64(4))") == "ZZ\n"
+    query("DROP TABLE IF EXISTS test.elements;")
+    instance.restart_clickhouse()


 def dependent_tables_assert():
@ -175,3 +181,5 @@ def test_multiple_tables(started_cluster):
    random.shuffle(order)
    for i in order:
        assert query(f"select count() from test.table_{i}") == "100\n"
+    for i in range(tables_count):
+        query(f"drop table test.table_{i} sync")
--- a/tests/integration/test_asynchronous_metric_log_table/test.py
+++ b/tests/integration/test_asynchronous_metric_log_table/test.py
@ -26,26 +26,24 @@ def started_cluster():
 # asynchronous_metric_update_period_s is being set to 2s so that the metrics are populated faster and
 # are available for querying during the test.
 def test_event_time_microseconds_field(started_cluster):
-    try:
-        cluster.start()
-        node1.query("SET log_queries = 1;")
-        node1.query("CREATE DATABASE replica;")
-        query_create = """CREATE TABLE replica.test
-        (
-           id Int64,
-           event_time DateTime
-        )
-        Engine=MergeTree()
-        PARTITION BY toYYYYMMDD(event_time)
-        ORDER BY id;"""
-        time.sleep(2)
-        node1.query(query_create)
-        node1.query("""INSERT INTO replica.test VALUES (1, now())""")
-        node1.query("SYSTEM FLUSH LOGS;")
+    node1.query("SET log_queries = 1;")
+    node1.query("CREATE DATABASE replica;")
+    query_create = """CREATE TABLE replica.test
+    (
+        id Int64,
+        event_time DateTime
+    )
+    Engine=MergeTree()
+    PARTITION BY toYYYYMMDD(event_time)
+    ORDER BY id;"""
+    time.sleep(2)
+    node1.query(query_create)
+    node1.query("""INSERT INTO replica.test VALUES (1, now())""")
+    node1.query("SYSTEM FLUSH LOGS;")

-        test_query = (
-            "SELECT count() > 0 ? 'ok' : 'fail' FROM system.asynchronous_metric_log"
-        )
-        assert "ok\n" in node1.query(test_query)
-    finally:
-        cluster.shutdown()
+    test_query = (
+        "SELECT count() > 0 ? 'ok' : 'fail' FROM system.asynchronous_metric_log"
+    )
+    assert "ok\n" in node1.query(test_query)
+    node1.query("DROP TABLE replica.test")
+    node1.query("DROP DATABASE replica")
--- a/tests/integration/test_backward_compatibility/test_functions.py
+++ b/tests/integration/test_backward_compatibility/test_functions.py
@ -67,6 +67,11 @@ def test_aggregate_states(start_cluster):
            f"select hex(initializeAggregation('{function_name}State', 'foo'))"
        ).strip()

+    def get_final_value_unhex(node, function_name, value):
+        return node.query(
+            f"select finalizeAggregation(unhex('{value}')::AggregateFunction({function_name}, String))"
+        ).strip()
+
    for aggregate_function in aggregate_functions:
        logging.info("Checking %s", aggregate_function)

@ -99,13 +104,39 @@ def test_aggregate_states(start_cluster):

        upstream_state = get_aggregate_state_hex(upstream, aggregate_function)
        if upstream_state != backward_state:
-            logging.info(
-                "Failed %s, %s (backward) != %s (upstream)",
-                aggregate_function,
-                backward_state,
-                upstream_state,
-            )
-            failed += 1
+            allowed_changes_if_result_is_the_same = ["anyHeavy"]
+
+            if aggregate_function in allowed_changes_if_result_is_the_same:
+                backward_final_from_upstream = get_final_value_unhex(
+                    backward, aggregate_function, upstream_state
+                )
+                upstream_final_from_backward = get_final_value_unhex(
+                    upstream, aggregate_function, backward_state
+                )
+
+                if backward_final_from_upstream == upstream_final_from_backward:
+                    logging.info(
+                        "OK %s (but different intermediate states)", aggregate_function
+                    )
+                    passed += 1
+                else:
+                    logging.error(
+                        "Failed %s, Intermediate: %s (backward) != %s (upstream). Final from intermediate: %s (backward from upstream state) != %s (upstream from backward state)",
+                        aggregate_function,
+                        backward_state,
+                        upstream_state,
+                        backward_final_from_upstream,
+                        upstream_final_from_backward,
+                    )
+                    failed += 1
+            else:
+                logging.error(
+                    "Failed %s, %s (backward) != %s (upstream)",
+                    aggregate_function,
+                    backward_state,
+                    upstream_state,
+                )
+                failed += 1
        else:
            logging.info("OK %s", aggregate_function)
            passed += 1
--- a/tests/integration/test_cgroup_limit/test.py
+++ b/tests/integration/test_cgroup_limit/test.py
@ -46,7 +46,7 @@ def test_cgroup_cpu_limit():
            "clickhouse local -q \"select value from system.settings where name='max_threads'\"",
            num_cpus,
        )
-        expect_output = (r"auto({})".format(math.ceil(num_cpus))).encode()
+        expect_output = (r"\'auto({})\'".format(math.ceil(num_cpus))).encode()
        assert (
            result.strip() == expect_output
        ), f"fail for cpu limit={num_cpus}, result={result.strip()}, expect={expect_output}"
--- a/tests/integration/test_filesystem_layout/test.py
+++ b/tests/integration/test_filesystem_layout/test.py
@ -79,3 +79,7 @@ def test_file_path_escaping(started_cluster):
            "test -f /var/lib/clickhouse/shadow/2/store/123/12345678-1000-4000-8000-000000000001/1_1_1_0/%7EId.bin",
        ]
    )
+    node.query("DROP TABLE test.`T.a_b,l-e!` SYNC")
+    node.query("DROP TABLE `test 2`.`T.a_b,l-e!` SYNC")
+    node.query("DROP DATABASE test")
+    node.query("DROP DATABASE `test 2`")
--- a/tests/integration/test_grant_and_revoke/test_with_table_engine_grant.py
+++ b/tests/integration/test_grant_and_revoke/test_with_table_engine_grant.py
@ -359,6 +359,8 @@ def test_implicit_create_view_grant():
    instance.query("GRANT CREATE VIEW ON test.* TO B", user="A")
    instance.query("CREATE VIEW test.view_2 AS SELECT 1", user="B")
    assert instance.query("SELECT * FROM test.view_2") == "1\n"
+    instance.query("DROP USER A")
+    instance.query("DROP VIEW test.view_2")


 def test_implicit_create_temporary_table_grant():
@ -530,6 +532,7 @@ def test_current_database():
    assert "Not enough privileges" in instance.query_and_get_error(
        "SELECT * FROM table", user="A"
    )
+    instance.query("DROP TABLE default.table SYNC")


 def test_grant_with_replace_option():
--- a/tests/integration/test_inserts_with_keeper_retries/configs/storage_conf.xml
+++ b/tests/integration/test_inserts_with_keeper_retries/configs/storage_conf.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<clickhouse>
+    <logger>
+        <level>test</level>
+    </logger>
+
+</clickhouse>
--- a/tests/integration/test_inserts_with_keeper_retries/test.py
+++ b/tests/integration/test_inserts_with_keeper_retries/test.py
@ -3,6 +3,7 @@
 import pytest
 import time
 import threading
+import uuid
 from helpers.cluster import ClickHouseCluster
 from multiprocessing.dummy import Pool
 from helpers.network import PartitionManager
@ -10,8 +11,12 @@ from helpers.client import QueryRuntimeException
 from helpers.test_tools import assert_eq_with_retry

 cluster = ClickHouseCluster(__file__)
-
-node1 = cluster.add_instance("node1", with_zookeeper=True)
+node1 = cluster.add_instance(
+    "node1",
+    main_configs=["configs/storage_conf.xml"],
+    with_zookeeper=True,
+    with_minio=True,
+)


@pytest.fixture(scope="module")
@ -25,10 +30,16 @@ def started_cluster():
        cluster.shutdown()


-def test_replica_inserts_with_keeper_restart(started_cluster):
+@pytest.mark.parametrize(
+    "engine,storage_policy",
+    [
+        ("ReplicatedMergeTree", "default"),
+    ],
+)
+def test_replica_inserts_with_keeper_restart(started_cluster, engine, storage_policy):
    try:
        node1.query(
-            "CREATE TABLE r (a UInt64, b String) ENGINE=ReplicatedMergeTree('/test/r', '0') ORDER BY tuple()"
+            f"CREATE TABLE r (a UInt64, b String) ENGINE={engine}('/test/r', '0') ORDER BY tuple() SETTINGS storage_policy='{storage_policy}'"
        )

        p = Pool(1)
@ -60,10 +71,18 @@ def test_replica_inserts_with_keeper_restart(started_cluster):
        node1.query("DROP TABLE IF EXISTS r SYNC")


-def test_replica_inserts_with_keeper_disconnect(started_cluster):
+@pytest.mark.parametrize(
+    "engine,storage_policy",
+    [
+        ("ReplicatedMergeTree", "default"),
+    ],
+)
+def test_replica_inserts_with_keeper_disconnect(
+    started_cluster, engine, storage_policy
+):
    try:
        node1.query(
-            "CREATE TABLE r (a UInt64, b String) ENGINE=ReplicatedMergeTree('/test/r', '0') ORDER BY tuple()"
+            f"CREATE TABLE r2 (a UInt64, b String) ENGINE={engine}('/test/r2', '0') ORDER BY tuple() SETTINGS storage_policy='{storage_policy}'"
        )

        p = Pool(1)
@ -84,26 +103,32 @@ def test_replica_inserts_with_keeper_disconnect(started_cluster):
        disconnect_event.wait(90)

        node1.query(
-            "INSERT INTO r SELECT number, toString(number) FROM numbers(10) SETTINGS insert_keeper_max_retries=20"
+            "INSERT INTO r2 SELECT number, toString(number) FROM numbers(10) SETTINGS insert_keeper_max_retries=20"
        )
        node1.query(
-            "INSERT INTO r SELECT number, toString(number) FROM numbers(10, 10) SETTINGS insert_keeper_max_retries=20"
+            "INSERT INTO r2 SELECT number, toString(number) FROM numbers(10, 10) SETTINGS insert_keeper_max_retries=20"
        )

        job.wait()
        p.close()
        p.join()

-        assert node1.query("SELECT COUNT() FROM r") == "20\n"
+        assert node1.query("SELECT COUNT() FROM r2") == "20\n"

    finally:
-        node1.query("DROP TABLE IF EXISTS r SYNC")
+        node1.query("DROP TABLE IF EXISTS r2 SYNC")


-def test_query_timeout_with_zk_down(started_cluster):
+@pytest.mark.parametrize(
+    "engine,storage_policy",
+    [
+        ("ReplicatedMergeTree", "default"),
+    ],
+)
+def test_query_timeout_with_zk_down(started_cluster, engine, storage_policy):
    try:
        node1.query(
-            "CREATE TABLE zk_down (a UInt64, b String) ENGINE=ReplicatedMergeTree('/test/zk_down', '0') ORDER BY tuple()"
+            f"CREATE TABLE zk_down (a UInt64, b String) ENGINE={engine}('/test/zk_down', '0') ORDER BY tuple() SETTINGS storage_policy='{storage_policy}'"
        )

        cluster.stop_zookeeper_nodes(["zoo1", "zoo2", "zoo3"])
@ -118,3 +143,45 @@ def test_query_timeout_with_zk_down(started_cluster):
    finally:
        cluster.start_zookeeper_nodes(["zoo1", "zoo2", "zoo3"])
        node1.query("DROP TABLE IF EXISTS zk_down SYNC")
+
+
+@pytest.mark.parametrize(
+    "engine,storage_policy",
+    [
+        ("ReplicatedMergeTree", "default"),
+    ],
+)
+def test_retries_should_not_wait_for_global_connection(
+    started_cluster, engine, storage_policy
+):
+    pm = PartitionManager()
+    try:
+        node1.query(
+            f"CREATE TABLE zk_down_retries (a UInt64, b String) ENGINE={engine}('/test/zk_down', '0') ORDER BY tuple() SETTINGS storage_policy='{storage_policy}'"
+        )
+
+        cluster.stop_zookeeper_nodes(["zoo1", "zoo2", "zoo3"])
+        # Apart from stopping keepers, we introduce a network delay to make connection retries slower
+        # We want to check that retries are not blocked during that time
+        pm.add_network_delay(node1, 1000)
+
+        query_id = uuid.uuid4()
+
+        with pytest.raises(QueryRuntimeException):
+            node1.query(
+                "INSERT INTO zk_down_retries SELECT number, toString(number) FROM numbers(10) SETTINGS insert_keeper_max_retries=10, insert_keeper_retry_max_backoff_ms=100",
+                query_id=str(query_id),
+            )
+        pm.heal_all()
+        # Use query_log for execution time since we want to ignore the network delay introduced (also in client)
+        node1.query("SYSTEM FLUSH LOGS")
+        res = node1.query(
+            f"SELECT query_duration_ms FROM system.query_log WHERE type != 'QueryStart' AND query_id = '{query_id}'"
+        )
+        query_duration = int(res)
+        # It should be around 1 second. 5 seconds is being generous (debug and so on). Used to take 35 seconds without the fix
+        assert query_duration < 5000
+    finally:
+        pm.heal_all()
+        cluster.start_zookeeper_nodes(["zoo1", "zoo2", "zoo3"])
+        node1.query("DROP TABLE IF EXISTS zk_down_retries SYNC")
--- a/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py
+++ b/tests/integration/test_materialized_mysql_database/materialized_with_ddl.py
@ -2353,7 +2353,7 @@ def table_overrides(clickhouse_node, mysql_node, service_name):
    )
    check_query(clickhouse_node, "SELECT count() FROM table_overrides.t1", "1001\n")
    show_db = clickhouse_node.query("SHOW CREATE DATABASE table_overrides")
-    assert "TABLE OVERRIDE `t1`\\n(\\n\\n)" in show_db, show_db
+    assert "TABLE OVERRIDE t1\\n(\\n\\n)" in show_db, show_db

    clickhouse_node.query("DROP DATABASE IF EXISTS table_overrides")
    mysql_node.query("DROP DATABASE IF EXISTS table_overrides")
--- a/tests/integration/test_parallel_replicas_failover/test.py
+++ b/tests/integration/test_parallel_replicas_failover/test.py
@ -1,5 +1,5 @@
 import pytest
-
+import uuid
 from helpers.cluster import ClickHouseCluster

 cluster = ClickHouseCluster(__file__)
@ -25,19 +25,15 @@ def start_cluster():


 def create_tables(cluster, table_name, skip_last_replica):
-    node1.query(f"DROP TABLE IF EXISTS {table_name} SYNC")
-    node2.query(f"DROP TABLE IF EXISTS {table_name} SYNC")
-    node3.query(f"DROP TABLE IF EXISTS {table_name} SYNC")
-
    node1.query(
-        f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r1') ORDER BY (key)"
+        f"CREATE TABLE {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r1') ORDER BY (key)"
    )
    node2.query(
-        f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r2') ORDER BY (key)"
+        f"CREATE TABLE {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r2') ORDER BY (key)"
    )
    if not skip_last_replica:
        node3.query(
-            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r3') ORDER BY (key)"
+            f"CREATE TABLE {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r3') ORDER BY (key)"
        )

    # populate data
@ -67,7 +63,7 @@ def test_skip_replicas_without_table(start_cluster):
    for i in range(4):
        expected_result += f"{i}\t1000\n"

-    log_comment = "5230b069-9574-407d-9b80-891b5a175f41"
+    log_comment = uuid.uuid4()
    assert (
        node1.query(
            f"SELECT key, count() FROM {table_name} GROUP BY key ORDER BY key",
@ -88,6 +84,8 @@ def test_skip_replicas_without_table(start_cluster):
        )
        == "1\t1\n"
    )
+    node1.query(f"DROP TABLE {table_name} SYNC")
+    node2.query(f"DROP TABLE {table_name} SYNC")


 def test_skip_unresponsive_replicas(start_cluster):
@ -112,3 +110,6 @@ def test_skip_unresponsive_replicas(start_cluster):
        )
        == expected_result
    )
+    node1.query(f"DROP TABLE {table_name} SYNC")
+    node2.query(f"DROP TABLE {table_name} SYNC")
+    node3.query(f"DROP TABLE {table_name} SYNC")
--- a/tests/integration/test_parallel_replicas_invisible_parts/test.py
+++ b/tests/integration/test_parallel_replicas_invisible_parts/test.py
@ -35,11 +35,10 @@ def start_cluster():


 def _create_tables(table_name, table_size, index_granularity):
-    nodes[0].query(f"DROP TABLE IF EXISTS {table_name} ON CLUSTER {cluster_name}")

    nodes[0].query(
        f"""
-        CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER '{cluster_name}' (key Int64, value String)
+        CREATE TABLE {table_name} ON CLUSTER '{cluster_name}' (key Int64, value String)
        Engine=ReplicatedMergeTree('/test_parallel_replicas/shard/{table_name}/', '{{replica}}')
        ORDER BY (key)
        SETTINGS index_granularity = {index_granularity}, max_bytes_to_merge_at_max_space_in_pool = 0, max_bytes_to_merge_at_max_space_in_pool = 1
@ -128,3 +127,4 @@ def test_reading_with_invisible_parts(
        )
        == f"{expected}\n"
    )
+    nodes[0].query(f"DROP TABLE {table_name} ON CLUSTER {cluster_name} SYNC")
--- a/tests/integration/test_postgresql_replica_database_engine_2/test.py
+++ b/tests/integration/test_postgresql_replica_database_engine_2/test.py
@ -1,5 +1,6 @@
 import pytest

+import uuid
 import time
 import psycopg2
 import os.path as p
@ -59,7 +60,6 @@ instance2 = cluster.add_instance(
 pg_manager = PostgresManager()
 pg_manager2 = PostgresManager()
 pg_manager_instance2 = PostgresManager()
-pg_manager3 = PostgresManager()


@pytest.fixture(scope="module")
@ -82,12 +82,6 @@ def started_cluster():
        pg_manager2.init(
            instance2, cluster.postgres_ip, cluster.postgres_port, "postgres_database2"
        )
-        pg_manager3.init(
-            instance,
-            cluster.postgres_ip,
-            cluster.postgres_port,
-            default_database="postgres-postgres",
-        )

        yield cluster

@ -924,16 +918,27 @@ def test_failed_load_from_snapshot(started_cluster):


 def test_symbols_in_publication_name(started_cluster):
-    table = "test_symbols_in_publication_name"
+    id = uuid.uuid4()
+    db = f"test_{id}"
+    table = f"test_symbols_in_publication_name"
+
+    pg_manager3 = PostgresManager()
+    pg_manager3.init(
+        instance,
+        cluster.postgres_ip,
+        cluster.postgres_port,
+        default_database=db,
+    )

    pg_manager3.create_postgres_table(table)
    instance.query(
-        f"INSERT INTO `{pg_manager3.get_default_database()}`.`{table}` SELECT number, number from numbers(0, 50)"
+        f"INSERT INTO `{db}`.`{table}` SELECT number, number from numbers(0, 50)"
    )

    pg_manager3.create_materialized_db(
        ip=started_cluster.postgres_ip,
        port=started_cluster.postgres_port,
+        materialized_database=db,
        settings=[
            f"materialized_postgresql_tables_list = '{table}'",
            "materialized_postgresql_backoff_min_ms = 100",
@ -941,8 +946,10 @@ def test_symbols_in_publication_name(started_cluster):
        ],
    )
    check_tables_are_synchronized(
-        instance, table, postgres_database=pg_manager3.get_default_database()
+        instance, table, materialized_database=db, postgres_database=db
    )
+    pg_manager3.drop_materialized_db(db)
+    pg_manager3.execute(f'drop table "{table}"')


 def test_generated_columns(started_cluster):
--- a/tests/integration/test_scheduler/test.py
+++ b/tests/integration/test_scheduler/test.py
@ -69,6 +69,124 @@ def update_workloads_config(**settings):
    node.query("system reload config")


+def check_profile_event_for_query(workload, profile_event, amount=1):
+    node.query("system flush logs")
+    query_pattern = f"workload='{workload}'".replace("'", "\\'")
+    assert (
+        int(
+            node.query(
+                f"select ProfileEvents['{profile_event}'] from system.query_log where query ilike '%{query_pattern}%' and type = 'QueryFinish' order by query_start_time_microseconds desc limit 1"
+            )
+        )
+        == amount
+    )
+
+
+def test_s3_resource_request_granularity():
+    node.query(
+        f"""
+        drop table if exists data;
+        create table data (key UInt64 CODEC(NONE), value String CODEC(NONE)) engine=MergeTree() order by key settings min_bytes_for_wide_part=1e9, storage_policy='s3';
+    """
+    )
+
+    total_bytes = 50000000  # Approximate data size
+    max_bytes_per_request = 2000000  # Should be ~1MB or less in general
+    min_bytes_per_request = 6000  # Small requests are ok, but we don't want hurt performance with too often resource requests
+
+    writes_before = int(
+        node.query(
+            f"select dequeued_requests from system.scheduler where resource='network_write' and path='/prio/admin'"
+        ).strip()
+    )
+    write_bytes_before = int(
+        node.query(
+            f"select dequeued_cost from system.scheduler where resource='network_write' and path='/prio/admin'"
+        ).strip()
+    )
+    write_budget_before = int(
+        node.query(
+            f"select budget from system.scheduler where resource='network_write' and path='/prio/admin'"
+        ).strip()
+    )
+    node.query(
+        f"insert into data select number, randomString(10000000) from numbers(5) SETTINGS workload='admin'"
+    )
+    writes_after = int(
+        node.query(
+            f"select dequeued_requests from system.scheduler where resource='network_write' and path='/prio/admin'"
+        ).strip()
+    )
+    write_bytes_after = int(
+        node.query(
+            f"select dequeued_cost from system.scheduler where resource='network_write' and path='/prio/admin'"
+        ).strip()
+    )
+    write_budget_after = int(
+        node.query(
+            f"select budget from system.scheduler where resource='network_write' and path='/prio/admin'"
+        ).strip()
+    )
+
+    write_requests = writes_after - writes_before
+    write_bytes = (write_bytes_after - write_bytes_before) - (
+        write_budget_after - write_budget_before
+    )
+    assert write_bytes > 1.0 * total_bytes
+    assert write_bytes < 1.05 * total_bytes
+    assert write_bytes / write_requests < max_bytes_per_request
+    assert write_bytes / write_requests > min_bytes_per_request
+    check_profile_event_for_query("admin", "SchedulerIOWriteRequests", write_requests)
+    check_profile_event_for_query("admin", "SchedulerIOWriteBytes", write_bytes)
+
+    node.query(f"optimize table data final")
+
+    reads_before = int(
+        node.query(
+            f"select dequeued_requests from system.scheduler where resource='network_read' and path='/prio/admin'"
+        ).strip()
+    )
+    read_bytes_before = int(
+        node.query(
+            f"select dequeued_cost from system.scheduler where resource='network_read' and path='/prio/admin'"
+        ).strip()
+    )
+    read_budget_before = int(
+        node.query(
+            f"select budget from system.scheduler where resource='network_read' and path='/prio/admin'"
+        ).strip()
+    )
+    node.query(
+        f"select count() from data where not ignore(*) SETTINGS workload='admin'"
+    )
+    reads_after = int(
+        node.query(
+            f"select dequeued_requests from system.scheduler where resource='network_read' and path='/prio/admin'"
+        ).strip()
+    )
+    read_bytes_after = int(
+        node.query(
+            f"select dequeued_cost from system.scheduler where resource='network_read' and path='/prio/admin'"
+        ).strip()
+    )
+    read_budget_after = int(
+        node.query(
+            f"select budget from system.scheduler where resource='network_read' and path='/prio/admin'"
+        ).strip()
+    )
+
+    read_bytes = (read_bytes_after - read_bytes_before) - (
+        read_budget_after - read_budget_before
+    )
+    read_requests = reads_after - reads_before
+    assert read_bytes > 1.0 * total_bytes
+    assert read_bytes < 1.05 * total_bytes
+    assert read_bytes / read_requests < max_bytes_per_request
+    assert read_bytes / read_requests > min_bytes_per_request
+    check_profile_event_for_query("admin", "SchedulerIOReadRequests", read_requests)
+    check_profile_event_for_query("admin", "SchedulerIOReadBytes", read_bytes)
+
+
 def test_s3_disk():
    node.query(
        f"""
--- a/tests/queries/0_stateless/01056_create_table_as_with_sorting_clauses.reference
+++ b/tests/queries/0_stateless/01056_create_table_as_with_sorting_clauses.reference
@ -0,0 +1,70 @@
+-------------- Test copy sorting clauses from source table --------------
+CREATE TABLE default.x
+(
+    `CounterID` UInt32,
+    `EventDate` Date,
+    `UserID` UInt64
+)
+ENGINE = MergeTree
+PARTITION BY toYYYYMM(EventDate)
+ORDER BY (CounterID, EventDate, intHash32(UserID))
+SAMPLE BY intHash32(UserID)
+SETTINGS index_granularity = 8192
+-------------------------------------------------------------------------
+CREATE TABLE default.x_as
+(
+    `CounterID` UInt32,
+    `EventDate` Date,
+    `UserID` UInt64
+)
+ENGINE = MergeTree
+PARTITION BY toYYYYMM(EventDate)
+ORDER BY (CounterID, EventDate, intHash32(UserID))
+SAMPLE BY intHash32(UserID)
+SETTINGS enable_block_number_column = 1, enable_block_offset_column = 1, index_granularity = 8192
+-------------- Test copy sorting clauses from destination table (source table without the same type clauses) --------------
+CREATE TABLE default.x
+(
+    `CounterID` UInt32,
+    `EventDate` Date,
+    `UserID` UInt64
+)
+ENGINE = MergeTree
+PRIMARY KEY (CounterID, EventDate, intHash32(UserID))
+ORDER BY (CounterID, EventDate, intHash32(UserID))
+SETTINGS index_granularity = 8192
+-------------------------------------------------------------------------
+CREATE TABLE default.x_as
+(
+    `CounterID` UInt32,
+    `EventDate` Date,
+    `UserID` UInt64
+)
+ENGINE = MergeTree
+PARTITION BY toYYYYMM(EventDate)
+PRIMARY KEY (CounterID, EventDate, intHash32(UserID))
+ORDER BY (CounterID, EventDate, intHash32(UserID))
+SAMPLE BY intHash32(UserID)
+SETTINGS enable_block_number_column = 1, enable_block_offset_column = 1, index_granularity = 8192
+-------------- Test copy sorting clauses from destination table (source table with the same type clauses) --------------
+CREATE TABLE default.x
+(
+    `CounterID` UInt32,
+    `EventDate` Date,
+    `UserID` UInt64
+)
+ENGINE = MergeTree
+ORDER BY CounterID
+SETTINGS index_granularity = 8192
+-------------------------------------------------------------------------
+CREATE TABLE default.x_as
+(
+    `CounterID` UInt32,
+    `EventDate` Date,
+    `UserID` UInt64
+)
+ENGINE = MergeTree
+PARTITION BY toYYYYMM(EventDate)
+ORDER BY (CounterID, EventDate, intHash32(UserID))
+SAMPLE BY intHash32(UserID)
+SETTINGS enable_block_number_column = 1, enable_block_offset_column = 1, index_granularity = 8192
--- a/tests/queries/0_stateless/01056_create_table_as_with_sorting_clauses.sql
+++ b/tests/queries/0_stateless/01056_create_table_as_with_sorting_clauses.sql
@ -0,0 +1,37 @@
+DROP TABLE IF EXISTS x;
+DROP TABLE IF EXISTS x_as;
+
+SELECT '-------------- Test copy sorting clauses from source table --------------';
+CREATE TABLE x (`CounterID` UInt32, `EventDate` Date, `UserID` UInt64) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID);
+CREATE TABLE x_as AS x ENGINE = MergeTree SETTINGS enable_block_number_column = 1, enable_block_offset_column = 1;
+
+SHOW CREATE TABLE x FORMAT TSVRaw;
+SELECT '-------------------------------------------------------------------------';
+SHOW CREATE TABLE x_as FORMAT TSVRaw;
+
+DROP TABLE x;
+DROP TABLE x_as;
+
+SELECT '-------------- Test copy sorting clauses from destination table (source table without the same type clauses) --------------';
+CREATE TABLE x (`CounterID` UInt32, `EventDate` Date, `UserID` UInt64) ENGINE = MergeTree PRIMARY KEY (CounterID, EventDate, intHash32(UserID));
+CREATE TABLE x_as AS x ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS enable_block_number_column = 1, enable_block_offset_column = 1;
+
+SHOW CREATE TABLE x FORMAT TSVRaw;
+SELECT '-------------------------------------------------------------------------';
+SHOW CREATE TABLE x_as FORMAT TSVRaw;
+
+DROP TABLE x;
+DROP TABLE x_as;
+
+SELECT '-------------- Test copy sorting clauses from destination table (source table with the same type clauses) --------------';
+CREATE TABLE x (`CounterID` UInt32, `EventDate` Date, `UserID` UInt64) ENGINE = MergeTree ORDER BY (CounterID);
+CREATE TABLE x_as AS x ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS enable_block_number_column = 1, enable_block_offset_column = 1;
+
+SHOW CREATE TABLE x FORMAT TSVRaw;
+SELECT '-------------------------------------------------------------------------';
+SHOW CREATE TABLE x_as FORMAT TSVRaw;
+
+DROP TABLE x;
+DROP TABLE x_as;
+
+
--- a/tests/queries/0_stateless/01541_max_memory_usage_for_user_long.sh
+++ b/tests/queries/0_stateless/01541_max_memory_usage_for_user_long.sh
@ -1,5 +1,6 @@
 #!/usr/bin/env bash
-# Tags: long, no-replicated-database, no-parallel, no-fasttest
+# Tags: long, no-replicated-database, no-parallel, no-fasttest, no-tsan, no-asan, no-msan, no-ubsan
+# no sanitizers -- memory consumption is unpredicatable with sanitizers

 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
--- a/tests/queries/0_stateless/01710_projection_vertical_merges.sql
+++ b/tests/queries/0_stateless/01710_projection_vertical_merges.sql
@ -1,5 +1,5 @@
-- Tags: long, no-parallel
-- set no-parallel tag is to prevent timeout of this test
+-- Tags: long, no-parallel, no-msan, no-tsan, no-asan
+-- set no-parallel and no sanitizers tag is to prevent timeout of this test

 drop table if exists t;

--- a/tests/queries/0_stateless/02790_async_queries_in_query_log.reference
+++ b/tests/queries/0_stateless/02790_async_queries_in_query_log.reference
@ -9,7 +9,7 @@ written_rows:   0
 written_bytes:  0
 result_rows:    0
 result_bytes:   0
-query:          INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
+query:          INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
 query_kind:     AsyncInsertFlush
 databases:      ['default']
 tables:         ['default.async_insert_landing']
@ -26,7 +26,7 @@ written_rows:   4
 written_bytes:  16
 result_rows:    4
 result_bytes:   16
-query:          INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
+query:          INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
 query_kind:     AsyncInsertFlush
 databases:      ['default']
 tables:         ['default.async_insert_landing']
@ -54,7 +54,7 @@ written_rows:   0
 written_bytes:  0
 result_rows:    0
 result_bytes:   0
-query:          INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
+query:          INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
 query_kind:     AsyncInsertFlush
 databases:      ['default']
 tables:         ['default.async_insert_landing','default.async_insert_target']
@ -71,7 +71,7 @@ written_rows:   6
 written_bytes:  24
 result_rows:    6
 result_bytes:   24
-query:          INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
+query:          INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
 query_kind:     AsyncInsertFlush
 databases:      ['default']
 tables:         ['default.async_insert_landing','default.async_insert_target']
@ -118,7 +118,7 @@ written_rows:   0
 written_bytes:  0
 result_rows:    0
 result_bytes:   0
-query:          INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
+query:          INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
 query_kind:     AsyncInsertFlush
 databases:      ['default']
 tables:         ['default.async_insert_landing','default.async_insert_target']
@ -135,7 +135,7 @@ written_rows:   3
 written_bytes:  12
 result_rows:    0
 result_bytes:   0
-query:          INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
+query:          INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
 query_kind:     AsyncInsertFlush
 databases:      ['default']
 tables:         ['default.async_insert_landing','default.async_insert_target']
--- a/tests/queries/0_stateless/02930_client_file_log_comment.reference
+++ b/tests/queries/0_stateless/02930_client_file_log_comment.reference
@ -1,6 +0,0 @@
-42
-4242
-424242
-select 42	clickhouse.default-1.sql
-select 4242	clickhouse.default-2.sql
-select 424242\n	foo
--- a/tests/queries/0_stateless/02930_client_file_log_comment.sh
+++ b/tests/queries/0_stateless/02930_client_file_log_comment.sh
@ -1,24 +0,0 @@
-#!/usr/bin/env bash
-
-CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-# reset --log_comment, because the test has to set its own
-CLICKHOUSE_LOG_COMMENT=
-# shellcheck source=../shell_config.sh
-. "$CUR_DIR"/../shell_config.sh
-
-file1="$CUR_DIR/clickhouse.${CLICKHOUSE_DATABASE}-1.sql"
-echo -n 'select 42' >> "$file1"
-file2="$CUR_DIR/clickhouse.${CLICKHOUSE_DATABASE}-2.sql"
-echo -n 'select 4242' >> "$file2"
-
-$CLICKHOUSE_CLIENT --queries-file "$file1" "$file2" <<<'select 42'
-$CLICKHOUSE_CLIENT --log_comment foo --queries-file /dev/stdin <<<'select 424242'
-
-$CLICKHOUSE_CLIENT -m -q "
-    system flush logs;
-    select query, log_comment from system.query_log where current_database = '$CLICKHOUSE_DATABASE' and event_date >= yesterday() and query = 'select 42' and type != 'QueryStart';
-    select query, log_comment from system.query_log where current_database = '$CLICKHOUSE_DATABASE' and event_date >= yesterday() and query = 'select 4242' and type != 'QueryStart';
-    select query, log_comment from system.query_log where current_database = '$CLICKHOUSE_DATABASE' and event_date >= yesterday() and query = 'select 424242\n' and type != 'QueryStart';
-" | sed "s#$CUR_DIR/##"
-
-rm "$file1" "$file2"
--- a/tests/queries/0_stateless/03148_async_queries_in_query_log_errors.reference
+++ b/tests/queries/0_stateless/03148_async_queries_in_query_log_errors.reference
@ -4,7 +4,7 @@ Row 1:
 ──────
 database:                 default
 table:                    async_insert_landing
-query:                    INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
+query:                    INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
 format:                   Values
 error:                    DB::Exc*****on: Cannot parse string 'Invalid' as UInt32:
 populated_flush_query_id: 1
@ -18,7 +18,7 @@ written_rows:   0
 written_bytes:  0
 result_rows:    0
 result_bytes:   0
-query:          INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
+query:          INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
 query_kind:     AsyncInsertFlush
 databases:      ['default']
 tables:         ['default.async_insert_landing']
@ -35,7 +35,7 @@ written_rows:   0
 written_bytes:  0
 result_rows:    0
 result_bytes:   0
-query:          INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
+query:          INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
 query_kind:     AsyncInsertFlush
 databases:      ['default']
 tables:         ['default.async_insert_landing']
--- a/tests/queries/0_stateless/03172_http_content_encoding.reference
+++ b/tests/queries/0_stateless/03172_http_content_encoding.reference
@ -0,0 +1,2 @@
+< Content-Encoding: zstd
+< Content-Encoding: zstd
--- a/tests/queries/0_stateless/03172_http_content_encoding.sh
+++ b/tests/queries/0_stateless/03172_http_content_encoding.sh
@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+URL="${CLICKHOUSE_PORT_HTTP_PROTO}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_HTTP}/"
+
+# with progress
+${CLICKHOUSE_CURL} -vsS "${URL}?send_progress_in_http_headers=1&enable_http_compression=1&wait_end_of_query=0" -o /dev/null \
+  -H 'Accept-Encoding: zstd' --compressed --data-binary @- <<< "select distinct sleep(.1),name from generateRandom('name String',1,1000,2) limit 100009 format TSV" 2>&1 \
+  | perl -lnE 'print if /Content-Encoding/';
+# no progress
+${CLICKHOUSE_CURL} -vsS "${URL}?send_progress_in_http_headers=0&enable_http_compression=1&wait_end_of_query=0" -o /dev/null \
+  -H 'Accept-Encoding: zstd' --compressed --data-binary @- <<< "select distinct sleep(.1),name from generateRandom('name String',1,1000,2) limit 100009 format TSV" 2>&1 \
+  | perl -lnE 'print if /Content-Encoding/';
--- a/tests/queries/0_stateless/03208_multiple_joins_with_storage_join.reference
+++ b/tests/queries/0_stateless/03208_multiple_joins_with_storage_join.reference
@ -0,0 +1,23 @@
+-----
+1	1	1	a	1	A	1	A
+2	2	2	b	2	B	2	B
+-----
+\N	\N	\N		0		3	B
+1	1	1	a	1	A	1	A
+2	2	2	b	2	B	2	B
+-----
+1	1	1	a	1	A	1	A
+2	2	2	b	2	B	2	B
+\N	\N	\N	\N	3	B	\N	\N
+\N	\N	\N	\N	\N	\N	3	B
+-----
+\N	\N	\N		3	3	B	0	0	
+\N	\N	\N		0	0		3	3	B
+1	1	1	a	1	1	A	1	1	A
+2	2	2	b	2	2	B	2	2	B
+-----
+3	3	\N		B	B
+1	1	1	a	A	A
+2	2	2	b	B	B
+-----
+7
--- a/Show More
+++ b/Show More