From 5f1991fbef2f959f1d55c62194d948814d199fa9 Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Tue, 12 Mar 2024 15:53:28 +0800
Subject: [PATCH] too big translation unit in Aggregator

---
 src/Common/HashTable/FixedHashMap.h           |    3 +
 .../HashTable/TwoLevelStringHashTable.h       |    1 +
 src/Interpreters/AggregatedData.h             |  142 +++
 src/Interpreters/AggregatedDataVariants.cpp   |  255 ++++
 src/Interpreters/AggregatedDataVariants.h     |  320 +++++
 src/Interpreters/AggregationMethod.cpp        |  215 ++++
 src/Interpreters/AggregationMethod.h          |  320 +++++
 src/Interpreters/Aggregator.cpp               |  512 ++++----
 src/Interpreters/Aggregator.h                 | 1035 +----------------
 9 files changed, 1541 insertions(+), 1262 deletions(-)
 create mode 100644 src/Interpreters/AggregatedData.h
 create mode 100644 src/Interpreters/AggregatedDataVariants.cpp
 create mode 100644 src/Interpreters/AggregatedDataVariants.h
 create mode 100644 src/Interpreters/AggregationMethod.cpp
 create mode 100644 src/Interpreters/AggregationMethod.h
diff --git a/src/Common/HashTable/FixedHashMap.h b/src/Common/HashTable/FixedHashMap.h
index e835a6fba94..537f37a9e6c 100644
--- a/src/Common/HashTable/FixedHashMap.h
+++ b/src/Common/HashTable/FixedHashMap.h
@@ -109,6 +109,9 @@ public:
 
     using Base::Base;
 
+    FixedHashMap() = default;
+    FixedHashMap(size_t ) {} /// NOLINT
+
     template <typename Func, bool>
     void ALWAYS_INLINE mergeToViaEmplace(Self & that, Func && func)
     {
diff --git a/src/Common/HashTable/TwoLevelStringHashTable.h b/src/Common/HashTable/TwoLevelStringHashTable.h
index 54c208c5b60..1ce6b3d02e3 100644
--- a/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -38,6 +38,7 @@ public:
     Impl impls[NUM_BUCKETS];
 
     TwoLevelStringHashTable() = default;
+    TwoLevelStringHashTable(size_t ) {} /// NOLINT
 
     template <typename Source>
     explicit TwoLevelStringHashTable(const Source & src)
diff --git a/src/Interpreters/AggregatedData.h b/src/Interpreters/AggregatedData.h
new file mode 100644
index 00000000000..6cd6b190801
--- /dev/null
+++ b/src/Interpreters/AggregatedData.h
@@ -0,0 +1,142 @@
+#pragma once
+#include <AggregateFunctions/IAggregateFunction.h>
+
+#include <Common/HashTable/FixedHashMap.h>
+#include <Common/HashTable/StringHashMap.h>
+#include <Common/HashTable/TwoLevelHashMap.h>
+#include <Common/HashTable/TwoLevelStringHashMap.h>
+namespace DB
+{
+/** Different data structures that can be used for aggregation
+  * For efficiency, the aggregation data itself is put into the pool.
+  * Data and pool ownership (states of aggregate functions)
+  *  is acquired later - in `convertToBlocks` function, by the ColumnAggregateFunction object.
+  *
+  * Most data structures exist in two versions: normal and two-level (TwoLevel).
+  * A two-level hash table works a little slower with a small number of different keys,
+  *  but with a large number of different keys scales better, because it allows
+  *  parallelize some operations (merging, post-processing) in a natural way.
+  *
+  * To ensure efficient work over a wide range of conditions,
+  *  first single-level hash tables are used,
+  *  and when the number of different keys is large enough,
+  *  they are converted to two-level ones.
+  *
+  * PS. There are many different approaches to the effective implementation of parallel and distributed aggregation,
+  *  best suited for different cases, and this approach is just one of them, chosen for a combination of reasons.
+  */
+
+using AggregatedDataWithoutKey = AggregateDataPtr;
+
+using AggregatedDataWithUInt8Key = FixedImplicitZeroHashMapWithCalculatedSize<UInt8, AggregateDataPtr>;
+using AggregatedDataWithUInt16Key = FixedImplicitZeroHashMap<UInt16, AggregateDataPtr>;
+
+using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
+using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
+
+using AggregatedDataWithShortStringKey = StringHashMap<AggregateDataPtr>;
+
+using AggregatedDataWithStringKey = HashMapWithSavedHash<StringRef, AggregateDataPtr>;
+
+using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, UInt128HashCRC32>;
+using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, UInt256HashCRC32>;
+
+using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
+using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
+
+using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap<AggregateDataPtr>;
+
+using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr>;
+
+using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, UInt128HashCRC32>;
+using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, UInt256HashCRC32>;
+
+/** Variants with better hash function, using more than 32 bits for hash.
+  * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion,
+  *  but we keep in memory and merge only sub-partition of them simultaneously.
+  * TODO We need to switch for better hash function not only for external aggregation,
+  *  but also for huge aggregation results on machines with terabytes of RAM.
+  */
+
+using AggregatedDataWithUInt64KeyHash64 = HashMap<UInt64, AggregateDataPtr, DefaultHash<UInt64>>;
+using AggregatedDataWithStringKeyHash64 = HashMapWithSavedHash<StringRef, AggregateDataPtr, StringRefHash64>;
+using AggregatedDataWithKeys128Hash64 = HashMap<UInt128, AggregateDataPtr, UInt128Hash>;
+using AggregatedDataWithKeys256Hash64 = HashMap<UInt256, AggregateDataPtr, UInt256Hash>;
+
+template <typename Base>
+struct AggregationDataWithNullKey : public Base
+{
+    using Base::Base;
+
+    bool & hasNullKeyData() { return has_null_key_data; }
+    AggregateDataPtr & getNullKeyData() { return null_key_data; }
+    bool hasNullKeyData() const { return has_null_key_data; }
+    const AggregateDataPtr & getNullKeyData() const { return null_key_data; }
+    size_t size() const { return Base::size() + (has_null_key_data ? 1 : 0); }
+    bool empty() const { return Base::empty() && !has_null_key_data; }
+    void clear()
+    {
+        Base::clear();
+        has_null_key_data = false;
+    }
+    void clearAndShrink()
+    {
+        Base::clearAndShrink();
+        has_null_key_data = false;
+    }
+
+private:
+    bool has_null_key_data = false;
+    AggregateDataPtr null_key_data = nullptr;
+};
+
+template <typename Base>
+struct AggregationDataWithNullKeyTwoLevel : public Base
+{
+    using Base::Base;
+    using Base::impls;
+
+    AggregationDataWithNullKeyTwoLevel() = default;
+
+    template <typename Other>
+    explicit AggregationDataWithNullKeyTwoLevel(const Other & other) : Base(other)
+    {
+        impls[0].hasNullKeyData() = other.hasNullKeyData();
+        impls[0].getNullKeyData() = other.getNullKeyData();
+    }
+
+    bool & hasNullKeyData() { return impls[0].hasNullKeyData(); }
+    AggregateDataPtr & getNullKeyData() { return impls[0].getNullKeyData(); }
+    bool hasNullKeyData() const { return impls[0].hasNullKeyData(); }
+    const AggregateDataPtr & getNullKeyData() const { return impls[0].getNullKeyData(); }
+};
+
+template <typename ... Types>
+using HashTableWithNullKey = AggregationDataWithNullKey<HashMapTable<Types ...>>;
+template <typename ... Types>
+using StringHashTableWithNullKey = AggregationDataWithNullKey<StringHashMap<Types ...>>;
+
+using AggregatedDataWithNullableUInt8Key = AggregationDataWithNullKey<AggregatedDataWithUInt8Key>;
+using AggregatedDataWithNullableUInt16Key = AggregationDataWithNullKey<AggregatedDataWithUInt16Key>;
+using AggregatedDataWithNullableUInt32Key = AggregationDataWithNullKey<AggregatedDataWithUInt32Key>;
+
+
+using AggregatedDataWithNullableUInt64Key = AggregationDataWithNullKey<AggregatedDataWithUInt64Key>;
+using AggregatedDataWithNullableStringKey = AggregationDataWithNullKey<AggregatedDataWithStringKey>;
+using AggregatedDataWithNullableShortStringKey = AggregationDataWithNullKey<AggregatedDataWithShortStringKey>;
+
+
+using AggregatedDataWithNullableUInt32KeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
+    TwoLevelHashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>,
+                    TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>;
+using AggregatedDataWithNullableUInt64KeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
+        TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>,
+        TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>;
+
+using AggregatedDataWithNullableShortStringKeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
+        TwoLevelStringHashMap<AggregateDataPtr, HashTableAllocator, StringHashTableWithNullKey>>;
+
+using AggregatedDataWithNullableStringKeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
+        TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr, DefaultHash<StringRef>,
+        TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>;
+}
diff --git a/src/Interpreters/AggregatedDataVariants.cpp b/src/Interpreters/AggregatedDataVariants.cpp
new file mode 100644
index 00000000000..0c86c58bd3e
--- /dev/null
+++ b/src/Interpreters/AggregatedDataVariants.cpp
@@ -0,0 +1,255 @@
+#include <Interpreters/AggregatedDataVariants.h>
+#include <Interpreters/Aggregator.h>
+
+namespace ProfileEvents
+{
+    extern const Event AggregationPreallocatedElementsInHashTables;
+}
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int UNKNOWN_AGGREGATED_DATA_VARIANT;
+    extern const int LOGICAL_ERROR;
+
+}
+using ColumnsHashing::HashMethodContext;
+using ColumnsHashing::HashMethodContextPtr;
+using ColumnsHashing::LastElementCacheStats;
+
+AggregatedDataVariants::AggregatedDataVariants() : aggregates_pools(1, std::make_shared<Arena>()), aggregates_pool(aggregates_pools.back().get()) {}
+
+AggregatedDataVariants::~AggregatedDataVariants()
+{
+    if (aggregator && !aggregator->all_aggregates_has_trivial_destructor)
+    {
+        try
+        {
+            aggregator->destroyAllAggregateStates(*this);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+    }
+}
+
+// The std::is_constructible trait isn't suitable here because some classes have template constructors with semantics different from providing size hints.
+// Also string hash table variants are not supported due to the fact that both local perf tests and tests in CI showed slowdowns for them.
+template <typename...>
+struct HasConstructorOfNumberOfElements : std::false_type
+{
+};
+
+template <typename... Ts>
+struct HasConstructorOfNumberOfElements<HashMapTable<Ts...>> : std::true_type
+{
+};
+
+template <typename Key, typename Cell, typename Hash, typename Grower, typename Allocator, template <typename...> typename ImplTable>
+struct HasConstructorOfNumberOfElements<TwoLevelHashMapTable<Key, Cell, Hash, Grower, Allocator, ImplTable>> : std::true_type
+{
+};
+
+template <typename... Ts>
+struct HasConstructorOfNumberOfElements<HashTable<Ts...>> : std::true_type
+{
+};
+
+template <typename... Ts>
+struct HasConstructorOfNumberOfElements<TwoLevelHashTable<Ts...>> : std::true_type
+{
+};
+
+template <template <typename> typename Method, typename Base>
+struct HasConstructorOfNumberOfElements<Method<Base>> : HasConstructorOfNumberOfElements<Base>
+{
+};
+
+template <typename Method>
+auto constructWithReserveIfPossible(size_t size_hint)
+{
+    if constexpr (HasConstructorOfNumberOfElements<typename Method::Data>::value)
+    {
+        ProfileEvents::increment(ProfileEvents::AggregationPreallocatedElementsInHashTables, size_hint);
+        return std::make_unique<Method>(size_hint);
+    }
+    else
+        return std::make_unique<Method>();
+}
+
+void AggregatedDataVariants::init(Type type_, std::optional<size_t> size_hint)
+{
+    switch (type_)
+    {
+        case Type::EMPTY:
+        case Type::without_key:
+            break;
+
+    #define M(NAME, IS_TWO_LEVEL) \
+        case Type::NAME: \
+            if (size_hint) \
+                (NAME) = constructWithReserveIfPossible<decltype(NAME)::element_type>(*size_hint); \
+            else \
+                (NAME) = std::make_unique<decltype(NAME)::element_type>(); \
+            break;
+        APPLY_FOR_AGGREGATED_VARIANTS(M)
+#undef M
+    }
+
+    type = type_;
+}
+
+size_t AggregatedDataVariants::size() const
+{
+    switch (type)
+    {
+        case Type::EMPTY:
+            return 0;
+        case Type::without_key:
+            return 1;
+
+    #define M(NAME, IS_TWO_LEVEL) \
+        case Type::NAME: \
+            return (NAME)->data.size() + (without_key != nullptr);
+        APPLY_FOR_AGGREGATED_VARIANTS(M)
+    #undef M
+    }
+
+    UNREACHABLE();
+}
+
+size_t AggregatedDataVariants::sizeWithoutOverflowRow() const
+{
+    switch (type)
+    {
+        case Type::EMPTY:
+        return 0;
+        case Type::without_key:
+        return 1;
+
+    #define M(NAME, IS_TWO_LEVEL) \
+        case Type::NAME: \
+            return (NAME)->data.size();
+        APPLY_FOR_AGGREGATED_VARIANTS(M)
+    #undef M
+    }
+
+    UNREACHABLE();
+}
+
+const char * AggregatedDataVariants::getMethodName() const
+{
+    switch (type)
+    {
+        case Type::EMPTY:
+        return "EMPTY";
+        case Type::without_key:
+        return "without_key";
+
+    #define M(NAME, IS_TWO_LEVEL) \
+        case Type::NAME: \
+            return #NAME;
+        APPLY_FOR_AGGREGATED_VARIANTS(M)
+    #undef M
+    }
+
+    UNREACHABLE();
+}
+
+bool AggregatedDataVariants::isTwoLevel() const
+{
+    switch (type)
+    {
+        case Type::EMPTY:
+        return false;
+        case Type::without_key:
+        return false;
+
+    #define M(NAME, IS_TWO_LEVEL) \
+        case Type::NAME: \
+            return IS_TWO_LEVEL;
+        APPLY_FOR_AGGREGATED_VARIANTS(M)
+    #undef M
+    }
+
+    UNREACHABLE();
+}
+
+bool AggregatedDataVariants::isConvertibleToTwoLevel() const
+{
+    switch (type)
+    {
+    #define M(NAME) \
+        case Type::NAME: \
+            return true;
+
+        APPLY_FOR_VARIANTS_CONVERTIBLE_TO_TWO_LEVEL(M)
+
+    #undef M
+        default:
+        return false;
+    }
+}
+
+void AggregatedDataVariants::convertToTwoLevel()
+{
+    if (aggregator)
+        LOG_TRACE(aggregator->log, "Converting aggregation data to two-level.");
+
+    switch (type)
+    {
+#define M(NAME) \
+        case Type::NAME: \
+            NAME ## _two_level = std::make_unique<decltype(NAME ## _two_level)::element_type>(*(NAME)); \
+            (NAME).reset(); \
+            type = Type::NAME ## _two_level; \
+            break;
+
+        APPLY_FOR_VARIANTS_CONVERTIBLE_TO_TWO_LEVEL(M)
+
+    #undef M
+
+        default:
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong data variant passed.");
+    }
+}
+
+bool AggregatedDataVariants::isLowCardinality() const
+{
+    switch (type)
+    {
+    #define M(NAME) \
+        case Type::NAME: \
+            return true;
+
+        APPLY_FOR_LOW_CARDINALITY_VARIANTS(M)
+    #undef M
+        default:
+            return false;
+    }
+}
+
+HashMethodContextPtr AggregatedDataVariants::createCache(Type type, const HashMethodContext::Settings & settings)
+{
+    switch (type)
+    {
+        case Type::without_key:
+            return nullptr;
+
+    #define M(NAME, IS_TWO_LEVEL) \
+        case Type::NAME: { \
+            using TPtr##NAME = decltype(AggregatedDataVariants::NAME); \
+            using T##NAME = typename TPtr##NAME ::element_type; \
+            return T##NAME ::State::createContext(settings); \
+        }
+
+            APPLY_FOR_AGGREGATED_VARIANTS(M)
+    #undef M
+
+        default:
+            throw Exception(ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT, "Unknown aggregated data variant.");
+    }
+}
+}
diff --git a/src/Interpreters/AggregatedDataVariants.h b/src/Interpreters/AggregatedDataVariants.h
new file mode 100644
index 00000000000..8b82c5d9842
--- /dev/null
+++ b/src/Interpreters/AggregatedDataVariants.h
@@ -0,0 +1,320 @@
+#pragma once
+#include <boost/noncopyable.hpp>
+#include <memory.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <Common/ColumnsHashing.h>
+#include <Interpreters/AggregatedData.h>
+#include <Interpreters/AggregationMethod.h>
+
+namespace DB
+{
+class Arena;
+class Aggregator;
+
+struct AggregatedDataVariants : private boost::noncopyable
+{
+    /** Working with states of aggregate functions in the pool is arranged in the following (inconvenient) way:
+      * - when aggregating, states are created in the pool using IAggregateFunction::create (inside - `placement new` of arbitrary structure);
+      * - they must then be destroyed using IAggregateFunction::destroy (inside - calling the destructor of arbitrary structure);
+      * - if aggregation is complete, then, in the Aggregator::convertToBlocks function, pointers to the states of aggregate functions
+      *   are written to ColumnAggregateFunction; ColumnAggregateFunction "acquires ownership" of them, that is - calls `destroy` in its destructor.
+      * - if during the aggregation, before call to Aggregator::convertToBlocks, an exception was thrown,
+      *   then the states of aggregate functions must still be destroyed,
+      *   otherwise, for complex states (eg, AggregateFunctionUniq), there will be memory leaks;
+      * - in this case, to destroy states, the destructor calls Aggregator::destroyAggregateStates method,
+      *   but only if the variable aggregator (see below) is not nullptr;
+      * - that is, until you transfer ownership of the aggregate function states in the ColumnAggregateFunction, set the variable `aggregator`,
+      *   so that when an exception occurs, the states are correctly destroyed.
+      *
+      * PS. This can be corrected by making a pool that knows about which states of aggregate functions and in which order are put in it, and knows how to destroy them.
+      * But this can hardly be done simply because it is planned to put variable-length strings into the same pool.
+      * In this case, the pool will not be able to know with what offsets objects are stored.
+      */
+    const Aggregator * aggregator = nullptr;
+
+    size_t keys_size{};  /// Number of keys. NOTE do we need this field?
+    Sizes key_sizes;     /// Dimensions of keys, if keys of fixed length
+
+    /// Pools for states of aggregate functions. Ownership will be later transferred to ColumnAggregateFunction.
+    using ArenaPtr = std::shared_ptr<Arena>;
+    using Arenas = std::vector<ArenaPtr>;
+    Arenas aggregates_pools;
+    Arena * aggregates_pool{};    /// The pool that is currently used for allocation.
+
+    /** Specialization for the case when there are no keys, and for keys not fitted into max_rows_to_group_by.
+      */
+    AggregatedDataWithoutKey without_key = nullptr;
+
+    /// Stats of a cache for consecutive keys optimization.
+    /// Stats can be used to disable the cache in case of a lot of misses.
+    ColumnsHashing::LastElementCacheStats consecutive_keys_cache_stats;
+
+    // Disable consecutive key optimization for Uint8/16, because they use a FixedHashMap
+    // and the lookup there is almost free, so we don't need to cache the last lookup result
+    std::unique_ptr<AggregationMethodOneNumber<UInt8, AggregatedDataWithUInt8Key, false>>           key8;
+    std::unique_ptr<AggregationMethodOneNumber<UInt16, AggregatedDataWithUInt16Key, false>>         key16;
+
+    std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithUInt64Key>>         key32;
+    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64Key>>         key64;
+    std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithShortStringKey>>               key_string;
+    std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKey>>          key_fixed_string;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt16Key, false, false, false>>  keys16;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt32Key>>                   keys32;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt64Key>>                   keys64;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128>>                   keys128;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256>>                   keys256;
+    std::unique_ptr<AggregationMethodSerialized<AggregatedDataWithStringKey>>                          serialized;
+    std::unique_ptr<AggregationMethodNullableSerialized<AggregatedDataWithStringKey>>                  nullable_serialized;
+    std::unique_ptr<AggregationMethodPreallocSerialized<AggregatedDataWithStringKey>>                  prealloc_serialized;
+    std::unique_ptr<AggregationMethodNullablePreallocSerialized<AggregatedDataWithStringKey>>          nullable_prealloc_serialized;
+
+    std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithUInt64KeyTwoLevel>> key32_two_level;
+    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64KeyTwoLevel>> key64_two_level;
+    std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithShortStringKeyTwoLevel>>       key_string_two_level;
+    std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKeyTwoLevel>>  key_fixed_string_two_level;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt32KeyTwoLevel>>           keys32_two_level;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt64KeyTwoLevel>>           keys64_two_level;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128TwoLevel>>           keys128_two_level;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256TwoLevel>>           keys256_two_level;
+    std::unique_ptr<AggregationMethodSerialized<AggregatedDataWithStringKeyTwoLevel>>                  serialized_two_level;
+    std::unique_ptr<AggregationMethodNullableSerialized<AggregatedDataWithStringKeyTwoLevel>>          nullable_serialized_two_level;
+    std::unique_ptr<AggregationMethodPreallocSerialized<AggregatedDataWithStringKeyTwoLevel>>          prealloc_serialized_two_level;
+    std::unique_ptr<AggregationMethodNullablePreallocSerialized<AggregatedDataWithStringKeyTwoLevel>>  nullable_prealloc_serialized_two_level;
+
+    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64KeyHash64>>   key64_hash64;
+    std::unique_ptr<AggregationMethodString<AggregatedDataWithStringKeyHash64>>              key_string_hash64;
+    std::unique_ptr<AggregationMethodFixedString<AggregatedDataWithStringKeyHash64>>         key_fixed_string_hash64;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128Hash64>>             keys128_hash64;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256Hash64>>             keys256_hash64;
+    std::unique_ptr<AggregationMethodSerialized<AggregatedDataWithStringKeyHash64>>                  serialized_hash64;
+    std::unique_ptr<AggregationMethodNullableSerialized<AggregatedDataWithStringKeyHash64>>          nullable_serialized_hash64;
+    std::unique_ptr<AggregationMethodPreallocSerialized<AggregatedDataWithStringKeyHash64>>          prealloc_serialized_hash64;
+    std::unique_ptr<AggregationMethodNullablePreallocSerialized<AggregatedDataWithStringKeyHash64>>  nullable_prealloc_serialized_hash64;
+
+    /// Support for nullable keys.
+    std::unique_ptr<AggregationMethodOneNumber<UInt8, AggregatedDataWithNullableUInt8Key, false, true>>         nullable_key8;
+    std::unique_ptr<AggregationMethodOneNumber<UInt16, AggregatedDataWithNullableUInt16Key, false, true>>         nullable_key16;
+    std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt32Key, true, true>>         nullable_key32;
+    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64Key, true, true>>         nullable_key64;
+    std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt32KeyTwoLevel, true, true>>         nullable_key32_two_level;
+    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64KeyTwoLevel, true, true>>         nullable_key64_two_level;
+
+    std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithNullableShortStringKey, true>> nullable_key_string;
+    std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithNullableShortStringKey, true>> nullable_key_fixed_string;
+    std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithNullableShortStringKeyTwoLevel, true>> nullable_key_string_two_level;
+    std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithNullableShortStringKeyTwoLevel, true>> nullable_key_fixed_string_two_level;
+
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128, true>>             nullable_keys128;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256, true>>             nullable_keys256;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128TwoLevel, true>>     nullable_keys128_two_level;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256TwoLevel, true>>     nullable_keys256_two_level;
+
+    /// Support for low cardinality.
+    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt8, AggregatedDataWithNullableUInt8Key, false>>> low_cardinality_key8;
+    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt16, AggregatedDataWithNullableUInt16Key, false>>> low_cardinality_key16;
+    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt64Key>>> low_cardinality_key32;
+    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64Key>>> low_cardinality_key64;
+    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodString<AggregatedDataWithNullableStringKey>>> low_cardinality_key_string;
+    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodFixedString<AggregatedDataWithNullableStringKey>>> low_cardinality_key_fixed_string;
+
+    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt64KeyTwoLevel>>> low_cardinality_key32_two_level;
+    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64KeyTwoLevel>>> low_cardinality_key64_two_level;
+    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodString<AggregatedDataWithNullableStringKeyTwoLevel>>> low_cardinality_key_string_two_level;
+    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodFixedString<AggregatedDataWithNullableStringKeyTwoLevel>>> low_cardinality_key_fixed_string_two_level;
+
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128, false, true>>      low_cardinality_keys128;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256, false, true>>      low_cardinality_keys256;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128TwoLevel, false, true>> low_cardinality_keys128_two_level;
+    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256TwoLevel, false, true>> low_cardinality_keys256_two_level;
+
+    /// In this and similar macros, the option without_key is not considered.
+    #define APPLY_FOR_AGGREGATED_VARIANTS(M) \
+        M(key8,                       false) \
+        M(key16,                      false) \
+        M(key32,                      false) \
+        M(key64,                      false) \
+        M(key_string,                 false) \
+        M(key_fixed_string,           false) \
+        M(keys16,                    false) \
+        M(keys32,                    false) \
+        M(keys64,                    false) \
+        M(keys128,                    false) \
+        M(keys256,                    false) \
+        M(serialized,                   false) \
+        M(nullable_serialized,          false) \
+        M(prealloc_serialized,          false) \
+        M(nullable_prealloc_serialized, false) \
+        M(key32_two_level,            true) \
+        M(key64_two_level,            true) \
+        M(key_string_two_level,       true) \
+        M(key_fixed_string_two_level, true) \
+        M(keys32_two_level,          true) \
+        M(keys64_two_level,          true) \
+        M(keys128_two_level,          true) \
+        M(keys256_two_level,          true) \
+        M(serialized_two_level,                   true) \
+        M(nullable_serialized_two_level,          true) \
+        M(prealloc_serialized_two_level,          true) \
+        M(nullable_prealloc_serialized_two_level, true) \
+        M(key64_hash64,               false) \
+        M(key_string_hash64,          false) \
+        M(key_fixed_string_hash64,    false) \
+        M(keys128_hash64,             false) \
+        M(keys256_hash64,             false) \
+        M(serialized_hash64,                   false) \
+        M(nullable_serialized_hash64,          false) \
+        M(prealloc_serialized_hash64,          false) \
+        M(nullable_prealloc_serialized_hash64, false) \
+        M(nullable_key8,             false) \
+        M(nullable_key16,             false) \
+        M(nullable_key32,             false) \
+        M(nullable_key64,             false) \
+        M(nullable_key32_two_level,   true) \
+        M(nullable_key64_two_level,   true) \
+        M(nullable_key_string,        false) \
+        M(nullable_key_fixed_string,  false) \
+        M(nullable_key_string_two_level, true) \
+        M(nullable_key_fixed_string_two_level, true) \
+        M(nullable_keys128,           false) \
+        M(nullable_keys256,           false) \
+        M(nullable_keys128_two_level, true) \
+        M(nullable_keys256_two_level, true) \
+        M(low_cardinality_key8, false) \
+        M(low_cardinality_key16, false) \
+        M(low_cardinality_key32, false) \
+        M(low_cardinality_key64, false) \
+        M(low_cardinality_keys128, false) \
+        M(low_cardinality_keys256, false) \
+        M(low_cardinality_key_string, false) \
+        M(low_cardinality_key_fixed_string, false) \
+        M(low_cardinality_key32_two_level, true) \
+        M(low_cardinality_key64_two_level, true) \
+        M(low_cardinality_keys128_two_level, true) \
+        M(low_cardinality_keys256_two_level, true) \
+        M(low_cardinality_key_string_two_level, true) \
+        M(low_cardinality_key_fixed_string_two_level, true) \
+
+    #define APPLY_FOR_VARIANTS_CONVERTIBLE_TO_TWO_LEVEL(M) \
+        M(key32)            \
+        M(key64)            \
+        M(key_string)       \
+        M(key_fixed_string) \
+        M(keys32)           \
+        M(keys64)           \
+        M(keys128)          \
+        M(keys256)          \
+        M(serialized)       \
+        M(nullable_serialized) \
+        M(prealloc_serialized) \
+        M(nullable_prealloc_serialized) \
+        M(nullable_key32) \
+        M(nullable_key64) \
+        M(nullable_key_string) \
+        M(nullable_key_fixed_string) \
+        M(nullable_keys128) \
+        M(nullable_keys256) \
+        M(low_cardinality_key32) \
+        M(low_cardinality_key64) \
+        M(low_cardinality_keys128) \
+        M(low_cardinality_keys256) \
+        M(low_cardinality_key_string) \
+        M(low_cardinality_key_fixed_string) \
+
+    /// NOLINTNEXTLINE
+    #define APPLY_FOR_VARIANTS_NOT_CONVERTIBLE_TO_TWO_LEVEL(M) \
+        M(key8)             \
+        M(key16)            \
+        M(nullable_key8) \
+        M(nullable_key16) \
+        M(keys16)           \
+        M(key64_hash64)     \
+        M(key_string_hash64)\
+        M(key_fixed_string_hash64) \
+        M(keys128_hash64)   \
+        M(keys256_hash64)   \
+        M(serialized_hash64) \
+        M(nullable_serialized_hash64) \
+        M(prealloc_serialized_hash64) \
+        M(nullable_prealloc_serialized_hash64) \
+        M(low_cardinality_key8) \
+        M(low_cardinality_key16) \
+
+    /// NOLINTNEXTLINE
+    #define APPLY_FOR_VARIANTS_SINGLE_LEVEL(M) \
+        APPLY_FOR_VARIANTS_NOT_CONVERTIBLE_TO_TWO_LEVEL(M) \
+        APPLY_FOR_VARIANTS_CONVERTIBLE_TO_TWO_LEVEL(M) \
+
+    /// NOLINTNEXTLINE
+    #define APPLY_FOR_VARIANTS_TWO_LEVEL(M) \
+        M(key32_two_level)            \
+        M(key64_two_level)            \
+        M(key_string_two_level)       \
+        M(key_fixed_string_two_level) \
+        M(keys32_two_level)           \
+        M(keys64_two_level)           \
+        M(keys128_two_level)          \
+        M(keys256_two_level)          \
+        M(serialized_two_level)       \
+        M(nullable_serialized_two_level)       \
+        M(prealloc_serialized_two_level)       \
+        M(nullable_prealloc_serialized_two_level)       \
+        M(nullable_key32_two_level) \
+        M(nullable_key64_two_level) \
+        M(nullable_key_string_two_level) \
+        M(nullable_key_fixed_string_two_level) \
+        M(nullable_keys128_two_level) \
+        M(nullable_keys256_two_level) \
+        M(low_cardinality_key32_two_level) \
+        M(low_cardinality_key64_two_level) \
+        M(low_cardinality_keys128_two_level) \
+        M(low_cardinality_keys256_two_level) \
+        M(low_cardinality_key_string_two_level) \
+        M(low_cardinality_key_fixed_string_two_level) \
+
+    #define APPLY_FOR_LOW_CARDINALITY_VARIANTS(M) \
+        M(low_cardinality_key8) \
+        M(low_cardinality_key16) \
+        M(low_cardinality_key32) \
+        M(low_cardinality_key64) \
+        M(low_cardinality_keys128) \
+        M(low_cardinality_keys256) \
+        M(low_cardinality_key_string) \
+        M(low_cardinality_key_fixed_string) \
+        M(low_cardinality_key32_two_level) \
+        M(low_cardinality_key64_two_level) \
+        M(low_cardinality_keys128_two_level) \
+        M(low_cardinality_keys256_two_level) \
+        M(low_cardinality_key_string_two_level) \
+        M(low_cardinality_key_fixed_string_two_level)
+
+    enum class Type
+    {
+        EMPTY = 0,
+        without_key,
+
+    #define M(NAME, IS_TWO_LEVEL) NAME,
+        APPLY_FOR_AGGREGATED_VARIANTS(M)
+    #undef M
+    };
+    Type type = Type::EMPTY;
+    AggregatedDataVariants();
+    ~AggregatedDataVariants();
+    bool empty() const { return type == Type::EMPTY; }
+    void invalidate() { type = Type::EMPTY; }
+    void init(Type type_, std::optional<size_t> size_hint = std::nullopt);
+    /// Number of rows (different keys).
+    size_t size() const;
+    size_t sizeWithoutOverflowRow() const;
+    const char * getMethodName() const;
+    bool isTwoLevel() const;
+    bool isConvertibleToTwoLevel() const;
+    void convertToTwoLevel();
+    bool isLowCardinality() const;
+    static ColumnsHashing::HashMethodContextPtr createCache(Type type, const ColumnsHashing::HashMethodContext::Settings & settings);
+
+};
+
+using AggregatedDataVariantsPtr = std::shared_ptr<AggregatedDataVariants>;
+using ManyAggregatedDataVariants = std::vector<AggregatedDataVariantsPtr>;
+using ManyAggregatedDataVariantsPtr = std::shared_ptr<ManyAggregatedDataVariants>;
+}
diff --git a/src/Interpreters/AggregationMethod.cpp b/src/Interpreters/AggregationMethod.cpp
new file mode 100644
index 00000000000..3ff4f0cae43
--- /dev/null
+++ b/src/Interpreters/AggregationMethod.cpp
@@ -0,0 +1,215 @@
+#include <Interpreters/AggregationMethod.h>
+
+namespace DB
+{
+template <typename FieldType, typename TData, bool consecutive_keys_optimization, bool nullable>
+void AggregationMethodOneNumber<FieldType, TData, consecutive_keys_optimization, nullable>::insertKeyIntoColumns(
+    const AggregationMethodOneNumber::Key & key, std::vector<IColumn *> & key_columns, const Sizes & /*key_sizes*/)
+{
+    ColumnFixedSizeHelper * column;
+    if constexpr (nullable)
+    {
+        ColumnNullable & nullable_col = assert_cast<ColumnNullable &>(*key_columns[0]);
+        ColumnUInt8 * null_map = assert_cast<ColumnUInt8 *>(&nullable_col.getNullMapColumn());
+        null_map->insertDefault();
+        column = static_cast<ColumnFixedSizeHelper *>(&nullable_col.getNestedColumn());
+    }
+    else
+    {
+        column = static_cast<ColumnFixedSizeHelper *>(key_columns[0]);
+    }
+    static_assert(sizeof(FieldType) <= sizeof(Key));
+    const auto * key_holder = reinterpret_cast<const char *>(&key);
+    if constexpr (sizeof(FieldType) < sizeof(Key) && std::endian::native == std::endian::big)
+        column->insertRawData<sizeof(FieldType)>(key_holder + (sizeof(Key) - sizeof(FieldType)));
+    else
+        column->insertRawData<sizeof(FieldType)>(key_holder);
+}
+
+template struct AggregationMethodOneNumber<UInt8, AggregatedDataWithUInt8Key, false>;
+template struct AggregationMethodOneNumber<UInt16, AggregatedDataWithUInt16Key, false>;
+template struct AggregationMethodOneNumber<UInt32, AggregatedDataWithUInt64Key>;
+template struct AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64Key>;
+template struct AggregationMethodOneNumber<UInt32, AggregatedDataWithUInt64KeyTwoLevel>;
+template struct AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64KeyTwoLevel>;
+template struct AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64KeyHash64>;
+template struct AggregationMethodOneNumber<UInt8, AggregatedDataWithNullableUInt8Key, false, true>;
+template struct AggregationMethodOneNumber<UInt16, AggregatedDataWithNullableUInt16Key, false, true>;
+template struct AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt32Key, true, true>;
+template struct AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64Key, true, true>;
+template struct AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt32KeyTwoLevel, true, true>;
+template struct AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64KeyTwoLevel, true, true>;
+template struct AggregationMethodOneNumber<UInt8, AggregatedDataWithNullableUInt8Key, false>;
+template struct AggregationMethodOneNumber<UInt16, AggregatedDataWithNullableUInt16Key, false>;
+template struct AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt64Key>;
+template struct AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64Key>;
+template struct AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt64KeyTwoLevel>;
+template struct AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64KeyTwoLevel>;
+
+template <typename TData, bool nullable>
+void AggregationMethodStringNoCache<TData, nullable>::insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &)
+{
+    if constexpr (nullable)
+    {
+        ColumnNullable & column_nullable = assert_cast<ColumnNullable &>(*key_columns[0]);
+        assert_cast<ColumnString &>(column_nullable.getNestedColumn()).insertData(key.data, key.size);
+        column_nullable.getNullMapData().push_back(0);
+    }
+    else
+    {
+        assert_cast<ColumnString &>(*key_columns[0]).insertData(key.data, key.size);
+    }
+}
+template struct AggregationMethodStringNoCache<AggregatedDataWithShortStringKey>;
+template struct AggregationMethodStringNoCache<AggregatedDataWithShortStringKeyTwoLevel>;
+template struct AggregationMethodStringNoCache<AggregatedDataWithNullableShortStringKey, true>;
+template struct AggregationMethodStringNoCache<AggregatedDataWithNullableShortStringKeyTwoLevel, true>;
+
+template <typename TData>
+void AggregationMethodFixedString<TData>::insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &)
+{
+    assert_cast<ColumnFixedString &>(*key_columns[0]).insertData(key.data, key.size);
+}
+template struct AggregationMethodFixedString<AggregatedDataWithStringKeyHash64>;
+template struct AggregationMethodFixedString<AggregatedDataWithNullableStringKey>;
+template struct AggregationMethodFixedString<AggregatedDataWithNullableStringKeyTwoLevel>;
+
+
+template <typename TData, bool nullable>
+void AggregationMethodFixedStringNoCache<TData, nullable>::insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &)
+{
+    if constexpr (nullable)
+        assert_cast<ColumnNullable &>(*key_columns[0]).insertData(key.data, key.size);
+    else
+        assert_cast<ColumnFixedString &>(*key_columns[0]).insertData(key.data, key.size);
+}
+template struct AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKey>;
+template struct AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKeyTwoLevel>;
+template struct AggregationMethodFixedStringNoCache<AggregatedDataWithNullableShortStringKey, true>;
+template struct AggregationMethodFixedStringNoCache<AggregatedDataWithNullableShortStringKeyTwoLevel, true>;
+
+
+template <typename SingleColumnMethod>
+void AggregationMethodSingleLowCardinalityColumn<SingleColumnMethod>::insertKeyIntoColumns(
+    const Key & key, std::vector<IColumn *> & key_columns_low_cardinality, const Sizes & /*key_sizes*/)
+{
+    auto * col = assert_cast<ColumnLowCardinality *>(key_columns_low_cardinality[0]);
+
+    if constexpr (std::is_same_v<Key, StringRef>)
+        col->insertData(key.data, key.size);
+    else
+        col->insertData(reinterpret_cast<const char *>(&key), sizeof(key));
+}
+template struct AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt8, AggregatedDataWithNullableUInt8Key, false>>;
+template struct AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt16, AggregatedDataWithNullableUInt16Key, false>>;
+template struct AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt64Key>>;
+template struct AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64Key>>;
+template struct AggregationMethodSingleLowCardinalityColumn<AggregationMethodString<AggregatedDataWithNullableStringKey>>;
+template struct AggregationMethodSingleLowCardinalityColumn<AggregationMethodFixedString<AggregatedDataWithNullableStringKey>>;
+template struct AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt64KeyTwoLevel>>;
+template struct AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64KeyTwoLevel>>;
+template struct AggregationMethodSingleLowCardinalityColumn<AggregationMethodString<AggregatedDataWithNullableStringKeyTwoLevel>>;
+template struct AggregationMethodSingleLowCardinalityColumn<AggregationMethodFixedString<AggregatedDataWithNullableStringKeyTwoLevel>>;
+
+
+template <typename TData, bool has_nullable_keys, bool has_low_cardinality, bool consecutive_keys_optimization>
+void AggregationMethodKeysFixed<TData, has_nullable_keys, has_low_cardinality,consecutive_keys_optimization>::insertKeyIntoColumns(const Key & key, std::vector<IColumn *> & key_columns, const Sizes & key_sizes)
+{
+    size_t keys_size = key_columns.size();
+
+    static constexpr auto bitmap_size = has_nullable_keys ? std::tuple_size<KeysNullMap<Key>>::value : 0;
+    /// In any hash key value, column values to be read start just after the bitmap, if it exists.
+    size_t pos = bitmap_size;
+
+    for (size_t i = 0; i < keys_size; ++i)
+    {
+        IColumn * observed_column;
+        ColumnUInt8 * null_map;
+
+        bool column_nullable = false;
+        if constexpr (has_nullable_keys)
+            column_nullable = isColumnNullable(*key_columns[i]);
+
+        /// If we have a nullable column, get its nested column and its null map.
+        if (column_nullable)
+        {
+            ColumnNullable & nullable_col = assert_cast<ColumnNullable &>(*key_columns[i]);
+            observed_column = &nullable_col.getNestedColumn();
+            null_map = assert_cast<ColumnUInt8 *>(&nullable_col.getNullMapColumn());
+        }
+        else
+        {
+            observed_column = key_columns[i];
+            null_map = nullptr;
+        }
+
+        bool is_null = false;
+        if (column_nullable)
+        {
+            /// The current column is nullable. Check if the value of the
+            /// corresponding key is nullable. Update the null map accordingly.
+            size_t bucket = i / 8;
+            size_t offset = i % 8;
+            UInt8 val = (reinterpret_cast<const UInt8 *>(&key)[bucket] >> offset) & 1;
+            null_map->insertValue(val);
+            is_null = val == 1;
+        }
+
+        if (has_nullable_keys && is_null)
+            observed_column->insertDefault();
+        else
+        {
+            size_t size = key_sizes[i];
+            size_t offset_to = pos;
+            if constexpr (std::endian::native == std::endian::big)
+                offset_to = sizeof(Key) - size - pos;
+            observed_column->insertData(reinterpret_cast<const char *>(&key) + offset_to, size);
+            pos += size;
+        }
+    }
+}
+template struct AggregationMethodKeysFixed<AggregatedDataWithUInt16Key, false, false, false>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithUInt32Key>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithUInt64Key>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys128>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys256>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithUInt32KeyTwoLevel>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithUInt64KeyTwoLevel>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys128TwoLevel>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys256TwoLevel>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys128Hash64>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys256Hash64>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys128, true>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys256, true>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys128TwoLevel, true>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys256TwoLevel, true>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys128, false, true>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys256, false, true>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys128TwoLevel, false, true>;
+template struct AggregationMethodKeysFixed<AggregatedDataWithKeys256TwoLevel, false, true>;
+
+
+template <typename TData, bool nullable, bool prealloc>
+void AggregationMethodSerialized<TData, nullable, prealloc>::insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &)
+{
+    const auto * pos = key.data;
+    for (auto & column : key_columns)
+        pos = column->deserializeAndInsertFromArena(pos);
+}
+template struct AggregationMethodSerialized<AggregatedDataWithStringKey>;
+template struct AggregationMethodSerialized<AggregatedDataWithStringKeyTwoLevel>;
+template struct AggregationMethodSerialized<AggregatedDataWithStringKeyHash64>;
+// AggregationMethodNullableSerialized
+template struct AggregationMethodSerialized<AggregatedDataWithStringKey, true, false>;
+template struct AggregationMethodSerialized<AggregatedDataWithStringKeyTwoLevel, true, false>;
+template struct AggregationMethodSerialized<AggregatedDataWithStringKeyHash64, true, false>;
+// AggregationMethodPreallocSerialized
+template struct AggregationMethodSerialized<AggregatedDataWithStringKey, false, true>;
+template struct AggregationMethodSerialized<AggregatedDataWithStringKeyTwoLevel, false, true>;
+template struct AggregationMethodSerialized<AggregatedDataWithStringKeyHash64, false, true>;
+// AggregationMethodNullablePreallocSerialized
+template struct AggregationMethodSerialized<AggregatedDataWithStringKey, true, true>;
+template struct AggregationMethodSerialized<AggregatedDataWithStringKeyTwoLevel, true, true>;
+template struct AggregationMethodSerialized<AggregatedDataWithStringKeyHash64, true, true>;
+
+}
diff --git a/src/Interpreters/AggregationMethod.h b/src/Interpreters/AggregationMethod.h
new file mode 100644
index 00000000000..41345d91990
--- /dev/null
+++ b/src/Interpreters/AggregationMethod.h
@@ -0,0 +1,320 @@
+#pragma once
+#include <vector>
+#include <Common/ColumnsHashing.h>
+#include <Interpreters/AggregationCommon.h>
+#include <Interpreters/AggregatedData.h>
+
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnFixedString.h>
+#include <Columns/ColumnAggregateFunction.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnNullable.h>
+#include <Columns/ColumnLowCardinality.h>
+
+namespace DB
+{
+class IColumn;
+/// For the case where there is one numeric key.
+/// FieldType is UInt8/16/32/64 for any type with corresponding bit width.
+template <typename FieldType, typename TData,
+        bool consecutive_keys_optimization = true, bool nullable = false>
+struct AggregationMethodOneNumber
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodOneNumber() = default;
+
+    explicit AggregationMethodOneNumber(size_t size_hint) : data(size_hint) { }
+
+    template <typename Other>
+    explicit AggregationMethodOneNumber(const Other & other) : data(other.data)
+    {
+    }
+
+    /// To use one `Method` in different threads, use different `State`.
+    template <bool use_cache>
+    using StateImpl = ColumnsHashing::HashMethodOneNumber<
+        typename Data::value_type,
+        Mapped,
+        FieldType,
+        use_cache && consecutive_keys_optimization,
+        /*need_offset=*/ false,
+        nullable>;
+
+    using State = StateImpl<true>;
+    using StateNoCache = StateImpl<false>;
+
+    /// Use optimization for low cardinality.
+    static const bool low_cardinality_optimization = false;
+    static const bool one_key_nullable_optimization = nullable;
+
+    /// Shuffle key columns before `insertKeyIntoColumns` call if needed.
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    // Insert the key from the hash table into columns.
+    static void insertKeyIntoColumns(const Key & key, std::vector<IColumn *> & key_columns, const Sizes & /*key_sizes*/);
+};
+
+/// For the case where there is one string key.
+template <typename TData>
+struct AggregationMethodString
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodString() = default;
+
+    template <typename Other>
+    explicit AggregationMethodString(const Other & other) : data(other.data)
+    {
+    }
+
+    explicit AggregationMethodString(size_t size_hint) : data(size_hint) { }
+
+    template <bool use_cache>
+    using StateImpl = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, /*place_string_to_arena=*/ true, use_cache>;
+
+    using State = StateImpl<true>;
+    using StateNoCache = StateImpl<false>;
+
+    static const bool low_cardinality_optimization = false;
+    static const bool one_key_nullable_optimization = false;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    static void insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &)
+    {
+        static_cast<ColumnString *>(key_columns[0])->insertData(key.data, key.size);
+    }
+};
+
+/// Same as above but without cache
+template <typename TData, bool nullable = false>
+struct AggregationMethodStringNoCache
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodStringNoCache() = default;
+
+    explicit AggregationMethodStringNoCache(size_t size_hint) : data(size_hint) { }
+
+    template <typename Other>
+    explicit AggregationMethodStringNoCache(const Other & other) : data(other.data)
+    {
+    }
+
+    template <bool use_cache>
+    using StateImpl = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, true, false, false, nullable>;
+
+    using State = StateImpl<true>;
+    using StateNoCache = StateImpl<false>;
+
+    static const bool low_cardinality_optimization = false;
+    static const bool one_key_nullable_optimization = nullable;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    static void insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &);
+};
+
+/// For the case where there is one fixed-length string key.
+template <typename TData>
+struct AggregationMethodFixedString
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodFixedString() = default;
+
+    explicit AggregationMethodFixedString(size_t size_hint) : data(size_hint) { }
+
+    template <typename Other>
+    explicit AggregationMethodFixedString(const Other & other) : data(other.data)
+    {
+    }
+
+    template <bool use_cache>
+    using StateImpl = ColumnsHashing::HashMethodFixedString<typename Data::value_type, Mapped, /*place_string_to_arena=*/ true, use_cache>;
+
+    using State = StateImpl<true>;
+    using StateNoCache = StateImpl<false>;
+
+    static const bool low_cardinality_optimization = false;
+    static const bool one_key_nullable_optimization = false;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    static void insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &);
+};
+
+/// Same as above but without cache
+template <typename TData, bool nullable = false>
+struct AggregationMethodFixedStringNoCache
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodFixedStringNoCache() = default;
+
+    explicit AggregationMethodFixedStringNoCache(size_t size_hint) : data(size_hint) { }
+
+    template <typename Other>
+    explicit AggregationMethodFixedStringNoCache(const Other & other) : data(other.data)
+    {
+    }
+
+    template <bool use_cache>
+    using StateImpl = ColumnsHashing::HashMethodFixedString<typename Data::value_type, Mapped, true, false, false, nullable>;
+
+    using State = StateImpl<true>;
+    using StateNoCache = StateImpl<false>;
+
+    static const bool low_cardinality_optimization = false;
+    static const bool one_key_nullable_optimization = nullable;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    static void insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &);
+};
+
+/// Single low cardinality column.
+template <typename SingleColumnMethod>
+struct AggregationMethodSingleLowCardinalityColumn : public SingleColumnMethod
+{
+    using Base = SingleColumnMethod;
+    using Data = typename Base::Data;
+    using Key = typename Base::Key;
+    using Mapped = typename Base::Mapped;
+    using Base::data;
+
+    template <bool use_cache>
+    using BaseStateImpl = typename Base::template StateImpl<use_cache>;
+
+    AggregationMethodSingleLowCardinalityColumn() = default;
+
+    template <typename Other>
+    explicit AggregationMethodSingleLowCardinalityColumn(const Other & other) : Base(other) {}
+
+    template <bool use_cache>
+    using StateImpl = ColumnsHashing::HashMethodSingleLowCardinalityColumn<BaseStateImpl<use_cache>, Mapped, use_cache>;
+
+    using State = StateImpl<true>;
+    using StateNoCache = StateImpl<false>;
+
+    static const bool low_cardinality_optimization = true;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    static void insertKeyIntoColumns(const Key & key,
+         std::vector<IColumn *> & key_columns_low_cardinality, const Sizes & /*key_sizes*/);
+};
+
+/// For the case where all keys are of fixed length, and they fit in N (for example, 128) bits.
+template <typename TData, bool has_nullable_keys_ = false, bool has_low_cardinality_ = false, bool consecutive_keys_optimization = false>
+struct AggregationMethodKeysFixed
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+    static constexpr bool has_nullable_keys = has_nullable_keys_;
+    static constexpr bool has_low_cardinality = has_low_cardinality_;
+
+    Data data;
+
+    AggregationMethodKeysFixed() = default;
+
+    explicit AggregationMethodKeysFixed(size_t size_hint) : data(size_hint) { }
+
+    template <typename Other>
+    explicit AggregationMethodKeysFixed(const Other & other) : data(other.data)
+    {
+    }
+
+    template <bool use_cache>
+    using StateImpl = ColumnsHashing::HashMethodKeysFixed<
+        typename Data::value_type,
+        Key,
+        Mapped,
+        has_nullable_keys,
+        has_low_cardinality,
+        use_cache && consecutive_keys_optimization>;
+
+    using State = StateImpl<true>;
+    using StateNoCache = StateImpl<false>;
+
+    static const bool low_cardinality_optimization = false;
+    static const bool one_key_nullable_optimization = false;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> & key_columns, const Sizes & key_sizes)
+    {
+        return State::shuffleKeyColumns(key_columns, key_sizes);
+    }
+
+    static void insertKeyIntoColumns(const Key & key, std::vector<IColumn *> & key_columns, const Sizes & key_sizes);
+};
+
+/** Aggregates by concatenating serialized key values.
+  * The serialized value differs in that it uniquely allows to deserialize it, having only the position with which it starts.
+  * That is, for example, for strings, it contains first the serialized length of the string, and then the bytes.
+  * Therefore, when aggregating by several strings, there is no ambiguity.
+  */
+template <typename TData, bool nullable = false, bool prealloc = false>
+struct AggregationMethodSerialized
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodSerialized() = default;
+
+    explicit AggregationMethodSerialized(size_t size_hint) : data(size_hint) { }
+
+    template <typename Other>
+    explicit AggregationMethodSerialized(const Other & other) : data(other.data)
+    {
+    }
+
+    template <bool use_cache>
+    using StateImpl = ColumnsHashing::HashMethodSerialized<typename Data::value_type, Mapped, nullable, prealloc>;
+
+    using State = StateImpl<true>;
+    using StateNoCache = StateImpl<false>;
+
+    static const bool low_cardinality_optimization = false;
+    static const bool one_key_nullable_optimization = false;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    static void insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &);
+};
+
+template <typename TData>
+using AggregationMethodNullableSerialized = AggregationMethodSerialized<TData, true>;
+
+template <typename TData>
+using AggregationMethodPreallocSerialized = AggregationMethodSerialized<TData, false, true>;
+
+template <typename TData>
+using AggregationMethodNullablePreallocSerialized = AggregationMethodSerialized<TData, true, true>;
+
+
+}
diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp
index 80a98683867..3596e2fcc4a 100644
--- a/src/Interpreters/Aggregator.cpp
+++ b/src/Interpreters/Aggregator.cpp
@@ -48,7 +48,6 @@ namespace ProfileEvents
     extern const Event ExternalAggregationUncompressedBytes;
     extern const Event ExternalProcessingCompressedBytesTotal;
     extern const Event ExternalProcessingUncompressedBytesTotal;
-    extern const Event AggregationPreallocatedElementsInHashTables;
     extern const Event AggregationHashTablesInitializedAsTwoLevel;
     extern const Event OverflowThrow;
     extern const Event OverflowBreak;
@@ -269,50 +268,6 @@ void updateStatistics(const DB::ManyAggregatedDataVariants & data_variants, cons
     getHashTablesStatistics().update(sum_of_sizes, *median_size, params);
 }
 
-// The std::is_constructible trait isn't suitable here because some classes have template constructors with semantics different from providing size hints.
-// Also string hash table variants are not supported due to the fact that both local perf tests and tests in CI showed slowdowns for them.
-template <typename...>
-struct HasConstructorOfNumberOfElements : std::false_type
-{
-};
-
-template <typename... Ts>
-struct HasConstructorOfNumberOfElements<HashMapTable<Ts...>> : std::true_type
-{
-};
-
-template <typename Key, typename Cell, typename Hash, typename Grower, typename Allocator, template <typename...> typename ImplTable>
-struct HasConstructorOfNumberOfElements<TwoLevelHashMapTable<Key, Cell, Hash, Grower, Allocator, ImplTable>> : std::true_type
-{
-};
-
-template <typename... Ts>
-struct HasConstructorOfNumberOfElements<HashTable<Ts...>> : std::true_type
-{
-};
-
-template <typename... Ts>
-struct HasConstructorOfNumberOfElements<TwoLevelHashTable<Ts...>> : std::true_type
-{
-};
-
-template <template <typename> typename Method, typename Base>
-struct HasConstructorOfNumberOfElements<Method<Base>> : HasConstructorOfNumberOfElements<Base>
-{
-};
-
-template <typename Method>
-auto constructWithReserveIfPossible(size_t size_hint)
-{
-    if constexpr (HasConstructorOfNumberOfElements<typename Method::Data>::value)
-    {
-        ProfileEvents::increment(ProfileEvents::AggregationPreallocatedElementsInHashTables, size_hint);
-        return std::make_unique<Method>(size_hint);
-    }
-    else
-        return std::make_unique<Method>();
-}
-
 DB::ColumnNumbers calculateKeysPositions(const DB::Block & header, const DB::Aggregator::Params & params)
 {
     DB::ColumnNumbers keys_positions(params.keys_size);
@@ -345,71 +300,11 @@ size_t getMinBytesForPrefetch()
 namespace DB
 {
 
-AggregatedDataVariants::~AggregatedDataVariants()
-{
-    if (aggregator && !aggregator->all_aggregates_has_trivial_destructor)
-    {
-        try
-        {
-            aggregator->destroyAllAggregateStates(*this);
-        }
-        catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-        }
-    }
-}
-
 std::optional<HashTablesCacheStatistics> getHashTablesCacheStatistics()
 {
     return getHashTablesStatistics().getCacheStats();
 }
 
-void AggregatedDataVariants::convertToTwoLevel()
-{
-    if (aggregator)
-        LOG_TRACE(aggregator->log, "Converting aggregation data to two-level.");
-
-    switch (type)
-    {
-#define M(NAME) \
-        case Type::NAME: \
-            NAME ## _two_level = std::make_unique<decltype(NAME ## _two_level)::element_type>(*(NAME)); \
-            (NAME).reset(); \
-            type = Type::NAME ## _two_level; \
-            break;
-
-        APPLY_FOR_VARIANTS_CONVERTIBLE_TO_TWO_LEVEL(M)
-
-    #undef M
-
-        default:
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong data variant passed.");
-    }
-}
-
-void AggregatedDataVariants::init(Type type_, std::optional<size_t> size_hint)
-{
-    switch (type_)
-    {
-        case Type::EMPTY:
-        case Type::without_key:
-            break;
-
-#define M(NAME, IS_TWO_LEVEL) \
-    case Type::NAME: \
-        if (size_hint) \
-            (NAME) = constructWithReserveIfPossible<decltype(NAME)::element_type>(*size_hint); \
-        else \
-            (NAME) = std::make_unique<decltype(NAME)::element_type>(); \
-        break;
-            APPLY_FOR_AGGREGATED_VARIANTS(M)
-#undef M
-    }
-
-    type = type_;
-}
-
 Aggregator::Params::StatsCollectingParams::StatsCollectingParams() = default;
 
 Aggregator::Params::StatsCollectingParams::StatsCollectingParams(
@@ -1121,30 +1016,30 @@ void NO_INLINE Aggregator::executeImpl(
         if (compiled_aggregate_functions_holder && !hasSparseArguments(aggregate_instructions))
         {
             if (prefetch)
-                executeImplBatch<false, true, true>(
-                    method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, all_keys_are_const, overflow_row);
+                executeImplBatch<true>(
+                    method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, false, all_keys_are_const, true, overflow_row);
             else
-                executeImplBatch<false, true, false>(
-                    method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, all_keys_are_const, overflow_row);
+                executeImplBatch<false>(
+                    method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, false, all_keys_are_const, true, overflow_row);
         }
         else
 #endif
         {
             if (prefetch)
-                executeImplBatch<false, false, true>(
-                    method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, all_keys_are_const, overflow_row);
+                executeImplBatch<true>(
+                    method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, false, all_keys_are_const, false, overflow_row);
             else
-                executeImplBatch<false, false, false>(
-                    method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, all_keys_are_const, overflow_row);
+                executeImplBatch<false>(
+                    method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, false, all_keys_are_const, false, overflow_row);
         }
     }
     else
     {
-        executeImplBatch<true, false, false>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, all_keys_are_const, overflow_row);
+        executeImplBatch<false>(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, true, all_keys_are_const, false, overflow_row);
     }
 }
 
-template <bool no_more_keys, bool use_compiled_functions, bool prefetch, typename Method, typename State>
+template <bool prefetch, typename Method, typename State>
 void NO_INLINE Aggregator::executeImplBatch(
     Method & method,
     State & state,
@@ -1152,7 +1047,9 @@ void NO_INLINE Aggregator::executeImplBatch(
     size_t row_begin,
     size_t row_end,
     AggregateFunctionInstruction * aggregate_instructions,
+    bool no_more_keys,
     bool all_keys_are_const,
+    bool use_compiled_functions,
     AggregateDataPtr overflow_row) const
 {
     using KeyHolder = decltype(state.getKeyHolder(0, std::declval<Arena &>()));
@@ -1164,7 +1061,7 @@ void NO_INLINE Aggregator::executeImplBatch(
     /// Optimization for special case when there are no aggregate functions.
     if (params.aggregates_size == 0)
     {
-        if constexpr (no_more_keys)
+        if (no_more_keys)
             return;
 
         /// This pointer is unused, but the logic will compare it for nullptr to check if the cell is set.
@@ -1197,39 +1094,42 @@ void NO_INLINE Aggregator::executeImplBatch(
     }
 
     /// Optimization for special case when aggregating by 8bit key.
-    if constexpr (!no_more_keys && std::is_same_v<Method, typename decltype(AggregatedDataVariants::key8)::element_type>)
+    if (!no_more_keys)
     {
-        /// We use another method if there are aggregate functions with -Array combinator.
-        bool has_arrays = false;
-        for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
-        {
-            if (inst->offsets)
-            {
-                has_arrays = true;
-                break;
-            }
-        }
-
-        if (!has_arrays && !hasSparseArguments(aggregate_instructions) && !all_keys_are_const)
+        if constexpr (std::is_same_v<Method, typename decltype(AggregatedDataVariants::key8)::element_type>)
         {
+            /// We use another method if there are aggregate functions with -Array combinator.
+            bool has_arrays = false;
             for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
             {
-                inst->batch_that->addBatchLookupTable8(
-                    row_begin,
-                    row_end,
-                    reinterpret_cast<AggregateDataPtr *>(method.data.data()),
-                    inst->state_offset,
-                    [&](AggregateDataPtr & aggregate_data)
-                    {
-                        AggregateDataPtr place = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                        createAggregateStates(place);
-                        aggregate_data = place;
-                    },
-                    state.getKeyData(),
-                    inst->batch_arguments,
-                    aggregates_pool);
+                if (inst->offsets)
+                {
+                    has_arrays = true;
+                    break;
+                }
+            }
+
+            if (!has_arrays && !hasSparseArguments(aggregate_instructions) && !all_keys_are_const)
+            {
+                for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
+                {
+                    inst->batch_that->addBatchLookupTable8(
+                        row_begin,
+                        row_end,
+                        reinterpret_cast<AggregateDataPtr *>(method.data.data()),
+                        inst->state_offset,
+                        [&](AggregateDataPtr & aggregate_data)
+                        {
+                            AggregateDataPtr place = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                            createAggregateStates(place);
+                            aggregate_data = place;
+                        },
+                        state.getKeyData(),
+                        inst->batch_arguments,
+                        aggregates_pool);
+                }
+                return;
             }
-            return;
         }
     }
 
@@ -1255,12 +1155,12 @@ void NO_INLINE Aggregator::executeImplBatch(
     state.resetCache();
 
     /// For all rows.
-    for (size_t i = key_start; i < key_end; ++i)
+    if (!no_more_keys)
     {
-        AggregateDataPtr aggregate_data = nullptr;
-
-        if constexpr (!no_more_keys)
+        for (size_t i = key_start; i < key_end; ++i)
         {
+            AggregateDataPtr aggregate_data = nullptr;
+
             if constexpr (prefetch && HasPrefetchMemberFunc<decltype(method.data), KeyHolder>)
             {
                 if (i == key_start + prefetching.iterationsToMeasure())
@@ -1284,7 +1184,7 @@ void NO_INLINE Aggregator::executeImplBatch(
                 aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
 
 #if USE_EMBEDDED_COMPILER
-                if constexpr (use_compiled_functions)
+                if (use_compiled_functions)
                 {
                     const auto & compiled_aggregate_functions = compiled_aggregate_functions_holder->compiled_aggregate_functions;
                     compiled_aggregate_functions.create_aggregate_states_function(aggregate_data);
@@ -1297,7 +1197,8 @@ void NO_INLINE Aggregator::executeImplBatch(
 #if defined(MEMORY_SANITIZER)
 
                     /// We compile only functions that do not allocate some data in Arena. Only store necessary state in AggregateData place.
-                    for (size_t aggregate_function_index = 0; aggregate_function_index < aggregate_functions.size(); ++aggregate_function_index)
+                    for (size_t aggregate_function_index = 0; aggregate_function_index < aggregate_functions.size();
+                         ++aggregate_function_index)
                     {
                         if (!is_aggregate_function_compiled[aggregate_function_index])
                             continue;
@@ -1320,26 +1221,51 @@ void NO_INLINE Aggregator::executeImplBatch(
                 aggregate_data = emplace_result.getMapped();
 
             assert(aggregate_data != nullptr);
+            places[i] = aggregate_data;
         }
-        else
+    }
+    else
+    {
+        for (size_t i = key_start; i < key_end; ++i)
         {
+            AggregateDataPtr aggregate_data = nullptr;
             /// Add only if the key already exists.
             auto find_result = state.findKey(method.data, i, *aggregates_pool);
             if (find_result.isFound())
-            {
                 aggregate_data = find_result.getMapped();
-            }
             else
-            {
                 aggregate_data = overflow_row;
-            }
+            places[i] = aggregate_data;
         }
-
-        places[i] = aggregate_data;
     }
 
+    executeAggregateInstructions(
+        aggregates_pool,
+        row_begin,
+        row_end,
+        aggregate_instructions,
+        places,
+        key_start,
+        state.hasOnlyOneValueSinceLastReset(),
+        no_more_keys,
+        all_keys_are_const,
+        use_compiled_functions);
+}
+
+void Aggregator::executeAggregateInstructions(
+    Arena * aggregates_pool,
+    size_t row_begin,
+    size_t row_end,
+    AggregateFunctionInstruction * aggregate_instructions,
+    const std::unique_ptr<AggregateDataPtr[]> &places,
+    size_t key_start,
+    bool has_only_one_value_since_last_reset,
+    bool no_more_keys,
+    bool all_keys_are_const,
+    bool use_compiled_functions) const
+{
 #if USE_EMBEDDED_COMPILER
-    if constexpr (use_compiled_functions)
+    if (use_compiled_functions)
     {
         std::vector<ColumnData> columns_data;
 
@@ -1355,7 +1281,7 @@ void NO_INLINE Aggregator::executeImplBatch(
                 columns_data.emplace_back(getColumnData(inst->batch_arguments[argument_index]));
         }
 
-        if (all_keys_are_const || (!no_more_keys && state.hasOnlyOneValueSinceLastReset()))
+        if (all_keys_are_const || (!no_more_keys && has_only_one_value_since_last_reset))
         {
             auto add_into_aggregate_states_function_single_place = compiled_aggregate_functions_holder->compiled_aggregate_functions.add_into_aggregate_states_function_single_place;
             add_into_aggregate_states_function_single_place(row_begin, row_end, columns_data.data(), places[key_start]);
@@ -1372,33 +1298,34 @@ void NO_INLINE Aggregator::executeImplBatch(
     for (size_t i = 0; i < aggregate_functions.size(); ++i)
     {
 #if USE_EMBEDDED_COMPILER
-        if constexpr (use_compiled_functions)
+        if (use_compiled_functions)
             if (is_aggregate_function_compiled[i])
                 continue;
 #endif
 
         AggregateFunctionInstruction * inst = aggregate_instructions + i;
 
-        if (all_keys_are_const || (!no_more_keys && state.hasOnlyOneValueSinceLastReset()))
+        if (all_keys_are_const || (!no_more_keys && has_only_one_value_since_last_reset))
             addBatchSinglePlace(row_begin, row_end, inst, places[key_start] + inst->state_offset, aggregates_pool);
         else
             addBatch(row_begin, row_end, inst, places.get(), aggregates_pool);
     }
+
 }
 
 
-template <bool use_compiled_functions>
 void NO_INLINE Aggregator::executeWithoutKeyImpl(
     AggregatedDataWithoutKey & res,
     size_t row_begin, size_t row_end,
     AggregateFunctionInstruction * aggregate_instructions,
-    Arena * arena) const
+    Arena * arena,
+    bool use_compiled_functions) const
 {
     if (row_begin == row_end)
         return;
 
 #if USE_EMBEDDED_COMPILER
-    if constexpr (use_compiled_functions)
+    if (use_compiled_functions)
     {
         std::vector<ColumnData> columns_data;
 
@@ -1441,7 +1368,7 @@ void NO_INLINE Aggregator::executeWithoutKeyImpl(
         AggregateFunctionInstruction * inst = aggregate_instructions + i;
 
 #if USE_EMBEDDED_COMPILER
-        if constexpr (use_compiled_functions)
+        if (use_compiled_functions)
             if (is_aggregate_function_compiled[i])
                 continue;
 #endif
@@ -1707,12 +1634,12 @@ bool Aggregator::executeOnBlock(Columns columns,
 // #if USE_EMBEDDED_COMPILER
 //         if (compiled_aggregate_functions_holder)
 //         {
-//             executeWithoutKeyImpl<true>(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
+//             executeWithoutKeyImpl(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool,true);
 //         }
 //         else
 // #endif
         {
-            executeWithoutKeyImpl<false>(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool);
+            executeWithoutKeyImpl(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool, false);
         }
     }
     else
@@ -1832,8 +1759,14 @@ Block Aggregator::convertOneBucketToBlock(
 {
     // Used in ConvertingAggregatedToChunksSource -> ConvertingAggregatedToChunksTransform (expects single chunk for each bucket_id).
     constexpr bool return_single_block = true;
-    Block block = convertToBlockImpl<return_single_block>(
-        method, method.data.impls[bucket], arena, data_variants.aggregates_pools, final, method.data.impls[bucket].size());
+    Block block = std::get<Block>(convertToBlockImpl(
+        method,
+        method.data.impls[bucket],
+        arena,
+        data_variants.aggregates_pools,
+        final,
+        method.data.impls[bucket].size(),
+        return_single_block));
 
     block.info.bucket_num = static_cast<int>(bucket);
     return block;
@@ -1953,35 +1886,33 @@ bool Aggregator::checkLimits(size_t result_size, bool & no_more_keys) const
 }
 
 
-template <bool return_single_block, typename Method, typename Table>
-Aggregator::ConvertToBlockRes<return_single_block>
-Aggregator::convertToBlockImpl(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows) const
+template <typename Method, typename Table>
+Aggregator::ConvertToBlockResVariant
+Aggregator::convertToBlockImpl(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final,size_t rows, bool return_single_block) const
 {
     if (data.empty())
     {
         auto && out_cols = prepareOutputBlockColumns(params, aggregate_functions, getHeader(final), aggregates_pools, final, rows);
         return {finalizeBlock(params, getHeader(final), std::move(out_cols), final, rows)};
     }
-
-    ConvertToBlockRes<return_single_block> res;
-
+    ConvertToBlockResVariant res;
     if (final)
     {
 #if USE_EMBEDDED_COMPILER
         if (compiled_aggregate_functions_holder)
         {
             static constexpr bool use_compiled_functions = !Method::low_cardinality_optimization;
-            res = convertToBlockImplFinal<Method, use_compiled_functions, return_single_block>(method, data, arena, aggregates_pools, rows);
+            res = convertToBlockImplFinal<Method>(method, data, arena, aggregates_pools, use_compiled_functions, true);
         }
         else
 #endif
         {
-            res = convertToBlockImplFinal<Method, false, return_single_block>(method, data, arena, aggregates_pools, rows);
+            res = convertToBlockImplFinal<Method>(method, data, arena, aggregates_pools, false, return_single_block);
         }
     }
     else
     {
-        res = convertToBlockImplNotFinal<return_single_block>(method, data, aggregates_pools, rows);
+        res = convertToBlockImplNotFinal(method, data, aggregates_pools, rows, return_single_block);
     }
 
     /// In order to release memory early.
@@ -2146,19 +2077,27 @@ Block Aggregator::insertResultsIntoColumns(PaddedPODArray<AggregateDataPtr> & pl
     return finalizeBlock(params, getHeader(/* final */ true), std::move(out_cols), /* final */ true, places.size());
 }
 
-template <typename Method, bool use_compiled_functions, bool return_single_block, typename Table>
-Aggregator::ConvertToBlockRes<return_single_block> NO_INLINE
-Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t) const
+// template <typename Method, bool use_compiled_functions, bool return_single_block, typename Table>
+// Aggregator::ConvertToBlockRes<return_single_block> NO_INLINE
+// Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t) const
+template <typename Method, typename Table>
+Aggregator::ConvertToBlockResVariant Aggregator::convertToBlockImplFinal(
+    Method & method,
+    Table & data,
+    Arena * arena,
+    Arenas & aggregates_pools,
+    bool use_compiled_functions,
+    bool return_single_block) const
 {
     /// +1 for nullKeyData, if `data` doesn't have it - not a problem, just some memory for one excessive row will be preallocated
     const size_t max_block_size = (return_single_block ? data.size() : std::min(params.max_block_size, data.size())) + 1;
     const bool final = true;
-    ConvertToBlockRes<return_single_block> res;
 
     std::optional<OutputBlockColumns> out_cols;
     std::optional<Sizes> shuffled_key_sizes;
     PaddedPODArray<AggregateDataPtr> places;
     bool has_null_key_data = false;
+    BlocksList blocks;
 
     auto init_out_cols = [&]()
     {
@@ -2187,51 +2126,78 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena
     // should be invoked at least once, because null data might be the only content of the `data`
     init_out_cols();
 
-    data.forEachValue(
-        [&](const auto & key, auto & mapped)
-        {
-            if (!out_cols.has_value())
-                init_out_cols();
-
-            const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes : key_sizes;
-            method.insertKeyIntoColumns(key, out_cols->raw_key_columns, key_sizes_ref);
-            places.emplace_back(mapped);
-
-            /// Mark the cell as destroyed so it will not be destroyed in destructor.
-            mapped = nullptr;
-
-            if constexpr (!return_single_block)
+    if (return_single_block)
+    {
+        data.forEachValue(
+            [&](const auto & key, auto & mapped)
             {
+                if (!out_cols.has_value())
+                    init_out_cols();
+
+                const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes : key_sizes;
+                method.insertKeyIntoColumns(key, out_cols->raw_key_columns, key_sizes_ref);
+                places.emplace_back(mapped);
+
+                /// Mark the cell as destroyed so it will not be destroyed in destructor.
+                mapped = nullptr;
+            });
+    }
+    else
+    {
+        data.forEachValue(
+            [&](const auto & key, auto & mapped)
+            {
+                if (!out_cols.has_value())
+                    init_out_cols();
+
+                const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes : key_sizes;
+                method.insertKeyIntoColumns(key, out_cols->raw_key_columns, key_sizes_ref);
+                places.emplace_back(mapped);
+
+                /// Mark the cell as destroyed so it will not be destroyed in destructor.
+                mapped = nullptr;
+
                 if (places.size() >= max_block_size)
                 {
-                    res.emplace_back(insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena, has_null_key_data));
+                    if (use_compiled_functions)
+                        blocks.emplace_back(insertResultsIntoColumns<true>(places, std::move(out_cols.value()), arena, has_null_key_data));
+                    else
+                        blocks.emplace_back(insertResultsIntoColumns<false>(places, std::move(out_cols.value()), arena, has_null_key_data));
                     places.clear();
                     out_cols.reset();
                     has_null_key_data = false;
                 }
-            }
-        });
+            });
+    }
 
-    if constexpr (return_single_block)
+    if (return_single_block)
     {
-        return insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena, has_null_key_data);
+        if (use_compiled_functions)
+            return insertResultsIntoColumns<true>(places, std::move(out_cols.value()), arena, has_null_key_data);
+        else
+            return insertResultsIntoColumns<false>(places, std::move(out_cols.value()), arena, has_null_key_data);
     }
     else
     {
         if (out_cols.has_value())
-            res.emplace_back(insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena, has_null_key_data));
-        return res;
+        {
+            if (use_compiled_functions)
+                blocks.emplace_back(insertResultsIntoColumns<true>(places, std::move(out_cols.value()), arena, has_null_key_data));
+            else
+                blocks.emplace_back(insertResultsIntoColumns<false>(places, std::move(out_cols.value()), arena, has_null_key_data));
+        }
+        return blocks;
     }
 }
 
-template <bool return_single_block, typename Method, typename Table>
-Aggregator::ConvertToBlockRes<return_single_block> NO_INLINE
-Aggregator::convertToBlockImplNotFinal(Method & method, Table & data, Arenas & aggregates_pools, size_t) const
+template <typename Method, typename Table>
+Aggregator::ConvertToBlockResVariant NO_INLINE
+Aggregator::convertToBlockImplNotFinal(Method & method, Table & data, Arenas & aggregates_pools, size_t, bool return_single_block) const
 {
     /// +1 for nullKeyData, if `data` doesn't have it - not a problem, just some memory for one excessive row will be preallocated
     const size_t max_block_size = (return_single_block ? data.size() : std::min(params.max_block_size, data.size())) + 1;
     const bool final = false;
-    ConvertToBlockRes<return_single_block> res;
+    BlocksList res_blocks;
 
     std::optional<OutputBlockColumns> out_cols;
     std::optional<Sizes> shuffled_key_sizes;
@@ -2261,46 +2227,65 @@ Aggregator::convertToBlockImplNotFinal(Method & method, Table & data, Arenas & a
 
     // should be invoked at least once, because null data might be the only content of the `data`
     init_out_cols();
-
-    data.forEachValue(
-        [&](const auto & key, auto & mapped)
-        {
-            if (!out_cols.has_value())
-                init_out_cols();
-
-            const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes : key_sizes;
-            method.insertKeyIntoColumns(key, out_cols->raw_key_columns, key_sizes_ref);
-
-            /// reserved, so push_back does not throw exceptions
-            for (size_t i = 0; i < params.aggregates_size; ++i)
-                out_cols->aggregate_columns_data[i]->push_back(mapped + offsets_of_aggregate_states[i]);
-
-            mapped = nullptr;
-
-            ++rows_in_current_block;
-
-            if constexpr (!return_single_block)
+    if (return_single_block)
+    {
+        data.forEachValue(
+            [&](const auto & key, auto & mapped)
             {
+                if (!out_cols.has_value())
+                    init_out_cols();
+
+                const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes : key_sizes;
+                method.insertKeyIntoColumns(key, out_cols->raw_key_columns, key_sizes_ref);
+
+                /// reserved, so push_back does not throw exceptions
+                for (size_t i = 0; i < params.aggregates_size; ++i)
+                    out_cols->aggregate_columns_data[i]->push_back(mapped + offsets_of_aggregate_states[i]);
+
+                mapped = nullptr;
+
+                ++rows_in_current_block;
+            });
+    }
+    else
+    {
+        data.forEachValue(
+            [&](const auto & key, auto & mapped)
+            {
+                if (!out_cols.has_value())
+                    init_out_cols();
+
+                const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes : key_sizes;
+                method.insertKeyIntoColumns(key, out_cols->raw_key_columns, key_sizes_ref);
+
+                /// reserved, so push_back does not throw exceptions
+                for (size_t i = 0; i < params.aggregates_size; ++i)
+                    out_cols->aggregate_columns_data[i]->push_back(mapped + offsets_of_aggregate_states[i]);
+
+                mapped = nullptr;
+
+                ++rows_in_current_block;
+
                 if (rows_in_current_block >= max_block_size)
                 {
-                    res.emplace_back(finalizeBlock(params, getHeader(final), std::move(out_cols.value()), final, rows_in_current_block));
+                    res_blocks.emplace_back(finalizeBlock(params, getHeader(final), std::move(out_cols.value()), final, rows_in_current_block));
                     out_cols.reset();
                     rows_in_current_block = 0;
                 }
-            }
-        });
+            });
+    }
 
-    if constexpr (return_single_block)
+    if (return_single_block)
     {
         return finalizeBlock(params, getHeader(final), std::move(out_cols).value(), final, rows_in_current_block);
     }
     else
     {
         if (rows_in_current_block)
-            res.emplace_back(finalizeBlock(params, getHeader(final), std::move(out_cols).value(), final, rows_in_current_block));
-        return res;
+            res_blocks.emplace_back(finalizeBlock(params, getHeader(final), std::move(out_cols).value(), final, rows_in_current_block));
+        return res_blocks;
     }
-    return res;
+    return res_blocks;
 }
 
 void Aggregator::addSingleKeyToAggregateColumns(
@@ -2406,18 +2391,23 @@ template <bool return_single_block>
 Aggregator::ConvertToBlockRes<return_single_block>
 Aggregator::prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final) const
 {
+    ConvertToBlockResVariant res_variant;
     const size_t rows = data_variants.sizeWithoutOverflowRow();
 #define M(NAME) \
     else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
     { \
-        return convertToBlockImpl<return_single_block>( \
-            *data_variants.NAME, data_variants.NAME->data, data_variants.aggregates_pool, data_variants.aggregates_pools, final, rows); \
+        res_variant = convertToBlockImpl( \
+            *data_variants.NAME, data_variants.NAME->data, data_variants.aggregates_pool, data_variants.aggregates_pools, final, rows, return_single_block); \
     }
 
     if (false) {} // NOLINT
     APPLY_FOR_VARIANTS_SINGLE_LEVEL(M)
 #undef M
     else throw Exception(ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT, "Unknown aggregated data variant.");
+    if constexpr (return_single_block)
+        return std::get<Block>(res_variant);
+    else
+        return std::get<BlocksList>(res_variant);
 }
 
 
@@ -2543,7 +2533,7 @@ BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, b
     if (data_variants.type != AggregatedDataVariants::Type::without_key)
     {
         if (!data_variants.isTwoLevel())
-            blocks.splice(blocks.end(), prepareBlockAndFillSingleLevel</* return_single_block */ false>(data_variants, final));
+            blocks.splice(blocks.end(), prepareBlockAndFillSingleLevel<false>(data_variants, final));
         else
             blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, thread_pool.get()));
     }
@@ -2609,8 +2599,8 @@ void NO_INLINE Aggregator::mergeDataNullKey(
     }
 }
 
-template <typename Method, bool use_compiled_functions, bool prefetch, typename Table>
-void NO_INLINE Aggregator::mergeDataImpl(Table & table_dst, Table & table_src, Arena * arena) const
+template <typename Method, typename Table>
+void NO_INLINE Aggregator::mergeDataImpl(Table & table_dst, Table & table_src, Arena * arena, bool use_compiled_functions, bool prefetch) const
 {
     if constexpr (Method::low_cardinality_optimization || Method::one_key_nullable_optimization)
         mergeDataNullKey<Method, Table>(table_dst, table_src, arena);
@@ -2633,11 +2623,14 @@ void NO_INLINE Aggregator::mergeDataImpl(Table & table_dst, Table & table_src, A
         src = nullptr;
     };
 
-    table_src.template mergeToViaEmplace<decltype(merge), prefetch>(table_dst, std::move(merge));
+    if (prefetch)
+        table_src.template mergeToViaEmplace<decltype(merge), true>(table_dst, std::move(merge));
+    else
+        table_src.template mergeToViaEmplace<decltype(merge), false>(table_dst, std::move(merge));
     table_src.clearAndShrink();
 
 #if USE_EMBEDDED_COMPILER
-    if constexpr (use_compiled_functions)
+    if (use_compiled_functions)
     {
         const auto & compiled_functions = compiled_aggregate_functions_holder->compiled_aggregate_functions;
         compiled_functions.merge_aggregate_states_function(dst_places.data(), src_places.data(), dst_places.size());
@@ -2790,22 +2783,14 @@ void NO_INLINE Aggregator::mergeSingleLevelDataImpl(
 #if USE_EMBEDDED_COMPILER
             if (compiled_aggregate_functions_holder)
             {
-                if (prefetch)
-                    mergeDataImpl<Method, true, true>(
-                        getDataVariant<Method>(*res).data, getDataVariant<Method>(current).data, res->aggregates_pool);
-                else
-                    mergeDataImpl<Method, true, false>(
-                        getDataVariant<Method>(*res).data, getDataVariant<Method>(current).data, res->aggregates_pool);
+                mergeDataImpl<Method>(
+                    getDataVariant<Method>(*res).data, getDataVariant<Method>(current).data, res->aggregates_pool, true, prefetch);
             }
             else
 #endif
             {
-                if (prefetch)
-                    mergeDataImpl<Method, false, true>(
-                        getDataVariant<Method>(*res).data, getDataVariant<Method>(current).data, res->aggregates_pool);
-                else
-                    mergeDataImpl<Method, false, false>(
-                        getDataVariant<Method>(*res).data, getDataVariant<Method>(current).data, res->aggregates_pool);
+                mergeDataImpl<Method>(
+                    getDataVariant<Method>(*res).data, getDataVariant<Method>(current).data, res->aggregates_pool, false, prefetch);
             }
         }
         else if (res->without_key)
@@ -2854,22 +2839,18 @@ void NO_INLINE Aggregator::mergeBucketImpl(
 #if USE_EMBEDDED_COMPILER
         if (compiled_aggregate_functions_holder)
         {
-            if (prefetch)
-                mergeDataImpl<Method, true, true>(
-                    getDataVariant<Method>(*res).data.impls[bucket], getDataVariant<Method>(current).data.impls[bucket], arena);
-            else
-                mergeDataImpl<Method, true, false>(
-                    getDataVariant<Method>(*res).data.impls[bucket], getDataVariant<Method>(current).data.impls[bucket], arena);
+            mergeDataImpl<Method>(
+                getDataVariant<Method>(*res).data.impls[bucket], getDataVariant<Method>(current).data.impls[bucket], arena, true, prefetch);
         }
         else
 #endif
         {
-            if (prefetch)
-                mergeDataImpl<Method, false, true>(
-                    getDataVariant<Method>(*res).data.impls[bucket], getDataVariant<Method>(current).data.impls[bucket], arena);
-            else
-                mergeDataImpl<Method, false, false>(
-                    getDataVariant<Method>(*res).data.impls[bucket], getDataVariant<Method>(current).data.impls[bucket], arena);
+            mergeDataImpl<Method>(
+                getDataVariant<Method>(*res).data.impls[bucket],
+                getDataVariant<Method>(current).data.impls[bucket],
+                arena,
+                false,
+                prefetch);
         }
     }
 }
@@ -3589,7 +3570,4 @@ void Aggregator::destroyAllAggregateStates(AggregatedDataVariants & result) cons
         throw Exception(ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT, "Unknown aggregated data variant.");
 }
 
-
-template Aggregator::ConvertToBlockRes<false>
-Aggregator::prepareBlockAndFillSingleLevel<false>(AggregatedDataVariants & data_variants, bool final) const;
 }
diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h
index 375b8986101..1d887d2a2c3 100644
--- a/src/Interpreters/Aggregator.h
+++ b/src/Interpreters/Aggregator.h
@@ -4,6 +4,7 @@
 #include <memory>
 #include <mutex>
 #include <type_traits>
+#include <variant>
 
 
 #include <base/StringRef.h>
@@ -38,998 +39,21 @@
 
 #include <Parsers/IAST_fwd.h>
 
+#include <Interpreters/AggregationMethod.h>
+#include <Interpreters/AggregatedData.h>
+#include <Interpreters/AggregatedDataVariants.h>
+
 namespace DB
 {
 
-namespace ErrorCodes
-{
-    extern const int UNKNOWN_AGGREGATED_DATA_VARIANT;
-}
-
 class Arena;
 using ArenaPtr = std::shared_ptr<Arena>;
 using Arenas = std::vector<ArenaPtr>;
 
-/** Different data structures that can be used for aggregation
-  * For efficiency, the aggregation data itself is put into the pool.
-  * Data and pool ownership (states of aggregate functions)
-  *  is acquired later - in `convertToBlocks` function, by the ColumnAggregateFunction object.
-  *
-  * Most data structures exist in two versions: normal and two-level (TwoLevel).
-  * A two-level hash table works a little slower with a small number of different keys,
-  *  but with a large number of different keys scales better, because it allows
-  *  parallelize some operations (merging, post-processing) in a natural way.
-  *
-  * To ensure efficient work over a wide range of conditions,
-  *  first single-level hash tables are used,
-  *  and when the number of different keys is large enough,
-  *  they are converted to two-level ones.
-  *
-  * PS. There are many different approaches to the effective implementation of parallel and distributed aggregation,
-  *  best suited for different cases, and this approach is just one of them, chosen for a combination of reasons.
-  */
-
-using AggregatedDataWithoutKey = AggregateDataPtr;
-
-using AggregatedDataWithUInt8Key = FixedImplicitZeroHashMapWithCalculatedSize<UInt8, AggregateDataPtr>;
-using AggregatedDataWithUInt16Key = FixedImplicitZeroHashMap<UInt16, AggregateDataPtr>;
-
-using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
-using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
-
-using AggregatedDataWithShortStringKey = StringHashMap<AggregateDataPtr>;
-
-using AggregatedDataWithStringKey = HashMapWithSavedHash<StringRef, AggregateDataPtr>;
-
-using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, UInt128HashCRC32>;
-using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, UInt256HashCRC32>;
-
-using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
-using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
-
-using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap<AggregateDataPtr>;
-
-using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr>;
-
-using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, UInt128HashCRC32>;
-using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, UInt256HashCRC32>;
-
-/** Variants with better hash function, using more than 32 bits for hash.
-  * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion,
-  *  but we keep in memory and merge only sub-partition of them simultaneously.
-  * TODO We need to switch for better hash function not only for external aggregation,
-  *  but also for huge aggregation results on machines with terabytes of RAM.
-  */
-
-using AggregatedDataWithUInt64KeyHash64 = HashMap<UInt64, AggregateDataPtr, DefaultHash<UInt64>>;
-using AggregatedDataWithStringKeyHash64 = HashMapWithSavedHash<StringRef, AggregateDataPtr, StringRefHash64>;
-using AggregatedDataWithKeys128Hash64 = HashMap<UInt128, AggregateDataPtr, UInt128Hash>;
-using AggregatedDataWithKeys256Hash64 = HashMap<UInt256, AggregateDataPtr, UInt256Hash>;
-
-template <typename Base>
-struct AggregationDataWithNullKey : public Base
-{
-    using Base::Base;
-
-    bool & hasNullKeyData() { return has_null_key_data; }
-    AggregateDataPtr & getNullKeyData() { return null_key_data; }
-    bool hasNullKeyData() const { return has_null_key_data; }
-    const AggregateDataPtr & getNullKeyData() const { return null_key_data; }
-    size_t size() const { return Base::size() + (has_null_key_data ? 1 : 0); }
-    bool empty() const { return Base::empty() && !has_null_key_data; }
-    void clear()
-    {
-        Base::clear();
-        has_null_key_data = false;
-    }
-    void clearAndShrink()
-    {
-        Base::clearAndShrink();
-        has_null_key_data = false;
-    }
-
-private:
-    bool has_null_key_data = false;
-    AggregateDataPtr null_key_data = nullptr;
-};
-
-template <typename Base>
-struct AggregationDataWithNullKeyTwoLevel : public Base
-{
-    using Base::Base;
-    using Base::impls;
-
-    AggregationDataWithNullKeyTwoLevel() = default;
-
-    template <typename Other>
-    explicit AggregationDataWithNullKeyTwoLevel(const Other & other) : Base(other)
-    {
-        impls[0].hasNullKeyData() = other.hasNullKeyData();
-        impls[0].getNullKeyData() = other.getNullKeyData();
-    }
-
-    bool & hasNullKeyData() { return impls[0].hasNullKeyData(); }
-    AggregateDataPtr & getNullKeyData() { return impls[0].getNullKeyData(); }
-    bool hasNullKeyData() const { return impls[0].hasNullKeyData(); }
-    const AggregateDataPtr & getNullKeyData() const { return impls[0].getNullKeyData(); }
-};
-
-template <typename ... Types>
-using HashTableWithNullKey = AggregationDataWithNullKey<HashMapTable<Types ...>>;
-template <typename ... Types>
-using StringHashTableWithNullKey = AggregationDataWithNullKey<StringHashMap<Types ...>>;
-
-using AggregatedDataWithNullableUInt8Key = AggregationDataWithNullKey<AggregatedDataWithUInt8Key>;
-using AggregatedDataWithNullableUInt16Key = AggregationDataWithNullKey<AggregatedDataWithUInt16Key>;
-using AggregatedDataWithNullableUInt32Key = AggregationDataWithNullKey<AggregatedDataWithUInt32Key>;
-
-
-using AggregatedDataWithNullableUInt64Key = AggregationDataWithNullKey<AggregatedDataWithUInt64Key>;
-using AggregatedDataWithNullableStringKey = AggregationDataWithNullKey<AggregatedDataWithStringKey>;
-using AggregatedDataWithNullableShortStringKey = AggregationDataWithNullKey<AggregatedDataWithShortStringKey>;
-
-
-using AggregatedDataWithNullableUInt32KeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
-    TwoLevelHashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>,
-                    TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>;
-using AggregatedDataWithNullableUInt64KeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
-        TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>,
-        TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>;
-
-using AggregatedDataWithNullableShortStringKeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
-        TwoLevelStringHashMap<AggregateDataPtr, HashTableAllocator, StringHashTableWithNullKey>>;
-
-using AggregatedDataWithNullableStringKeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
-        TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr, DefaultHash<StringRef>,
-        TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>;
-
-/// For the case where there is one numeric key.
-/// FieldType is UInt8/16/32/64 for any type with corresponding bit width.
-template <typename FieldType, typename TData,
-        bool consecutive_keys_optimization = true, bool nullable = false>
-struct AggregationMethodOneNumber
-{
-    using Data = TData;
-    using Key = typename Data::key_type;
-    using Mapped = typename Data::mapped_type;
-
-    Data data;
-
-    AggregationMethodOneNumber() = default;
-
-    explicit AggregationMethodOneNumber(size_t size_hint) : data(size_hint) { }
-
-    template <typename Other>
-    explicit AggregationMethodOneNumber(const Other & other) : data(other.data)
-    {
-    }
-
-    /// To use one `Method` in different threads, use different `State`.
-    template <bool use_cache>
-    using StateImpl = ColumnsHashing::HashMethodOneNumber<
-        typename Data::value_type,
-        Mapped,
-        FieldType,
-        use_cache && consecutive_keys_optimization,
-        /*need_offset=*/ false,
-        nullable>;
-
-    using State = StateImpl<true>;
-    using StateNoCache = StateImpl<false>;
-
-    /// Use optimization for low cardinality.
-    static const bool low_cardinality_optimization = false;
-    static const bool one_key_nullable_optimization = nullable;
-
-    /// Shuffle key columns before `insertKeyIntoColumns` call if needed.
-    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
-
-    // Insert the key from the hash table into columns.
-    static void insertKeyIntoColumns(const Key & key, std::vector<IColumn *> & key_columns, const Sizes & /*key_sizes*/)
-    {
-        ColumnFixedSizeHelper * column;
-        if constexpr (nullable)
-        {
-            ColumnNullable & nullable_col = assert_cast<ColumnNullable &>(*key_columns[0]);
-            ColumnUInt8 * null_map = assert_cast<ColumnUInt8 *>(&nullable_col.getNullMapColumn());
-            null_map->insertDefault();
-            column = static_cast<ColumnFixedSizeHelper *>(&nullable_col.getNestedColumn());
-        }
-        else
-        {
-            column = static_cast<ColumnFixedSizeHelper *>(key_columns[0]);
-        }
-        static_assert(sizeof(FieldType) <= sizeof(Key));
-        const auto * key_holder = reinterpret_cast<const char *>(&key);
-        if constexpr (sizeof(FieldType) < sizeof(Key) && std::endian::native == std::endian::big)
-            column->insertRawData<sizeof(FieldType)>(key_holder + (sizeof(Key) - sizeof(FieldType)));
-        else
-            column->insertRawData<sizeof(FieldType)>(key_holder);
-    }
-};
-
-
-/// For the case where there is one string key.
-template <typename TData>
-struct AggregationMethodString
-{
-    using Data = TData;
-    using Key = typename Data::key_type;
-    using Mapped = typename Data::mapped_type;
-
-    Data data;
-
-    AggregationMethodString() = default;
-
-    template <typename Other>
-    explicit AggregationMethodString(const Other & other) : data(other.data)
-    {
-    }
-
-    explicit AggregationMethodString(size_t size_hint) : data(size_hint) { }
-
-    template <bool use_cache>
-    using StateImpl = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, /*place_string_to_arena=*/ true, use_cache>;
-
-    using State = StateImpl<true>;
-    using StateNoCache = StateImpl<false>;
-
-    static const bool low_cardinality_optimization = false;
-    static const bool one_key_nullable_optimization = false;
-
-    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
-
-    static void insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &)
-    {
-        static_cast<ColumnString *>(key_columns[0])->insertData(key.data, key.size);
-    }
-};
-
-
-/// Same as above but without cache
-template <typename TData, bool nullable = false>
-struct AggregationMethodStringNoCache
-{
-    using Data = TData;
-    using Key = typename Data::key_type;
-    using Mapped = typename Data::mapped_type;
-
-    Data data;
-
-    AggregationMethodStringNoCache() = default;
-
-    explicit AggregationMethodStringNoCache(size_t size_hint) : data(size_hint) { }
-
-    template <typename Other>
-    explicit AggregationMethodStringNoCache(const Other & other) : data(other.data)
-    {
-    }
-
-    template <bool use_cache>
-    using StateImpl = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, true, false, false, nullable>;
-
-    using State = StateImpl<true>;
-    using StateNoCache = StateImpl<false>;
-
-    static const bool low_cardinality_optimization = false;
-    static const bool one_key_nullable_optimization = nullable;
-
-    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
-
-    static void insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &)
-    {
-        if constexpr (nullable)
-        {
-            ColumnNullable & column_nullable = assert_cast<ColumnNullable &>(*key_columns[0]);
-            assert_cast<ColumnString &>(column_nullable.getNestedColumn()).insertData(key.data, key.size);
-            column_nullable.getNullMapData().push_back(0);
-        }
-        else
-        {
-            assert_cast<ColumnString &>(*key_columns[0]).insertData(key.data, key.size);
-        }
-    }
-};
-
-
-/// For the case where there is one fixed-length string key.
-template <typename TData>
-struct AggregationMethodFixedString
-{
-    using Data = TData;
-    using Key = typename Data::key_type;
-    using Mapped = typename Data::mapped_type;
-
-    Data data;
-
-    AggregationMethodFixedString() = default;
-
-    explicit AggregationMethodFixedString(size_t size_hint) : data(size_hint) { }
-
-    template <typename Other>
-    explicit AggregationMethodFixedString(const Other & other) : data(other.data)
-    {
-    }
-
-    template <bool use_cache>
-    using StateImpl = ColumnsHashing::HashMethodFixedString<typename Data::value_type, Mapped, /*place_string_to_arena=*/ true, use_cache>;
-
-    using State = StateImpl<true>;
-    using StateNoCache = StateImpl<false>;
-
-    static const bool low_cardinality_optimization = false;
-    static const bool one_key_nullable_optimization = false;
-
-    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
-
-    static void insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &)
-    {
-        assert_cast<ColumnFixedString &>(*key_columns[0]).insertData(key.data, key.size);
-    }
-};
-
-/// Same as above but without cache
-template <typename TData, bool nullable = false>
-struct AggregationMethodFixedStringNoCache
-{
-    using Data = TData;
-    using Key = typename Data::key_type;
-    using Mapped = typename Data::mapped_type;
-
-    Data data;
-
-    AggregationMethodFixedStringNoCache() = default;
-
-    explicit AggregationMethodFixedStringNoCache(size_t size_hint) : data(size_hint) { }
-
-    template <typename Other>
-    explicit AggregationMethodFixedStringNoCache(const Other & other) : data(other.data)
-    {
-    }
-
-    template <bool use_cache>
-    using StateImpl = ColumnsHashing::HashMethodFixedString<typename Data::value_type, Mapped, true, false, false, nullable>;
-
-    using State = StateImpl<true>;
-    using StateNoCache = StateImpl<false>;
-
-    static const bool low_cardinality_optimization = false;
-    static const bool one_key_nullable_optimization = nullable;
-
-    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
-
-    static void insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &)
-    {
-        if constexpr (nullable)
-        {
-            assert_cast<ColumnNullable &>(*key_columns[0]).insertData(key.data, key.size);
-        }
-        else
-        {
-            assert_cast<ColumnFixedString &>(*key_columns[0]).insertData(key.data, key.size);
-        }
-    }
-};
-
-
-/// Single low cardinality column.
-template <typename SingleColumnMethod>
-struct AggregationMethodSingleLowCardinalityColumn : public SingleColumnMethod
-{
-    using Base = SingleColumnMethod;
-    using Data = typename Base::Data;
-    using Key = typename Base::Key;
-    using Mapped = typename Base::Mapped;
-    using Base::data;
-
-    template <bool use_cache>
-    using BaseStateImpl = typename Base::template StateImpl<use_cache>;
-
-    AggregationMethodSingleLowCardinalityColumn() = default;
-
-    template <typename Other>
-    explicit AggregationMethodSingleLowCardinalityColumn(const Other & other) : Base(other) {}
-
-    template <bool use_cache>
-    using StateImpl = ColumnsHashing::HashMethodSingleLowCardinalityColumn<BaseStateImpl<use_cache>, Mapped, use_cache>;
-
-    using State = StateImpl<true>;
-    using StateNoCache = StateImpl<false>;
-
-    static const bool low_cardinality_optimization = true;
-
-    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
-
-    static void insertKeyIntoColumns(const Key & key,
-         std::vector<IColumn *> & key_columns_low_cardinality, const Sizes & /*key_sizes*/)
-    {
-        auto * col = assert_cast<ColumnLowCardinality *>(key_columns_low_cardinality[0]);
-
-        if constexpr (std::is_same_v<Key, StringRef>)
-        {
-            col->insertData(key.data, key.size);
-        }
-        else
-        {
-            col->insertData(reinterpret_cast<const char *>(&key), sizeof(key));
-        }
-    }
-};
-
-
-/// For the case where all keys are of fixed length, and they fit in N (for example, 128) bits.
-template <typename TData, bool has_nullable_keys_ = false, bool has_low_cardinality_ = false, bool consecutive_keys_optimization = false>
-struct AggregationMethodKeysFixed
-{
-    using Data = TData;
-    using Key = typename Data::key_type;
-    using Mapped = typename Data::mapped_type;
-    static constexpr bool has_nullable_keys = has_nullable_keys_;
-    static constexpr bool has_low_cardinality = has_low_cardinality_;
-
-    Data data;
-
-    AggregationMethodKeysFixed() = default;
-
-    explicit AggregationMethodKeysFixed(size_t size_hint) : data(size_hint) { }
-
-    template <typename Other>
-    explicit AggregationMethodKeysFixed(const Other & other) : data(other.data)
-    {
-    }
-
-    template <bool use_cache>
-    using StateImpl = ColumnsHashing::HashMethodKeysFixed<
-        typename Data::value_type,
-        Key,
-        Mapped,
-        has_nullable_keys,
-        has_low_cardinality,
-        use_cache && consecutive_keys_optimization>;
-
-    using State = StateImpl<true>;
-    using StateNoCache = StateImpl<false>;
-
-    static const bool low_cardinality_optimization = false;
-    static const bool one_key_nullable_optimization = false;
-
-    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> & key_columns, const Sizes & key_sizes)
-    {
-        return State::shuffleKeyColumns(key_columns, key_sizes);
-    }
-
-    static void insertKeyIntoColumns(const Key & key, std::vector<IColumn *> & key_columns, const Sizes & key_sizes)
-    {
-        size_t keys_size = key_columns.size();
-
-        static constexpr auto bitmap_size = has_nullable_keys ? std::tuple_size<KeysNullMap<Key>>::value : 0;
-        /// In any hash key value, column values to be read start just after the bitmap, if it exists.
-        size_t pos = bitmap_size;
-
-        for (size_t i = 0; i < keys_size; ++i)
-        {
-            IColumn * observed_column;
-            ColumnUInt8 * null_map;
-
-            bool column_nullable = false;
-            if constexpr (has_nullable_keys)
-                column_nullable = isColumnNullable(*key_columns[i]);
-
-            /// If we have a nullable column, get its nested column and its null map.
-            if (column_nullable)
-            {
-                ColumnNullable & nullable_col = assert_cast<ColumnNullable &>(*key_columns[i]);
-                observed_column = &nullable_col.getNestedColumn();
-                null_map = assert_cast<ColumnUInt8 *>(&nullable_col.getNullMapColumn());
-            }
-            else
-            {
-                observed_column = key_columns[i];
-                null_map = nullptr;
-            }
-
-            bool is_null = false;
-            if (column_nullable)
-            {
-                /// The current column is nullable. Check if the value of the
-                /// corresponding key is nullable. Update the null map accordingly.
-                size_t bucket = i / 8;
-                size_t offset = i % 8;
-                UInt8 val = (reinterpret_cast<const UInt8 *>(&key)[bucket] >> offset) & 1;
-                null_map->insertValue(val);
-                is_null = val == 1;
-            }
-
-            if (has_nullable_keys && is_null)
-                observed_column->insertDefault();
-            else
-            {
-                size_t size = key_sizes[i];
-                size_t offset_to = pos;
-                if constexpr (std::endian::native == std::endian::big)
-                   offset_to = sizeof(Key) - size - pos;
-                observed_column->insertData(reinterpret_cast<const char *>(&key) + offset_to, size);
-                pos += size;
-            }
-        }
-    }
-};
-
-
-/** Aggregates by concatenating serialized key values.
-  * The serialized value differs in that it uniquely allows to deserialize it, having only the position with which it starts.
-  * That is, for example, for strings, it contains first the serialized length of the string, and then the bytes.
-  * Therefore, when aggregating by several strings, there is no ambiguity.
-  */
-template <typename TData, bool nullable = false, bool prealloc = false>
-struct AggregationMethodSerialized
-{
-    using Data = TData;
-    using Key = typename Data::key_type;
-    using Mapped = typename Data::mapped_type;
-
-    Data data;
-
-    AggregationMethodSerialized() = default;
-
-    explicit AggregationMethodSerialized(size_t size_hint) : data(size_hint) { }
-
-    template <typename Other>
-    explicit AggregationMethodSerialized(const Other & other) : data(other.data)
-    {
-    }
-
-    template <bool use_cache>
-    using StateImpl = ColumnsHashing::HashMethodSerialized<typename Data::value_type, Mapped, nullable, prealloc>;
-
-    using State = StateImpl<true>;
-    using StateNoCache = StateImpl<false>;
-
-    static const bool low_cardinality_optimization = false;
-    static const bool one_key_nullable_optimization = false;
-
-    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
-
-    static void insertKeyIntoColumns(StringRef key, std::vector<IColumn *> & key_columns, const Sizes &)
-    {
-        const auto * pos = key.data;
-        for (auto & column : key_columns)
-            pos = column->deserializeAndInsertFromArena(pos);
-    }
-};
-
-template <typename TData>
-using AggregationMethodNullableSerialized = AggregationMethodSerialized<TData, true>;
-
-template <typename TData>
-using AggregationMethodPreallocSerialized = AggregationMethodSerialized<TData, false, true>;
-
-template <typename TData>
-using AggregationMethodNullablePreallocSerialized = AggregationMethodSerialized<TData, true, true>;
-
-class Aggregator;
-
 using ColumnsHashing::HashMethodContext;
 using ColumnsHashing::HashMethodContextPtr;
 using ColumnsHashing::LastElementCacheStats;
 
-struct AggregatedDataVariants : private boost::noncopyable
-{
-    /** Working with states of aggregate functions in the pool is arranged in the following (inconvenient) way:
-      * - when aggregating, states are created in the pool using IAggregateFunction::create (inside - `placement new` of arbitrary structure);
-      * - they must then be destroyed using IAggregateFunction::destroy (inside - calling the destructor of arbitrary structure);
-      * - if aggregation is complete, then, in the Aggregator::convertToBlocks function, pointers to the states of aggregate functions
-      *   are written to ColumnAggregateFunction; ColumnAggregateFunction "acquires ownership" of them, that is - calls `destroy` in its destructor.
-      * - if during the aggregation, before call to Aggregator::convertToBlocks, an exception was thrown,
-      *   then the states of aggregate functions must still be destroyed,
-      *   otherwise, for complex states (eg, AggregateFunctionUniq), there will be memory leaks;
-      * - in this case, to destroy states, the destructor calls Aggregator::destroyAggregateStates method,
-      *   but only if the variable aggregator (see below) is not nullptr;
-      * - that is, until you transfer ownership of the aggregate function states in the ColumnAggregateFunction, set the variable `aggregator`,
-      *   so that when an exception occurs, the states are correctly destroyed.
-      *
-      * PS. This can be corrected by making a pool that knows about which states of aggregate functions and in which order are put in it, and knows how to destroy them.
-      * But this can hardly be done simply because it is planned to put variable-length strings into the same pool.
-      * In this case, the pool will not be able to know with what offsets objects are stored.
-      */
-    const Aggregator * aggregator = nullptr;
-
-    size_t keys_size{};  /// Number of keys. NOTE do we need this field?
-    Sizes key_sizes;     /// Dimensions of keys, if keys of fixed length
-
-    /// Pools for states of aggregate functions. Ownership will be later transferred to ColumnAggregateFunction.
-    Arenas aggregates_pools;
-    Arena * aggregates_pool{};    /// The pool that is currently used for allocation.
-
-    /** Specialization for the case when there are no keys, and for keys not fitted into max_rows_to_group_by.
-      */
-    AggregatedDataWithoutKey without_key = nullptr;
-
-    /// Stats of a cache for consecutive keys optimization.
-    /// Stats can be used to disable the cache in case of a lot of misses.
-    LastElementCacheStats consecutive_keys_cache_stats;
-
-    // Disable consecutive key optimization for Uint8/16, because they use a FixedHashMap
-    // and the lookup there is almost free, so we don't need to cache the last lookup result
-    std::unique_ptr<AggregationMethodOneNumber<UInt8, AggregatedDataWithUInt8Key, false>>           key8;
-    std::unique_ptr<AggregationMethodOneNumber<UInt16, AggregatedDataWithUInt16Key, false>>         key16;
-
-    std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithUInt64Key>>         key32;
-    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64Key>>         key64;
-    std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithShortStringKey>>               key_string;
-    std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKey>>          key_fixed_string;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt16Key, false, false, false>>  keys16;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt32Key>>                   keys32;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt64Key>>                   keys64;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128>>                   keys128;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256>>                   keys256;
-    std::unique_ptr<AggregationMethodSerialized<AggregatedDataWithStringKey>>                          serialized;
-    std::unique_ptr<AggregationMethodNullableSerialized<AggregatedDataWithStringKey>>                  nullable_serialized;
-    std::unique_ptr<AggregationMethodPreallocSerialized<AggregatedDataWithStringKey>>                  prealloc_serialized;
-    std::unique_ptr<AggregationMethodNullablePreallocSerialized<AggregatedDataWithStringKey>>          nullable_prealloc_serialized;
-
-    std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithUInt64KeyTwoLevel>> key32_two_level;
-    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64KeyTwoLevel>> key64_two_level;
-    std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithShortStringKeyTwoLevel>>       key_string_two_level;
-    std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKeyTwoLevel>>  key_fixed_string_two_level;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt32KeyTwoLevel>>           keys32_two_level;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt64KeyTwoLevel>>           keys64_two_level;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128TwoLevel>>           keys128_two_level;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256TwoLevel>>           keys256_two_level;
-    std::unique_ptr<AggregationMethodSerialized<AggregatedDataWithStringKeyTwoLevel>>                  serialized_two_level;
-    std::unique_ptr<AggregationMethodNullableSerialized<AggregatedDataWithStringKeyTwoLevel>>          nullable_serialized_two_level;
-    std::unique_ptr<AggregationMethodPreallocSerialized<AggregatedDataWithStringKeyTwoLevel>>          prealloc_serialized_two_level;
-    std::unique_ptr<AggregationMethodNullablePreallocSerialized<AggregatedDataWithStringKeyTwoLevel>>  nullable_prealloc_serialized_two_level;
-
-    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64KeyHash64>>   key64_hash64;
-    std::unique_ptr<AggregationMethodString<AggregatedDataWithStringKeyHash64>>              key_string_hash64;
-    std::unique_ptr<AggregationMethodFixedString<AggregatedDataWithStringKeyHash64>>         key_fixed_string_hash64;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128Hash64>>             keys128_hash64;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256Hash64>>             keys256_hash64;
-    std::unique_ptr<AggregationMethodSerialized<AggregatedDataWithStringKeyHash64>>                  serialized_hash64;
-    std::unique_ptr<AggregationMethodNullableSerialized<AggregatedDataWithStringKeyHash64>>          nullable_serialized_hash64;
-    std::unique_ptr<AggregationMethodPreallocSerialized<AggregatedDataWithStringKeyHash64>>          prealloc_serialized_hash64;
-    std::unique_ptr<AggregationMethodNullablePreallocSerialized<AggregatedDataWithStringKeyHash64>>  nullable_prealloc_serialized_hash64;
-
-    /// Support for nullable keys.
-    std::unique_ptr<AggregationMethodOneNumber<UInt8, AggregatedDataWithNullableUInt8Key, false, true>>         nullable_key8;
-    std::unique_ptr<AggregationMethodOneNumber<UInt16, AggregatedDataWithNullableUInt16Key, false, true>>         nullable_key16;
-    std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt32Key, true, true>>         nullable_key32;
-    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64Key, true, true>>         nullable_key64;
-    std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt32KeyTwoLevel, true, true>>         nullable_key32_two_level;
-    std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64KeyTwoLevel, true, true>>         nullable_key64_two_level;
-
-    std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithNullableShortStringKey, true>> nullable_key_string;
-    std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithNullableShortStringKey, true>> nullable_key_fixed_string;
-    std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithNullableShortStringKeyTwoLevel, true>> nullable_key_string_two_level;
-    std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithNullableShortStringKeyTwoLevel, true>> nullable_key_fixed_string_two_level;
-
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128, true>>             nullable_keys128;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256, true>>             nullable_keys256;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128TwoLevel, true>>     nullable_keys128_two_level;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256TwoLevel, true>>     nullable_keys256_two_level;
-
-    /// Support for low cardinality.
-    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt8, AggregatedDataWithNullableUInt8Key, false>>> low_cardinality_key8;
-    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt16, AggregatedDataWithNullableUInt16Key, false>>> low_cardinality_key16;
-    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt64Key>>> low_cardinality_key32;
-    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64Key>>> low_cardinality_key64;
-    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodString<AggregatedDataWithNullableStringKey>>> low_cardinality_key_string;
-    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodFixedString<AggregatedDataWithNullableStringKey>>> low_cardinality_key_fixed_string;
-
-    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt32, AggregatedDataWithNullableUInt64KeyTwoLevel>>> low_cardinality_key32_two_level;
-    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodOneNumber<UInt64, AggregatedDataWithNullableUInt64KeyTwoLevel>>> low_cardinality_key64_two_level;
-    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodString<AggregatedDataWithNullableStringKeyTwoLevel>>> low_cardinality_key_string_two_level;
-    std::unique_ptr<AggregationMethodSingleLowCardinalityColumn<AggregationMethodFixedString<AggregatedDataWithNullableStringKeyTwoLevel>>> low_cardinality_key_fixed_string_two_level;
-
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128, false, true>>      low_cardinality_keys128;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256, false, true>>      low_cardinality_keys256;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128TwoLevel, false, true>> low_cardinality_keys128_two_level;
-    std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256TwoLevel, false, true>> low_cardinality_keys256_two_level;
-
-    /// In this and similar macros, the option without_key is not considered.
-    #define APPLY_FOR_AGGREGATED_VARIANTS(M) \
-        M(key8,                       false) \
-        M(key16,                      false) \
-        M(key32,                      false) \
-        M(key64,                      false) \
-        M(key_string,                 false) \
-        M(key_fixed_string,           false) \
-        M(keys16,                    false) \
-        M(keys32,                    false) \
-        M(keys64,                    false) \
-        M(keys128,                    false) \
-        M(keys256,                    false) \
-        M(serialized,                   false) \
-        M(nullable_serialized,          false) \
-        M(prealloc_serialized,          false) \
-        M(nullable_prealloc_serialized, false) \
-        M(key32_two_level,            true) \
-        M(key64_two_level,            true) \
-        M(key_string_two_level,       true) \
-        M(key_fixed_string_two_level, true) \
-        M(keys32_two_level,          true) \
-        M(keys64_two_level,          true) \
-        M(keys128_two_level,          true) \
-        M(keys256_two_level,          true) \
-        M(serialized_two_level,                   true) \
-        M(nullable_serialized_two_level,          true) \
-        M(prealloc_serialized_two_level,          true) \
-        M(nullable_prealloc_serialized_two_level, true) \
-        M(key64_hash64,               false) \
-        M(key_string_hash64,          false) \
-        M(key_fixed_string_hash64,    false) \
-        M(keys128_hash64,             false) \
-        M(keys256_hash64,             false) \
-        M(serialized_hash64,                   false) \
-        M(nullable_serialized_hash64,          false) \
-        M(prealloc_serialized_hash64,          false) \
-        M(nullable_prealloc_serialized_hash64, false) \
-        M(nullable_key8,             false) \
-        M(nullable_key16,             false) \
-        M(nullable_key32,             false) \
-        M(nullable_key64,             false) \
-        M(nullable_key32_two_level,   true) \
-        M(nullable_key64_two_level,   true) \
-        M(nullable_key_string,        false) \
-        M(nullable_key_fixed_string,  false) \
-        M(nullable_key_string_two_level, true) \
-        M(nullable_key_fixed_string_two_level, true) \
-        M(nullable_keys128,           false) \
-        M(nullable_keys256,           false) \
-        M(nullable_keys128_two_level, true) \
-        M(nullable_keys256_two_level, true) \
-        M(low_cardinality_key8, false) \
-        M(low_cardinality_key16, false) \
-        M(low_cardinality_key32, false) \
-        M(low_cardinality_key64, false) \
-        M(low_cardinality_keys128, false) \
-        M(low_cardinality_keys256, false) \
-        M(low_cardinality_key_string, false) \
-        M(low_cardinality_key_fixed_string, false) \
-        M(low_cardinality_key32_two_level, true) \
-        M(low_cardinality_key64_two_level, true) \
-        M(low_cardinality_keys128_two_level, true) \
-        M(low_cardinality_keys256_two_level, true) \
-        M(low_cardinality_key_string_two_level, true) \
-        M(low_cardinality_key_fixed_string_two_level, true) \
-
-    enum class Type
-    {
-        EMPTY = 0,
-        without_key,
-
-    #define M(NAME, IS_TWO_LEVEL) NAME,
-        APPLY_FOR_AGGREGATED_VARIANTS(M)
-    #undef M
-    };
-    Type type = Type::EMPTY;
-
-    AggregatedDataVariants() : aggregates_pools(1, std::make_shared<Arena>()), aggregates_pool(aggregates_pools.back().get()) {}
-    bool empty() const { return type == Type::EMPTY; }
-    void invalidate() { type = Type::EMPTY; }
-
-    ~AggregatedDataVariants();
-
-    void init(Type type_, std::optional<size_t> size_hint = std::nullopt);
-
-    /// Number of rows (different keys).
-    size_t size() const
-    {
-        switch (type)
-        {
-            case Type::EMPTY:       return 0;
-            case Type::without_key: return 1;
-
-        #define M(NAME, IS_TWO_LEVEL) \
-            case Type::NAME: return (NAME)->data.size() + (without_key != nullptr);
-            APPLY_FOR_AGGREGATED_VARIANTS(M)
-        #undef M
-        }
-
-        UNREACHABLE();
-    }
-
-    /// The size without taking into account the row in which data is written for the calculation of TOTALS.
-    size_t sizeWithoutOverflowRow() const
-    {
-        switch (type)
-        {
-            case Type::EMPTY:       return 0;
-            case Type::without_key: return 1;
-
-            #define M(NAME, IS_TWO_LEVEL) \
-            case Type::NAME: return (NAME)->data.size();
-            APPLY_FOR_AGGREGATED_VARIANTS(M)
-            #undef M
-        }
-
-        UNREACHABLE();
-    }
-
-    const char * getMethodName() const
-    {
-        switch (type)
-        {
-            case Type::EMPTY:       return "EMPTY";
-            case Type::without_key: return "without_key";
-
-        #define M(NAME, IS_TWO_LEVEL) \
-            case Type::NAME: return #NAME;
-            APPLY_FOR_AGGREGATED_VARIANTS(M)
-        #undef M
-        }
-
-        UNREACHABLE();
-    }
-
-    bool isTwoLevel() const
-    {
-        switch (type)
-        {
-            case Type::EMPTY:       return false;
-            case Type::without_key: return false;
-
-        #define M(NAME, IS_TWO_LEVEL) \
-            case Type::NAME: return IS_TWO_LEVEL;
-            APPLY_FOR_AGGREGATED_VARIANTS(M)
-        #undef M
-        }
-
-        UNREACHABLE();
-    }
-
-    #define APPLY_FOR_VARIANTS_CONVERTIBLE_TO_TWO_LEVEL(M) \
-        M(key32)            \
-        M(key64)            \
-        M(key_string)       \
-        M(key_fixed_string) \
-        M(keys32)           \
-        M(keys64)           \
-        M(keys128)          \
-        M(keys256)          \
-        M(serialized)       \
-        M(nullable_serialized) \
-        M(prealloc_serialized) \
-        M(nullable_prealloc_serialized) \
-        M(nullable_key32) \
-        M(nullable_key64) \
-        M(nullable_key_string) \
-        M(nullable_key_fixed_string) \
-        M(nullable_keys128) \
-        M(nullable_keys256) \
-        M(low_cardinality_key32) \
-        M(low_cardinality_key64) \
-        M(low_cardinality_keys128) \
-        M(low_cardinality_keys256) \
-        M(low_cardinality_key_string) \
-        M(low_cardinality_key_fixed_string) \
-
-    /// NOLINTNEXTLINE
-    #define APPLY_FOR_VARIANTS_NOT_CONVERTIBLE_TO_TWO_LEVEL(M) \
-        M(key8)             \
-        M(key16)            \
-        M(nullable_key8) \
-        M(nullable_key16) \
-        M(keys16)           \
-        M(key64_hash64)     \
-        M(key_string_hash64)\
-        M(key_fixed_string_hash64) \
-        M(keys128_hash64)   \
-        M(keys256_hash64)   \
-        M(serialized_hash64) \
-        M(nullable_serialized_hash64) \
-        M(prealloc_serialized_hash64) \
-        M(nullable_prealloc_serialized_hash64) \
-        M(low_cardinality_key8) \
-        M(low_cardinality_key16) \
-
-    /// NOLINTNEXTLINE
-    #define APPLY_FOR_VARIANTS_SINGLE_LEVEL(M) \
-        APPLY_FOR_VARIANTS_NOT_CONVERTIBLE_TO_TWO_LEVEL(M) \
-        APPLY_FOR_VARIANTS_CONVERTIBLE_TO_TWO_LEVEL(M) \
-
-    bool isConvertibleToTwoLevel() const
-    {
-        switch (type)
-        {
-        #define M(NAME) \
-            case Type::NAME: return true;
-
-            APPLY_FOR_VARIANTS_CONVERTIBLE_TO_TWO_LEVEL(M)
-
-        #undef M
-            default:
-                return false;
-        }
-    }
-
-    void convertToTwoLevel();
-
-    /// NOLINTNEXTLINE
-    #define APPLY_FOR_VARIANTS_TWO_LEVEL(M) \
-        M(key32_two_level)            \
-        M(key64_two_level)            \
-        M(key_string_two_level)       \
-        M(key_fixed_string_two_level) \
-        M(keys32_two_level)           \
-        M(keys64_two_level)           \
-        M(keys128_two_level)          \
-        M(keys256_two_level)          \
-        M(serialized_two_level)       \
-        M(nullable_serialized_two_level)       \
-        M(prealloc_serialized_two_level)       \
-        M(nullable_prealloc_serialized_two_level)       \
-        M(nullable_key32_two_level) \
-        M(nullable_key64_two_level) \
-        M(nullable_key_string_two_level) \
-        M(nullable_key_fixed_string_two_level) \
-        M(nullable_keys128_two_level) \
-        M(nullable_keys256_two_level) \
-        M(low_cardinality_key32_two_level) \
-        M(low_cardinality_key64_two_level) \
-        M(low_cardinality_keys128_two_level) \
-        M(low_cardinality_keys256_two_level) \
-        M(low_cardinality_key_string_two_level) \
-        M(low_cardinality_key_fixed_string_two_level) \
-
-    #define APPLY_FOR_LOW_CARDINALITY_VARIANTS(M) \
-        M(low_cardinality_key8) \
-        M(low_cardinality_key16) \
-        M(low_cardinality_key32) \
-        M(low_cardinality_key64) \
-        M(low_cardinality_keys128) \
-        M(low_cardinality_keys256) \
-        M(low_cardinality_key_string) \
-        M(low_cardinality_key_fixed_string) \
-        M(low_cardinality_key32_two_level) \
-        M(low_cardinality_key64_two_level) \
-        M(low_cardinality_keys128_two_level) \
-        M(low_cardinality_keys256_two_level) \
-        M(low_cardinality_key_string_two_level) \
-        M(low_cardinality_key_fixed_string_two_level)
-
-    bool isLowCardinality() const
-    {
-        switch (type)
-        {
-        #define M(NAME) \
-            case Type::NAME: return true;
-
-            APPLY_FOR_LOW_CARDINALITY_VARIANTS(M)
-        #undef M
-            default:
-                return false;
-        }
-    }
-
-    static HashMethodContextPtr createCache(Type type, const HashMethodContext::Settings & settings)
-    {
-        switch (type)
-        {
-            case Type::without_key: return nullptr;
-
-            #define M(NAME, IS_TWO_LEVEL) \
-            case Type::NAME: \
-            { \
-                using TPtr ## NAME = decltype(AggregatedDataVariants::NAME); \
-                using T ## NAME = typename TPtr ## NAME ::element_type; \
-                return T ## NAME ::State::createContext(settings); \
-            }
-
-            APPLY_FOR_AGGREGATED_VARIANTS(M)
-            #undef M
-
-            default:
-                throw Exception(ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT, "Unknown aggregated data variant.");
-        }
-    }
-};
-
-using AggregatedDataVariantsPtr = std::shared_ptr<AggregatedDataVariants>;
-using ManyAggregatedDataVariants = std::vector<AggregatedDataVariantsPtr>;
-using ManyAggregatedDataVariantsPtr = std::shared_ptr<ManyAggregatedDataVariants>;
-
 class CompiledAggregateFunctionsHolder;
 class NativeWriter;
 struct OutputBlockColumns;
@@ -1395,7 +419,7 @@ private:
         AggregateDataPtr overflow_row) const;
 
     /// Specialization for a particular value no_more_keys.
-    template <bool no_more_keys, bool use_compiled_functions, bool prefetch, typename Method, typename State>
+    template <bool prefetch, typename Method, typename State>
     void executeImplBatch(
         Method & method,
         State & state,
@@ -1403,17 +427,31 @@ private:
         size_t row_begin,
         size_t row_end,
         AggregateFunctionInstruction * aggregate_instructions,
+        bool no_more_keys,
         bool all_keys_are_const,
+        bool use_compiled_functions,
         AggregateDataPtr overflow_row) const;
 
+    void executeAggregateInstructions(
+        Arena * aggregates_pool,
+        size_t row_begin,
+        size_t row_end,
+        AggregateFunctionInstruction * aggregate_instructions,
+        const std::unique_ptr<AggregateDataPtr[]> & places,
+        size_t key_start,
+        bool has_only_one_value_since_last_reset,
+        bool no_more_keys,
+        bool all_keys_are_const,
+        bool use_compiled_functions) const;
+
     /// For case when there are no keys (all aggregate into one row).
-    template <bool use_compiled_functions>
     void executeWithoutKeyImpl(
         AggregatedDataWithoutKey & res,
         size_t row_begin,
         size_t row_end,
         AggregateFunctionInstruction * aggregate_instructions,
-        Arena * arena) const;
+        Arena * arena,
+        bool use_compiled_functions) const;
 
     template <typename Method>
     void writeToTemporaryFileImpl(
@@ -1429,8 +467,9 @@ private:
             Arena * arena) const;
 
     /// Merge data from hash table `src` into `dst`.
-    template <typename Method, bool use_compiled_functions, bool prefetch, typename Table>
-    void mergeDataImpl(Table & table_dst, Table & table_src, Arena * arena) const;
+    // template <typename Method, bool use_compiled_functions, booal prefetch, typename Table>
+    template <typename Method, typename Table>
+    void mergeDataImpl(Table & table_dst, Table & table_src, Arena * arena, bool use_compiled_functions, bool prefetch) const;
 
     /// Merge data from hash table `src` into `dst`, but only for keys that already exist in dst. In other cases, merge the data into `overflows`.
     template <typename Method, typename Table>
@@ -1456,10 +495,11 @@ private:
 
     template <bool return_single_block>
     using ConvertToBlockRes = std::conditional_t<return_single_block, Block, BlocksList>;
+    using ConvertToBlockResVariant = std::variant<Block, BlocksList>;
 
-    template <bool return_single_block, typename Method, typename Table>
-    ConvertToBlockRes<return_single_block>
-    convertToBlockImpl(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows) const;
+    template <typename Method, typename Table>
+    ConvertToBlockResVariant
+    convertToBlockImpl(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows, bool return_single_block) const;
 
     template <typename Mapped>
     void insertAggregatesIntoColumns(
@@ -1470,13 +510,18 @@ private:
     template <bool use_compiled_functions>
     Block insertResultsIntoColumns(PaddedPODArray<AggregateDataPtr> & places, OutputBlockColumns && out_cols, Arena * arena, bool has_null_key_data) const;
 
-    template <typename Method, bool use_compiled_functions, bool return_single_block, typename Table>
-    ConvertToBlockRes<return_single_block>
-    convertToBlockImplFinal(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t rows) const;
+    template <typename Method, typename Table>
+    ConvertToBlockResVariant convertToBlockImplFinal(
+        Method & method,
+        Table & data,
+        Arena * arena,
+        Arenas & aggregates_pools,
+        bool use_compiled_functions,
+        bool return_single_block) const;
 
-    template <bool return_single_block, typename Method, typename Table>
-    ConvertToBlockRes<return_single_block>
-    convertToBlockImplNotFinal(Method & method, Table & data, Arenas & aggregates_pools, size_t rows) const;
+    template <typename Method, typename Table>
+    ConvertToBlockResVariant
+    convertToBlockImplNotFinal(Method & method, Table & data, Arenas & aggregates_pools, size_t rows, bool return_single_block) const;
 
     template <typename Method>
     Block convertOneBucketToBlock(