From cb9039e91d073942ff07a68e10336f8db38f4a79 Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Tue, 6 Aug 2024 19:05:12 +0800 Subject: [PATCH] Add thread pool and cancellation if merge data with key This patch will add thread pool and cancellation if merge data with key. During the merge, if the data size is large, we may convert the singleLevelHash to twoLevelHash and merge in parallel. Test the patch with 2 x 80 vCPUs, Q8 and Q9 has got 10.3% and 7.6% performance improvement. Signed-off-by: Jiebin Sun --- .../AggregateFunctionUniq.h | 2 ++ src/AggregateFunctions/IAggregateFunction.h | 12 +++++++--- src/AggregateFunctions/UniqExactSet.h | 7 ++++++ src/Interpreters/Aggregator.cpp | 24 ++++++++++++------- src/Interpreters/Aggregator.h | 4 ++-- .../Transforms/AggregatingTransform.cpp | 2 +- 6 files changed, 36 insertions(+), 15 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionUniq.h b/src/AggregateFunctions/AggregateFunctionUniq.h index cef23f766c7..35d6e599e38 100644 --- a/src/AggregateFunctions/AggregateFunctionUniq.h +++ b/src/AggregateFunctions/AggregateFunctionUniq.h @@ -459,6 +459,8 @@ public: bool isParallelizeMergePrepareNeeded() const override { return is_parallelize_merge_prepare_needed; } + constexpr static bool parallelizeMergeWithKey() { return true; } + void parallelizeMergePrepare(AggregateDataPtrs & places, ThreadPool & thread_pool, std::atomic & is_cancelled) const override { if constexpr (is_parallelize_merge_prepare_needed) diff --git a/src/AggregateFunctions/IAggregateFunction.h b/src/AggregateFunctions/IAggregateFunction.h index ee227db6d9d..f8e7051d635 100644 --- a/src/AggregateFunctions/IAggregateFunction.h +++ b/src/AggregateFunctions/IAggregateFunction.h @@ -145,6 +145,8 @@ public: virtual bool isParallelizeMergePrepareNeeded() const { return false; } + constexpr static bool parallelizeMergeWithKey() { return false; } + virtual void parallelizeMergePrepare(AggregateDataPtrs & /*places*/, ThreadPool & /*thread_pool*/, std::atomic & /*is_cancelled*/) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "parallelizeMergePrepare() with thread pool parameter isn't implemented for {} ", getName()); @@ -169,7 +171,7 @@ public: /// Merges states (on which src places points to) with other states (on which dst places points to) of current aggregation function /// then destroy states (on which src places points to). - virtual void mergeAndDestroyBatch(AggregateDataPtr * dst_places, AggregateDataPtr * src_places, size_t size, size_t offset, Arena * arena) const = 0; + virtual void mergeAndDestroyBatch(AggregateDataPtr * dst_places, AggregateDataPtr * src_places, size_t size, size_t offset, ThreadPool & thread_pool, std::atomic & is_cancelled, Arena * arena) const = 0; /// Serializes state (to transmit it over the network, for example). virtual void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional version = std::nullopt) const = 0; /// NOLINT @@ -499,11 +501,15 @@ public: static_cast(this)->merge(places[i] + place_offset, rhs[i], arena); } - void mergeAndDestroyBatch(AggregateDataPtr * dst_places, AggregateDataPtr * rhs_places, size_t size, size_t offset, Arena * arena) const override + void mergeAndDestroyBatch(AggregateDataPtr * dst_places, AggregateDataPtr * rhs_places, size_t size, size_t offset, ThreadPool & thread_pool, std::atomic & is_cancelled, Arena * arena) const override { for (size_t i = 0; i < size; ++i) { - static_cast(this)->merge(dst_places[i] + offset, rhs_places[i] + offset, arena); + if constexpr (Derived::parallelizeMergeWithKey()) + static_cast(this)->merge(dst_places[i] + offset, rhs_places[i] + offset, thread_pool, is_cancelled, arena); + else + static_cast(this)->merge(dst_places[i] + offset, rhs_places[i] + offset, arena); + static_cast(this)->destroy(rhs_places[i] + offset); } } diff --git a/src/AggregateFunctions/UniqExactSet.h b/src/AggregateFunctions/UniqExactSet.h index 2ae8c3a8386..25c6f7ac55f 100644 --- a/src/AggregateFunctions/UniqExactSet.h +++ b/src/AggregateFunctions/UniqExactSet.h @@ -101,6 +101,13 @@ public: auto merge(const UniqExactSet & other, ThreadPool * thread_pool = nullptr, std::atomic * is_cancelled = nullptr) { + /// If the size is large, we may convert the singleLevelHash to twoLevelHash and merge in parallel. + if (other.size() > 40000) + { + if (isSingleLevel()) + convertToTwoLevel(); + } + if (isSingleLevel() && other.isTwoLevel()) convertToTwoLevel(); diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index e073b7a49b6..71be3c1c30c 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -2371,7 +2371,7 @@ void NO_INLINE Aggregator::mergeDataNullKey( template void NO_INLINE Aggregator::mergeDataImpl( - Table & table_dst, Table & table_src, Arena * arena, bool use_compiled_functions [[maybe_unused]], bool prefetch) const + Table & table_dst, Table & table_src, Arena * arena, bool use_compiled_functions [[maybe_unused]], bool prefetch, ThreadPool & thread_pool, std::atomic & is_cancelled) const { if constexpr (Method::low_cardinality_optimization || Method::one_key_nullable_optimization) mergeDataNullKey(table_dst, table_src, arena); @@ -2410,7 +2410,7 @@ void NO_INLINE Aggregator::mergeDataImpl( { if (!is_aggregate_function_compiled[i]) aggregate_functions[i]->mergeAndDestroyBatch( - dst_places.data(), src_places.data(), dst_places.size(), offsets_of_aggregate_states[i], arena); + dst_places.data(), src_places.data(), dst_places.size(), offsets_of_aggregate_states[i], thread_pool, is_cancelled, arena); } return; @@ -2420,7 +2420,7 @@ void NO_INLINE Aggregator::mergeDataImpl( for (size_t i = 0; i < params.aggregates_size; ++i) { aggregate_functions[i]->mergeAndDestroyBatch( - dst_places.data(), src_places.data(), dst_places.size(), offsets_of_aggregate_states[i], arena); + dst_places.data(), src_places.data(), dst_places.size(), offsets_of_aggregate_states[i], thread_pool, is_cancelled, arena); } } @@ -2535,8 +2535,10 @@ void NO_INLINE Aggregator::mergeWithoutKeyDataImpl( template void NO_INLINE Aggregator::mergeSingleLevelDataImpl( - ManyAggregatedDataVariants & non_empty_data) const + ManyAggregatedDataVariants & non_empty_data, std::atomic & is_cancelled) const { + ThreadPool thread_pool{CurrentMetrics::AggregatorThreads, CurrentMetrics::AggregatorThreadsActive, CurrentMetrics::AggregatorThreadsScheduled, params.max_threads}; + AggregatedDataVariantsPtr & res = non_empty_data[0]; bool no_more_keys = false; @@ -2557,13 +2559,13 @@ void NO_INLINE Aggregator::mergeSingleLevelDataImpl( if (compiled_aggregate_functions_holder) { mergeDataImpl( - getDataVariant(*res).data, getDataVariant(current).data, res->aggregates_pool, true, prefetch); + getDataVariant(*res).data, getDataVariant(current).data, res->aggregates_pool, true, prefetch, thread_pool, is_cancelled); } else #endif { mergeDataImpl( - getDataVariant(*res).data, getDataVariant(current).data, res->aggregates_pool, false, prefetch); + getDataVariant(*res).data, getDataVariant(current).data, res->aggregates_pool, false, prefetch, thread_pool, is_cancelled); } } else if (res->without_key) @@ -2589,7 +2591,7 @@ void NO_INLINE Aggregator::mergeSingleLevelDataImpl( #define M(NAME) \ template void NO_INLINE Aggregator::mergeSingleLevelDataImpl( \ - ManyAggregatedDataVariants & non_empty_data) const; + ManyAggregatedDataVariants & non_empty_data, std::atomic & is_cancelled) const; APPLY_FOR_VARIANTS_SINGLE_LEVEL(M) #undef M @@ -2597,6 +2599,8 @@ template void NO_INLINE Aggregator::mergeBucketImpl( ManyAggregatedDataVariants & data, Int32 bucket, Arena * arena, std::atomic & is_cancelled) const { + ThreadPool thread_pool{CurrentMetrics::AggregatorThreads, CurrentMetrics::AggregatorThreadsActive, CurrentMetrics::AggregatorThreadsScheduled, params.max_threads}; + /// We merge all aggregation results to the first. AggregatedDataVariantsPtr & res = data[0]; @@ -2613,7 +2617,7 @@ void NO_INLINE Aggregator::mergeBucketImpl( if (compiled_aggregate_functions_holder) { mergeDataImpl( - getDataVariant(*res).data.impls[bucket], getDataVariant(current).data.impls[bucket], arena, true, prefetch); + getDataVariant(*res).data.impls[bucket], getDataVariant(current).data.impls[bucket], arena, true, prefetch, thread_pool, is_cancelled); } else #endif @@ -2623,7 +2627,9 @@ void NO_INLINE Aggregator::mergeBucketImpl( getDataVariant(current).data.impls[bucket], arena, false, - prefetch); + prefetch, + thread_pool, + is_cancelled); } } } diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h index f4f1e9a1df3..c357da2d9de 100644 --- a/src/Interpreters/Aggregator.h +++ b/src/Interpreters/Aggregator.h @@ -455,7 +455,7 @@ private: /// Merge data from hash table `src` into `dst`. template - void mergeDataImpl(Table & table_dst, Table & table_src, Arena * arena, bool use_compiled_functions, bool prefetch) const; + void mergeDataImpl(Table & table_dst, Table & table_src, Arena * arena, bool use_compiled_functions, bool prefetch, ThreadPool & thread_pool, std::atomic & is_cancelled) const; /// Merge data from hash table `src` into `dst`, but only for keys that already exist in dst. In other cases, merge the data into `overflows`. template @@ -478,7 +478,7 @@ private: template void mergeSingleLevelDataImpl( - ManyAggregatedDataVariants & non_empty_data) const; + ManyAggregatedDataVariants & non_empty_data, std::atomic & is_cancelled) const; template using ConvertToBlockRes = std::conditional_t; diff --git a/src/Processors/Transforms/AggregatingTransform.cpp b/src/Processors/Transforms/AggregatingTransform.cpp index cdbe194cfac..5e7fba6d80c 100644 --- a/src/Processors/Transforms/AggregatingTransform.cpp +++ b/src/Processors/Transforms/AggregatingTransform.cpp @@ -495,7 +495,7 @@ private: #define M(NAME) \ else if (first->type == AggregatedDataVariants::Type::NAME) \ - params->aggregator.mergeSingleLevelDataImplNAME)::element_type>(*data); + params->aggregator.mergeSingleLevelDataImplNAME)::element_type>(*data, shared_data->is_cancelled); if (false) {} // NOLINT APPLY_FOR_VARIANTS_SINGLE_LEVEL(M) #undef M