From 35231662b30d7ae1f4ed7b093735599a21b37d64 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 16 Nov 2020 21:50:25 +0300 Subject: [PATCH] Improve performance of AggregatingMergeTree w/ SimpleAggregateFunction(String) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While reading from AggregatingMergeTree with SimpleAggregateFunction(String) in primary key and optimize_aggregation_in_order perf top shows: Samples: 1M of event 'cycles', 4000 Hz, Event count (approx.): 287759760270 lost: 0/0 drop: 0/0 Children Self Shared Object Symbol + 12.64% 11.39% clickhouse [.] memcpy + 9.08% 0.23% [unknown] [.] 0000000000000000 + 8.45% 8.40% clickhouse [.] ProfileEvents::increment # <-- this, and in debug it has not 0.08x overhead, but 5.8x overhead + 7.68% 7.67% clickhouse [.] LZ4_compress_fast_extState + 5.29% 5.22% clickhouse [.] DB::IAggregateFunctionHelper >::addFree The reason is obvious, ProfileEvents is atomic counters (and also they are nested):
``` Samples: 7M of event 'cycles', 4000 Hz, Event count (approx.): 450726149337 ProfileEvents::increment /usr/bin/clickhouse [Percent: local period] Percent│ │ │ │ Disassembly of section .text: │ │ 00000000078d8900 : │ ProfileEvents::increment(unsigned long, unsigned long): 0.17 │ push %rbp 0.00 │ mov %rsi,%rbp 0.04 │ push %rbx 0.20 │ mov %rdi,%rbx 0.17 │ sub $0x8,%rsp 0.26 │ → callq DB::CurrentThread::getProfileEvents │ ProfileEvents::Counters::increment(unsigned long, unsigned long): 0.00 │ lea 0x0(,%rbx,8),%rdi 0.05 │ nop │ unsigned long std::__1::__cxx_atomic_fetch_add(std::__1::__cxx_atomic_base_impl*, unsigned long, std::__1::memory_order): 1.02 │ mov (%rax),%rdx 97.04 │ lock add %rbp,(%rdx,%rdi,1) │ ProfileEvents::Counters::increment(unsigned long, unsigned long): 0.21 │ mov 0x10(%rax),%rax 0.04 │ test %rax,%rax 0.00 │ → jne 78d8920 │ ProfileEvents::increment(unsigned long, unsigned long): 0.38 │ add $0x8,%rsp 0.00 │ pop %rbx 0.04 │ pop %rbp 0.38 │ ← retq ```
These ProfileEvents was ArenaAllocChunks (it shows ~1.5M events per second), and the reason is that the table has SimpleAggregateFunction(String) in PK, which requires Arena. But most of the time there Arena wasn't even used, so avoid this cost by re-creating Arena only if it was "used" (i.e. has new chunks). Another possibility is to avoid populating Arena::head in ctor, but this will make the Arena code more complex, so for now this was preferred. Also as a long-term solution it worth looking at implementing them via RCU (to move the extra overhead out from the write code path into read side). --- .../Algorithms/AggregatingSortedAlgorithm.cpp | 20 ++++++++++++++++-- .../Algorithms/AggregatingSortedAlgorithm.h | 1 + ..._tree_simple_aggregate_function_string.xml | 21 +++++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 tests/performance/aggregating_merge_tree_simple_aggregate_function_string.xml diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp index b834ed82729..6b4746b1320 100644 --- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp @@ -195,7 +195,14 @@ AggregatingSortedAlgorithm::AggregatingMergedData::AggregatingMergedData( MutableColumns columns_, UInt64 max_block_size_, ColumnsDefinition & def_) : MergedData(std::move(columns_), false, max_block_size_), def(def_) { - initAggregateDescription(); + initAggregateDescription(); + + /// Just to make startGroup() simpler. + if (def.allocates_memory_in_arena) + { + arena = std::make_unique(); + arena_size = arena->size(); + } } void AggregatingSortedAlgorithm::AggregatingMergedData::startGroup(const ColumnRawPtrs & raw_columns, size_t row) @@ -212,8 +219,17 @@ void AggregatingSortedAlgorithm::AggregatingMergedData::startGroup(const ColumnR for (auto & desc : def.columns_to_simple_aggregate) desc.createState(); - if (def.allocates_memory_in_arena) + /// If and only if: + /// - arena is required (i.e. SimpleAggregateFunction(any, String) in PK) + /// - arena was used since otherwise it may be too costly to increment atomic counters inside Arena. + /// i.e. SELECT with SimpleAggregateFunction(String) in PK and lots of groups + /// may produce ~1.5M of ArenaAllocChunks atomic increments, + /// while LOCK is too costly for CPU (~10% overhead here). + if (def.allocates_memory_in_arena && arena->size() > arena_size) + { arena = std::make_unique(); + arena_size = arena->size(); + } is_group_started = true; } diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h index da4ec876b69..e572ed7d526 100644 --- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h @@ -73,6 +73,7 @@ private: /// Memory pool for SimpleAggregateFunction /// (only when allocates_memory_in_arena == true). std::unique_ptr arena; + size_t arena_size = 0; bool is_group_started = false; diff --git a/tests/performance/aggregating_merge_tree_simple_aggregate_function_string.xml b/tests/performance/aggregating_merge_tree_simple_aggregate_function_string.xml new file mode 100644 index 00000000000..d62f8d8e088 --- /dev/null +++ b/tests/performance/aggregating_merge_tree_simple_aggregate_function_string.xml @@ -0,0 +1,21 @@ + + + CREATE TABLE bench + ENGINE = AggregatingMergeTree() + ORDER BY key + SETTINGS index_granularity = 8192 + AS + SELECT CAST(reinterpretAsString(number), 'SimpleAggregateFunction(any, String)') AS key + FROM numbers_mt(toUInt64(10e6)) + + + + SELECT * + FROM bench + GROUP BY key + SETTINGS optimize_aggregation_in_order = 1, max_threads = 16 + FORMAT Null + + + DROP TABLE IF EXISTS bench +