Merge pull request #17109 from azat/perf-AggregatingMergeTree-INSERT

Improve performance of AggregatingMergeTree w/ SimpleAggregateFunction(String) in PK
This commit is contained in:
Alexander Kuzmenkov 2020-12-01 16:27:36 +03:00 committed by GitHub
commit 5ad15e2018
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 2 deletions

View File

@ -195,7 +195,14 @@ AggregatingSortedAlgorithm::AggregatingMergedData::AggregatingMergedData(
MutableColumns columns_, UInt64 max_block_size_, ColumnsDefinition & def_)
: MergedData(std::move(columns_), false, max_block_size_), def(def_)
{
initAggregateDescription();
initAggregateDescription();
/// Just to make startGroup() simpler.
if (def.allocates_memory_in_arena)
{
arena = std::make_unique<Arena>();
arena_size = arena->size();
}
}
void AggregatingSortedAlgorithm::AggregatingMergedData::startGroup(const ColumnRawPtrs & raw_columns, size_t row)
@ -212,8 +219,19 @@ void AggregatingSortedAlgorithm::AggregatingMergedData::startGroup(const ColumnR
for (auto & desc : def.columns_to_simple_aggregate)
desc.createState();
if (def.allocates_memory_in_arena)
/// Frequent Arena creation may be too costly, because we have to increment the atomic
/// ProfileEvents counters when creating the first Chunk -- e.g. SELECT with
/// SimpleAggregateFunction(String) in PK and lots of groups may produce ~1.5M of
/// ArenaAllocChunks atomic increments, while LOCK is too costly for CPU
/// (~10% overhead here).
/// To avoid this, reset arena if and only if:
/// - arena is required (i.e. SimpleAggregateFunction(any, String) in PK),
/// - arena was used in the previous groups.
if (def.allocates_memory_in_arena && arena->size() > arena_size)
{
arena = std::make_unique<Arena>();
arena_size = arena->size();
}
is_group_started = true;
}

View File

@ -73,6 +73,7 @@ private:
/// Memory pool for SimpleAggregateFunction
/// (only when allocates_memory_in_arena == true).
std::unique_ptr<Arena> arena;
size_t arena_size = 0;
bool is_group_started = false;

View File

@ -0,0 +1,24 @@
<test>
<create_query>
CREATE TABLE bench
ENGINE = AggregatingMergeTree()
ORDER BY key
SETTINGS index_granularity = 8192
AS
SELECT CAST(reinterpretAsString(number), 'SimpleAggregateFunction(any, String)') AS key
FROM numbers_mt(toUInt64(5e6))
SETTINGS max_insert_threads = 16
</create_query>
<fill_query>OPTIMIZE TABLE bench</fill_query>
<query>
SELECT *
FROM bench
GROUP BY key
SETTINGS optimize_aggregation_in_order = 1, max_threads = 16
FORMAT Null
</query>
<drop_query>DROP TABLE IF EXISTS bench</drop_query>
</test>