Do not use more then 98K of memory for uniqCombined*

uniqCombined() uses hashtable for medium cardinality, and since
HashTable resize by the power of 2 (well actually HashTableGrower grows
double by the power of 2, hence HashTableGrower::increaseSize() should
be overwritten to change this), with 1<<13 (default for uniqCombined)
and UInt64 HashValueType, the HashTable will takes:

  getBufferSizeInBytes() == 131072

While it should be not greater then sizeof(HLL) ~= 98K, so reduce the
maximum cardinality for hashtable to 1<<12 with UInt64 HashValueType and
to 1<13 with UInt32, overwrite HashTableGrower::increaseSize() and cover
this using max_memory_usage.

Refs: https://github.com/ClickHouse/ClickHouse/pull/7221#issuecomment-539672742

v2: cover uniqCombined() with non-default K
This commit is contained in:
Azat Khuzhin 2019-10-08 23:26:31 +03:00
parent 15deedb420
commit e373862c83
3 changed files with 71 additions and 3 deletions

View File

@ -66,6 +66,11 @@ namespace detail
} }
// Unlike HashTableGrower always grows to power of 2.
struct UniqCombinedHashTableGrower : public HashTableGrower<>
{
void increaseSize() { ++size_degree; }
};
template <typename Key, UInt8 K> template <typename Key, UInt8 K>
struct AggregateFunctionUniqCombinedDataWithKey struct AggregateFunctionUniqCombinedDataWithKey
@ -76,7 +81,7 @@ struct AggregateFunctionUniqCombinedDataWithKey
// We want to migrate from |HashSet| to |HyperLogLogCounter| when the sizes in memory become almost equal. // We want to migrate from |HashSet| to |HyperLogLogCounter| when the sizes in memory become almost equal.
// The size per element in |HashSet| is sizeof(Key)*2 bytes, and the overall size of |HyperLogLogCounter| is 2^K * 6 bits. // The size per element in |HashSet| is sizeof(Key)*2 bytes, and the overall size of |HyperLogLogCounter| is 2^K * 6 bits.
// For Key=UInt32 we can calculate: 2^X * 4 * 2 ≤ 2^(K-3) * 6 ⇒ X ≤ K-4. // For Key=UInt32 we can calculate: 2^X * 4 * 2 ≤ 2^(K-3) * 6 ⇒ X ≤ K-4.
using Set = CombinedCardinalityEstimator<Key, HashSet<Key, TrivialHash, HashTableGrower<>>, 16, K - 4, K, TrivialHash, Key>; using Set = CombinedCardinalityEstimator<Key, HashSet<Key, TrivialHash, UniqCombinedHashTableGrower>, 16, K - 5 + (sizeof(Key) == sizeof(UInt32)), K, TrivialHash, Key>;
Set set; Set set;
}; };
@ -85,9 +90,9 @@ template <typename Key>
struct AggregateFunctionUniqCombinedDataWithKey<Key, 17> struct AggregateFunctionUniqCombinedDataWithKey<Key, 17>
{ {
using Set = CombinedCardinalityEstimator<Key, using Set = CombinedCardinalityEstimator<Key,
HashSet<Key, TrivialHash, HashTableGrower<>>, HashSet<Key, TrivialHash, UniqCombinedHashTableGrower>,
16, 16,
13, 12 + (sizeof(Key) == sizeof(UInt32)),
17, 17,
TrivialHash, TrivialHash,
Key, Key,

View File

@ -0,0 +1,14 @@
UInt32
819200
UInt64
409600
K=16
UInt32
409600
UInt64
204800
K=18
UInt32
1638400
UInt64
819200

View File

@ -0,0 +1,49 @@
-- each uniqCombined state should not use > sizeof(HLL) in memory,
-- sizeof(HLL) is (2^K * 6 / 8)
-- hence max_memory_usage for 100 rows = (96<<10)*100 = 9830400
-- HashTable for UInt32 (used until (1<<13) elements), hence 8192 elements
SELECT 'UInt32';
SET max_memory_usage = 4000000;
SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(number % 8192) u FROM numbers(toUInt64(8192 * 100)) GROUP BY k); -- { serverError 241 }
SET max_memory_usage = 9830400;
SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(number % 8192) u FROM numbers(toUInt64(8192 * 100)) GROUP BY k);
-- HashTable for UInt64 (used until (1<<12) elements), hence 4096 elements
SELECT 'UInt64';
SET max_memory_usage = 4000000;
SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(reinterpretAsString(number % 4096)) u FROM numbers(toUInt64(4096 * 100)) GROUP BY k); -- { serverError 241 }
SET max_memory_usage = 9830400;
SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(reinterpretAsString(number % 4096)) u FROM numbers(toUInt64(4096 * 100)) GROUP BY k);
SELECT 'K=16';
-- HashTable for UInt32 (used until (1<<12) elements), hence 4096 elements
SELECT 'UInt32';
SET max_memory_usage = 2000000;
SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(16)(number % 4096) u FROM numbers(toUInt64(4096 * 100)) GROUP BY k); -- { serverError 241 }
SET max_memory_usage = 4915200;
SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(16)(number % 4096) u FROM numbers(toUInt64(4096 * 100)) GROUP BY k);
-- HashTable for UInt64 (used until (1<<11) elements), hence 2048 elements
SELECT 'UInt64';
SET max_memory_usage = 2000000;
SELECT sum(u) FROM (SELECT intDiv(number, 2048) AS k, uniqCombined(16)(reinterpretAsString(number % 2048)) u FROM numbers(toUInt64(2048 * 100)) GROUP BY k); -- { serverError 241 }
SET max_memory_usage = 4915200;
SELECT sum(u) FROM (SELECT intDiv(number, 2048) AS k, uniqCombined(16)(reinterpretAsString(number % 2048)) u FROM numbers(toUInt64(2048 * 100)) GROUP BY k);
SELECT 'K=18';
-- HashTable for UInt32 (used until (1<<14) elements), hence 16384 elements
SELECT 'UInt32';
SET max_memory_usage = 8000000;
SELECT sum(u) FROM (SELECT intDiv(number, 16384) AS k, uniqCombined(18)(number % 16384) u FROM numbers(toUInt64(16384 * 100)) GROUP BY k); -- { serverError 241 }
SET max_memory_usage = 19660800;
SELECT sum(u) FROM (SELECT intDiv(number, 16384) AS k, uniqCombined(18)(number % 16384) u FROM numbers(toUInt64(16384 * 100)) GROUP BY k);
-- HashTable for UInt64 (used until (1<<13) elements), hence 8192 elements
SELECT 'UInt64';
SET max_memory_usage = 8000000;
SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(18)(reinterpretAsString(number % 8192)) u FROM numbers(toUInt64(8192 * 100)) GROUP BY k); -- { serverError 241 }
SET max_memory_usage = 19660800;
SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(18)(reinterpretAsString(number % 8192)) u FROM numbers(toUInt64(8192 * 100)) GROUP BY k);