Speed up reading uniqState (#41089)

* Speed up reading UniquesHashSet

* Improve uniq serialization tests
This commit is contained in:
Raúl Marín 2022-09-15 23:41:15 +02:00 committed by GitHub
parent fe4a485da8
commit 6dac509739
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 98 additions and 11 deletions

View File

@ -424,14 +424,30 @@ public:
alloc(new_size_degree);
for (size_t i = 0; i < m_size; ++i)
if (m_size <= 1)
{
HashValue x = 0;
DB::readIntBinary(x, rb);
if (x == 0)
has_zero = true;
else
reinsertImpl(x);
for (size_t i = 0; i < m_size; ++i)
{
HashValue x = 0;
DB::readIntBinary(x, rb);
if (x == 0)
has_zero = true;
else
reinsertImpl(x);
}
}
else
{
auto hs = std::make_unique<HashValue[]>(m_size);
rb.readStrict(reinterpret_cast<char *>(hs.get()), m_size * sizeof(HashValue));
for (size_t i = 0; i < m_size; ++i)
{
if (hs[i] == 0)
has_zero = true;
else
reinsertImpl(hs[i]);
}
}
}
@ -458,11 +474,24 @@ public:
resize(new_size_degree);
}
for (size_t i = 0; i < rhs_size; ++i)
if (rhs_size <= 1)
{
HashValue x = 0;
DB::readIntBinary(x, rb);
insertHash(x);
for (size_t i = 0; i < rhs_size; ++i)
{
HashValue x = 0;
DB::readIntBinary(x, rb);
insertHash(x);
}
}
else
{
auto hs = std::make_unique<HashValue[]>(rhs_size);
rb.readStrict(reinterpret_cast<char *>(hs.get()), rhs_size * sizeof(HashValue));
for (size_t i = 0; i < rhs_size; ++i)
{
insertHash(hs[i]);
}
}
}

View File

@ -0,0 +1,58 @@
<test>
<create_query>
create table matview_1
(
a String,
b_count AggregateFunction(uniq, UInt64)
) Engine=MergeTree partition by tuple()
ORDER by tuple()
SETTINGS index_granularity = 1024;
</create_query>
<create_query>
create table matview_10000
(
a String,
b_count AggregateFunction(uniq, String)
) Engine=MergeTree partition by tuple()
ORDER by tuple()
SETTINGS index_granularity = 1024;
</create_query>
<drop_query>DROP TABLE IF EXISTS matview_1</drop_query>
<drop_query>DROP TABLE IF EXISTS matview_10000</drop_query>
<fill_query>
INSERT INTO matview_10000
SELECT a, uniqState(b) b_count
FROM
(
SELECT toString(rand() % 1000) a, toString(number % 10000) b
FROM numbers_mt(20000000)
)
GROUP BY a
SETTINGS max_insert_threads=8;
</fill_query>
<fill_query>OPTIMIZE TABLE matview_10000 FINAL</fill_query>
<fill_query>
INSERT INTO matview_1
SELECT '1', uniqState(number) b_count
FROM
(
SELECT *
FROM numbers_mt(2000000)
)
GROUP BY number
SETTINGS max_insert_threads=8;
</fill_query>
<fill_query>OPTIMIZE TABLE matview_1 FINAL</fill_query>
<!-- Test with ~10000 elements per state -->
<query>select a, uniqMerge(b_count) as b_count from matview_10000 prewhere a='55' group by a FORMAT Null SETTINGS max_threads=1;</query>
<query>select uniqMerge(b_count) as b_count from matview_10000 FORMAT Null SETTINGS max_threads=1;</query>
<!-- Test with ~1 elements per state -->
<query>select uniqMerge(b_count) as b_count FROM matview_1 FORMAT Null SETTINGS max_threads=1;</query>
</test>