mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 08:02:02 +00:00
Merge pull request #6058 from yandex/uniq-fix-non-deterministic-result
Fix non-deterministic result of "uniq" aggregate function in extreme rare cases.
This commit is contained in:
commit
6f96102fe9
@ -126,20 +126,32 @@ private:
|
||||
{
|
||||
for (size_t i = 0; i < buf_size(); ++i)
|
||||
{
|
||||
if (buf[i] && !good(buf[i]))
|
||||
if (buf[i])
|
||||
{
|
||||
buf[i] = 0;
|
||||
--m_size;
|
||||
if (!good(buf[i]))
|
||||
{
|
||||
buf[i] = 0;
|
||||
--m_size;
|
||||
}
|
||||
/** After removing the elements, there may have been room for items,
|
||||
* which were placed further than necessary, due to a collision.
|
||||
* You need to move them.
|
||||
*/
|
||||
else if (i != place(buf[i]))
|
||||
{
|
||||
HashValue x = buf[i];
|
||||
buf[i] = 0;
|
||||
reinsertImpl(x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** After removing the elements, there may have been room for items,
|
||||
* which were placed further than necessary, due to a collision.
|
||||
* You need to move them.
|
||||
/** We must process first collision resolution chain once again.
|
||||
* Look at the comment in "resize" function.
|
||||
*/
|
||||
for (size_t i = 0; i < buf_size(); ++i)
|
||||
for (size_t i = 0; i < buf_size() && buf[i]; ++i)
|
||||
{
|
||||
if (unlikely(buf[i] && i != place(buf[i])))
|
||||
if (i != place(buf[i]))
|
||||
{
|
||||
HashValue x = buf[i];
|
||||
buf[i] = 0;
|
||||
|
@ -0,0 +1,2 @@
|
||||
80041
|
||||
80041
|
133
dbms/tests/queries/0_stateless/00973_uniq_non_associativity.sql
Normal file
133
dbms/tests/queries/0_stateless/00973_uniq_non_associativity.sql
Normal file
@ -0,0 +1,133 @@
|
||||
/* Aggregate function 'uniq' is intended to be associative and provide deterministic results regardless to the schedule of query execution threads and remote servers in a cluster.
|
||||
* But due to subtle bug in implementation it is not associative in very rare cases.
|
||||
* In this test we fill data structure with specific pattern that reproduces this behaviour.
|
||||
*/
|
||||
|
||||
DROP TABLE IF EXISTS part_a;
|
||||
DROP TABLE IF EXISTS part_b;
|
||||
DROP TABLE IF EXISTS part_c;
|
||||
DROP TABLE IF EXISTS part_d;
|
||||
|
||||
/* Create values that will resize hash table to the maximum (131072 cells) and fill it with less than max_fill (65536 cells)
|
||||
* and occupy cells near the end except last 10 cells:
|
||||
* [ ----------- ]
|
||||
* Pick values that will vanish if table will be rehashed.
|
||||
*/
|
||||
CREATE TABLE part_a ENGINE = TinyLog AS SELECT * FROM
|
||||
(
|
||||
WITH
|
||||
number AS k1,
|
||||
bitXor(k1, bitShiftRight(k1, 33)) AS k2,
|
||||
k2 * 0xff51afd7ed558ccd AS k3,
|
||||
bitXor(k3, bitShiftRight(k3, 33)) AS k4,
|
||||
k4 * 0xc4ceb9fe1a85ec53 AS k5,
|
||||
bitXor(k5, bitShiftRight(k5, 33)) AS k6,
|
||||
k6 AS hash,
|
||||
bitShiftRight(hash, 15) % 0x20000 AS place,
|
||||
hash % 2 = 0 AS will_remain
|
||||
SELECT hash, number, place FROM system.numbers WHERE place >= 90000 AND place < 131062 AND NOT will_remain LIMIT 1 BY place LIMIT 41062
|
||||
) ORDER BY place;
|
||||
|
||||
/* Create values that will resize hash table to the maximum (131072 cells) and fill it with less than max_fill (65536 cells),
|
||||
* but if we use both "a" and "b", it will force rehash.
|
||||
* [ ----------- ]
|
||||
* Pick values that will remain after rehash.
|
||||
*/
|
||||
CREATE TABLE part_b ENGINE = TinyLog AS SELECT * FROM
|
||||
(
|
||||
WITH
|
||||
number AS k1,
|
||||
bitXor(k1, bitShiftRight(k1, 33)) AS k2,
|
||||
k2 * 0xff51afd7ed558ccd AS k3,
|
||||
bitXor(k3, bitShiftRight(k3, 33)) AS k4,
|
||||
k4 * 0xc4ceb9fe1a85ec53 AS k5,
|
||||
bitXor(k5, bitShiftRight(k5, 33)) AS k6,
|
||||
k6 AS hash,
|
||||
bitShiftRight(hash, 15) % 0x20000 AS place,
|
||||
hash % 2 = 0 AS will_remain
|
||||
SELECT hash, number, place FROM system.numbers WHERE place >= 50000 AND place < 90000 AND will_remain LIMIT 1 BY place LIMIT 40000
|
||||
) ORDER BY place;
|
||||
|
||||
/* Occupy 10 cells near the end of "a":
|
||||
* a: [ ----------- ]
|
||||
* c: [ -- ]
|
||||
* If we insert "a" then "c", these values will be placed at the end of hash table due to collision resolution:
|
||||
* a + c: [ aaaaaaaaaaacc]
|
||||
*/
|
||||
CREATE TABLE part_c ENGINE = TinyLog AS SELECT * FROM
|
||||
(
|
||||
WITH
|
||||
number AS k1,
|
||||
bitXor(k1, bitShiftRight(k1, 33)) AS k2,
|
||||
k2 * 0xff51afd7ed558ccd AS k3,
|
||||
bitXor(k3, bitShiftRight(k3, 33)) AS k4,
|
||||
k4 * 0xc4ceb9fe1a85ec53 AS k5,
|
||||
bitXor(k5, bitShiftRight(k5, 33)) AS k6,
|
||||
k6 AS hash,
|
||||
bitShiftRight(hash, 15) % 0x20000 AS place,
|
||||
hash % 2 = 0 AS will_remain
|
||||
SELECT hash, number, place FROM system.numbers WHERE place >= 131052 AND place < 131062 AND will_remain AND hash NOT IN (SELECT hash FROM part_a) LIMIT 1 BY place LIMIT 10
|
||||
) ORDER BY place;
|
||||
|
||||
/* Occupy 10 cells at the end of hash table, after "a":
|
||||
* a: [ ----------- ]
|
||||
* d: [ --]
|
||||
* a + d: [ aaaaaaaaaaadd]
|
||||
* But if we insert "a" then "c" then "d", these values will be placed at the beginning of the hash table due to collision resolution:
|
||||
* a+c+d: [dd aaaaaaaaaaacc]
|
||||
*/
|
||||
CREATE TABLE part_d ENGINE = TinyLog AS SELECT * FROM
|
||||
(
|
||||
WITH
|
||||
number AS k1,
|
||||
bitXor(k1, bitShiftRight(k1, 33)) AS k2,
|
||||
k2 * 0xff51afd7ed558ccd AS k3,
|
||||
bitXor(k3, bitShiftRight(k3, 33)) AS k4,
|
||||
k4 * 0xc4ceb9fe1a85ec53 AS k5,
|
||||
bitXor(k5, bitShiftRight(k5, 33)) AS k6,
|
||||
k6 AS hash,
|
||||
bitShiftRight(hash, 15) % 0x20000 AS place,
|
||||
hash % 2 = 0 AS will_remain
|
||||
SELECT hash, number, place FROM system.numbers WHERE place >= 131062 AND will_remain LIMIT 1 BY place LIMIT 10
|
||||
) ORDER BY place;
|
||||
|
||||
/** What happens if we insert a then c then d then b?
|
||||
* Insertion of b forces rehash.
|
||||
* a will be removed, but c, d, b remain:
|
||||
* [dd bbbbbbbbbb cc]
|
||||
* Then we go through hash table and move elements to better places in collision resolution chain.
|
||||
* c will be moved left to their right place:
|
||||
* [dd bbbbbbbbbb cc ]
|
||||
*
|
||||
* And d must be moved also:
|
||||
* [ bbbbbbbbbb ccdd]
|
||||
* But our algorithm was incorrect and it doesn't happen.
|
||||
*
|
||||
* If we insert d again, it will be placed twice because original d will not found:
|
||||
* [dd bbbbbbbbbb ccdd]
|
||||
* This will lead to slightly higher return value of "uniq" aggregate function and it is dependent on insertion order.
|
||||
*/
|
||||
|
||||
|
||||
SET max_threads = 1;
|
||||
|
||||
/** Results of these two queries must match: */
|
||||
|
||||
SELECT uniq(number) FROM (
|
||||
SELECT * FROM part_a
|
||||
UNION ALL SELECT * FROM part_c
|
||||
UNION ALL SELECT * FROM part_d
|
||||
UNION ALL SELECT * FROM part_b);
|
||||
|
||||
SELECT uniq(number) FROM (
|
||||
SELECT * FROM part_a
|
||||
UNION ALL SELECT * FROM part_c
|
||||
UNION ALL SELECT * FROM part_d
|
||||
UNION ALL SELECT * FROM part_b
|
||||
UNION ALL SELECT * FROM part_d);
|
||||
|
||||
|
||||
DROP TABLE part_a;
|
||||
DROP TABLE part_b;
|
||||
DROP TABLE part_c;
|
||||
DROP TABLE part_d;
|
Loading…
Reference in New Issue
Block a user