From faa07c8440520a66d817bf5fa8512604c8ae81c5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jul 2019 18:37:45 +0300 Subject: [PATCH] Added a test --- .../00973_uniq_non_associativity.reference | 2 + .../00973_uniq_non_associativity.sql | 133 ++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/00973_uniq_non_associativity.reference create mode 100644 dbms/tests/queries/0_stateless/00973_uniq_non_associativity.sql diff --git a/dbms/tests/queries/0_stateless/00973_uniq_non_associativity.reference b/dbms/tests/queries/0_stateless/00973_uniq_non_associativity.reference new file mode 100644 index 00000000000..9e0c113389f --- /dev/null +++ b/dbms/tests/queries/0_stateless/00973_uniq_non_associativity.reference @@ -0,0 +1,2 @@ +80041 +80041 diff --git a/dbms/tests/queries/0_stateless/00973_uniq_non_associativity.sql b/dbms/tests/queries/0_stateless/00973_uniq_non_associativity.sql new file mode 100644 index 00000000000..91d62a134f8 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00973_uniq_non_associativity.sql @@ -0,0 +1,133 @@ +/* Aggregate function 'uniq' is intended to be associative and provide deterministic results regardless to the schedule of query execution threads and remote servers in a cluster. + * But due to subtle bug in implementation it is not associative in very rare cases. + * In this test we fill data structure with specific pattern that reproduces this behaviour. + */ + +DROP TABLE IF EXISTS part_a; +DROP TABLE IF EXISTS part_b; +DROP TABLE IF EXISTS part_c; +DROP TABLE IF EXISTS part_d; + +/* Create values that will resize hash table to the maximum (131072 cells) and fill it with less than max_fill (65536 cells) + * and occupy cells near the end except last 10 cells: + * [ ----------- ] + * Pick values that will vanish if table will be rehashed. + */ +CREATE TABLE part_a ENGINE = TinyLog AS SELECT * FROM +( +WITH + number AS k1, + bitXor(k1, bitShiftRight(k1, 33)) AS k2, + k2 * 0xff51afd7ed558ccd AS k3, + bitXor(k3, bitShiftRight(k3, 33)) AS k4, + k4 * 0xc4ceb9fe1a85ec53 AS k5, + bitXor(k5, bitShiftRight(k5, 33)) AS k6, + k6 AS hash, + bitShiftRight(hash, 15) % 0x20000 AS place, + hash % 2 = 0 AS will_remain +SELECT hash, number, place FROM system.numbers WHERE place >= 90000 AND place < 131062 AND NOT will_remain LIMIT 1 BY place LIMIT 41062 +) ORDER BY place; + +/* Create values that will resize hash table to the maximum (131072 cells) and fill it with less than max_fill (65536 cells), + * but if we use both "a" and "b", it will force rehash. + * [ ----------- ] + * Pick values that will remain after rehash. + */ +CREATE TABLE part_b ENGINE = TinyLog AS SELECT * FROM +( +WITH + number AS k1, + bitXor(k1, bitShiftRight(k1, 33)) AS k2, + k2 * 0xff51afd7ed558ccd AS k3, + bitXor(k3, bitShiftRight(k3, 33)) AS k4, + k4 * 0xc4ceb9fe1a85ec53 AS k5, + bitXor(k5, bitShiftRight(k5, 33)) AS k6, + k6 AS hash, + bitShiftRight(hash, 15) % 0x20000 AS place, + hash % 2 = 0 AS will_remain +SELECT hash, number, place FROM system.numbers WHERE place >= 50000 AND place < 90000 AND will_remain LIMIT 1 BY place LIMIT 40000 +) ORDER BY place; + +/* Occupy 10 cells near the end of "a": + * a: [ ----------- ] + * c: [ -- ] + * If we insert "a" then "c", these values will be placed at the end of hash table due to collision resolution: + * a + c: [ aaaaaaaaaaacc] + */ +CREATE TABLE part_c ENGINE = TinyLog AS SELECT * FROM +( +WITH + number AS k1, + bitXor(k1, bitShiftRight(k1, 33)) AS k2, + k2 * 0xff51afd7ed558ccd AS k3, + bitXor(k3, bitShiftRight(k3, 33)) AS k4, + k4 * 0xc4ceb9fe1a85ec53 AS k5, + bitXor(k5, bitShiftRight(k5, 33)) AS k6, + k6 AS hash, + bitShiftRight(hash, 15) % 0x20000 AS place, + hash % 2 = 0 AS will_remain +SELECT hash, number, place FROM system.numbers WHERE place >= 131052 AND place < 131062 AND will_remain AND hash NOT IN (SELECT hash FROM part_a) LIMIT 1 BY place LIMIT 10 +) ORDER BY place; + +/* Occupy 10 cells at the end of hash table, after "a": + * a: [ ----------- ] + * d: [ --] + * a + d: [ aaaaaaaaaaadd] + * But if we insert "a" then "c" then "d", these values will be placed at the beginning of the hash table due to collision resolution: + * a+c+d: [dd aaaaaaaaaaacc] + */ +CREATE TABLE part_d ENGINE = TinyLog AS SELECT * FROM +( +WITH + number AS k1, + bitXor(k1, bitShiftRight(k1, 33)) AS k2, + k2 * 0xff51afd7ed558ccd AS k3, + bitXor(k3, bitShiftRight(k3, 33)) AS k4, + k4 * 0xc4ceb9fe1a85ec53 AS k5, + bitXor(k5, bitShiftRight(k5, 33)) AS k6, + k6 AS hash, + bitShiftRight(hash, 15) % 0x20000 AS place, + hash % 2 = 0 AS will_remain +SELECT hash, number, place FROM system.numbers WHERE place >= 131062 AND will_remain LIMIT 1 BY place LIMIT 10 +) ORDER BY place; + +/** What happens if we insert a then c then d then b? + * Insertion of b forces rehash. + * a will be removed, but c, d, b remain: + * [dd bbbbbbbbbb cc] + * Then we go through hash table and move elements to better places in collision resolution chain. + * c will be moved left to their right place: + * [dd bbbbbbbbbb cc ] + * + * And d must be moved also: + * [ bbbbbbbbbb ccdd] + * But our algorithm was incorrect and it doesn't happen. + * + * If we insert d again, it will be placed twice because original d will not found: + * [dd bbbbbbbbbb ccdd] + * This will lead to slightly higher return value of "uniq" aggregate function and it is dependent on insertion order. + */ + + +SET max_threads = 1; + +/** Results of these two queries must match: */ + +SELECT uniq(number) FROM ( + SELECT * FROM part_a +UNION ALL SELECT * FROM part_c +UNION ALL SELECT * FROM part_d +UNION ALL SELECT * FROM part_b); + +SELECT uniq(number) FROM ( + SELECT * FROM part_a +UNION ALL SELECT * FROM part_c +UNION ALL SELECT * FROM part_d +UNION ALL SELECT * FROM part_b +UNION ALL SELECT * FROM part_d); + + +DROP TABLE part_a; +DROP TABLE part_b; +DROP TABLE part_c; +DROP TABLE part_d;