Merge pull request #35311 from vdimir/pmj-dup-rows

Attempt to fix merge join duplicate rows
This commit is contained in:
Kruglov Pavel 2022-03-16 12:48:29 +01:00 committed by GitHub
commit 4a579768ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 66 additions and 0 deletions

View File

@ -881,6 +881,7 @@ bool MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & left_block
{
right_cursor.nextN(range.right_length);
right_block_info.skip = right_cursor.position();
left_cursor.nextN(range.left_length);
return false;
}
}

View File

@ -0,0 +1,28 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

View File

@ -0,0 +1,37 @@
-- Tags: long
DROP TABLE IF EXISTS left;
DROP TABLE IF EXISTS right;
SET join_algorithm = 'partial_merge';
{% for block_size in [10, 11, 128, 129, 65505, 65506, 70000] -%}
{% for join_block_size in range(block_size - 2, block_size + 2) -%}
CREATE OR REPLACE TABLE left ( key UInt32, value String ) ENGINE = Memory;
CREATE OR REPLACE TABLE right ( key UInt32, value String ) ENGINE = Memory;
INSERT INTO left SELECT number, toString(number) FROM numbers({{ block_size * 2 + 1 }});
INSERT INTO right SELECT number, toString(number) FROM numbers({{ block_size * 2 + 5 }});
SET max_joined_block_size_rows = {{ join_block_size }};
SET max_block_size = {{ block_size }};
SELECT key, count(1) AS cnt
FROM (
SELECT *
FROM ( SELECT key FROM left AS s ) AS data
ALL LEFT JOIN ( SELECT key FROM right GROUP BY key ) AS promo ON promo.key = data.key
) GROUP BY key HAVING count(1) > 1
;
SELECT count() == (SELECT count() from left) AND min(key == promo.key) == 1
FROM ( SELECT key FROM left AS s ) AS data
ALL LEFT JOIN ( SELECT key FROM right GROUP BY key ) AS promo ON promo.key = data.key
;
{% endfor -%}
{% endfor -%}
DROP TABLE IF EXISTS left;
DROP TABLE IF EXISTS right;