ClickHouse/tests/queries/0_stateless/01654_test_writer_block_sequence.python

#!/usr/bin/env python3
import os
import sys
import random
import string

CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))

from pure_http_client import ClickHouseClient

def get_random_string(length):
    return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))

client = ClickHouseClient()

def insert_block(table_name, block_granularity_rows, block_rows):
    global client
    block_data = []
    index_granularity_bytes = 10 * 1024 * 1024
    row_bytes = index_granularity_bytes // block_granularity_rows
    for _ in range(block_rows):
        block_data.append(get_random_string(row_bytes - 1))

    values_row = ", ".join("(1, '" + row + "')" for row in block_data)
    client.query("INSERT INTO {} VALUES {}".format(table_name, values_row))

try:
    client.query("DROP TABLE IF EXISTS t")
    client.query("CREATE TABLE t (v UInt8, data String) ENGINE = MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0")

    client.query("SYSTEM STOP MERGES t")

    # These blocks size taken from the real table which reproduces the error
    # when we get granule with more rows then fixed granularity after horizontal merge.
    # About 10k rows when max is 8912.
    #
    # Why these blocks are special?
    # 1) The first one contains 1811 rows, but its granule should have 6853.
    # So we write 1811 and get unfinished granule with 6853 - 1811 = 5042 rows to write from the next blocks.
    #
    # 2) The second block has fewer rows than rows left in the unfinished granule (3094 < 5042).
    # It can be written entirely in this unfinished granule and we will still have some rows left. But it's granularity
    # should be smaller than rows left in granule (3094 < 5042), so clickhouse will adjust (make smaller) this last unfinished granule.
    # This adjust logic contained a bug: we adjust not to the new block's granularity (3094), but to the difference of the new block granularity and
    # already written rows (3094 - 1811 = 1283). This lead to several unsigned integer overflows in code and huge granules as result.
    #
    # 3) Last block just triggers the check that each granule has fewer rows than fixed granularity rows. If the bug from 2) exists then it will fail.
    insert_block("t", block_granularity_rows=6853, block_rows=1811)
    insert_block("t", block_granularity_rows=3094, block_rows=3094)
    insert_block("t", block_granularity_rows=6092, block_rows=6092)

    client.query("SYSTEM START MERGES t")
    client.query("OPTIMIZE TABLE t FINAL")

    print(client.query_return_df("SELECT COUNT() as C FROM t FORMAT TabSeparatedWithNames")['C'][0])
finally:
    client.query("DROP TABLE IF EXISTS t")
Fix stupid error 2021-01-15 18:50:30 +00:00			`#!/usr/bin/env python3`
			`import os`
			`import sys`
			`import random`
			`import string`

			`CURDIR = os.path.dirname(os.path.realpath(__file__))`
			`sys.path.insert(0, os.path.join(CURDIR, 'helpers'))`

			`from pure_http_client import ClickHouseClient`

			`def get_random_string(length):`
			`return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))`

			`client = ClickHouseClient()`

			`def insert_block(table_name, block_granularity_rows, block_rows):`
			`global client`
			`block_data = []`
			`index_granularity_bytes = 10 * 1024 * 1024`
			`row_bytes = index_granularity_bytes // block_granularity_rows`
			`for _ in range(block_rows):`
			`block_data.append(get_random_string(row_bytes - 1))`

			`values_row = ", ".join("(1, '" + row + "')" for row in block_data)`
			`client.query("INSERT INTO {} VALUES {}".format(table_name, values_row))`

			`try:`
			`client.query("DROP TABLE IF EXISTS t")`
			`client.query("CREATE TABLE t (v UInt8, data String) ENGINE = MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0")`

			`client.query("SYSTEM STOP MERGES t")`

			`# These blocks size taken from the real table which reproduces the error`
			`# when we get granule with more rows then fixed granularity after horizontal merge.`
			`# About 10k rows when max is 8912.`
			`#`
			`# Why these blocks are special?`
			`# 1) The first one contains 1811 rows, but its granule should have 6853.`
			`# So we write 1811 and get unfinished granule with 6853 - 1811 = 5042 rows to write from the next blocks.`
			`#`
			`# 2) The second block has fewer rows than rows left in the unfinished granule (3094 < 5042).`
			`# It can be written entirely in this unfinished granule and we will still have some rows left. But it's granularity`
			`# should be smaller than rows left in granule (3094 < 5042), so clickhouse will adjust (make smaller) this last unfinished granule.`
			`# This adjust logic contained a bug: we adjust not to the new block's granularity (3094), but to the difference of the new block granularity and`
			`# already written rows (3094 - 1811 = 1283). This lead to several unsigned integer overflows in code and huge granules as result.`
			`#`
			`# 3) Last block just triggers the check that each granule has fewer rows than fixed granularity rows. If the bug from 2) exists then it will fail.`
			`insert_block("t", block_granularity_rows=6853, block_rows=1811)`
			`insert_block("t", block_granularity_rows=3094, block_rows=3094)`
			`insert_block("t", block_granularity_rows=6092, block_rows=6092)`

			`client.query("SYSTEM START MERGES t")`
			`client.query("OPTIMIZE TABLE t FINAL")`

			`print(client.query_return_df("SELECT COUNT() as C FROM t FORMAT TabSeparatedWithNames")['C'][0])`
			`finally:`
			`client.query("DROP TABLE IF EXISTS t")`