ClickHouse/tests/queries/0_stateless/01654_test_writer_block_sequence.python
2023-03-23 15:33:23 +00:00

71 lines
2.7 KiB
Python

#!/usr/bin/env python3
import os
import sys
import random
import string
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
def get_random_string(length):
return "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(length)
)
client = ClickHouseClient()
def insert_block(table_name, block_granularity_rows, block_rows):
global client
block_data = []
index_granularity_bytes = 10 * 1024 * 1024
row_bytes = index_granularity_bytes // block_granularity_rows
for _ in range(block_rows):
block_data.append(get_random_string(row_bytes - 1))
values_row = ", ".join("(1, '" + row + "')" for row in block_data)
client.query("INSERT INTO {} VALUES {}".format(table_name, values_row))
try:
client.query("DROP TABLE IF EXISTS t")
client.query(
"CREATE TABLE t (v UInt8, data String) ENGINE = MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0"
)
client.query("SYSTEM STOP MERGES t")
# These blocks size taken from the real table which reproduces the error
# when we get granule with more rows then fixed granularity after horizontal merge.
# About 10k rows when max is 8912.
#
# Why these blocks are special?
# 1) The first one contains 1811 rows, but its granule should have 6853.
# So we write 1811 and get unfinished granule with 6853 - 1811 = 5042 rows to write from the next blocks.
#
# 2) The second block has fewer rows than rows left in the unfinished granule (3094 < 5042).
# It can be written entirely in this unfinished granule and we will still have some rows left. But it's granularity
# should be smaller than rows left in granule (3094 < 5042), so clickhouse will adjust (make smaller) this last unfinished granule.
# This adjust logic contained a bug: we adjust not to the new block's granularity (3094), but to the difference of the new block granularity and
# already written rows (3094 - 1811 = 1283). This lead to several unsigned integer overflows in code and huge granules as result.
#
# 3) Last block just triggers the check that each granule has fewer rows than fixed granularity rows. If the bug from 2) exists then it will fail.
insert_block("t", block_granularity_rows=6853, block_rows=1811)
insert_block("t", block_granularity_rows=3094, block_rows=3094)
insert_block("t", block_granularity_rows=6092, block_rows=6092)
client.query("SYSTEM START MERGES t")
client.query("OPTIMIZE TABLE t FINAL")
print(
client.query_return_df(
"SELECT COUNT() as C FROM t FORMAT TabSeparatedWithNames"
)["C"][0]
)
finally:
client.query("DROP TABLE IF EXISTS t")