2022-11-14 18:01:40 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import random
|
|
|
|
import queue
|
|
|
|
import time
|
|
|
|
from threading import Thread
|
|
|
|
|
|
|
|
CURDIR = os.path.dirname(os.path.realpath(__file__))
|
2023-03-23 15:33:23 +00:00
|
|
|
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
|
2022-11-14 18:01:40 +00:00
|
|
|
|
|
|
|
from pure_http_client import ClickHouseClient
|
|
|
|
|
|
|
|
client = ClickHouseClient()
|
|
|
|
|
2022-11-30 11:00:09 +00:00
|
|
|
# test table without partition
|
2023-05-03 18:06:46 +00:00
|
|
|
client.query("DROP TABLE IF EXISTS t_async_insert_dedup_no_part SYNC")
|
2023-03-23 15:33:23 +00:00
|
|
|
client.query(
|
|
|
|
"""
|
2022-11-30 11:00:09 +00:00
|
|
|
CREATE TABLE t_async_insert_dedup_no_part (
|
|
|
|
KeyID UInt32
|
|
|
|
) Engine = ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/t_async_insert_dedup', '{replica}')
|
|
|
|
ORDER BY (KeyID)
|
2023-03-23 15:33:23 +00:00
|
|
|
"""
|
|
|
|
)
|
|
|
|
|
|
|
|
client.query(
|
|
|
|
"insert into t_async_insert_dedup_no_part values (1), (2), (3), (4), (5)",
|
|
|
|
settings={
|
|
|
|
"async_insert": 1,
|
|
|
|
"wait_for_async_insert": 1,
|
|
|
|
"insert_keeper_fault_injection_probability": 0,
|
|
|
|
},
|
|
|
|
)
|
2022-11-30 11:00:09 +00:00
|
|
|
result = client.query("select count(*) from t_async_insert_dedup_no_part")
|
|
|
|
print(result, flush=True)
|
2023-05-03 18:06:46 +00:00
|
|
|
client.query("DROP TABLE IF EXISTS t_async_insert_dedup_no_part SYNC")
|
2022-11-30 11:00:09 +00:00
|
|
|
|
2023-03-24 10:54:38 +00:00
|
|
|
|
2022-11-14 18:01:40 +00:00
|
|
|
# generate data and push to queue
|
|
|
|
def generate_data(q, total_number):
|
|
|
|
old_data = []
|
|
|
|
max_chunk_size = 30
|
2023-03-23 15:33:23 +00:00
|
|
|
partitions = ["2022-11-11 10:10:10", "2022-12-12 10:10:10"]
|
2022-11-14 18:01:40 +00:00
|
|
|
last_number = 0
|
|
|
|
while True:
|
2023-03-23 15:33:23 +00:00
|
|
|
dup_simulate = random.randint(0, 3)
|
2022-11-14 18:01:40 +00:00
|
|
|
# insert old data randomly. 25% of them are dup.
|
|
|
|
if dup_simulate == 0:
|
2023-03-23 15:33:23 +00:00
|
|
|
last_idx = len(old_data) - 1
|
2022-11-14 18:01:40 +00:00
|
|
|
if last_idx < 0:
|
|
|
|
continue
|
|
|
|
idx = last_idx - random.randint(0, 50)
|
|
|
|
if idx < 0:
|
|
|
|
idx = 0
|
|
|
|
q.put(old_data[idx])
|
|
|
|
else:
|
|
|
|
# insert new data.
|
|
|
|
chunk_size = random.randint(1, max_chunk_size)
|
|
|
|
insert_stmt = "insert into t_async_insert_dedup values "
|
|
|
|
start = last_number + 1
|
|
|
|
end = start + chunk_size
|
|
|
|
if end > total_number:
|
|
|
|
end = total_number
|
2023-03-23 15:33:23 +00:00
|
|
|
for i in range(start, end + 1):
|
2022-11-14 18:01:40 +00:00
|
|
|
partition = partitions[random.randint(0, 1)]
|
|
|
|
insert_stmt += "('{}', {}),".format(partition, i)
|
|
|
|
insert_stmt = insert_stmt[:-1]
|
|
|
|
q.put(insert_stmt)
|
|
|
|
old_data.append(insert_stmt)
|
|
|
|
last_number = end
|
|
|
|
if end >= total_number:
|
|
|
|
break
|
|
|
|
# wait all the tasks is done.
|
|
|
|
q.join()
|
|
|
|
|
2023-03-23 15:33:23 +00:00
|
|
|
|
2022-11-14 18:01:40 +00:00
|
|
|
def fetch_and_insert_data(q, client):
|
|
|
|
while True:
|
|
|
|
insert = q.get()
|
2023-03-23 15:33:23 +00:00
|
|
|
client.query(
|
|
|
|
insert,
|
|
|
|
settings={
|
|
|
|
"async_insert": 1,
|
|
|
|
"async_insert_deduplicate": 1,
|
|
|
|
"wait_for_async_insert": 0,
|
|
|
|
"async_insert_busy_timeout_ms": 1500,
|
|
|
|
"insert_keeper_fault_injection_probability": 0,
|
|
|
|
},
|
|
|
|
)
|
2022-11-14 18:01:40 +00:00
|
|
|
q.task_done()
|
2022-11-23 11:37:00 +00:00
|
|
|
sleep_time = random.randint(50, 500)
|
2023-03-23 15:33:23 +00:00
|
|
|
time.sleep(sleep_time / 1000.0)
|
|
|
|
|
2022-11-14 18:01:40 +00:00
|
|
|
|
|
|
|
# main process
|
2023-05-03 18:06:46 +00:00
|
|
|
client.query("DROP TABLE IF EXISTS t_async_insert_dedup SYNC")
|
2023-03-23 15:33:23 +00:00
|
|
|
client.query(
|
|
|
|
"""
|
2022-11-14 18:01:40 +00:00
|
|
|
CREATE TABLE t_async_insert_dedup (
|
|
|
|
EventDate DateTime,
|
|
|
|
KeyID UInt32
|
2022-11-18 16:22:05 +00:00
|
|
|
) Engine = ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/t_async_insert_dedup', '{replica}')
|
2022-11-14 18:01:40 +00:00
|
|
|
PARTITION BY toYYYYMM(EventDate)
|
2023-01-10 12:19:12 +00:00
|
|
|
ORDER BY (KeyID, EventDate) SETTINGS use_async_block_ids_cache = 1
|
2023-03-23 15:33:23 +00:00
|
|
|
"""
|
|
|
|
)
|
2022-11-14 18:01:40 +00:00
|
|
|
|
|
|
|
q = queue.Queue(100)
|
|
|
|
total_number = 10000
|
|
|
|
|
2023-03-23 15:33:23 +00:00
|
|
|
gen = Thread(target=generate_data, args=[q, total_number])
|
2022-11-14 18:01:40 +00:00
|
|
|
gen.start()
|
|
|
|
|
|
|
|
for i in range(3):
|
2023-03-23 15:33:23 +00:00
|
|
|
insert = Thread(target=fetch_and_insert_data, args=[q, client])
|
2022-11-14 18:01:40 +00:00
|
|
|
insert.start()
|
|
|
|
|
|
|
|
gen.join()
|
|
|
|
|
2022-11-28 15:47:44 +00:00
|
|
|
retry = 0
|
|
|
|
|
2023-01-17 14:47:52 +00:00
|
|
|
while True:
|
2022-11-28 15:47:44 +00:00
|
|
|
time.sleep(5)
|
|
|
|
result = client.query("select KeyID from t_async_insert_dedup order by KeyID")
|
|
|
|
result = result.split()
|
|
|
|
err = False
|
|
|
|
errMsg = ""
|
2022-12-19 13:05:50 +00:00
|
|
|
if len(result) != total_number:
|
|
|
|
err = True
|
|
|
|
errMsg = f"the size of result is {len(result)}. we expect {total_number}."
|
|
|
|
else:
|
|
|
|
for i in range(total_number):
|
2023-03-23 15:33:23 +00:00
|
|
|
expect = str(i + 1)
|
2022-12-19 13:05:50 +00:00
|
|
|
real = result[i]
|
|
|
|
if expect != real:
|
|
|
|
err = True
|
|
|
|
errMsg = f"error, real value {real} is not equal to expect value {expect} for {i}-th elements"
|
|
|
|
break
|
2022-11-28 15:47:44 +00:00
|
|
|
# retry several times to get stable results.
|
|
|
|
if err and retry >= 5:
|
2023-03-23 15:33:23 +00:00
|
|
|
print(errMsg, flush=True)
|
2022-11-28 15:47:44 +00:00
|
|
|
elif err:
|
|
|
|
retry += 1
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
print(len(result), flush=True)
|
|
|
|
break
|
2023-01-17 14:47:52 +00:00
|
|
|
|
2023-03-23 15:33:23 +00:00
|
|
|
result = client.query(
|
|
|
|
"SELECT value FROM system.metrics where metric = 'AsyncInsertCacheSize'"
|
|
|
|
)
|
2023-01-17 14:47:52 +00:00
|
|
|
result = int(result.split()[0])
|
|
|
|
if result <= 0:
|
|
|
|
raise Exception(f"AsyncInsertCacheSize should > 0, but got {result}")
|
2023-03-23 15:33:23 +00:00
|
|
|
result = client.query(
|
|
|
|
"SELECT value FROM system.events where event = 'AsyncInsertCacheHits'"
|
|
|
|
)
|
2023-01-17 14:47:52 +00:00
|
|
|
result = int(result.split()[0])
|
|
|
|
if result <= 0:
|
|
|
|
raise Exception(f"AsyncInsertCacheHits should > 0, but got {result}")
|
|
|
|
|
2023-05-03 18:06:46 +00:00
|
|
|
client.query("DROP TABLE IF EXISTS t_async_insert_dedup SYNC")
|
2022-11-14 18:01:40 +00:00
|
|
|
|
|
|
|
os._exit(os.EX_OK)
|