2023-05-02 14:27:53 +00:00
|
|
|
import io
|
2024-09-27 10:19:39 +00:00
|
|
|
import json
|
2023-05-01 12:50:20 +00:00
|
|
|
import logging
|
|
|
|
import random
|
2024-08-14 14:56:02 +00:00
|
|
|
import string
|
2023-05-01 12:50:20 +00:00
|
|
|
import time
|
2024-10-01 11:08:31 +00:00
|
|
|
import uuid
|
2024-12-03 17:17:25 +00:00
|
|
|
from multiprocessing.dummy import Pool
|
2023-05-01 12:50:20 +00:00
|
|
|
|
|
|
|
import pytest
|
2024-09-27 10:19:39 +00:00
|
|
|
|
2023-05-04 07:04:04 +00:00
|
|
|
from helpers.client import QueryRuntimeException
|
2023-05-01 12:50:20 +00:00
|
|
|
from helpers.cluster import ClickHouseCluster, ClickHouseInstance
|
2023-05-10 15:17:26 +00:00
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
AVAILABLE_MODES = ["unordered", "ordered"]
|
2024-01-07 01:16:29 +00:00
|
|
|
DEFAULT_AUTH = ["'minio'", "'minio123'"]
|
|
|
|
NO_AUTH = ["NOSIGN"]
|
|
|
|
|
|
|
|
|
|
|
|
def prepare_public_s3_bucket(started_cluster):
|
|
|
|
def create_bucket(client, bucket_name, policy):
|
|
|
|
if client.bucket_exists(bucket_name):
|
|
|
|
client.remove_bucket(bucket_name)
|
|
|
|
|
|
|
|
client.make_bucket(bucket_name)
|
|
|
|
|
2024-01-08 02:39:19 +00:00
|
|
|
client.set_bucket_policy(bucket_name, json.dumps(policy))
|
2024-01-07 01:16:29 +00:00
|
|
|
|
|
|
|
def get_policy_with_public_access(bucket_name):
|
|
|
|
return {
|
|
|
|
"Version": "2012-10-17",
|
|
|
|
"Statement": [
|
|
|
|
{
|
|
|
|
"Sid": "",
|
|
|
|
"Effect": "Allow",
|
|
|
|
"Principal": "*",
|
|
|
|
"Action": [
|
|
|
|
"s3:GetBucketLocation",
|
|
|
|
"s3:ListBucket",
|
|
|
|
],
|
|
|
|
"Resource": f"arn:aws:s3:::{bucket_name}",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"Sid": "",
|
|
|
|
"Effect": "Allow",
|
|
|
|
"Principal": "*",
|
|
|
|
"Action": [
|
|
|
|
"s3:GetObject",
|
|
|
|
"s3:PutObject",
|
|
|
|
"s3:DeleteObject",
|
|
|
|
],
|
|
|
|
"Resource": f"arn:aws:s3:::{bucket_name}/*",
|
|
|
|
},
|
|
|
|
],
|
|
|
|
}
|
2023-05-10 15:17:26 +00:00
|
|
|
|
|
|
|
minio_client = started_cluster.minio_client
|
|
|
|
|
2024-01-07 01:16:29 +00:00
|
|
|
started_cluster.minio_public_bucket = f"{started_cluster.minio_bucket}-public"
|
|
|
|
create_bucket(
|
|
|
|
minio_client,
|
|
|
|
started_cluster.minio_public_bucket,
|
|
|
|
get_policy_with_public_access(started_cluster.minio_public_bucket),
|
2023-05-10 15:17:26 +00:00
|
|
|
)
|
2023-05-01 12:50:20 +00:00
|
|
|
|
2023-05-02 14:27:53 +00:00
|
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
|
|
def s3_queue_setup_teardown(started_cluster):
|
|
|
|
instance = started_cluster.instances["instance"]
|
2023-05-08 07:26:52 +00:00
|
|
|
instance_2 = started_cluster.instances["instance2"]
|
|
|
|
|
2024-08-14 16:43:12 +00:00
|
|
|
instance.query("DROP DATABASE IF EXISTS default; CREATE DATABASE default;")
|
|
|
|
instance_2.query("DROP DATABASE IF EXISTS default; CREATE DATABASE default;")
|
2023-05-04 07:04:04 +00:00
|
|
|
|
|
|
|
minio = started_cluster.minio_client
|
2024-01-08 02:39:19 +00:00
|
|
|
objects = list(minio.list_objects(started_cluster.minio_bucket, recursive=True))
|
2023-05-04 07:04:04 +00:00
|
|
|
for obj in objects:
|
2024-01-07 01:16:29 +00:00
|
|
|
minio.remove_object(started_cluster.minio_bucket, obj.object_name)
|
2024-08-14 14:15:05 +00:00
|
|
|
|
|
|
|
container_client = started_cluster.blob_service_client.get_container_client(
|
|
|
|
started_cluster.azurite_container
|
|
|
|
)
|
|
|
|
|
|
|
|
if container_client.exists():
|
|
|
|
blob_names = [b.name for b in container_client.list_blobs()]
|
|
|
|
logging.debug(f"Deleting blobs: {blob_names}")
|
|
|
|
for b in blob_names:
|
|
|
|
container_client.delete_blob(b)
|
|
|
|
|
2023-05-02 14:27:53 +00:00
|
|
|
yield # run test
|
2023-05-01 12:50:20 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
def started_cluster():
|
|
|
|
try:
|
|
|
|
cluster = ClickHouseCluster(__file__)
|
|
|
|
cluster.add_instance(
|
|
|
|
"instance",
|
|
|
|
user_configs=["configs/users.xml"],
|
|
|
|
with_minio=True,
|
2024-06-19 14:16:37 +00:00
|
|
|
with_azurite=True,
|
2023-05-01 12:50:20 +00:00
|
|
|
with_zookeeper=True,
|
2024-10-09 16:20:58 +00:00
|
|
|
keeper_required_feature_flags=["create_if_not_exists"],
|
2023-09-08 13:49:34 +00:00
|
|
|
main_configs=[
|
|
|
|
"configs/zookeeper.xml",
|
2023-10-25 10:30:02 +00:00
|
|
|
"configs/s3queue_log.xml",
|
2023-09-08 13:49:34 +00:00
|
|
|
],
|
2024-01-29 16:14:32 +00:00
|
|
|
stay_alive=True,
|
2023-05-01 12:50:20 +00:00
|
|
|
)
|
2023-05-08 07:26:52 +00:00
|
|
|
cluster.add_instance(
|
|
|
|
"instance2",
|
|
|
|
user_configs=["configs/users.xml"],
|
|
|
|
with_minio=True,
|
|
|
|
with_zookeeper=True,
|
2024-10-09 16:20:58 +00:00
|
|
|
keeper_required_feature_flags=["create_if_not_exists"],
|
2023-10-25 10:30:02 +00:00
|
|
|
main_configs=[
|
|
|
|
"configs/s3queue_log.xml",
|
|
|
|
],
|
2024-01-31 21:59:25 +00:00
|
|
|
stay_alive=True,
|
2023-05-08 07:26:52 +00:00
|
|
|
)
|
2024-02-12 19:23:21 +00:00
|
|
|
cluster.add_instance(
|
|
|
|
"old_instance",
|
|
|
|
with_zookeeper=True,
|
2024-10-09 16:20:58 +00:00
|
|
|
keeper_required_feature_flags=["create_if_not_exists"],
|
2024-02-12 19:23:21 +00:00
|
|
|
image="clickhouse/clickhouse-server",
|
|
|
|
tag="23.12",
|
|
|
|
stay_alive=True,
|
|
|
|
with_installed_binary=True,
|
2024-03-20 12:43:18 +00:00
|
|
|
use_old_analyzer=True,
|
2024-02-12 19:23:21 +00:00
|
|
|
)
|
2024-09-09 11:41:48 +00:00
|
|
|
cluster.add_instance(
|
|
|
|
"node1",
|
|
|
|
with_zookeeper=True,
|
2024-10-09 16:20:58 +00:00
|
|
|
keeper_required_feature_flags=["create_if_not_exists"],
|
2024-09-09 11:41:48 +00:00
|
|
|
stay_alive=True,
|
|
|
|
main_configs=[
|
|
|
|
"configs/zookeeper.xml",
|
|
|
|
"configs/s3queue_log.xml",
|
|
|
|
"configs/remote_servers.xml",
|
|
|
|
],
|
|
|
|
)
|
|
|
|
cluster.add_instance(
|
|
|
|
"node2",
|
|
|
|
with_zookeeper=True,
|
2024-10-09 16:20:58 +00:00
|
|
|
keeper_required_feature_flags=["create_if_not_exists"],
|
2024-09-09 11:41:48 +00:00
|
|
|
stay_alive=True,
|
|
|
|
main_configs=[
|
|
|
|
"configs/zookeeper.xml",
|
|
|
|
"configs/s3queue_log.xml",
|
|
|
|
"configs/remote_servers.xml",
|
|
|
|
],
|
|
|
|
)
|
2024-06-10 11:03:50 +00:00
|
|
|
cluster.add_instance(
|
|
|
|
"instance_too_many_parts",
|
|
|
|
user_configs=["configs/users.xml"],
|
|
|
|
with_minio=True,
|
|
|
|
with_zookeeper=True,
|
2024-10-09 16:20:58 +00:00
|
|
|
keeper_required_feature_flags=["create_if_not_exists"],
|
2024-06-10 11:03:50 +00:00
|
|
|
main_configs=[
|
|
|
|
"configs/s3queue_log.xml",
|
|
|
|
"configs/merge_tree.xml",
|
|
|
|
],
|
|
|
|
stay_alive=True,
|
|
|
|
)
|
2024-09-19 11:58:59 +00:00
|
|
|
cluster.add_instance(
|
|
|
|
"instance_24.5",
|
|
|
|
with_zookeeper=True,
|
2024-10-09 16:20:58 +00:00
|
|
|
keeper_required_feature_flags=["create_if_not_exists"],
|
2024-09-19 11:58:59 +00:00
|
|
|
image="clickhouse/clickhouse-server",
|
2024-09-25 09:58:23 +00:00
|
|
|
tag="24.5",
|
2024-09-19 11:58:59 +00:00
|
|
|
stay_alive=True,
|
|
|
|
user_configs=[
|
|
|
|
"configs/users.xml",
|
|
|
|
],
|
|
|
|
with_installed_binary=True,
|
|
|
|
use_old_analyzer=True,
|
|
|
|
)
|
2024-10-16 14:07:39 +00:00
|
|
|
cluster.add_instance(
|
|
|
|
"node_cloud_mode",
|
|
|
|
with_zookeeper=True,
|
2024-10-31 12:39:41 +00:00
|
|
|
keeper_required_feature_flags=["create_if_not_exists"],
|
2024-10-16 14:07:39 +00:00
|
|
|
stay_alive=True,
|
|
|
|
main_configs=[
|
|
|
|
"configs/zookeeper.xml",
|
|
|
|
"configs/s3queue_log.xml",
|
|
|
|
],
|
|
|
|
user_configs=["configs/cloud_mode.xml"],
|
|
|
|
)
|
2023-05-01 12:50:20 +00:00
|
|
|
|
|
|
|
logging.info("Starting cluster...")
|
|
|
|
cluster.start()
|
|
|
|
logging.info("Cluster started")
|
|
|
|
|
|
|
|
yield cluster
|
|
|
|
finally:
|
|
|
|
cluster.shutdown()
|
|
|
|
|
|
|
|
|
|
|
|
def run_query(instance, query, stdin=None, settings=None):
|
|
|
|
# type: (ClickHouseInstance, str, object, dict) -> str
|
|
|
|
|
|
|
|
logging.info("Running query '{}'...".format(query))
|
|
|
|
result = instance.query(query, stdin=stdin, settings=settings)
|
|
|
|
logging.info("Query finished")
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
2024-12-03 17:17:25 +00:00
|
|
|
def random_str(length=6):
|
|
|
|
alphabet = string.ascii_lowercase + string.digits
|
|
|
|
return "".join(random.SystemRandom().choice(alphabet) for _ in range(length))
|
|
|
|
|
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
def generate_random_files(
|
2024-01-08 02:39:19 +00:00
|
|
|
started_cluster,
|
|
|
|
files_path,
|
|
|
|
count,
|
2024-06-19 14:37:46 +00:00
|
|
|
storage="s3",
|
2024-01-08 02:39:19 +00:00
|
|
|
column_num=3,
|
|
|
|
row_num=10,
|
|
|
|
start_ind=0,
|
|
|
|
bucket=None,
|
2024-12-03 17:17:25 +00:00
|
|
|
use_random_names=False,
|
2023-09-08 13:49:34 +00:00
|
|
|
):
|
2024-12-03 17:17:25 +00:00
|
|
|
if use_random_names:
|
|
|
|
files = [
|
|
|
|
(f"{files_path}/{random_str(10)}.csv", i)
|
|
|
|
for i in range(start_ind, start_ind + count)
|
|
|
|
]
|
|
|
|
else:
|
|
|
|
files = [
|
|
|
|
(f"{files_path}/test_{i}.csv", i)
|
|
|
|
for i in range(start_ind, start_ind + count)
|
|
|
|
]
|
2023-09-08 13:49:34 +00:00
|
|
|
files.sort(key=lambda x: x[0])
|
2023-09-07 12:37:24 +00:00
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
print(f"Generating files: {files}")
|
2023-05-10 15:17:26 +00:00
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
total_values = []
|
|
|
|
for filename, i in files:
|
|
|
|
rand_values = [
|
|
|
|
[random.randint(0, 1000) for _ in range(column_num)] for _ in range(row_num)
|
|
|
|
]
|
|
|
|
total_values += rand_values
|
|
|
|
values_csv = (
|
|
|
|
"\n".join((",".join(map(str, row)) for row in rand_values)) + "\n"
|
|
|
|
).encode()
|
2024-06-19 14:16:37 +00:00
|
|
|
if storage == "s3":
|
|
|
|
put_s3_file_content(started_cluster, filename, values_csv, bucket)
|
|
|
|
else:
|
|
|
|
put_azure_file_content(started_cluster, filename, values_csv, bucket)
|
2023-09-08 13:49:34 +00:00
|
|
|
return total_values
|
|
|
|
|
|
|
|
|
2024-01-07 01:16:29 +00:00
|
|
|
def put_s3_file_content(started_cluster, filename, data, bucket=None):
|
|
|
|
bucket = started_cluster.minio_bucket if bucket is None else bucket
|
2023-09-08 13:49:34 +00:00
|
|
|
buf = io.BytesIO(data)
|
2024-01-08 02:39:19 +00:00
|
|
|
started_cluster.minio_client.put_object(bucket, filename, buf, len(data))
|
2023-09-08 13:49:34 +00:00
|
|
|
|
2024-06-19 14:37:46 +00:00
|
|
|
|
2024-06-19 14:16:37 +00:00
|
|
|
def put_azure_file_content(started_cluster, filename, data, bucket=None):
|
2024-06-19 14:37:46 +00:00
|
|
|
client = started_cluster.blob_service_client.get_blob_client(
|
2024-08-14 14:15:05 +00:00
|
|
|
started_cluster.azurite_container, filename
|
2024-06-19 14:37:46 +00:00
|
|
|
)
|
2024-06-19 14:16:37 +00:00
|
|
|
buf = io.BytesIO(data)
|
|
|
|
client.upload_blob(buf, "BlockBlob", len(data))
|
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
|
|
|
|
def create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
mode,
|
|
|
|
files_path,
|
2024-06-19 14:37:46 +00:00
|
|
|
engine_name="S3Queue",
|
2023-09-08 13:49:34 +00:00
|
|
|
format="column1 UInt32, column2 UInt32, column3 UInt32",
|
|
|
|
additional_settings={},
|
2023-09-14 16:41:31 +00:00
|
|
|
file_format="CSV",
|
2024-01-07 01:16:29 +00:00
|
|
|
auth=DEFAULT_AUTH,
|
|
|
|
bucket=None,
|
2024-01-29 11:36:00 +00:00
|
|
|
expect_error=False,
|
2024-09-16 10:27:27 +00:00
|
|
|
database_name="default",
|
2023-09-08 13:49:34 +00:00
|
|
|
):
|
2024-01-07 01:16:29 +00:00
|
|
|
auth_params = ",".join(auth)
|
|
|
|
bucket = started_cluster.minio_bucket if bucket is None else bucket
|
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
settings = {
|
|
|
|
"s3queue_loading_retries": 0,
|
|
|
|
"after_processing": "keep",
|
|
|
|
"keeper_path": f"/clickhouse/test_{table_name}",
|
|
|
|
"mode": f"{mode}",
|
|
|
|
}
|
|
|
|
settings.update(additional_settings)
|
|
|
|
|
2024-06-19 14:16:37 +00:00
|
|
|
engine_def = None
|
|
|
|
if engine_name == "S3Queue":
|
|
|
|
url = f"http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{files_path}/"
|
|
|
|
engine_def = f"{engine_name}('{url}', {auth_params}, {file_format})"
|
|
|
|
else:
|
2024-08-15 09:41:03 +00:00
|
|
|
engine_def = f"{engine_name}('{started_cluster.env_variables['AZURITE_CONNECTION_STRING']}', '{started_cluster.azurite_container}', '{files_path}/', 'CSV')"
|
2024-06-19 14:16:37 +00:00
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
node.query(f"DROP TABLE IF EXISTS {table_name}")
|
|
|
|
create_query = f"""
|
2024-09-09 11:41:48 +00:00
|
|
|
CREATE TABLE {database_name}.{table_name} ({format})
|
2024-06-19 14:16:37 +00:00
|
|
|
ENGINE = {engine_def}
|
2023-09-08 13:49:34 +00:00
|
|
|
SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))}
|
|
|
|
"""
|
2024-01-29 11:36:00 +00:00
|
|
|
|
|
|
|
if expect_error:
|
|
|
|
return node.query_and_get_error(create_query)
|
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
node.query(create_query)
|
|
|
|
|
|
|
|
|
|
|
|
def create_mv(
|
|
|
|
node,
|
|
|
|
src_table_name,
|
|
|
|
dst_table_name,
|
|
|
|
format="column1 UInt32, column2 UInt32, column3 UInt32",
|
|
|
|
):
|
|
|
|
mv_name = f"{dst_table_name}_mv"
|
2023-09-07 12:37:24 +00:00
|
|
|
node.query(
|
2023-05-10 15:17:26 +00:00
|
|
|
f"""
|
2023-09-07 12:37:24 +00:00
|
|
|
DROP TABLE IF EXISTS {dst_table_name};
|
|
|
|
DROP TABLE IF EXISTS {mv_name};
|
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
CREATE TABLE {dst_table_name} ({format}, _path String)
|
2023-09-07 12:37:24 +00:00
|
|
|
ENGINE = MergeTree()
|
|
|
|
ORDER BY column1;
|
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
CREATE MATERIALIZED VIEW {mv_name} TO {dst_table_name} AS SELECT *, _path FROM {src_table_name};
|
2023-05-10 15:17:26 +00:00
|
|
|
"""
|
|
|
|
)
|
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
|
2024-08-14 14:56:02 +00:00
|
|
|
def generate_random_string(length=6):
|
|
|
|
return "".join(random.choice(string.ascii_lowercase) for i in range(length))
|
|
|
|
|
|
|
|
|
2024-06-25 11:56:01 +00:00
|
|
|
@pytest.mark.parametrize("mode", ["unordered", "ordered"])
|
|
|
|
@pytest.mark.parametrize("engine_name", ["S3Queue", "AzureQueue"])
|
2024-06-19 14:16:37 +00:00
|
|
|
def test_delete_after_processing(started_cluster, mode, engine_name):
|
2023-09-08 13:49:34 +00:00
|
|
|
node = started_cluster.instances["instance"]
|
2024-11-15 13:55:30 +00:00
|
|
|
table_name = (
|
|
|
|
f"delete_after_processing_{mode}_{engine_name}_{generate_random_string()}"
|
|
|
|
)
|
2023-09-08 13:49:34 +00:00
|
|
|
dst_table_name = f"{table_name}_dst"
|
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_num = 5
|
|
|
|
row_num = 10
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2024-06-19 14:16:37 +00:00
|
|
|
if engine_name == "S3Queue":
|
|
|
|
storage = "s3"
|
|
|
|
else:
|
|
|
|
storage = "azure"
|
2023-09-08 13:49:34 +00:00
|
|
|
|
|
|
|
total_values = generate_random_files(
|
2024-06-19 14:37:46 +00:00
|
|
|
started_cluster, files_path, files_num, row_num=row_num, storage=storage
|
2023-09-08 13:49:34 +00:00
|
|
|
)
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
mode,
|
|
|
|
files_path,
|
2024-08-14 14:56:02 +00:00
|
|
|
additional_settings={"after_processing": "delete", "keeper_path": keeper_path},
|
2024-06-19 14:37:46 +00:00
|
|
|
engine_name=engine_name,
|
2023-09-08 13:49:34 +00:00
|
|
|
)
|
|
|
|
create_mv(node, table_name, dst_table_name)
|
|
|
|
|
2023-09-07 12:37:24 +00:00
|
|
|
expected_count = files_num * row_num
|
|
|
|
for _ in range(100):
|
|
|
|
count = int(node.query(f"SELECT count() FROM {dst_table_name}"))
|
|
|
|
print(f"{count}/{expected_count}")
|
|
|
|
if count == expected_count:
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
assert int(node.query(f"SELECT count() FROM {dst_table_name}")) == expected_count
|
|
|
|
assert int(node.query(f"SELECT uniq(_path) FROM {dst_table_name}")) == files_num
|
2023-05-10 15:17:26 +00:00
|
|
|
assert [
|
2023-09-07 12:37:24 +00:00
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(
|
|
|
|
f"SELECT column1, column2, column3 FROM {dst_table_name} ORDER BY column1, column2, column3"
|
|
|
|
).splitlines()
|
2023-08-23 09:48:08 +00:00
|
|
|
] == sorted(total_values, key=lambda x: (x[0], x[1], x[2]))
|
2023-09-07 12:37:24 +00:00
|
|
|
|
2024-11-14 19:54:43 +00:00
|
|
|
node.query("system flush logs")
|
|
|
|
|
|
|
|
if engine_name == "S3Queue":
|
|
|
|
system_table_name = "s3queue_log"
|
|
|
|
else:
|
|
|
|
system_table_name = "azure_queue_log"
|
|
|
|
assert (
|
|
|
|
int(
|
|
|
|
node.query(
|
|
|
|
f"SELECT sum(rows_processed) FROM system.{system_table_name} WHERE table = '{table_name}'"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
== files_num * row_num
|
|
|
|
)
|
|
|
|
|
2024-06-19 14:16:37 +00:00
|
|
|
if engine_name == "S3Queue":
|
|
|
|
minio = started_cluster.minio_client
|
|
|
|
objects = list(minio.list_objects(started_cluster.minio_bucket, recursive=True))
|
|
|
|
assert len(objects) == 0
|
|
|
|
else:
|
2024-06-19 14:37:46 +00:00
|
|
|
client = started_cluster.blob_service_client.get_container_client(
|
2024-08-14 14:15:05 +00:00
|
|
|
started_cluster.azurite_container
|
2024-06-19 14:37:46 +00:00
|
|
|
)
|
2024-06-19 14:16:37 +00:00
|
|
|
objects_iterator = client.list_blobs(files_path)
|
|
|
|
for objects in objects_iterator:
|
|
|
|
assert False
|
|
|
|
|
|
|
|
|
2024-06-25 11:56:01 +00:00
|
|
|
@pytest.mark.parametrize("mode", ["unordered", "ordered"])
|
|
|
|
@pytest.mark.parametrize("engine_name", ["S3Queue", "AzureQueue"])
|
2024-06-19 14:16:37 +00:00
|
|
|
def test_failed_retry(started_cluster, mode, engine_name):
|
2023-09-08 13:49:34 +00:00
|
|
|
node = started_cluster.instances["instance"]
|
2024-08-14 16:43:12 +00:00
|
|
|
table_name = f"failed_retry_{mode}_{engine_name}"
|
2023-09-08 13:49:34 +00:00
|
|
|
dst_table_name = f"{table_name}_dst"
|
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
file_path = f"{files_path}/trash_test.csv"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2023-09-08 13:49:34 +00:00
|
|
|
retries_num = 3
|
2023-05-10 15:17:26 +00:00
|
|
|
|
|
|
|
values = [
|
|
|
|
["failed", 1, 1],
|
|
|
|
]
|
|
|
|
values_csv = (
|
|
|
|
"\n".join((",".join(map(str, row)) for row in values)) + "\n"
|
|
|
|
).encode()
|
2024-06-20 16:32:00 +00:00
|
|
|
if engine_name == "S3Queue":
|
|
|
|
put_s3_file_content(started_cluster, file_path, values_csv)
|
|
|
|
else:
|
|
|
|
put_azure_file_content(started_cluster, file_path, values_csv)
|
2023-09-08 13:49:34 +00:00
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"s3queue_loading_retries": retries_num,
|
|
|
|
"keeper_path": keeper_path,
|
2024-11-13 17:48:14 +00:00
|
|
|
"polling_max_timeout_ms": 5000,
|
|
|
|
"polling_backoff_ms": 1000,
|
2023-09-08 13:49:34 +00:00
|
|
|
},
|
2024-06-19 14:37:46 +00:00
|
|
|
engine_name=engine_name,
|
2023-05-10 15:17:26 +00:00
|
|
|
)
|
2023-09-08 13:49:34 +00:00
|
|
|
create_mv(node, table_name, dst_table_name)
|
|
|
|
|
|
|
|
failed_node_path = ""
|
|
|
|
for _ in range(20):
|
|
|
|
zk = started_cluster.get_kazoo_client("zoo1")
|
|
|
|
failed_nodes = zk.get_children(f"{keeper_path}/failed/")
|
|
|
|
if len(failed_nodes) > 0:
|
|
|
|
assert len(failed_nodes) == 1
|
|
|
|
failed_node_path = f"{keeper_path}/failed/{failed_nodes[0]}"
|
|
|
|
time.sleep(1)
|
2023-05-10 15:17:26 +00:00
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
assert failed_node_path != ""
|
2023-05-10 15:17:26 +00:00
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
retries = 0
|
|
|
|
for _ in range(20):
|
|
|
|
data, stat = zk.get(failed_node_path)
|
|
|
|
json_data = json.loads(data)
|
|
|
|
print(f"Failed node metadata: {json_data}")
|
|
|
|
assert json_data["file_path"] == file_path
|
|
|
|
retries = int(json_data["retries"])
|
|
|
|
if retries == retries_num:
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
2023-05-10 15:17:26 +00:00
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
assert retries == retries_num
|
|
|
|
assert 0 == int(node.query(f"SELECT count() FROM {dst_table_name}"))
|
2023-05-10 15:17:26 +00:00
|
|
|
|
|
|
|
|
2023-05-02 14:27:53 +00:00
|
|
|
@pytest.mark.parametrize("mode", AVAILABLE_MODES)
|
|
|
|
def test_direct_select_file(started_cluster, mode):
|
2023-09-08 13:49:34 +00:00
|
|
|
node = started_cluster.instances["instance"]
|
2024-08-14 16:43:12 +00:00
|
|
|
table_name = f"direct_select_file_{mode}"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
2024-08-14 16:43:12 +00:00
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{mode}_{generate_random_string()}"
|
2023-09-08 13:49:34 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
file_path = f"{files_path}/test.csv"
|
|
|
|
|
2023-05-01 12:50:20 +00:00
|
|
|
values = [
|
|
|
|
[12549, 2463, 19893],
|
|
|
|
[64021, 38652, 66703],
|
|
|
|
[81611, 39650, 83516],
|
|
|
|
]
|
|
|
|
values_csv = (
|
2023-05-02 14:27:53 +00:00
|
|
|
"\n".join((",".join(map(str, row)) for row in values)) + "\n"
|
2023-05-01 12:50:20 +00:00
|
|
|
).encode()
|
2023-09-08 13:49:34 +00:00
|
|
|
put_s3_file_content(started_cluster, file_path, values_csv)
|
|
|
|
|
|
|
|
for i in range(3):
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
f"{table_name}_{i + 1}",
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
2024-06-11 17:01:24 +00:00
|
|
|
"s3queue_processing_threads_num": 1,
|
2023-09-08 13:49:34 +00:00
|
|
|
},
|
|
|
|
)
|
2023-05-01 12:50:20 +00:00
|
|
|
|
|
|
|
assert [
|
2023-09-08 13:49:34 +00:00
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(f"SELECT * FROM {table_name}_1").splitlines()
|
2023-05-02 14:27:53 +00:00
|
|
|
] == values
|
2023-05-01 12:50:20 +00:00
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
assert [
|
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(f"SELECT * FROM {table_name}_2").splitlines()
|
|
|
|
] == []
|
2023-05-02 14:27:53 +00:00
|
|
|
|
|
|
|
assert [
|
2023-09-08 13:49:34 +00:00
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(f"SELECT * FROM {table_name}_3").splitlines()
|
2023-05-02 14:27:53 +00:00
|
|
|
] == []
|
2023-09-08 13:49:34 +00:00
|
|
|
|
2023-05-02 14:27:53 +00:00
|
|
|
# New table with same zookeeper path
|
2023-09-08 13:49:34 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
f"{table_name}_4",
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
2024-06-11 17:01:24 +00:00
|
|
|
"s3queue_processing_threads_num": 1,
|
2023-09-08 13:49:34 +00:00
|
|
|
},
|
|
|
|
)
|
|
|
|
|
2023-05-02 14:27:53 +00:00
|
|
|
assert [
|
2023-09-08 13:49:34 +00:00
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(f"SELECT * FROM {table_name}_4").splitlines()
|
2023-05-02 14:27:53 +00:00
|
|
|
] == []
|
2023-09-08 13:49:34 +00:00
|
|
|
|
2023-05-02 14:27:53 +00:00
|
|
|
# New table with different zookeeper path
|
2024-08-14 16:43:12 +00:00
|
|
|
keeper_path = f"{keeper_path}_2"
|
2023-09-08 13:49:34 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
f"{table_name}_4",
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
2024-06-11 17:01:24 +00:00
|
|
|
"s3queue_processing_threads_num": 1,
|
2023-09-08 13:49:34 +00:00
|
|
|
},
|
2023-05-02 14:27:53 +00:00
|
|
|
)
|
2023-09-08 13:49:34 +00:00
|
|
|
|
2023-05-01 12:50:20 +00:00
|
|
|
assert [
|
2023-09-08 13:49:34 +00:00
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(f"SELECT * FROM {table_name}_4").splitlines()
|
2023-05-02 14:27:53 +00:00
|
|
|
] == values
|
|
|
|
|
|
|
|
values = [
|
|
|
|
[1, 1, 1],
|
|
|
|
]
|
|
|
|
values_csv = (
|
|
|
|
"\n".join((",".join(map(str, row)) for row in values)) + "\n"
|
|
|
|
).encode()
|
2023-09-08 13:49:34 +00:00
|
|
|
file_path = f"{files_path}/t.csv"
|
|
|
|
put_s3_file_content(started_cluster, file_path, values_csv)
|
2023-05-02 14:27:53 +00:00
|
|
|
|
|
|
|
if mode == "unordered":
|
|
|
|
assert [
|
|
|
|
list(map(int, l.split()))
|
2023-09-08 13:49:34 +00:00
|
|
|
for l in node.query(f"SELECT * FROM {table_name}_4").splitlines()
|
2023-05-02 14:27:53 +00:00
|
|
|
] == values
|
|
|
|
elif mode == "ordered":
|
|
|
|
assert [
|
|
|
|
list(map(int, l.split()))
|
2023-09-08 13:49:34 +00:00
|
|
|
for l in node.query(f"SELECT * FROM {table_name}_4").splitlines()
|
2023-05-02 14:27:53 +00:00
|
|
|
] == []
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("mode", AVAILABLE_MODES)
|
|
|
|
def test_direct_select_multiple_files(started_cluster, mode):
|
2023-09-08 13:49:34 +00:00
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
table_name = f"direct_select_multiple_files_{mode}"
|
|
|
|
files_path = f"{table_name}_data"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2023-05-02 14:27:53 +00:00
|
|
|
|
2024-08-14 14:56:02 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
mode,
|
|
|
|
files_path,
|
2024-09-18 18:03:45 +00:00
|
|
|
additional_settings={"keeper_path": keeper_path, "processing_threads_num": 3},
|
2024-08-14 14:56:02 +00:00
|
|
|
)
|
2023-05-04 07:04:04 +00:00
|
|
|
for i in range(5):
|
2023-05-02 14:27:53 +00:00
|
|
|
rand_values = [[random.randint(0, 50) for _ in range(3)] for _ in range(10)]
|
|
|
|
values_csv = (
|
|
|
|
"\n".join((",".join(map(str, row)) for row in rand_values)) + "\n"
|
|
|
|
).encode()
|
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
file_path = f"{files_path}/test_{i}.csv"
|
|
|
|
put_s3_file_content(started_cluster, file_path, values_csv)
|
|
|
|
|
2023-05-02 14:27:53 +00:00
|
|
|
assert [
|
|
|
|
list(map(int, l.split()))
|
2023-09-08 13:49:34 +00:00
|
|
|
for l in node.query(f"SELECT * FROM {table_name}").splitlines()
|
2023-05-02 14:27:53 +00:00
|
|
|
] == rand_values
|
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
total_values = generate_random_files(started_cluster, files_path, 4, start_ind=5)
|
2023-05-02 14:27:53 +00:00
|
|
|
assert {
|
2023-09-08 13:49:34 +00:00
|
|
|
tuple(map(int, l.split()))
|
|
|
|
for l in node.query(f"SELECT * FROM {table_name}").splitlines()
|
2023-05-02 14:27:53 +00:00
|
|
|
} == set([tuple(i) for i in total_values])
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("mode", AVAILABLE_MODES)
|
2024-08-14 14:56:02 +00:00
|
|
|
def test_streaming_to_view(started_cluster, mode):
|
2023-09-08 13:49:34 +00:00
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
table_name = f"streaming_to_view_{mode}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
|
|
|
files_path = f"{table_name}_data"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2023-07-21 15:03:30 +00:00
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
total_values = generate_random_files(started_cluster, files_path, 10)
|
2024-08-14 14:56:02 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={"keeper_path": keeper_path},
|
|
|
|
)
|
2023-09-08 13:49:34 +00:00
|
|
|
create_mv(node, table_name, dst_table_name)
|
2023-05-02 14:27:53 +00:00
|
|
|
|
|
|
|
expected_values = set([tuple(i) for i in total_values])
|
|
|
|
for i in range(10):
|
|
|
|
selected_values = {
|
|
|
|
tuple(map(int, l.split()))
|
2023-09-08 13:49:34 +00:00
|
|
|
for l in node.query(
|
|
|
|
f"SELECT column1, column2, column3 FROM {dst_table_name}"
|
|
|
|
).splitlines()
|
2023-05-02 14:27:53 +00:00
|
|
|
}
|
2023-09-08 13:49:34 +00:00
|
|
|
if selected_values == expected_values:
|
2023-05-02 14:27:53 +00:00
|
|
|
break
|
2023-09-08 13:49:34 +00:00
|
|
|
time.sleep(1)
|
2023-05-02 14:27:53 +00:00
|
|
|
assert selected_values == expected_values
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("mode", AVAILABLE_MODES)
|
|
|
|
def test_streaming_to_many_views(started_cluster, mode):
|
2023-09-08 13:49:34 +00:00
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
table_name = f"streaming_to_many_views_{mode}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2023-09-08 13:49:34 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
|
|
|
|
for i in range(3):
|
|
|
|
table = f"{table_name}_{i + 1}"
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table,
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
create_mv(node, table, dst_table_name)
|
2023-07-21 15:03:30 +00:00
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
total_values = generate_random_files(started_cluster, files_path, 5)
|
2023-05-02 14:27:53 +00:00
|
|
|
expected_values = set([tuple(i) for i in total_values])
|
|
|
|
|
2023-09-08 13:49:34 +00:00
|
|
|
def select():
|
|
|
|
return {
|
|
|
|
tuple(map(int, l.split()))
|
|
|
|
for l in node.query(
|
|
|
|
f"SELECT column1, column2, column3 FROM {dst_table_name}"
|
|
|
|
).splitlines()
|
|
|
|
}
|
|
|
|
|
|
|
|
for _ in range(20):
|
|
|
|
if select() == expected_values:
|
2023-05-02 14:27:53 +00:00
|
|
|
break
|
2023-09-08 13:49:34 +00:00
|
|
|
time.sleep(1)
|
|
|
|
assert select() == expected_values
|
2023-05-04 07:04:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_multiple_tables_meta_mismatch(started_cluster):
|
2023-09-14 16:41:31 +00:00
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
table_name = f"multiple_tables_meta_mismatch"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2023-09-14 16:41:31 +00:00
|
|
|
files_path = f"{table_name}_data"
|
2023-07-21 15:03:30 +00:00
|
|
|
|
2023-09-14 16:41:31 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
},
|
2023-05-04 07:04:04 +00:00
|
|
|
)
|
|
|
|
# check mode
|
|
|
|
failed = False
|
|
|
|
try:
|
2023-09-14 16:41:31 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
f"{table_name}_copy",
|
|
|
|
"unordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
},
|
2023-05-04 07:04:04 +00:00
|
|
|
)
|
|
|
|
except QueryRuntimeException as e:
|
2024-02-01 11:43:55 +00:00
|
|
|
assert "Existing table metadata in ZooKeeper differs in engine mode" in str(e)
|
2023-05-04 07:04:04 +00:00
|
|
|
failed = True
|
2023-09-14 16:41:31 +00:00
|
|
|
|
2023-05-04 07:04:04 +00:00
|
|
|
assert failed is True
|
|
|
|
|
|
|
|
# check columns
|
|
|
|
try:
|
2023-09-14 16:41:31 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
f"{table_name}_copy",
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
format="column1 UInt32, column2 UInt32, column3 UInt32, column4 UInt32",
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
},
|
2023-05-04 07:04:04 +00:00
|
|
|
)
|
|
|
|
except QueryRuntimeException as e:
|
2024-09-16 10:27:27 +00:00
|
|
|
assert "Existing table metadata in ZooKeeper differs in columns" in str(e)
|
2023-05-04 07:04:04 +00:00
|
|
|
failed = True
|
|
|
|
|
|
|
|
assert failed is True
|
|
|
|
|
|
|
|
# check format
|
|
|
|
try:
|
2023-09-14 16:41:31 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
f"{table_name}_copy",
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
format="column1 UInt32, column2 UInt32, column3 UInt32, column4 UInt32",
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
},
|
|
|
|
file_format="TSV",
|
2023-05-04 07:04:04 +00:00
|
|
|
)
|
|
|
|
except QueryRuntimeException as e:
|
|
|
|
assert "Existing table metadata in ZooKeeper differs in format name" in str(e)
|
|
|
|
failed = True
|
2023-09-14 16:41:31 +00:00
|
|
|
|
2023-05-04 07:04:04 +00:00
|
|
|
assert failed is True
|
|
|
|
|
|
|
|
# create working engine
|
2023-09-14 16:41:31 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
f"{table_name}_copy",
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
},
|
2023-05-08 07:26:52 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-11-06 13:43:10 +00:00
|
|
|
# TODO: Update the modes for this test to include "ordered" once PR #55795 is finished.
|
|
|
|
@pytest.mark.parametrize("mode", ["unordered"])
|
2023-05-08 07:26:52 +00:00
|
|
|
def test_multiple_tables_streaming_sync(started_cluster, mode):
|
2023-09-14 16:41:31 +00:00
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
table_name = f"multiple_tables_streaming_sync_{mode}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2023-09-14 16:41:31 +00:00
|
|
|
files_path = f"{table_name}_data"
|
2023-05-08 07:26:52 +00:00
|
|
|
files_to_generate = 300
|
|
|
|
|
2023-09-14 16:41:31 +00:00
|
|
|
for i in range(3):
|
|
|
|
table = f"{table_name}_{i + 1}"
|
|
|
|
dst_table = f"{dst_table_name}_{i + 1}"
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table,
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
create_mv(node, table, dst_table)
|
2023-05-08 07:26:52 +00:00
|
|
|
|
|
|
|
total_values = generate_random_files(
|
2023-09-14 16:41:31 +00:00
|
|
|
started_cluster, files_path, files_to_generate, row_num=1
|
2023-05-08 07:26:52 +00:00
|
|
|
)
|
2023-07-21 15:03:30 +00:00
|
|
|
|
|
|
|
def get_count(table_name):
|
2023-09-14 16:41:31 +00:00
|
|
|
return int(run_query(node, f"SELECT count() FROM {table_name}"))
|
2023-07-21 15:03:30 +00:00
|
|
|
|
|
|
|
for _ in range(100):
|
|
|
|
if (
|
2023-09-14 16:41:31 +00:00
|
|
|
get_count(f"{dst_table_name}_1")
|
|
|
|
+ get_count(f"{dst_table_name}_2")
|
|
|
|
+ get_count(f"{dst_table_name}_3")
|
2023-07-21 15:03:30 +00:00
|
|
|
) == files_to_generate:
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
2023-05-08 07:26:52 +00:00
|
|
|
|
2023-10-25 10:30:02 +00:00
|
|
|
if (
|
|
|
|
get_count(f"{dst_table_name}_1")
|
|
|
|
+ get_count(f"{dst_table_name}_2")
|
|
|
|
+ get_count(f"{dst_table_name}_3")
|
|
|
|
) != files_to_generate:
|
|
|
|
info = node.query(
|
2023-11-06 11:21:13 +00:00
|
|
|
f"SELECT * FROM system.s3queue WHERE zookeeper_path like '%{table_name}' ORDER BY file_name FORMAT Vertical"
|
2023-10-25 10:30:02 +00:00
|
|
|
)
|
|
|
|
logging.debug(info)
|
|
|
|
assert False
|
|
|
|
|
2023-05-08 07:26:52 +00:00
|
|
|
res1 = [
|
2023-09-14 16:41:31 +00:00
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(
|
|
|
|
f"SELECT column1, column2, column3 FROM {dst_table_name}_1"
|
|
|
|
).splitlines()
|
2023-05-08 07:26:52 +00:00
|
|
|
]
|
|
|
|
res2 = [
|
|
|
|
list(map(int, l.split()))
|
2023-09-14 16:41:31 +00:00
|
|
|
for l in node.query(
|
|
|
|
f"SELECT column1, column2, column3 FROM {dst_table_name}_2"
|
|
|
|
).splitlines()
|
2023-05-08 07:26:52 +00:00
|
|
|
]
|
|
|
|
res3 = [
|
|
|
|
list(map(int, l.split()))
|
2023-09-14 16:41:31 +00:00
|
|
|
for l in node.query(
|
|
|
|
f"SELECT column1, column2, column3 FROM {dst_table_name}_3"
|
|
|
|
).splitlines()
|
2023-05-08 07:26:52 +00:00
|
|
|
]
|
|
|
|
assert {tuple(v) for v in res1 + res2 + res3} == set(
|
|
|
|
[tuple(i) for i in total_values]
|
|
|
|
)
|
|
|
|
|
2023-07-21 15:03:30 +00:00
|
|
|
# Checking that all files were processed only once
|
|
|
|
time.sleep(10)
|
|
|
|
assert (
|
2023-09-14 16:41:31 +00:00
|
|
|
get_count(f"{dst_table_name}_1")
|
|
|
|
+ get_count(f"{dst_table_name}_2")
|
|
|
|
+ get_count(f"{dst_table_name}_3")
|
2023-07-21 15:03:30 +00:00
|
|
|
) == files_to_generate
|
|
|
|
|
2023-05-08 07:26:52 +00:00
|
|
|
|
|
|
|
@pytest.mark.parametrize("mode", AVAILABLE_MODES)
|
|
|
|
def test_multiple_tables_streaming_sync_distributed(started_cluster, mode):
|
2023-09-14 16:41:31 +00:00
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
node_2 = started_cluster.instances["instance2"]
|
2024-08-14 16:43:12 +00:00
|
|
|
# A unique table name is necessary for repeatable tests
|
|
|
|
table_name = (
|
|
|
|
f"multiple_tables_streaming_sync_distributed_{mode}_{generate_random_string()}"
|
|
|
|
)
|
2023-09-14 16:41:31 +00:00
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 16:43:12 +00:00
|
|
|
keeper_path = f"/clickhouse/test_{table_name}"
|
2023-09-14 16:41:31 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 300
|
2023-11-20 13:55:23 +00:00
|
|
|
row_num = 50
|
|
|
|
total_rows = row_num * files_to_generate
|
2023-05-08 07:26:52 +00:00
|
|
|
|
2023-09-14 16:41:31 +00:00
|
|
|
for instance in [node, node_2]:
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
instance,
|
|
|
|
table_name,
|
|
|
|
mode,
|
|
|
|
files_path,
|
Fix flaky test_storage_s3_queue/test.py::test_multiple_tables_streaming_sync_distributed
Disable parallel processing for the Ordered mode for the
test_storage_s3_queue/test.py::test_multiple_tables_streaming_sync_distributed
test.
The reason for this is that the load between the processing nodes is
too uneven when s3queue_processing_threads_num != 1,
e.g.:
```
$ grep res1 pytest.log
2024-08-07 07:15:58 [ 575 ] DEBUG : res1 size: 13300, res2 size: 1700, total_rows: 15000 (test.py:813, test_multiple_tables_streaming_sync_distributed)
```
In CIs environment, there are rare cases when one of the processors handles all the workload,
while the other is busy-waiting, and the test fails on assert:
When s3queue_processing_threads_num == 1, the workload is evenly
distributed:
```
$ grep res1 pytest.log
2024-08-07 07:26:52 [ 586 ] DEBUG : res1 size: 7200, res2 size: 7800, total_rows: 15000 (test.py:813, test_multiple_tables_streaming_sync_distributed)
```
This change only fixes test flakiness. Further investigation of the Order
mode parallelism is required.
2024-08-05 00:40:23 +00:00
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_buckets": 2,
|
2024-11-13 15:27:15 +00:00
|
|
|
"polling_max_timeout_ms": 2000,
|
|
|
|
"polling_backoff_ms": 1000,
|
Fix flaky test_storage_s3_queue/test.py::test_multiple_tables_streaming_sync_distributed
Disable parallel processing for the Ordered mode for the
test_storage_s3_queue/test.py::test_multiple_tables_streaming_sync_distributed
test.
The reason for this is that the load between the processing nodes is
too uneven when s3queue_processing_threads_num != 1,
e.g.:
```
$ grep res1 pytest.log
2024-08-07 07:15:58 [ 575 ] DEBUG : res1 size: 13300, res2 size: 1700, total_rows: 15000 (test.py:813, test_multiple_tables_streaming_sync_distributed)
```
In CIs environment, there are rare cases when one of the processors handles all the workload,
while the other is busy-waiting, and the test fails on assert:
When s3queue_processing_threads_num == 1, the workload is evenly
distributed:
```
$ grep res1 pytest.log
2024-08-07 07:26:52 [ 586 ] DEBUG : res1 size: 7200, res2 size: 7800, total_rows: 15000 (test.py:813, test_multiple_tables_streaming_sync_distributed)
```
This change only fixes test flakiness. Further investigation of the Order
mode parallelism is required.
2024-08-05 00:40:23 +00:00
|
|
|
**({"s3queue_processing_threads_num": 1} if mode == "ordered" else {}),
|
|
|
|
},
|
2023-08-02 15:09:47 +00:00
|
|
|
)
|
2023-07-21 15:03:30 +00:00
|
|
|
|
2023-09-14 16:41:31 +00:00
|
|
|
for instance in [node, node_2]:
|
|
|
|
create_mv(instance, table_name, dst_table_name)
|
2023-05-08 07:26:52 +00:00
|
|
|
|
|
|
|
total_values = generate_random_files(
|
2023-11-20 13:55:23 +00:00
|
|
|
started_cluster, files_path, files_to_generate, row_num=row_num
|
2023-05-08 07:26:52 +00:00
|
|
|
)
|
|
|
|
|
2023-07-30 11:27:01 +00:00
|
|
|
def get_count(node, table_name):
|
|
|
|
return int(run_query(node, f"SELECT count() FROM {table_name}"))
|
|
|
|
|
2023-08-02 15:09:47 +00:00
|
|
|
for _ in range(150):
|
2023-07-30 11:27:01 +00:00
|
|
|
if (
|
2023-09-14 16:41:31 +00:00
|
|
|
get_count(node, dst_table_name) + get_count(node_2, dst_table_name)
|
2023-11-20 13:55:23 +00:00
|
|
|
) == total_rows:
|
2023-07-30 11:27:01 +00:00
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
2023-10-25 10:30:02 +00:00
|
|
|
if (
|
|
|
|
get_count(node, dst_table_name) + get_count(node_2, dst_table_name)
|
2023-11-20 13:55:23 +00:00
|
|
|
) != total_rows:
|
2023-10-25 10:30:02 +00:00
|
|
|
info = node.query(
|
2023-11-06 11:21:13 +00:00
|
|
|
f"SELECT * FROM system.s3queue WHERE zookeeper_path like '%{table_name}' ORDER BY file_name FORMAT Vertical"
|
2023-10-25 10:30:02 +00:00
|
|
|
)
|
|
|
|
logging.debug(info)
|
|
|
|
assert False
|
|
|
|
|
2023-09-14 16:41:31 +00:00
|
|
|
get_query = f"SELECT column1, column2, column3 FROM {dst_table_name}"
|
|
|
|
res1 = [list(map(int, l.split())) for l in run_query(node, get_query).splitlines()]
|
2023-05-08 07:26:52 +00:00
|
|
|
res2 = [
|
2023-09-14 16:41:31 +00:00
|
|
|
list(map(int, l.split())) for l in run_query(node_2, get_query).splitlines()
|
2023-05-08 07:26:52 +00:00
|
|
|
]
|
|
|
|
|
Fix flaky test_storage_s3_queue/test.py::test_multiple_tables_streaming_sync_distributed
Disable parallel processing for the Ordered mode for the
test_storage_s3_queue/test.py::test_multiple_tables_streaming_sync_distributed
test.
The reason for this is that the load between the processing nodes is
too uneven when s3queue_processing_threads_num != 1,
e.g.:
```
$ grep res1 pytest.log
2024-08-07 07:15:58 [ 575 ] DEBUG : res1 size: 13300, res2 size: 1700, total_rows: 15000 (test.py:813, test_multiple_tables_streaming_sync_distributed)
```
In CIs environment, there are rare cases when one of the processors handles all the workload,
while the other is busy-waiting, and the test fails on assert:
When s3queue_processing_threads_num == 1, the workload is evenly
distributed:
```
$ grep res1 pytest.log
2024-08-07 07:26:52 [ 586 ] DEBUG : res1 size: 7200, res2 size: 7800, total_rows: 15000 (test.py:813, test_multiple_tables_streaming_sync_distributed)
```
This change only fixes test flakiness. Further investigation of the Order
mode parallelism is required.
2024-08-05 00:40:23 +00:00
|
|
|
logging.debug(
|
|
|
|
f"res1 size: {len(res1)}, res2 size: {len(res2)}, total_rows: {total_rows}"
|
|
|
|
)
|
|
|
|
|
2023-11-20 13:55:23 +00:00
|
|
|
assert len(res1) + len(res2) == total_rows
|
2023-08-02 15:09:47 +00:00
|
|
|
|
2023-05-08 07:26:52 +00:00
|
|
|
# Checking that all engines have made progress
|
|
|
|
assert len(res1) > 0
|
|
|
|
assert len(res2) > 0
|
|
|
|
|
|
|
|
assert {tuple(v) for v in res1 + res2} == set([tuple(i) for i in total_values])
|
|
|
|
|
2023-07-30 11:27:01 +00:00
|
|
|
# Checking that all files were processed only once
|
|
|
|
time.sleep(10)
|
|
|
|
assert (
|
2023-09-14 16:41:31 +00:00
|
|
|
get_count(node, dst_table_name) + get_count(node_2, dst_table_name)
|
2023-11-20 13:55:23 +00:00
|
|
|
) == total_rows
|
2023-07-30 11:27:01 +00:00
|
|
|
|
2023-05-08 07:26:52 +00:00
|
|
|
|
2023-09-14 16:41:31 +00:00
|
|
|
def test_max_set_age(started_cluster):
|
|
|
|
node = started_cluster.instances["instance"]
|
2024-07-24 15:07:53 +00:00
|
|
|
table_name = "max_set_age"
|
2023-09-15 12:21:08 +00:00
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2023-09-14 16:41:31 +00:00
|
|
|
files_path = f"{table_name}_data"
|
2024-07-26 13:19:57 +00:00
|
|
|
max_age = 20
|
2023-09-14 16:41:31 +00:00
|
|
|
files_to_generate = 10
|
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"unordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
2024-06-18 17:06:06 +00:00
|
|
|
"tracked_file_ttl_sec": max_age,
|
2024-06-26 10:44:21 +00:00
|
|
|
"cleanup_interval_min_ms": max_age / 3,
|
|
|
|
"cleanup_interval_max_ms": max_age / 3,
|
|
|
|
"loading_retries": 0,
|
2024-11-12 15:46:36 +00:00
|
|
|
"polling_max_timeout_ms": 5000,
|
|
|
|
"polling_backoff_ms": 1000,
|
2024-06-26 10:44:21 +00:00
|
|
|
"processing_threads_num": 1,
|
2024-06-18 17:06:06 +00:00
|
|
|
"loading_retries": 0,
|
2023-09-14 16:41:31 +00:00
|
|
|
},
|
|
|
|
)
|
2023-09-15 12:21:08 +00:00
|
|
|
create_mv(node, table_name, dst_table_name)
|
2023-09-14 16:41:31 +00:00
|
|
|
|
2024-07-24 15:07:53 +00:00
|
|
|
_ = generate_random_files(started_cluster, files_path, files_to_generate, row_num=1)
|
2023-09-15 12:21:08 +00:00
|
|
|
|
2024-07-24 11:42:28 +00:00
|
|
|
expected_rows = files_to_generate
|
2023-09-15 12:21:08 +00:00
|
|
|
|
|
|
|
node.wait_for_log_line("Checking node limits")
|
|
|
|
node.wait_for_log_line("Node limits check finished")
|
|
|
|
|
|
|
|
def get_count():
|
|
|
|
return int(node.query(f"SELECT count() FROM {dst_table_name}"))
|
|
|
|
|
2024-07-26 13:19:57 +00:00
|
|
|
def wait_for_condition(check_function, max_wait_time=1.5 * max_age):
|
2024-07-24 12:08:31 +00:00
|
|
|
before = time.time()
|
|
|
|
while time.time() - before < max_wait_time:
|
|
|
|
if check_function():
|
|
|
|
return
|
2024-07-26 13:19:57 +00:00
|
|
|
time.sleep(0.25)
|
2024-07-24 12:08:31 +00:00
|
|
|
assert False
|
2023-09-15 12:21:08 +00:00
|
|
|
|
2024-07-24 12:08:31 +00:00
|
|
|
wait_for_condition(lambda: get_count() == expected_rows)
|
2024-07-24 15:07:53 +00:00
|
|
|
assert files_to_generate == int(
|
|
|
|
node.query(f"SELECT uniq(_path) from {dst_table_name}")
|
|
|
|
)
|
2023-09-15 12:21:08 +00:00
|
|
|
|
2024-07-24 11:42:28 +00:00
|
|
|
expected_rows *= 2
|
2024-07-24 12:08:31 +00:00
|
|
|
wait_for_condition(lambda: get_count() == expected_rows)
|
2024-07-24 15:07:53 +00:00
|
|
|
assert files_to_generate == int(
|
|
|
|
node.query(f"SELECT uniq(_path) from {dst_table_name}")
|
|
|
|
)
|
2023-09-15 12:21:08 +00:00
|
|
|
|
|
|
|
paths_count = [
|
|
|
|
int(x)
|
|
|
|
for x in node.query(
|
|
|
|
f"SELECT count() from {dst_table_name} GROUP BY _path"
|
|
|
|
).splitlines()
|
2023-09-14 16:41:31 +00:00
|
|
|
]
|
2024-07-24 11:42:28 +00:00
|
|
|
assert files_to_generate == len(paths_count)
|
2024-07-29 11:15:03 +00:00
|
|
|
for path_count in paths_count:
|
|
|
|
assert 2 == path_count
|
2023-09-14 16:41:31 +00:00
|
|
|
|
2024-07-24 12:08:31 +00:00
|
|
|
def get_object_storage_failures():
|
2024-07-24 15:07:53 +00:00
|
|
|
return int(
|
|
|
|
node.query(
|
|
|
|
"SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
|
|
|
|
)
|
|
|
|
)
|
2024-07-24 12:08:31 +00:00
|
|
|
|
|
|
|
failed_count = get_object_storage_failures()
|
2024-05-13 15:15:26 +00:00
|
|
|
|
|
|
|
values = [
|
|
|
|
["failed", 1, 1],
|
|
|
|
]
|
|
|
|
values_csv = (
|
|
|
|
"\n".join((",".join(map(str, row)) for row in values)) + "\n"
|
|
|
|
).encode()
|
2024-07-24 15:07:53 +00:00
|
|
|
|
|
|
|
# use a different filename for each test to allow running a bunch of them sequentially with --count
|
2024-10-01 10:48:04 +00:00
|
|
|
file_with_error = f"max_set_age_fail_{uuid.uuid4().hex[:8]}.csv"
|
2024-07-24 11:42:28 +00:00
|
|
|
put_s3_file_content(started_cluster, f"{files_path}/{file_with_error}", values_csv)
|
2024-05-13 15:15:26 +00:00
|
|
|
|
2024-07-31 12:09:49 +00:00
|
|
|
wait_for_condition(lambda: failed_count + 1 == get_object_storage_failures())
|
2024-05-13 15:15:26 +00:00
|
|
|
|
2024-05-14 14:30:31 +00:00
|
|
|
node.query("SYSTEM FLUSH LOGS")
|
2024-05-14 14:37:15 +00:00
|
|
|
assert "Cannot parse input" in node.query(
|
2024-07-24 11:42:28 +00:00
|
|
|
f"SELECT exception FROM system.s3queue WHERE file_name ilike '%{file_with_error}'"
|
2024-05-14 14:37:15 +00:00
|
|
|
)
|
2024-11-08 16:01:09 +00:00
|
|
|
assert "Cannot parse input" in node.query(
|
|
|
|
f"SELECT exception FROM system.s3queue_log WHERE file_name ilike '%{file_with_error}' ORDER BY processing_end_time DESC LIMIT 1"
|
|
|
|
)
|
2024-07-24 11:42:28 +00:00
|
|
|
|
2024-05-14 14:37:15 +00:00
|
|
|
assert 1 == int(
|
|
|
|
node.query(
|
2024-07-24 11:42:28 +00:00
|
|
|
f"SELECT count() FROM system.s3queue_log WHERE file_name ilike '%{file_with_error}' AND notEmpty(exception)"
|
2024-05-14 14:37:15 +00:00
|
|
|
)
|
|
|
|
)
|
2024-05-14 14:30:31 +00:00
|
|
|
|
2024-07-31 12:09:49 +00:00
|
|
|
wait_for_condition(lambda: failed_count + 2 == get_object_storage_failures())
|
2024-05-13 15:15:26 +00:00
|
|
|
|
2024-05-14 14:30:31 +00:00
|
|
|
node.query("SYSTEM FLUSH LOGS")
|
2024-05-14 14:37:15 +00:00
|
|
|
assert "Cannot parse input" in node.query(
|
2024-07-24 11:42:28 +00:00
|
|
|
f"SELECT exception FROM system.s3queue WHERE file_name ilike '%{file_with_error}' ORDER BY processing_end_time DESC LIMIT 1"
|
2024-05-14 14:37:15 +00:00
|
|
|
)
|
2024-06-19 12:03:24 +00:00
|
|
|
assert 1 < int(
|
2024-05-14 14:37:15 +00:00
|
|
|
node.query(
|
2024-07-24 11:42:28 +00:00
|
|
|
f"SELECT count() FROM system.s3queue_log WHERE file_name ilike '%{file_with_error}' AND notEmpty(exception)"
|
2024-05-14 14:37:15 +00:00
|
|
|
)
|
|
|
|
)
|
2024-05-14 14:30:31 +00:00
|
|
|
|
2024-09-18 17:25:18 +00:00
|
|
|
node.restart_clickhouse()
|
|
|
|
|
|
|
|
expected_rows *= 2
|
|
|
|
wait_for_condition(lambda: get_count() == expected_rows)
|
|
|
|
assert files_to_generate == int(
|
|
|
|
node.query(f"SELECT uniq(_path) from {dst_table_name}")
|
|
|
|
)
|
|
|
|
|
2023-09-14 16:41:31 +00:00
|
|
|
|
2023-05-08 07:26:52 +00:00
|
|
|
def test_max_set_size(started_cluster):
|
2023-09-15 12:21:08 +00:00
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
table_name = f"max_set_size"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2023-09-15 12:21:08 +00:00
|
|
|
files_path = f"{table_name}_data"
|
2023-05-08 07:26:52 +00:00
|
|
|
files_to_generate = 10
|
|
|
|
|
2023-09-15 12:21:08 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"unordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_tracked_files_limit": 9,
|
2023-09-25 11:44:45 +00:00
|
|
|
"s3queue_cleanup_interval_min_ms": 0,
|
|
|
|
"s3queue_cleanup_interval_max_ms": 0,
|
2023-09-27 16:44:53 +00:00
|
|
|
"s3queue_processing_threads_num": 1,
|
2023-09-15 12:21:08 +00:00
|
|
|
},
|
2023-05-08 07:26:52 +00:00
|
|
|
)
|
|
|
|
total_values = generate_random_files(
|
2023-09-15 12:21:08 +00:00
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
2023-05-08 07:26:52 +00:00
|
|
|
)
|
2023-09-15 12:21:08 +00:00
|
|
|
|
2023-09-27 16:44:53 +00:00
|
|
|
get_query = f"SELECT * FROM {table_name} ORDER BY column1, column2, column3"
|
2023-09-15 12:21:08 +00:00
|
|
|
res1 = [list(map(int, l.split())) for l in run_query(node, get_query).splitlines()]
|
2023-09-27 16:44:53 +00:00
|
|
|
assert res1 == sorted(total_values, key=lambda x: (x[0], x[1], x[2]))
|
2023-09-15 12:21:08 +00:00
|
|
|
print(total_values)
|
2023-05-08 07:26:52 +00:00
|
|
|
|
2023-09-15 12:21:08 +00:00
|
|
|
time.sleep(10)
|
|
|
|
|
|
|
|
zk = started_cluster.get_kazoo_client("zoo1")
|
|
|
|
processed_nodes = zk.get_children(f"{keeper_path}/processed/")
|
|
|
|
assert len(processed_nodes) == 9
|
|
|
|
|
|
|
|
res1 = [list(map(int, l.split())) for l in run_query(node, get_query).splitlines()]
|
2023-05-08 07:26:52 +00:00
|
|
|
assert res1 == [total_values[0]]
|
|
|
|
|
2023-09-15 12:21:08 +00:00
|
|
|
time.sleep(10)
|
|
|
|
res1 = [list(map(int, l.split())) for l in run_query(node, get_query).splitlines()]
|
2023-05-08 07:26:52 +00:00
|
|
|
assert res1 == [total_values[1]]
|
2023-11-06 14:46:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_drop_table(started_cluster):
|
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
table_name = f"test_drop"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2023-11-06 14:46:29 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 300
|
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"unordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
2023-11-08 11:29:40 +00:00
|
|
|
"s3queue_processing_threads_num": 5,
|
2023-11-06 14:46:29 +00:00
|
|
|
},
|
|
|
|
)
|
|
|
|
total_values = generate_random_files(
|
2023-11-08 11:29:40 +00:00
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=100000
|
2023-11-06 14:46:29 +00:00
|
|
|
)
|
|
|
|
create_mv(node, table_name, dst_table_name)
|
2024-10-02 11:34:25 +00:00
|
|
|
node.wait_for_log_line(f"rows from file: test_drop_data")
|
2023-11-06 14:46:29 +00:00
|
|
|
node.query(f"DROP TABLE {table_name} SYNC")
|
2023-11-08 11:48:09 +00:00
|
|
|
assert node.contains_in_log(
|
2024-05-13 17:16:05 +00:00
|
|
|
f"StorageS3Queue (default.{table_name}): Table is being dropped"
|
2023-12-28 11:48:56 +00:00
|
|
|
) or node.contains_in_log(
|
2024-05-13 17:16:05 +00:00
|
|
|
f"StorageS3Queue (default.{table_name}): Shutdown was called, stopping sync"
|
2023-11-08 11:48:09 +00:00
|
|
|
)
|
2024-01-07 01:16:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_s3_client_reused(started_cluster):
|
|
|
|
node = started_cluster.instances["instance"]
|
2024-08-14 16:43:12 +00:00
|
|
|
table_name = f"test_s3_client_reused"
|
2024-01-07 01:16:29 +00:00
|
|
|
dst_table_name = f"{table_name}_dst"
|
|
|
|
files_path = f"{table_name}_data"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2024-01-07 01:16:29 +00:00
|
|
|
row_num = 10
|
|
|
|
|
|
|
|
def get_created_s3_clients_count():
|
2024-01-08 02:39:19 +00:00
|
|
|
value = node.query(
|
|
|
|
f"SELECT value FROM system.events WHERE event='S3Clients'"
|
|
|
|
).strip()
|
|
|
|
return int(value) if value != "" else 0
|
2024-01-07 01:16:29 +00:00
|
|
|
|
|
|
|
def wait_all_processed(files_num):
|
|
|
|
expected_count = files_num * row_num
|
|
|
|
for _ in range(100):
|
|
|
|
count = int(node.query(f"SELECT count() FROM {dst_table_name}"))
|
|
|
|
print(f"{count}/{expected_count}")
|
|
|
|
if count == expected_count:
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
2024-01-08 02:39:19 +00:00
|
|
|
assert (
|
|
|
|
int(node.query(f"SELECT count() FROM {dst_table_name}")) == expected_count
|
|
|
|
)
|
2024-01-07 01:16:29 +00:00
|
|
|
|
|
|
|
prepare_public_s3_bucket(started_cluster)
|
|
|
|
|
|
|
|
s3_clients_before = get_created_s3_clients_count()
|
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"after_processing": "delete",
|
|
|
|
"s3queue_processing_threads_num": 1,
|
2024-08-14 14:56:02 +00:00
|
|
|
"keeper_path": keeper_path,
|
2024-01-07 01:16:29 +00:00
|
|
|
},
|
|
|
|
auth=NO_AUTH,
|
|
|
|
bucket=started_cluster.minio_public_bucket,
|
|
|
|
)
|
|
|
|
|
|
|
|
s3_clients_after = get_created_s3_clients_count()
|
|
|
|
assert s3_clients_before + 1 == s3_clients_after
|
|
|
|
|
|
|
|
create_mv(node, table_name, dst_table_name)
|
|
|
|
|
|
|
|
for i in range(0, 10):
|
|
|
|
s3_clients_before = get_created_s3_clients_count()
|
|
|
|
|
|
|
|
generate_random_files(
|
2024-01-08 02:39:19 +00:00
|
|
|
started_cluster,
|
|
|
|
files_path,
|
|
|
|
count=1,
|
|
|
|
start_ind=i,
|
|
|
|
row_num=row_num,
|
|
|
|
bucket=started_cluster.minio_public_bucket,
|
2024-01-07 01:16:29 +00:00
|
|
|
)
|
|
|
|
|
2024-01-08 02:39:19 +00:00
|
|
|
wait_all_processed(i + 1)
|
2024-01-07 01:16:29 +00:00
|
|
|
|
|
|
|
s3_clients_after = get_created_s3_clients_count()
|
|
|
|
|
|
|
|
assert s3_clients_before == s3_clients_after
|
2024-01-24 15:04:00 +00:00
|
|
|
|
|
|
|
|
2024-05-29 19:37:42 +00:00
|
|
|
def get_processed_files(node, table_name):
|
2024-05-29 19:44:40 +00:00
|
|
|
return (
|
|
|
|
node.query(
|
|
|
|
f"""
|
2024-05-29 19:37:42 +00:00
|
|
|
select splitByChar('/', file_name)[-1] as file
|
|
|
|
from system.s3queue where zookeeper_path ilike '%{table_name}%' and status = 'Processed' order by file
|
|
|
|
"""
|
2024-05-29 19:44:40 +00:00
|
|
|
)
|
|
|
|
.strip()
|
|
|
|
.split("\n")
|
|
|
|
)
|
|
|
|
|
2024-05-29 19:37:42 +00:00
|
|
|
|
|
|
|
def get_unprocessed_files(node, table_name):
|
|
|
|
return node.query(
|
|
|
|
f"""
|
|
|
|
select concat('test_', toString(number), '.csv') as file from numbers(300)
|
|
|
|
where file not
|
|
|
|
in (select splitByChar('/', file_name)[-1] from system.s3queue where zookeeper_path ilike '%{table_name}%' and status = 'Processed')
|
|
|
|
"""
|
|
|
|
)
|
|
|
|
|
2024-05-29 19:44:40 +00:00
|
|
|
|
2024-01-24 15:04:00 +00:00
|
|
|
@pytest.mark.parametrize("mode", ["unordered", "ordered"])
|
|
|
|
def test_processing_threads(started_cluster, mode):
|
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
table_name = f"processing_threads_{mode}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2024-01-24 15:04:00 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 300
|
|
|
|
processing_threads = 32
|
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_processing_threads_num": processing_threads,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
create_mv(node, table_name, dst_table_name)
|
|
|
|
|
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, row_num=1
|
|
|
|
)
|
|
|
|
|
|
|
|
def get_count(table_name):
|
|
|
|
return int(run_query(node, f"SELECT count() FROM {table_name}"))
|
|
|
|
|
2024-05-29 19:37:42 +00:00
|
|
|
for _ in range(50):
|
2024-01-24 15:04:00 +00:00
|
|
|
if (get_count(f"{dst_table_name}")) == files_to_generate:
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
2024-05-29 19:37:42 +00:00
|
|
|
if get_count(dst_table_name) != files_to_generate:
|
|
|
|
processed_files = get_processed_files(node, table_name)
|
|
|
|
unprocessed_files = get_unprocessed_files(node, table_name)
|
2024-05-29 19:44:40 +00:00
|
|
|
logging.debug(
|
|
|
|
f"Processed files: {len(processed_files)}/{files_to_generate}, unprocessed files: {unprocessed_files}, count: {get_count(dst_table_name)}"
|
|
|
|
)
|
2024-05-29 19:37:42 +00:00
|
|
|
assert False
|
2024-01-24 15:04:00 +00:00
|
|
|
|
|
|
|
res = [
|
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(
|
|
|
|
f"SELECT column1, column2, column3 FROM {dst_table_name}"
|
|
|
|
).splitlines()
|
|
|
|
]
|
|
|
|
assert {tuple(v) for v in res} == set([tuple(i) for i in total_values])
|
|
|
|
|
|
|
|
if mode == "ordered":
|
|
|
|
zk = started_cluster.get_kazoo_client("zoo1")
|
2024-05-29 19:37:42 +00:00
|
|
|
nodes = zk.get_children(f"{keeper_path}")
|
|
|
|
print(f"Metadata nodes: {nodes}")
|
2024-05-15 11:41:23 +00:00
|
|
|
processed_nodes = zk.get_children(f"{keeper_path}/buckets/")
|
2024-01-24 15:04:00 +00:00
|
|
|
assert len(processed_nodes) == processing_threads
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"mode, processing_threads",
|
|
|
|
[
|
|
|
|
pytest.param("unordered", 1),
|
|
|
|
pytest.param("unordered", 8),
|
|
|
|
pytest.param("ordered", 1),
|
|
|
|
pytest.param("ordered", 8),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_shards(started_cluster, mode, processing_threads):
|
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
table_name = f"test_shards_{mode}_{processing_threads}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2024-01-24 15:04:00 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 300
|
|
|
|
shards_num = 3
|
|
|
|
|
|
|
|
for i in range(shards_num):
|
|
|
|
table = f"{table_name}_{i + 1}"
|
|
|
|
dst_table = f"{dst_table_name}_{i + 1}"
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table,
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_processing_threads_num": processing_threads,
|
2024-05-15 11:41:23 +00:00
|
|
|
"s3queue_buckets": shards_num,
|
2024-01-24 15:04:00 +00:00
|
|
|
},
|
|
|
|
)
|
|
|
|
create_mv(node, table, dst_table)
|
|
|
|
|
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, row_num=1
|
|
|
|
)
|
|
|
|
|
|
|
|
def get_count(table_name):
|
|
|
|
return int(run_query(node, f"SELECT count() FROM {table_name}"))
|
|
|
|
|
2024-05-30 12:57:50 +00:00
|
|
|
for _ in range(30):
|
2024-05-24 17:23:46 +00:00
|
|
|
count = (
|
|
|
|
get_count(f"{dst_table_name}_1")
|
|
|
|
+ get_count(f"{dst_table_name}_2")
|
|
|
|
+ get_count(f"{dst_table_name}_3")
|
|
|
|
)
|
2024-05-15 11:41:23 +00:00
|
|
|
if count == files_to_generate:
|
2024-01-24 15:04:00 +00:00
|
|
|
break
|
2024-05-15 11:41:23 +00:00
|
|
|
print(f"Current {count}/{files_to_generate}")
|
2024-01-24 15:04:00 +00:00
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
if (
|
|
|
|
get_count(f"{dst_table_name}_1")
|
|
|
|
+ get_count(f"{dst_table_name}_2")
|
|
|
|
+ get_count(f"{dst_table_name}_3")
|
|
|
|
) != files_to_generate:
|
2024-05-24 17:23:46 +00:00
|
|
|
processed_files = (
|
|
|
|
node.query(
|
2024-05-30 12:57:50 +00:00
|
|
|
f"""
|
|
|
|
select splitByChar('/', file_name)[-1] as file from system.s3queue
|
|
|
|
where zookeeper_path ilike '%{table_name}%' and status = 'Processed' and rows_processed > 0 order by file
|
|
|
|
"""
|
2024-05-24 17:23:46 +00:00
|
|
|
)
|
|
|
|
.strip()
|
|
|
|
.split("\n")
|
|
|
|
)
|
2024-05-30 13:07:07 +00:00
|
|
|
logging.debug(
|
|
|
|
f"Processed files: {len(processed_files)}/{files_to_generate}: {processed_files}"
|
|
|
|
)
|
2024-05-15 11:41:23 +00:00
|
|
|
|
2024-05-24 17:23:46 +00:00
|
|
|
count = (
|
|
|
|
get_count(f"{dst_table_name}_1")
|
|
|
|
+ get_count(f"{dst_table_name}_2")
|
|
|
|
+ get_count(f"{dst_table_name}_3")
|
|
|
|
)
|
2024-05-15 11:41:23 +00:00
|
|
|
logging.debug(f"Processed rows: {count}/{files_to_generate}")
|
|
|
|
|
2024-01-24 15:04:00 +00:00
|
|
|
info = node.query(
|
2024-05-15 11:41:23 +00:00
|
|
|
f"""
|
|
|
|
select concat('test_', toString(number), '.csv') as file from numbers(300)
|
2024-05-30 12:57:50 +00:00
|
|
|
where file not in (select splitByChar('/', file_name)[-1] from system.s3queue
|
|
|
|
where zookeeper_path ilike '%{table_name}%' and status = 'Processed' and rows_processed > 0)
|
2024-05-15 11:41:23 +00:00
|
|
|
"""
|
2024-01-24 15:04:00 +00:00
|
|
|
)
|
2024-05-15 11:41:23 +00:00
|
|
|
logging.debug(f"Unprocessed files: {info}")
|
|
|
|
|
2024-01-24 15:04:00 +00:00
|
|
|
assert False
|
|
|
|
|
|
|
|
res1 = [
|
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(
|
|
|
|
f"SELECT column1, column2, column3 FROM {dst_table_name}_1"
|
|
|
|
).splitlines()
|
|
|
|
]
|
|
|
|
res2 = [
|
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(
|
|
|
|
f"SELECT column1, column2, column3 FROM {dst_table_name}_2"
|
|
|
|
).splitlines()
|
|
|
|
]
|
|
|
|
res3 = [
|
|
|
|
list(map(int, l.split()))
|
|
|
|
for l in node.query(
|
|
|
|
f"SELECT column1, column2, column3 FROM {dst_table_name}_3"
|
|
|
|
).splitlines()
|
|
|
|
]
|
|
|
|
assert {tuple(v) for v in res1 + res2 + res3} == set(
|
|
|
|
[tuple(i) for i in total_values]
|
|
|
|
)
|
|
|
|
|
|
|
|
# Checking that all files were processed only once
|
|
|
|
time.sleep(10)
|
|
|
|
assert (
|
|
|
|
get_count(f"{dst_table_name}_1")
|
|
|
|
+ get_count(f"{dst_table_name}_2")
|
|
|
|
+ get_count(f"{dst_table_name}_3")
|
|
|
|
) == files_to_generate
|
|
|
|
|
|
|
|
if mode == "ordered":
|
|
|
|
zk = started_cluster.get_kazoo_client("zoo1")
|
2024-05-15 11:41:23 +00:00
|
|
|
processed_nodes = zk.get_children(f"{keeper_path}/buckets/")
|
|
|
|
assert len(processed_nodes) == shards_num
|
2024-01-24 15:04:00 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"mode, processing_threads",
|
|
|
|
[
|
|
|
|
pytest.param("unordered", 1),
|
|
|
|
pytest.param("unordered", 8),
|
|
|
|
pytest.param("ordered", 1),
|
2024-08-12 15:21:01 +00:00
|
|
|
pytest.param("ordered", 2),
|
2024-01-24 15:04:00 +00:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_shards_distributed(started_cluster, mode, processing_threads):
|
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
node_2 = started_cluster.instances["instance2"]
|
|
|
|
table_name = f"test_shards_distributed_{mode}_{processing_threads}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2024-01-24 15:04:00 +00:00
|
|
|
files_path = f"{table_name}_data"
|
2024-11-07 16:44:41 +00:00
|
|
|
files_to_generate = 600
|
|
|
|
row_num = 1000
|
2024-01-24 15:04:00 +00:00
|
|
|
total_rows = row_num * files_to_generate
|
|
|
|
shards_num = 2
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
for instance in [node, node_2]:
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
instance,
|
|
|
|
table_name,
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_processing_threads_num": processing_threads,
|
2024-05-15 11:41:23 +00:00
|
|
|
"s3queue_buckets": shards_num,
|
2024-11-13 10:54:15 +00:00
|
|
|
"polling_max_timeout_ms": 1000,
|
|
|
|
"polling_backoff_ms": 0,
|
2024-01-24 15:04:00 +00:00
|
|
|
},
|
|
|
|
)
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
for instance in [node, node_2]:
|
|
|
|
create_mv(instance, table_name, dst_table_name)
|
|
|
|
|
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, row_num=row_num
|
|
|
|
)
|
|
|
|
|
|
|
|
def get_count(node, table_name):
|
|
|
|
return int(run_query(node, f"SELECT count() FROM {table_name}"))
|
|
|
|
|
2024-07-29 15:52:25 +00:00
|
|
|
def print_debug_info():
|
2024-05-24 17:23:46 +00:00
|
|
|
processed_files = (
|
|
|
|
node.query(
|
|
|
|
f"""
|
2024-05-15 11:41:23 +00:00
|
|
|
select splitByChar('/', file_name)[-1] as file from system.s3queue where zookeeper_path ilike '%{table_name}%' and status = 'Processed' and rows_processed > 0 order by file
|
|
|
|
"""
|
2024-05-24 17:23:46 +00:00
|
|
|
)
|
|
|
|
.strip()
|
|
|
|
.split("\n")
|
|
|
|
)
|
|
|
|
logging.debug(
|
|
|
|
f"Processed files by node 1: {len(processed_files)}/{files_to_generate}"
|
|
|
|
)
|
|
|
|
processed_files = (
|
|
|
|
node_2.query(
|
|
|
|
f"""
|
2024-05-15 11:41:23 +00:00
|
|
|
select splitByChar('/', file_name)[-1] as file from system.s3queue where zookeeper_path ilike '%{table_name}%' and status = 'Processed' and rows_processed > 0 order by file
|
|
|
|
"""
|
2024-05-24 17:23:46 +00:00
|
|
|
)
|
|
|
|
.strip()
|
|
|
|
.split("\n")
|
|
|
|
)
|
|
|
|
logging.debug(
|
|
|
|
f"Processed files by node 2: {len(processed_files)}/{files_to_generate}"
|
|
|
|
)
|
2024-05-15 11:41:23 +00:00
|
|
|
|
|
|
|
count = get_count(node, dst_table_name) + get_count(node_2, dst_table_name)
|
2024-07-29 15:52:25 +00:00
|
|
|
logging.debug(f"Processed rows: {count}/{total_rows}")
|
2024-05-15 11:41:23 +00:00
|
|
|
|
2024-01-24 15:04:00 +00:00
|
|
|
info = node.query(
|
2024-05-15 11:41:23 +00:00
|
|
|
f"""
|
|
|
|
select concat('test_', toString(number), '.csv') as file from numbers(300)
|
|
|
|
where file not in (select splitByChar('/', file_name)[-1] from clusterAllReplicas(default, system.s3queue)
|
|
|
|
where zookeeper_path ilike '%{table_name}%' and status = 'Processed' and rows_processed > 0)
|
|
|
|
"""
|
2024-01-24 15:04:00 +00:00
|
|
|
)
|
2024-05-15 11:41:23 +00:00
|
|
|
logging.debug(f"Unprocessed files: {info}")
|
|
|
|
|
2024-05-24 17:23:46 +00:00
|
|
|
files1 = (
|
|
|
|
node.query(
|
|
|
|
f"""
|
2024-05-15 11:41:23 +00:00
|
|
|
select splitByChar('/', file_name)[-1] from system.s3queue
|
|
|
|
where zookeeper_path ilike '%{table_name}%' and status = 'Processed' and rows_processed > 0
|
|
|
|
"""
|
2024-05-24 17:23:46 +00:00
|
|
|
)
|
|
|
|
.strip()
|
|
|
|
.split("\n")
|
|
|
|
)
|
|
|
|
files2 = (
|
|
|
|
node_2.query(
|
|
|
|
f"""
|
2024-05-15 11:41:23 +00:00
|
|
|
select splitByChar('/', file_name)[-1] from system.s3queue
|
|
|
|
where zookeeper_path ilike '%{table_name}%' and status = 'Processed' and rows_processed > 0
|
|
|
|
"""
|
2024-05-24 17:23:46 +00:00
|
|
|
)
|
|
|
|
.strip()
|
|
|
|
.split("\n")
|
|
|
|
)
|
|
|
|
|
2024-05-15 11:41:23 +00:00
|
|
|
def intersection(list_a, list_b):
|
2024-05-24 17:23:46 +00:00
|
|
|
return [e for e in list_a if e in list_b]
|
|
|
|
|
2024-05-15 11:41:23 +00:00
|
|
|
logging.debug(f"Intersecting files: {intersection(files1, files2)}")
|
|
|
|
|
2024-07-29 15:52:25 +00:00
|
|
|
for _ in range(30):
|
|
|
|
if (
|
|
|
|
get_count(node, dst_table_name) + get_count(node_2, dst_table_name)
|
|
|
|
) == total_rows:
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
if (
|
|
|
|
get_count(node, dst_table_name) + get_count(node_2, dst_table_name)
|
|
|
|
) != total_rows:
|
|
|
|
print_debug_info()
|
|
|
|
|
2024-01-24 15:04:00 +00:00
|
|
|
assert False
|
|
|
|
|
|
|
|
get_query = f"SELECT column1, column2, column3 FROM {dst_table_name}"
|
|
|
|
res1 = [list(map(int, l.split())) for l in run_query(node, get_query).splitlines()]
|
|
|
|
res2 = [
|
|
|
|
list(map(int, l.split())) for l in run_query(node_2, get_query).splitlines()
|
|
|
|
]
|
|
|
|
|
2024-07-29 15:52:25 +00:00
|
|
|
if len(res1) + len(res2) != total_rows or len(res1) <= 0 or len(res2) <= 0 or True:
|
2024-07-29 16:02:06 +00:00
|
|
|
logging.debug(
|
|
|
|
f"res1 size: {len(res1)}, res2 size: {len(res2)}, total_rows: {total_rows}"
|
|
|
|
)
|
2024-07-29 15:52:25 +00:00
|
|
|
print_debug_info()
|
|
|
|
|
2024-01-24 15:04:00 +00:00
|
|
|
assert len(res1) + len(res2) == total_rows
|
|
|
|
|
|
|
|
# Checking that all engines have made progress
|
|
|
|
assert len(res1) > 0
|
|
|
|
assert len(res2) > 0
|
|
|
|
|
|
|
|
assert {tuple(v) for v in res1 + res2} == set([tuple(i) for i in total_values])
|
|
|
|
|
|
|
|
# Checking that all files were processed only once
|
|
|
|
time.sleep(10)
|
|
|
|
assert (
|
|
|
|
get_count(node, dst_table_name) + get_count(node_2, dst_table_name)
|
|
|
|
) == total_rows
|
|
|
|
|
|
|
|
if mode == "ordered":
|
|
|
|
zk = started_cluster.get_kazoo_client("zoo1")
|
2024-05-15 11:41:23 +00:00
|
|
|
processed_nodes = zk.get_children(f"{keeper_path}/buckets/")
|
|
|
|
assert len(processed_nodes) == shards_num
|
2024-01-29 11:36:00 +00:00
|
|
|
|
2024-01-30 14:58:35 +00:00
|
|
|
node.restart_clickhouse()
|
|
|
|
time.sleep(10)
|
|
|
|
assert (
|
|
|
|
get_count(node, dst_table_name) + get_count(node_2, dst_table_name)
|
|
|
|
) == total_rows
|
|
|
|
|
2024-01-29 11:36:00 +00:00
|
|
|
|
|
|
|
def test_settings_check(started_cluster):
|
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
node_2 = started_cluster.instances["instance2"]
|
|
|
|
table_name = f"test_settings_check"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2024-01-29 11:36:00 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
mode = "ordered"
|
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_processing_threads_num": 5,
|
2024-05-15 11:41:23 +00:00
|
|
|
"s3queue_buckets": 2,
|
2024-01-29 11:36:00 +00:00
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
assert (
|
2024-06-21 09:53:01 +00:00
|
|
|
"Existing table metadata in ZooKeeper differs in buckets setting. Stored in ZooKeeper: 2, local: 3"
|
2024-01-29 11:36:00 +00:00
|
|
|
in create_table(
|
|
|
|
started_cluster,
|
|
|
|
node_2,
|
|
|
|
table_name,
|
|
|
|
mode,
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_processing_threads_num": 5,
|
2024-05-15 11:41:23 +00:00
|
|
|
"s3queue_buckets": 3,
|
2024-01-29 11:36:00 +00:00
|
|
|
},
|
|
|
|
expect_error=True,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
2024-01-29 16:14:32 +00:00
|
|
|
node.query(f"DROP TABLE {table_name} SYNC")
|
2024-01-31 20:28:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("processing_threads", [1, 5])
|
|
|
|
def test_processed_file_setting(started_cluster, processing_threads):
|
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
table_name = f"test_processed_file_setting_{processing_threads}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = (
|
|
|
|
f"/clickhouse/test_{table_name}_{processing_threads}_{generate_random_string()}"
|
|
|
|
)
|
2024-01-31 20:28:18 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 10
|
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_processing_threads_num": processing_threads,
|
|
|
|
"s3queue_last_processed_path": f"{files_path}/test_5.csv",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
|
|
|
)
|
|
|
|
|
|
|
|
create_mv(node, table_name, dst_table_name)
|
|
|
|
|
|
|
|
def get_count():
|
|
|
|
return int(node.query(f"SELECT count() FROM {dst_table_name}"))
|
|
|
|
|
|
|
|
expected_rows = 4
|
|
|
|
for _ in range(20):
|
|
|
|
if expected_rows == get_count():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
assert expected_rows == get_count()
|
|
|
|
|
|
|
|
node.restart_clickhouse()
|
|
|
|
time.sleep(10)
|
|
|
|
|
|
|
|
expected_rows = 4
|
|
|
|
for _ in range(20):
|
|
|
|
if expected_rows == get_count():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
assert expected_rows == get_count()
|
2024-01-31 21:59:25 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("processing_threads", [1, 5])
|
|
|
|
def test_processed_file_setting_distributed(started_cluster, processing_threads):
|
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
node_2 = started_cluster.instances["instance2"]
|
|
|
|
table_name = f"test_processed_file_setting_distributed_{processing_threads}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = (
|
|
|
|
f"/clickhouse/test_{table_name}_{processing_threads}_{generate_random_string()}"
|
|
|
|
)
|
2024-01-31 21:59:25 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 10
|
|
|
|
|
|
|
|
for instance in [node, node_2]:
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
instance,
|
|
|
|
table_name,
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_processing_threads_num": processing_threads,
|
|
|
|
"s3queue_last_processed_path": f"{files_path}/test_5.csv",
|
2024-05-15 11:41:23 +00:00
|
|
|
"s3queue_buckets": 2,
|
2024-11-13 15:27:15 +00:00
|
|
|
"polling_max_timeout_ms": 2000,
|
|
|
|
"polling_backoff_ms": 1000,
|
2024-01-31 21:59:25 +00:00
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
|
|
|
)
|
|
|
|
|
|
|
|
for instance in [node, node_2]:
|
|
|
|
create_mv(instance, table_name, dst_table_name)
|
|
|
|
|
|
|
|
def get_count():
|
|
|
|
query = f"SELECT count() FROM {dst_table_name}"
|
|
|
|
return int(node.query(query)) + int(node_2.query(query))
|
|
|
|
|
|
|
|
expected_rows = 4
|
|
|
|
for _ in range(20):
|
|
|
|
if expected_rows == get_count():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
assert expected_rows == get_count()
|
|
|
|
|
|
|
|
for instance in [node, node_2]:
|
|
|
|
instance.restart_clickhouse()
|
|
|
|
|
|
|
|
time.sleep(10)
|
|
|
|
expected_rows = 4
|
|
|
|
for _ in range(20):
|
|
|
|
if expected_rows == get_count():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
assert expected_rows == get_count()
|
2024-02-12 19:23:21 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_upgrade(started_cluster):
|
|
|
|
node = started_cluster.instances["old_instance"]
|
|
|
|
|
|
|
|
table_name = f"test_upgrade"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 14:56:02 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
|
2024-02-12 19:23:21 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 10
|
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
2024-10-18 14:49:31 +00:00
|
|
|
"after_processing": "keep",
|
2024-02-12 19:23:21 +00:00
|
|
|
},
|
|
|
|
)
|
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
|
|
|
)
|
|
|
|
|
|
|
|
create_mv(node, table_name, dst_table_name)
|
|
|
|
|
|
|
|
def get_count():
|
|
|
|
return int(node.query(f"SELECT count() FROM {dst_table_name}"))
|
|
|
|
|
|
|
|
expected_rows = 10
|
|
|
|
for _ in range(20):
|
|
|
|
if expected_rows == get_count():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
assert expected_rows == get_count()
|
|
|
|
|
|
|
|
node.restart_with_latest_version()
|
|
|
|
|
|
|
|
assert expected_rows == get_count()
|
2024-06-10 11:03:50 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_exception_during_insert(started_cluster):
|
|
|
|
node = started_cluster.instances["instance_too_many_parts"]
|
|
|
|
|
2024-08-14 16:43:12 +00:00
|
|
|
# A unique table name is necessary for repeatable tests
|
|
|
|
table_name = f"test_exception_during_insert_{generate_random_string()}"
|
2024-06-10 11:03:50 +00:00
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 16:43:12 +00:00
|
|
|
keeper_path = f"/clickhouse/test_{table_name}"
|
2024-06-10 11:03:50 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 10
|
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"unordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
},
|
|
|
|
)
|
2024-08-14 16:43:12 +00:00
|
|
|
node.rotate_logs()
|
2024-06-10 11:03:50 +00:00
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
|
|
|
)
|
|
|
|
|
|
|
|
create_mv(node, table_name, dst_table_name)
|
|
|
|
|
|
|
|
node.wait_for_log_line(
|
|
|
|
"Failed to process data: Code: 252. DB::Exception: Too many parts"
|
|
|
|
)
|
|
|
|
|
|
|
|
time.sleep(2)
|
|
|
|
exception = node.query(
|
|
|
|
f"SELECT exception FROM system.s3queue WHERE zookeeper_path ilike '%{table_name}%' and notEmpty(exception)"
|
|
|
|
)
|
|
|
|
assert "Too many parts" in exception
|
|
|
|
|
2024-08-14 16:43:12 +00:00
|
|
|
original_parts_to_throw_insert = 0
|
|
|
|
modified_parts_to_throw_insert = 10
|
2024-06-10 11:03:50 +00:00
|
|
|
node.replace_in_config(
|
|
|
|
"/etc/clickhouse-server/config.d/merge_tree.xml",
|
2024-08-14 16:43:12 +00:00
|
|
|
f"parts_to_throw_insert>{original_parts_to_throw_insert}",
|
|
|
|
f"parts_to_throw_insert>{modified_parts_to_throw_insert}",
|
2024-06-10 11:03:50 +00:00
|
|
|
)
|
2024-08-14 16:43:12 +00:00
|
|
|
try:
|
|
|
|
node.restart_clickhouse()
|
2024-06-10 11:03:50 +00:00
|
|
|
|
2024-08-14 16:43:12 +00:00
|
|
|
def get_count():
|
|
|
|
return int(node.query(f"SELECT count() FROM {dst_table_name}"))
|
2024-06-10 11:03:50 +00:00
|
|
|
|
2024-08-14 16:43:12 +00:00
|
|
|
expected_rows = 10
|
|
|
|
for _ in range(20):
|
|
|
|
if expected_rows == get_count():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
assert expected_rows == get_count()
|
|
|
|
finally:
|
|
|
|
node.replace_in_config(
|
|
|
|
"/etc/clickhouse-server/config.d/merge_tree.xml",
|
|
|
|
f"parts_to_throw_insert>{modified_parts_to_throw_insert}",
|
|
|
|
f"parts_to_throw_insert>{original_parts_to_throw_insert}",
|
|
|
|
)
|
|
|
|
node.restart_clickhouse()
|
2024-06-14 12:24:06 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_commit_on_limit(started_cluster):
|
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
|
2024-08-14 16:43:12 +00:00
|
|
|
# A unique table name is necessary for repeatable tests
|
|
|
|
table_name = f"test_commit_on_limit_{generate_random_string()}"
|
2024-06-14 12:24:06 +00:00
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-08-14 16:43:12 +00:00
|
|
|
keeper_path = f"/clickhouse/test_{table_name}"
|
2024-06-14 12:24:06 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 10
|
|
|
|
|
2024-08-14 16:43:12 +00:00
|
|
|
failed_files_event_before = int(
|
|
|
|
node.query(
|
|
|
|
"SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
|
|
|
|
)
|
|
|
|
)
|
2024-06-14 12:24:06 +00:00
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_processing_threads_num": 1,
|
2024-06-17 15:11:17 +00:00
|
|
|
"s3queue_loading_retries": 0,
|
2024-06-14 12:24:06 +00:00
|
|
|
"s3queue_max_processed_files_before_commit": 10,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
|
|
|
)
|
|
|
|
|
|
|
|
incorrect_values = [
|
|
|
|
["failed", 1, 1],
|
|
|
|
]
|
|
|
|
incorrect_values_csv = (
|
|
|
|
"\n".join((",".join(map(str, row)) for row in incorrect_values)) + "\n"
|
|
|
|
).encode()
|
|
|
|
|
|
|
|
correct_values = [
|
|
|
|
[1, 1, 1],
|
|
|
|
]
|
|
|
|
correct_values_csv = (
|
|
|
|
"\n".join((",".join(map(str, row)) for row in correct_values)) + "\n"
|
|
|
|
).encode()
|
|
|
|
|
|
|
|
put_s3_file_content(
|
|
|
|
started_cluster, f"{files_path}/test_99.csv", correct_values_csv
|
|
|
|
)
|
|
|
|
put_s3_file_content(
|
|
|
|
started_cluster, f"{files_path}/test_999.csv", correct_values_csv
|
|
|
|
)
|
|
|
|
put_s3_file_content(
|
|
|
|
started_cluster, f"{files_path}/test_9999.csv", incorrect_values_csv
|
|
|
|
)
|
|
|
|
put_s3_file_content(
|
|
|
|
started_cluster, f"{files_path}/test_99999.csv", correct_values_csv
|
|
|
|
)
|
|
|
|
put_s3_file_content(
|
|
|
|
started_cluster, f"{files_path}/test_999999.csv", correct_values_csv
|
|
|
|
)
|
|
|
|
|
|
|
|
create_mv(node, table_name, dst_table_name)
|
|
|
|
|
|
|
|
def get_processed_files():
|
|
|
|
return (
|
|
|
|
node.query(
|
|
|
|
f"SELECT file_name FROM system.s3queue WHERE zookeeper_path ilike '%{table_name}%' and status = 'Processed' and rows_processed > 0 "
|
|
|
|
)
|
|
|
|
.strip()
|
|
|
|
.split("\n")
|
|
|
|
)
|
|
|
|
|
|
|
|
def get_failed_files():
|
|
|
|
return (
|
|
|
|
node.query(
|
|
|
|
f"SELECT file_name FROM system.s3queue WHERE zookeeper_path ilike '%{table_name}%' and status = 'Failed'"
|
|
|
|
)
|
|
|
|
.strip()
|
|
|
|
.split("\n")
|
|
|
|
)
|
|
|
|
|
|
|
|
for _ in range(30):
|
|
|
|
if "test_999999.csv" in get_processed_files():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
2024-07-02 16:13:49 +00:00
|
|
|
|
2024-06-14 12:24:06 +00:00
|
|
|
assert "test_999999.csv" in get_processed_files()
|
|
|
|
|
2024-08-14 16:55:59 +00:00
|
|
|
assert 1 == int(
|
|
|
|
node.count_in_log(f"Setting file {files_path}/test_9999.csv as failed")
|
|
|
|
)
|
2024-08-14 16:43:12 +00:00
|
|
|
assert failed_files_event_before + 1 == int(
|
2024-06-14 12:24:06 +00:00
|
|
|
node.query(
|
2024-06-26 14:45:02 +00:00
|
|
|
"SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
|
2024-06-14 12:24:06 +00:00
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
expected_processed = ["test_" + str(i) + ".csv" for i in range(files_to_generate)]
|
|
|
|
processed = get_processed_files()
|
|
|
|
for value in expected_processed:
|
|
|
|
assert value in processed
|
|
|
|
|
|
|
|
expected_failed = ["test_9999.csv"]
|
|
|
|
failed = get_failed_files()
|
|
|
|
for value in expected_failed:
|
|
|
|
assert value not in processed
|
|
|
|
assert value in failed
|
2024-09-09 11:41:48 +00:00
|
|
|
|
|
|
|
|
2024-09-19 11:58:59 +00:00
|
|
|
def test_upgrade_2(started_cluster):
|
|
|
|
node = started_cluster.instances["instance_24.5"]
|
|
|
|
|
2024-10-01 10:48:04 +00:00
|
|
|
table_name = f"test_upgrade_2_{uuid.uuid4().hex[:8]}"
|
2024-09-19 11:58:59 +00:00
|
|
|
dst_table_name = f"{table_name}_dst"
|
|
|
|
# A unique path is necessary for repeatable tests
|
2024-10-01 11:02:31 +00:00
|
|
|
keeper_path = f"/clickhouse/test_{table_name}"
|
2024-09-19 11:58:59 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 10
|
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"s3queue_current_shard_num": 0,
|
|
|
|
"s3queue_processing_threads_num": 2,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
|
|
|
)
|
|
|
|
|
|
|
|
create_mv(node, table_name, dst_table_name)
|
|
|
|
|
|
|
|
def get_count():
|
|
|
|
return int(node.query(f"SELECT count() FROM {dst_table_name}"))
|
|
|
|
|
|
|
|
expected_rows = 10
|
|
|
|
for _ in range(20):
|
|
|
|
if expected_rows == get_count():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
assert expected_rows == get_count()
|
|
|
|
|
|
|
|
node.restart_with_latest_version()
|
|
|
|
assert table_name in node.query("SHOW TABLES")
|
2024-09-26 10:43:54 +00:00
|
|
|
|
|
|
|
|
2024-09-09 11:41:48 +00:00
|
|
|
def test_replicated(started_cluster):
|
|
|
|
node1 = started_cluster.instances["node1"]
|
|
|
|
node2 = started_cluster.instances["node2"]
|
|
|
|
|
2024-10-01 11:02:31 +00:00
|
|
|
table_name = f"test_replicated_{uuid.uuid4().hex[:8]}"
|
2024-09-09 11:41:48 +00:00
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-10-01 11:02:31 +00:00
|
|
|
keeper_path = f"/clickhouse/test_{table_name}"
|
2024-09-09 11:41:48 +00:00
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 1000
|
|
|
|
|
2024-09-19 10:38:08 +00:00
|
|
|
node1.query("DROP DATABASE IF EXISTS r")
|
|
|
|
node2.query("DROP DATABASE IF EXISTS r")
|
|
|
|
|
2024-09-16 10:27:27 +00:00
|
|
|
node1.query(
|
|
|
|
"CREATE DATABASE r ENGINE=Replicated('/clickhouse/databases/replicateddb', 'shard1', 'node1')"
|
|
|
|
)
|
|
|
|
node2.query(
|
|
|
|
"CREATE DATABASE r ENGINE=Replicated('/clickhouse/databases/replicateddb', 'shard1', 'node2')"
|
|
|
|
)
|
2024-09-09 11:41:48 +00:00
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node1,
|
|
|
|
table_name,
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
},
|
2024-09-16 10:27:27 +00:00
|
|
|
database_name="r",
|
2024-09-09 11:41:48 +00:00
|
|
|
)
|
|
|
|
|
2024-09-17 10:54:48 +00:00
|
|
|
assert '"processing_threads_num":16' in node1.query(
|
|
|
|
f"SELECT * FROM system.zookeeper WHERE path = '{keeper_path}'"
|
|
|
|
)
|
2024-09-17 10:48:51 +00:00
|
|
|
|
2024-09-09 11:41:48 +00:00
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
|
|
|
)
|
|
|
|
|
|
|
|
create_mv(node1, f"r.{table_name}", dst_table_name)
|
|
|
|
create_mv(node2, f"r.{table_name}", dst_table_name)
|
|
|
|
|
|
|
|
def get_count():
|
2024-09-16 10:27:27 +00:00
|
|
|
return int(
|
|
|
|
node1.query(
|
|
|
|
f"SELECT count() FROM clusterAllReplicas(cluster, default.{dst_table_name})"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
2024-09-09 11:41:48 +00:00
|
|
|
expected_rows = files_to_generate
|
|
|
|
for _ in range(20):
|
|
|
|
if expected_rows == get_count():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
assert expected_rows == get_count()
|
2024-10-16 14:07:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_bad_settings(started_cluster):
|
|
|
|
node = started_cluster.instances["node_cloud_mode"]
|
|
|
|
|
|
|
|
table_name = f"test_bad_settings_{uuid.uuid4().hex[:8]}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}"
|
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 10
|
|
|
|
|
|
|
|
try:
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node,
|
|
|
|
table_name,
|
|
|
|
"ordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"processing_threads_num": 1,
|
|
|
|
"buckets": 0,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
assert False
|
|
|
|
except Exception as e:
|
|
|
|
assert "Ordered mode in cloud without either" in str(e)
|
2024-10-17 17:41:52 +00:00
|
|
|
|
|
|
|
|
2024-10-18 15:12:08 +00:00
|
|
|
def test_processing_threads(started_cluster):
|
|
|
|
node = started_cluster.instances["node1"]
|
2024-10-17 17:41:52 +00:00
|
|
|
|
2024-10-18 15:12:08 +00:00
|
|
|
table_name = f"test_processing_threads_{uuid.uuid4().hex[:8]}"
|
2024-10-17 17:41:52 +00:00
|
|
|
dst_table_name = f"{table_name}_dst"
|
2024-10-18 15:12:08 +00:00
|
|
|
# A unique path is necessary for repeatable tests
|
2024-10-17 17:41:52 +00:00
|
|
|
keeper_path = f"/clickhouse/test_{table_name}"
|
|
|
|
files_path = f"{table_name}_data"
|
2024-10-18 15:12:08 +00:00
|
|
|
files_to_generate = 10
|
2024-10-17 17:41:52 +00:00
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
2024-10-18 15:12:08 +00:00
|
|
|
node,
|
2024-10-17 17:41:52 +00:00
|
|
|
table_name,
|
2024-10-18 15:12:08 +00:00
|
|
|
"ordered",
|
2024-10-17 17:41:52 +00:00
|
|
|
files_path,
|
2024-10-18 14:49:31 +00:00
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
},
|
2024-10-17 17:41:52 +00:00
|
|
|
)
|
|
|
|
|
2024-10-21 09:35:20 +00:00
|
|
|
assert '"processing_threads_num":16' in node.query(
|
2024-10-18 14:39:27 +00:00
|
|
|
f"SELECT * FROM system.zookeeper WHERE path = '{keeper_path}'"
|
|
|
|
)
|
|
|
|
|
2024-10-21 09:35:20 +00:00
|
|
|
assert 16 == int(
|
2024-10-18 17:11:29 +00:00
|
|
|
node.query(
|
|
|
|
f"SELECT value FROM system.s3_queue_settings WHERE table = '{table_name}' and name = 'processing_threads_num'"
|
|
|
|
)
|
2024-10-18 14:39:27 +00:00
|
|
|
)
|
|
|
|
|
2024-10-17 17:41:52 +00:00
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
|
|
|
)
|
|
|
|
|
2024-10-18 15:12:08 +00:00
|
|
|
create_mv(node, table_name, dst_table_name)
|
2024-10-17 17:41:52 +00:00
|
|
|
|
|
|
|
def get_count():
|
2024-10-18 15:12:08 +00:00
|
|
|
return int(node.query(f"SELECT count() FROM {dst_table_name}"))
|
2024-10-17 17:41:52 +00:00
|
|
|
|
2024-10-18 15:12:08 +00:00
|
|
|
expected_rows = 10
|
2024-10-17 17:41:52 +00:00
|
|
|
for _ in range(20):
|
|
|
|
if expected_rows == get_count():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
2024-10-18 15:12:08 +00:00
|
|
|
assert expected_rows == get_count()
|
2024-10-18 14:39:27 +00:00
|
|
|
|
2024-10-18 15:12:08 +00:00
|
|
|
assert node.contains_in_log(
|
2024-10-21 09:35:20 +00:00
|
|
|
f"StorageS3Queue (default.{table_name}): Using 16 processing threads"
|
2024-10-18 14:39:27 +00:00
|
|
|
)
|
2024-10-24 18:17:47 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_alter_settings(started_cluster):
|
|
|
|
node1 = started_cluster.instances["node1"]
|
|
|
|
node2 = started_cluster.instances["node2"]
|
|
|
|
|
|
|
|
table_name = f"test_alter_settings_{uuid.uuid4().hex[:8]}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}"
|
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 1000
|
|
|
|
|
|
|
|
node1.query("DROP DATABASE IF EXISTS r")
|
|
|
|
node2.query("DROP DATABASE IF EXISTS r")
|
|
|
|
|
|
|
|
node1.query(
|
|
|
|
f"CREATE DATABASE r ENGINE=Replicated('/clickhouse/databases/{table_name}', 'shard1', 'node1')"
|
|
|
|
)
|
|
|
|
node2.query(
|
|
|
|
f"CREATE DATABASE r ENGINE=Replicated('/clickhouse/databases/{table_name}', 'shard1', 'node2')"
|
|
|
|
)
|
|
|
|
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
node1,
|
|
|
|
table_name,
|
|
|
|
"unordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"processing_threads_num": 10,
|
|
|
|
"loading_retries": 20,
|
|
|
|
},
|
|
|
|
database_name="r",
|
|
|
|
)
|
|
|
|
|
|
|
|
assert '"processing_threads_num":10' in node1.query(
|
|
|
|
f"SELECT * FROM system.zookeeper WHERE path = '{keeper_path}'"
|
|
|
|
)
|
|
|
|
|
|
|
|
assert '"loading_retries":20' in node1.query(
|
|
|
|
f"SELECT * FROM system.zookeeper WHERE path = '{keeper_path}'"
|
|
|
|
)
|
|
|
|
|
|
|
|
assert '"after_processing":"keep"' in node1.query(
|
|
|
|
f"SELECT * FROM system.zookeeper WHERE path = '{keeper_path}'"
|
|
|
|
)
|
|
|
|
|
|
|
|
total_values = generate_random_files(
|
|
|
|
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
|
|
|
)
|
|
|
|
|
|
|
|
create_mv(node1, f"r.{table_name}", dst_table_name)
|
|
|
|
create_mv(node2, f"r.{table_name}", dst_table_name)
|
|
|
|
|
|
|
|
def get_count():
|
|
|
|
return int(
|
|
|
|
node1.query(
|
|
|
|
f"SELECT count() FROM clusterAllReplicas(cluster, default.{dst_table_name})"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
expected_rows = files_to_generate
|
|
|
|
for _ in range(20):
|
|
|
|
if expected_rows == get_count():
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
assert expected_rows == get_count()
|
|
|
|
|
|
|
|
node1.query(
|
|
|
|
f"""
|
|
|
|
ALTER TABLE r.{table_name}
|
2024-10-28 16:12:17 +00:00
|
|
|
MODIFY SETTING processing_threads_num=5,
|
|
|
|
loading_retries=10,
|
|
|
|
after_processing='delete',
|
|
|
|
tracked_files_limit=50,
|
|
|
|
tracked_file_ttl_sec=10000,
|
|
|
|
polling_min_timeout_ms=222,
|
|
|
|
polling_max_timeout_ms=333,
|
|
|
|
polling_backoff_ms=111
|
2024-10-24 18:17:47 +00:00
|
|
|
"""
|
|
|
|
)
|
|
|
|
|
2024-10-28 16:12:17 +00:00
|
|
|
int_settings = {
|
|
|
|
"processing_threads_num": 5,
|
|
|
|
"loading_retries": 10,
|
|
|
|
"tracked_files_ttl_sec": 10000,
|
|
|
|
"tracked_files_limit": 50,
|
|
|
|
"polling_min_timeout_ms": 222,
|
|
|
|
"polling_max_timeout_ms": 333,
|
|
|
|
"polling_backoff_ms": 111,
|
|
|
|
}
|
|
|
|
string_settings = {"after_processing": "delete"}
|
|
|
|
|
|
|
|
def with_keeper(setting):
|
|
|
|
return setting in {
|
|
|
|
"after_processing",
|
|
|
|
"loading_retries",
|
|
|
|
"processing_threads_num",
|
|
|
|
"tracked_files_limit",
|
|
|
|
"tracked_files_ttl_sec",
|
|
|
|
}
|
2024-10-25 14:21:35 +00:00
|
|
|
|
2024-10-28 16:12:17 +00:00
|
|
|
def check_int_settings(node, settings):
|
|
|
|
for setting, value in settings.items():
|
|
|
|
if with_keeper(setting):
|
|
|
|
assert f'"{setting}":{value}' in node.query(
|
|
|
|
f"SELECT * FROM system.zookeeper WHERE path = '{keeper_path}'"
|
|
|
|
)
|
|
|
|
if setting == "tracked_files_ttl_sec":
|
|
|
|
setting = "tracked_file_ttl_sec"
|
|
|
|
assert (
|
|
|
|
str(value)
|
|
|
|
== node.query(
|
|
|
|
f"SELECT value FROM system.s3_queue_settings WHERE name = '{setting}' and table = '{table_name}'"
|
|
|
|
).strip()
|
|
|
|
)
|
2024-10-24 18:17:47 +00:00
|
|
|
|
2024-10-28 16:12:17 +00:00
|
|
|
def check_string_settings(node, settings):
|
|
|
|
for setting, value in settings.items():
|
|
|
|
if with_keeper(setting):
|
|
|
|
assert f'"{setting}":"{value}"' in node.query(
|
|
|
|
f"SELECT * FROM system.zookeeper WHERE path = '{keeper_path}'"
|
|
|
|
)
|
|
|
|
assert (
|
|
|
|
str(value)
|
|
|
|
== node.query(
|
|
|
|
f"SELECT value FROM system.s3_queue_settings WHERE name = '{setting}' and table = '{table_name}'"
|
|
|
|
).strip()
|
|
|
|
)
|
2024-10-24 18:17:47 +00:00
|
|
|
|
2024-10-28 16:12:17 +00:00
|
|
|
for node in [node1, node2]:
|
|
|
|
check_int_settings(node, int_settings)
|
|
|
|
check_string_settings(node, string_settings)
|
2024-10-24 18:17:47 +00:00
|
|
|
|
2024-10-28 16:12:17 +00:00
|
|
|
node.restart_clickhouse()
|
2024-10-24 18:17:47 +00:00
|
|
|
|
2024-10-28 16:12:17 +00:00
|
|
|
check_int_settings(node, int_settings)
|
|
|
|
check_string_settings(node, string_settings)
|
2024-10-25 14:21:35 +00:00
|
|
|
|
2024-10-24 18:17:47 +00:00
|
|
|
node1.query(
|
|
|
|
f"""
|
2024-10-25 14:21:35 +00:00
|
|
|
ALTER TABLE r.{table_name} RESET SETTING after_processing, tracked_file_ttl_sec
|
2024-10-24 18:17:47 +00:00
|
|
|
"""
|
|
|
|
)
|
|
|
|
|
2024-10-28 16:12:17 +00:00
|
|
|
int_settings = {
|
|
|
|
"processing_threads_num": 5,
|
|
|
|
"loading_retries": 10,
|
|
|
|
"tracked_files_ttl_sec": 0,
|
|
|
|
"tracked_files_limit": 50,
|
|
|
|
}
|
|
|
|
string_settings = {"after_processing": "keep"}
|
2024-10-25 14:21:35 +00:00
|
|
|
|
2024-10-28 16:12:17 +00:00
|
|
|
for node in [node1, node2]:
|
|
|
|
check_int_settings(node, int_settings)
|
|
|
|
check_string_settings(node, string_settings)
|
2024-10-25 14:21:35 +00:00
|
|
|
|
2024-10-28 16:12:17 +00:00
|
|
|
node.restart_clickhouse()
|
|
|
|
assert expected_rows == get_count()
|
2024-10-25 14:21:35 +00:00
|
|
|
|
2024-10-28 16:12:17 +00:00
|
|
|
check_int_settings(node, int_settings)
|
|
|
|
check_string_settings(node, string_settings)
|
2024-12-03 17:17:25 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_list_and_delete_race(started_cluster):
|
|
|
|
node = started_cluster.instances["instance"]
|
|
|
|
node_2 = started_cluster.instances["instance2"]
|
|
|
|
table_name = f"list_and_delete_race_{generate_random_string()}"
|
|
|
|
dst_table_name = f"{table_name}_dst"
|
|
|
|
keeper_path = f"/clickhouse/test_{table_name}"
|
|
|
|
files_path = f"{table_name}_data"
|
|
|
|
files_to_generate = 1000
|
|
|
|
row_num = 10
|
|
|
|
|
|
|
|
for instance in [node, node_2]:
|
|
|
|
create_table(
|
|
|
|
started_cluster,
|
|
|
|
instance,
|
|
|
|
table_name,
|
|
|
|
"unordered",
|
|
|
|
files_path,
|
|
|
|
additional_settings={
|
|
|
|
"keeper_path": keeper_path,
|
|
|
|
"tracked_files_limit": 1,
|
|
|
|
"polling_max_timeout_ms": 0,
|
|
|
|
"processing_threads_num": 1,
|
|
|
|
"polling_min_timeout_ms": 200,
|
|
|
|
"cleanup_interval_min_ms": 0,
|
|
|
|
"cleanup_interval_max_ms": 0,
|
|
|
|
"polling_backoff_ms": 100,
|
|
|
|
"after_processing": "delete",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
threads = 10
|
|
|
|
total_rows = row_num * files_to_generate * (threads + 1)
|
|
|
|
|
|
|
|
busy_pool = Pool(10)
|
|
|
|
|
|
|
|
def generate(_):
|
|
|
|
generate_random_files(
|
|
|
|
started_cluster,
|
|
|
|
files_path,
|
|
|
|
files_to_generate,
|
|
|
|
row_num=row_num,
|
|
|
|
use_random_names=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
generate(0)
|
|
|
|
|
|
|
|
p = busy_pool.map_async(generate, range(10))
|
|
|
|
|
|
|
|
create_mv(node, table_name, dst_table_name)
|
|
|
|
time.sleep(2)
|
|
|
|
create_mv(node_2, table_name, dst_table_name)
|
|
|
|
|
|
|
|
p.wait()
|
|
|
|
|
|
|
|
def get_count(node, table_name):
|
|
|
|
return int(run_query(node, f"SELECT count() FROM {table_name}"))
|
|
|
|
|
|
|
|
for _ in range(150):
|
|
|
|
if (
|
|
|
|
get_count(node, dst_table_name) + get_count(node_2, dst_table_name)
|
|
|
|
) == total_rows:
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
assert (
|
|
|
|
get_count(node, dst_table_name) + get_count(node_2, dst_table_name)
|
|
|
|
== total_rows
|
|
|
|
)
|
|
|
|
|
|
|
|
get_query = f"SELECT column1, column2, column3 FROM {dst_table_name}"
|
|
|
|
res1 = [list(map(int, l.split())) for l in run_query(node, get_query).splitlines()]
|
|
|
|
res2 = [
|
|
|
|
list(map(int, l.split())) for l in run_query(node_2, get_query).splitlines()
|
|
|
|
]
|
|
|
|
|
|
|
|
logging.debug(
|
|
|
|
f"res1 size: {len(res1)}, res2 size: {len(res2)}, total_rows: {total_rows}"
|
|
|
|
)
|
|
|
|
|
|
|
|
assert len(res1) + len(res2) == total_rows
|
|
|
|
assert node.contains_in_log(
|
|
|
|
"because of the race with list & delete"
|
|
|
|
) or node_2.contains_in_log("because of the race with list & delete")
|