2021-12-21 12:07:10 +00:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
2023-08-10 20:41:41 +00:00
|
|
|
|
import argparse
|
2021-12-21 12:07:10 +00:00
|
|
|
|
import logging
|
|
|
|
|
import os
|
2021-12-24 10:31:34 +00:00
|
|
|
|
import sys
|
2023-08-10 20:41:41 +00:00
|
|
|
|
import time
|
|
|
|
|
from pathlib import Path
|
2023-09-22 11:16:46 +00:00
|
|
|
|
from typing import Any, List
|
2022-11-04 13:40:13 +00:00
|
|
|
|
|
2022-11-15 13:52:40 +00:00
|
|
|
|
import boto3 # type: ignore
|
2024-05-17 13:28:37 +00:00
|
|
|
|
|
2024-01-16 18:12:09 +00:00
|
|
|
|
from build_download_helper import (
|
|
|
|
|
read_build_urls,
|
|
|
|
|
)
|
2023-01-03 14:23:19 +00:00
|
|
|
|
from compress_files import compress_fast
|
2024-07-24 17:51:34 +00:00
|
|
|
|
from env_helper import REPO_COPY, REPORT_PATH, TEMP_PATH
|
2024-01-04 15:35:09 +00:00
|
|
|
|
from get_robot_token import get_parameter_from_ssm
|
2021-12-21 12:07:10 +00:00
|
|
|
|
from pr_info import PRInfo
|
2024-02-06 12:39:34 +00:00
|
|
|
|
from report import FAILURE, SUCCESS, JobReport, TestResult, TestResults
|
2023-01-03 14:23:19 +00:00
|
|
|
|
from ssh import SSHKey
|
|
|
|
|
from stopwatch import Stopwatch
|
|
|
|
|
from tee_popen import TeePopen
|
2024-06-10 09:18:03 +00:00
|
|
|
|
from ci_config import CI
|
2024-01-04 15:35:09 +00:00
|
|
|
|
|
2022-03-22 16:39:58 +00:00
|
|
|
|
JEPSEN_GROUP_NAME = "jepsen_group"
|
2022-11-04 14:12:30 +00:00
|
|
|
|
|
|
|
|
|
KEEPER_DESIRED_INSTANCE_COUNT = 3
|
|
|
|
|
SERVER_DESIRED_INSTANCE_COUNT = 4
|
|
|
|
|
|
2022-11-04 13:40:13 +00:00
|
|
|
|
KEEPER_IMAGE_NAME = "clickhouse/keeper-jepsen-test"
|
2024-07-24 17:51:34 +00:00
|
|
|
|
KEEPER_CHECK_NAME = CI.JobNames.JEPSEN_KEEPER
|
2022-11-04 13:40:13 +00:00
|
|
|
|
|
|
|
|
|
SERVER_IMAGE_NAME = "clickhouse/server-jepsen-test"
|
2024-07-24 17:51:34 +00:00
|
|
|
|
SERVER_CHECK_NAME = CI.JobNames.JEPSEN_SERVER
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
|
|
|
|
SUCCESSFUL_TESTS_ANCHOR = "# Successful tests"
|
|
|
|
|
INTERMINATE_TESTS_ANCHOR = "# Indeterminate tests"
|
|
|
|
|
CRASHED_TESTS_ANCHOR = "# Crashed tests"
|
|
|
|
|
FAILED_TESTS_ANCHOR = "# Failed tests"
|
|
|
|
|
|
2022-03-22 16:39:58 +00:00
|
|
|
|
|
2023-09-22 11:16:46 +00:00
|
|
|
|
def _parse_jepsen_output(path: Path) -> TestResults:
|
2023-01-03 14:23:19 +00:00
|
|
|
|
test_results = [] # type: TestResults
|
2022-03-22 16:39:58 +00:00
|
|
|
|
current_type = ""
|
2024-02-26 18:25:02 +00:00
|
|
|
|
with open(path, "r", encoding="utf-8") as f:
|
2021-12-21 12:07:10 +00:00
|
|
|
|
for line in f:
|
|
|
|
|
if SUCCESSFUL_TESTS_ANCHOR in line:
|
2022-03-22 16:39:58 +00:00
|
|
|
|
current_type = "OK"
|
2021-12-21 12:07:10 +00:00
|
|
|
|
elif INTERMINATE_TESTS_ANCHOR in line or CRASHED_TESTS_ANCHOR in line:
|
2022-03-22 16:39:58 +00:00
|
|
|
|
current_type = "ERROR"
|
2021-12-21 12:07:10 +00:00
|
|
|
|
elif FAILED_TESTS_ANCHOR in line:
|
2022-03-22 16:39:58 +00:00
|
|
|
|
current_type = "FAIL"
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
2022-03-22 16:39:58 +00:00
|
|
|
|
if (
|
2022-11-07 13:49:21 +00:00
|
|
|
|
line.startswith("store/clickhouse") or line.startswith("clickhouse")
|
2022-03-22 16:39:58 +00:00
|
|
|
|
) and current_type:
|
2023-01-03 14:23:19 +00:00
|
|
|
|
test_results.append(TestResult(line.strip(), current_type))
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
|
|
|
|
return test_results
|
|
|
|
|
|
2022-03-22 16:39:58 +00:00
|
|
|
|
|
2021-12-21 12:07:10 +00:00
|
|
|
|
def get_autoscaling_group_instances_ids(asg_client, group_name):
|
2022-03-22 16:39:58 +00:00
|
|
|
|
group_description = asg_client.describe_auto_scaling_groups(
|
|
|
|
|
AutoScalingGroupNames=[group_name]
|
|
|
|
|
)
|
|
|
|
|
our_group = group_description["AutoScalingGroups"][0]
|
2021-12-21 12:07:10 +00:00
|
|
|
|
instance_ids = []
|
2022-03-22 16:39:58 +00:00
|
|
|
|
for instance in our_group["Instances"]:
|
|
|
|
|
if (
|
|
|
|
|
instance["LifecycleState"] == "InService"
|
|
|
|
|
and instance["HealthStatus"] == "Healthy"
|
|
|
|
|
):
|
|
|
|
|
instance_ids.append(instance["InstanceId"])
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
|
|
|
|
return instance_ids
|
|
|
|
|
|
2022-03-22 16:39:58 +00:00
|
|
|
|
|
2021-12-21 12:07:10 +00:00
|
|
|
|
def get_instances_addresses(ec2_client, instance_ids):
|
2022-03-22 16:39:58 +00:00
|
|
|
|
ec2_response = ec2_client.describe_instances(InstanceIds=instance_ids)
|
2021-12-21 12:07:10 +00:00
|
|
|
|
instance_ips = []
|
2022-03-22 16:39:58 +00:00
|
|
|
|
for instances in ec2_response["Reservations"]:
|
|
|
|
|
for ip in instances["Instances"]:
|
|
|
|
|
instance_ips.append(ip["PrivateIpAddress"])
|
2021-12-21 12:07:10 +00:00
|
|
|
|
return instance_ips
|
|
|
|
|
|
|
|
|
|
|
2022-11-04 14:12:30 +00:00
|
|
|
|
def prepare_autoscaling_group_and_get_hostnames(count):
|
2022-03-22 16:39:58 +00:00
|
|
|
|
asg_client = boto3.client("autoscaling", region_name="us-east-1")
|
|
|
|
|
asg_client.set_desired_capacity(
|
2022-11-04 14:12:30 +00:00
|
|
|
|
AutoScalingGroupName=JEPSEN_GROUP_NAME, DesiredCapacity=count
|
2022-03-22 16:39:58 +00:00
|
|
|
|
)
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
|
|
|
|
instances = get_autoscaling_group_instances_ids(asg_client, JEPSEN_GROUP_NAME)
|
|
|
|
|
counter = 0
|
2022-11-04 14:12:30 +00:00
|
|
|
|
while len(instances) < count:
|
2021-12-21 12:07:10 +00:00
|
|
|
|
time.sleep(5)
|
|
|
|
|
instances = get_autoscaling_group_instances_ids(asg_client, JEPSEN_GROUP_NAME)
|
|
|
|
|
counter += 1
|
|
|
|
|
if counter > 30:
|
2024-02-26 18:25:02 +00:00
|
|
|
|
raise RuntimeError("Cannot wait autoscaling group")
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
2022-03-22 16:39:58 +00:00
|
|
|
|
ec2_client = boto3.client("ec2", region_name="us-east-1")
|
2021-12-21 12:07:10 +00:00
|
|
|
|
return get_instances_addresses(ec2_client, instances)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clear_autoscaling_group():
|
2022-03-22 16:39:58 +00:00
|
|
|
|
asg_client = boto3.client("autoscaling", region_name="us-east-1")
|
|
|
|
|
asg_client.set_desired_capacity(
|
|
|
|
|
AutoScalingGroupName=JEPSEN_GROUP_NAME, DesiredCapacity=0
|
|
|
|
|
)
|
2021-12-21 12:07:10 +00:00
|
|
|
|
instances = get_autoscaling_group_instances_ids(asg_client, JEPSEN_GROUP_NAME)
|
|
|
|
|
counter = 0
|
|
|
|
|
while len(instances) > 0:
|
|
|
|
|
time.sleep(5)
|
|
|
|
|
instances = get_autoscaling_group_instances_ids(asg_client, JEPSEN_GROUP_NAME)
|
|
|
|
|
counter += 1
|
|
|
|
|
if counter > 30:
|
2024-02-26 18:25:02 +00:00
|
|
|
|
raise RuntimeError("Cannot wait autoscaling group")
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
|
|
|
|
|
2023-09-22 11:16:46 +00:00
|
|
|
|
def save_nodes_to_file(instances: List[Any], temp_path: Path) -> Path:
|
|
|
|
|
nodes_path = temp_path / "nodes.txt"
|
2024-02-26 18:25:02 +00:00
|
|
|
|
with open(nodes_path, "w", encoding="utf-8") as f:
|
2021-12-21 12:07:10 +00:00
|
|
|
|
f.write("\n".join(instances))
|
|
|
|
|
f.flush()
|
|
|
|
|
return nodes_path
|
|
|
|
|
|
2022-03-22 16:39:58 +00:00
|
|
|
|
|
|
|
|
|
def get_run_command(
|
|
|
|
|
ssh_auth_sock,
|
|
|
|
|
ssh_sock_dir,
|
|
|
|
|
pr_info,
|
|
|
|
|
nodes_path,
|
|
|
|
|
repo_path,
|
|
|
|
|
build_url,
|
|
|
|
|
result_path,
|
2022-11-04 13:40:13 +00:00
|
|
|
|
extra_args,
|
2022-03-22 16:39:58 +00:00
|
|
|
|
docker_image,
|
|
|
|
|
):
|
|
|
|
|
return (
|
|
|
|
|
f"docker run --network=host -v '{ssh_sock_dir}:{ssh_sock_dir}' -e SSH_AUTH_SOCK={ssh_auth_sock} "
|
|
|
|
|
f"-e PR_TO_TEST={pr_info.number} -e SHA_TO_TEST={pr_info.sha} -v '{nodes_path}:/nodes.txt' -v {result_path}:/test_output "
|
2022-11-04 13:40:13 +00:00
|
|
|
|
f"-e 'CLICKHOUSE_PACKAGE={build_url}' -v '{repo_path}:/ch' -e 'CLICKHOUSE_REPO_PATH=/ch' -e NODES_USERNAME=ubuntu {extra_args} {docker_image}"
|
2022-03-22 16:39:58 +00:00
|
|
|
|
)
|
|
|
|
|
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
2023-09-22 11:16:46 +00:00
|
|
|
|
def main():
|
2021-12-21 12:07:10 +00:00
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
2022-11-04 13:40:13 +00:00
|
|
|
|
parser = argparse.ArgumentParser(
|
2022-11-07 13:49:21 +00:00
|
|
|
|
prog="Jepsen Check",
|
|
|
|
|
description="Check that uses Jepsen. Both Keeper and Server can be tested.",
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"program", help='What should be tested. Valid values "keeper", "server"'
|
|
|
|
|
)
|
2022-11-04 13:40:13 +00:00
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
2024-02-26 18:25:02 +00:00
|
|
|
|
if args.program not in ("server", "keeper"):
|
2022-11-07 14:03:11 +00:00
|
|
|
|
logging.warning("Invalid argument '%s'", args.program)
|
2022-11-04 13:40:13 +00:00
|
|
|
|
sys.exit(0)
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
|
|
|
|
stopwatch = Stopwatch()
|
2023-09-22 11:16:46 +00:00
|
|
|
|
temp_path = Path(TEMP_PATH)
|
|
|
|
|
temp_path.mkdir(parents=True, exist_ok=True)
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
|
|
|
|
pr_info = PRInfo()
|
|
|
|
|
|
2022-11-07 11:46:47 +00:00
|
|
|
|
logging.info(
|
|
|
|
|
"Start at PR number %s, commit sha %s labels %s",
|
|
|
|
|
pr_info.number,
|
|
|
|
|
pr_info.sha,
|
|
|
|
|
pr_info.labels,
|
|
|
|
|
)
|
2021-12-27 07:33:25 +00:00
|
|
|
|
|
2022-11-07 11:46:47 +00:00
|
|
|
|
if pr_info.number != 0 and "jepsen-test" not in pr_info.labels:
|
|
|
|
|
logging.info("Not jepsen test label in labels list, skipping")
|
|
|
|
|
sys.exit(0)
|
2021-12-24 12:56:03 +00:00
|
|
|
|
|
2022-11-07 13:49:21 +00:00
|
|
|
|
check_name = KEEPER_CHECK_NAME if args.program == "keeper" else SERVER_CHECK_NAME
|
2022-11-04 13:40:13 +00:00
|
|
|
|
|
2021-12-21 12:07:10 +00:00
|
|
|
|
if not os.path.exists(TEMP_PATH):
|
|
|
|
|
os.makedirs(TEMP_PATH)
|
|
|
|
|
|
2023-09-22 11:16:46 +00:00
|
|
|
|
result_path = temp_path / "result_path"
|
|
|
|
|
result_path.mkdir(parents=True, exist_ok=True)
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
2022-11-07 13:49:21 +00:00
|
|
|
|
instances = prepare_autoscaling_group_and_get_hostnames(
|
|
|
|
|
KEEPER_DESIRED_INSTANCE_COUNT
|
|
|
|
|
if args.program == "keeper"
|
|
|
|
|
else SERVER_DESIRED_INSTANCE_COUNT
|
|
|
|
|
)
|
|
|
|
|
nodes_path = save_nodes_to_file(
|
2023-09-22 11:16:46 +00:00
|
|
|
|
instances[:KEEPER_DESIRED_INSTANCE_COUNT], temp_path
|
2022-11-07 13:49:21 +00:00
|
|
|
|
)
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
2021-12-24 12:56:03 +00:00
|
|
|
|
# always use latest
|
2022-11-07 13:49:21 +00:00
|
|
|
|
docker_image = KEEPER_IMAGE_NAME if args.program == "keeper" else SERVER_IMAGE_NAME
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
2024-07-24 17:51:34 +00:00
|
|
|
|
# binary_release assumed to be always ready on the master as it's part of the merge queue workflow
|
|
|
|
|
build_name = CI.get_required_build_name(check_name)
|
|
|
|
|
urls = read_build_urls(build_name, REPORT_PATH)
|
|
|
|
|
build_url = None
|
|
|
|
|
for url in urls:
|
|
|
|
|
if url.endswith("clickhouse"):
|
|
|
|
|
build_url = url
|
|
|
|
|
assert build_url, "No build url found in the report"
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
2022-11-07 13:49:21 +00:00
|
|
|
|
extra_args = ""
|
|
|
|
|
if args.program == "server":
|
|
|
|
|
extra_args = f"-e KEEPER_NODE={instances[-1]}"
|
2022-11-04 13:40:13 +00:00
|
|
|
|
|
2022-03-22 16:39:58 +00:00
|
|
|
|
with SSHKey(key_value=get_parameter_from_ssm("jepsen_ssh_key") + "\n"):
|
|
|
|
|
ssh_auth_sock = os.environ["SSH_AUTH_SOCK"]
|
2021-12-21 12:07:10 +00:00
|
|
|
|
auth_sock_dir = os.path.dirname(ssh_auth_sock)
|
2022-03-22 16:39:58 +00:00
|
|
|
|
cmd = get_run_command(
|
|
|
|
|
ssh_auth_sock,
|
|
|
|
|
auth_sock_dir,
|
|
|
|
|
pr_info,
|
|
|
|
|
nodes_path,
|
|
|
|
|
REPO_COPY,
|
|
|
|
|
build_url,
|
|
|
|
|
result_path,
|
2022-11-04 13:40:13 +00:00
|
|
|
|
extra_args,
|
2022-03-22 16:39:58 +00:00
|
|
|
|
docker_image,
|
|
|
|
|
)
|
2021-12-21 12:07:10 +00:00
|
|
|
|
logging.info("Going to run jepsen: %s", cmd)
|
|
|
|
|
|
2023-09-22 11:16:46 +00:00
|
|
|
|
run_log_path = temp_path / "run.log"
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
|
|
|
|
with TeePopen(cmd, run_log_path) as process:
|
|
|
|
|
retcode = process.wait()
|
|
|
|
|
if retcode == 0:
|
|
|
|
|
logging.info("Run successfully")
|
|
|
|
|
else:
|
|
|
|
|
logging.info("Run failed")
|
|
|
|
|
|
2024-02-06 12:39:34 +00:00
|
|
|
|
status = SUCCESS
|
2022-03-22 16:39:58 +00:00
|
|
|
|
description = "No invalid analysis found ヽ(‘ー`)ノ"
|
2023-09-22 11:16:46 +00:00
|
|
|
|
jepsen_log_path = result_path / "jepsen_run_all_tests.log"
|
2021-12-21 12:07:10 +00:00
|
|
|
|
additional_data = []
|
|
|
|
|
try:
|
|
|
|
|
test_result = _parse_jepsen_output(jepsen_log_path)
|
2023-01-03 14:23:19 +00:00
|
|
|
|
if any(r.status == "FAIL" for r in test_result):
|
2024-02-06 12:39:34 +00:00
|
|
|
|
status = FAILURE
|
2022-03-22 16:39:58 +00:00
|
|
|
|
description = "Found invalid analysis (ノಥ益ಥ)ノ ┻━┻"
|
|
|
|
|
|
2023-09-22 11:16:46 +00:00
|
|
|
|
compress_fast(result_path / "store", result_path / "jepsen_store.tar.zst")
|
|
|
|
|
additional_data.append(result_path / "jepsen_store.tar.zst")
|
2021-12-24 10:35:40 +00:00
|
|
|
|
except Exception as ex:
|
|
|
|
|
print("Exception", ex)
|
2024-02-06 12:39:34 +00:00
|
|
|
|
status = FAILURE
|
2022-03-22 16:39:58 +00:00
|
|
|
|
description = "No Jepsen output log"
|
2023-01-03 14:23:19 +00:00
|
|
|
|
test_result = [TestResult("No Jepsen output log", "FAIL")]
|
2022-03-22 16:39:58 +00:00
|
|
|
|
|
2024-01-04 15:35:09 +00:00
|
|
|
|
JobReport(
|
|
|
|
|
description=description,
|
|
|
|
|
test_results=test_result,
|
|
|
|
|
status=status,
|
|
|
|
|
start_time=stopwatch.start_time_str,
|
|
|
|
|
duration=stopwatch.duration_seconds,
|
|
|
|
|
additional_files=[run_log_path] + additional_data,
|
|
|
|
|
check_name=check_name,
|
|
|
|
|
).dump()
|
2021-12-21 12:07:10 +00:00
|
|
|
|
|
|
|
|
|
clear_autoscaling_group()
|
2023-09-22 11:16:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|