ClickHouse/tests/ci/libfuzzer_test_check.py

#!/usr/bin/env python3

import argparse
import logging
import os
import re
import sys
import zipfile
from pathlib import Path
from typing import List

from botocore.exceptions import ClientError

from build_download_helper import download_fuzzers
from clickhouse_helper import CiLogsCredentials
from docker_images_helper import DockerImage, get_docker_image, pull_image
from env_helper import REPO_COPY, REPORT_PATH, S3_BUILDS_BUCKET, TEMP_PATH
from pr_info import PRInfo
from report import FAILURE, SUCCESS, JobReport, TestResult
from s3_helper import S3Helper
from stopwatch import Stopwatch
from tee_popen import TeePopen

TIMEOUT = 60 * 5
NO_CHANGES_MSG = "Nothing to run"
s3 = S3Helper()


def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, _, files in os.walk(path):
        for file in files:
            ziph.write(
                os.path.join(root, file),
                os.path.relpath(os.path.join(root, file), os.path.join(path, "..")),
            )


def get_additional_envs(check_name, run_by_hash_num, run_by_hash_total):
    result = []
    if "DatabaseReplicated" in check_name:
        result.append("USE_DATABASE_REPLICATED=1")
    if "DatabaseOrdinary" in check_name:
        result.append("USE_DATABASE_ORDINARY=1")
    if "wide parts enabled" in check_name:
        result.append("USE_POLYMORPHIC_PARTS=1")
    if "ParallelReplicas" in check_name:
        result.append("USE_PARALLEL_REPLICAS=1")
    if "s3 storage" in check_name:
        result.append("USE_S3_STORAGE_FOR_MERGE_TREE=1")
        result.append("RANDOMIZE_OBJECT_KEY_TYPE=1")
    if "analyzer" in check_name:
        result.append("USE_OLD_ANALYZER=1")

    if run_by_hash_total != 0:
        result.append(f"RUN_BY_HASH_NUM={run_by_hash_num}")
        result.append(f"RUN_BY_HASH_TOTAL={run_by_hash_total}")

    return result


def get_run_command(
    fuzzers_path: Path,
    repo_path: Path,
    result_path: Path,
    additional_envs: List[str],
    ci_logs_args: str,
    image: DockerImage,
) -> str:
    additional_options = ["--hung-check"]
    additional_options.append("--print-time")

    additional_options_str = (
        '-e ADDITIONAL_OPTIONS="' + " ".join(additional_options) + '"'
    )

    envs = [
        # a static link, don't use S3_URL or S3_DOWNLOAD
        '-e S3_URL="https://s3.amazonaws.com"',
    ]

    envs += [f"-e {e}" for e in additional_envs]

    env_str = " ".join(envs)
    uid = os.getuid()
    gid = os.getgid()

    return (
        f"docker run "
        f"{ci_logs_args} "
        f"--user {uid}:{gid} "
        f"--workdir=/fuzzers "
        f"--volume={fuzzers_path}:/fuzzers "
        f"--volume={repo_path}/tests:/usr/share/clickhouse-test "
        f"--volume={result_path}:/test_output "
        "--security-opt seccomp=unconfined "  # required to issue io_uring sys-calls
        f"--cap-add=SYS_PTRACE {env_str} {additional_options_str} {image} "
        "python3 /usr/share/clickhouse-test/fuzz/runner.py"
    )


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("check_name")
    return parser.parse_args()


def download_corpus(path: str):
    logging.info("Download corpus...")

    try:
        s3.download_file(
            bucket=S3_BUILDS_BUCKET,
            s3_path="fuzzer/corpus.zip",
            local_file_path=path,
        )
    except ClientError as e:
        if e.response["Error"]["Code"] == "NoSuchKey":
            logging.debug("No active corpus exists")
        else:
            raise

    with zipfile.ZipFile(f"{path}/corpus.zip", "r") as zipf:
        zipf.extractall(path)
    os.remove(f"{path}/corpus.zip")

    units = 0
    for _, _, files in os.walk(path):
        units += len(files)

    logging.info("...downloaded %d units", units)


def upload_corpus(path: str):
    with zipfile.ZipFile(f"{path}/corpus.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
        zipdir(f"{path}/corpus/", zipf)
    s3.upload_file(
        bucket=S3_BUILDS_BUCKET,
        file_path=f"{path}/corpus.zip",
        s3_path="fuzzer/corpus.zip",
    )


def process_error(path: Path) -> list:
    ERROR = r"^==\d+==\s?ERROR: (\S+): (.*)"
    # error_source = ""
    # error_reason = ""
    # test_unit = ""
    # TEST_UNIT_LINE = r"artifact_prefix='.*\/'; Test unit written to (.*)"
    error_info = []
    is_error = False

    with open(path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.rstrip("\n")
            if is_error:
                error_info.append(line)
                # match = re.search(TEST_UNIT_LINE, line)
                # if match:
                #     test_unit = match.group(1)
                continue

            match = re.search(ERROR, line)
            if match:
                error_info.append(line)
                # error_source = match.group(1)
                # error_reason = match.group(2)
                is_error = True

    return error_info


def read_status(status_path: Path):
    result = []
    with open(status_path, "r", encoding="utf-8") as file:
        for line in file:
            result.append(line.rstrip("\n"))
    return result


def process_results(result_path: Path):
    test_results = []
    oks = 0
    errors = 0
    fails = 0
    for file in result_path.glob("*.status"):
        fuzzer = file.stem
        file_path = file.parent / fuzzer
        file_path_unit = file_path.with_suffix(".unit")
        file_path_out = file_path.with_suffix(".out")
        file_path_stdout = file_path.with_suffix(".stdout")
        status = read_status(file)
        result = TestResult(fuzzer, status[0], float(status[2]))
        if status[0] == "OK":
            oks += 1
        elif status[0] == "ERROR":
            errors += 1
            if file_path_out.exists():
                result.set_log_files(f"['{file_path_out}']")
            elif file_path_stdout.exists():
                result.set_log_files(f"['{file_path_stdout}']")
        else:
            fails += 1
            if file_path_out.exists():
                result.set_raw_logs("\n".join(process_error(file_path_out)))
            if file_path_unit.exists():
                result.set_log_files(f"['{file_path_unit}']")
            elif file_path_out.exists():
                result.set_log_files(f"['{file_path_out}']")
            elif file_path_stdout.exists():
                result.set_log_files(f"['{file_path_stdout}']")
        test_results.append(result)

    return [oks, errors, fails, test_results]


def main():
    logging.basicConfig(level=logging.INFO)

    stopwatch = Stopwatch()

    temp_path = Path(TEMP_PATH)
    reports_path = Path(REPORT_PATH)
    temp_path.mkdir(parents=True, exist_ok=True)
    repo_path = Path(REPO_COPY)

    args = parse_args()
    check_name = args.check_name

    pr_info = PRInfo()

    temp_path.mkdir(parents=True, exist_ok=True)

    if "RUN_BY_HASH_NUM" in os.environ:
        run_by_hash_num = int(os.getenv("RUN_BY_HASH_NUM", "0"))
        run_by_hash_total = int(os.getenv("RUN_BY_HASH_TOTAL", "0"))
    else:
        run_by_hash_num = 0
        run_by_hash_total = 0

    docker_image = pull_image(get_docker_image("clickhouse/libfuzzer"))

    fuzzers_path = temp_path / "fuzzers"
    fuzzers_path.mkdir(parents=True, exist_ok=True)

    download_corpus(fuzzers_path)
    download_fuzzers(check_name, reports_path, fuzzers_path)

    for file in os.listdir(fuzzers_path):
        if file.endswith("_fuzzer"):
            os.chmod(fuzzers_path / file, 0o777)
        elif file.endswith("_seed_corpus.zip"):
            seed_corpus_path = fuzzers_path / (
                file.removesuffix("_seed_corpus.zip") + ".in"
            )
            with zipfile.ZipFile(fuzzers_path / file, "r") as zfd:
                zfd.extractall(seed_corpus_path)

    result_path = temp_path / "result_path"
    result_path.mkdir(parents=True, exist_ok=True)

    run_log_path = result_path / "run.log"

    additional_envs = get_additional_envs(
        check_name, run_by_hash_num, run_by_hash_total
    )

    additional_envs.append(f"TIMEOUT={TIMEOUT}")

    ci_logs_credentials = CiLogsCredentials(Path(temp_path) / "export-logs-config.sh")
    ci_logs_args = ci_logs_credentials.get_docker_arguments(
        pr_info, stopwatch.start_time_str, check_name
    )

    run_command = get_run_command(
        fuzzers_path,
        repo_path,
        result_path,
        additional_envs,
        ci_logs_args,
        docker_image,
    )
    logging.info("Going to run libFuzzer tests: %s", run_command)

    with TeePopen(run_command, run_log_path) as process:
        retcode = process.wait()
        if retcode == 0:
            logging.info("Run successfully")
            if (
                pr_info.number == 0
                and pr_info.base_ref == "master"
                and pr_info.head_ref == "master"
            ):
                logging.info("Uploading corpus - running in master")
                upload_corpus(fuzzers_path)
            else:
                logging.info("Not uploading corpus - running in PR")
        else:
            logging.info("Run failed")

    results = process_results(result_path)

    success = results[1] == 0 and results[2] == 0

    JobReport(
        description=f"OK: {results[0]}, ERROR: {results[1]}, FAIL: {results[2]}",
        test_results=results[3],
        status=SUCCESS if success else FAILURE,
        start_time=stopwatch.start_time_str,
        duration=stopwatch.duration_seconds,
        additional_files=[],
    ).dump()

    if not success:
        sys.exit(1)


if __name__ == "__main__":
    main()