ClickHouse/tests/ci/build_download_helper.py

#!/usr/bin/env python3

import json
import logging
import os
import sys
import time
from pathlib import Path
from typing import Any, Callable, List, Union

import requests  # type: ignore

import get_robot_token as grt  # we need an updated ROBOT_TOKEN
from ci_config import CI_CONFIG

DOWNLOAD_RETRIES_COUNT = 5


class DownloadException(Exception):
    pass


def get_with_retries(
    url: str,
    retries: int = DOWNLOAD_RETRIES_COUNT,
    sleep: int = 3,
    **kwargs: Any,
) -> requests.Response:
    logging.info(
        "Getting URL with %i tries and sleep %i in between: %s", retries, sleep, url
    )
    exc = Exception("A placeholder to satisfy typing and avoid nesting")
    for i in range(retries):
        try:
            response = requests.get(url, **kwargs)
            response.raise_for_status()
            return response
        except Exception as e:
            if i + 1 < retries:
                logging.info("Exception '%s' while getting, retry %i", e, i + 1)
                time.sleep(sleep)

            exc = e

    raise exc


def get_gh_api(
    url: str,
    retries: int = DOWNLOAD_RETRIES_COUNT,
    sleep: int = 3,
    **kwargs: Any,
) -> requests.Response:
    """
    Request GH api w/o auth by default, and failover to the get_best_robot_token in case of receiving
    "403 rate limit exceeded" or "404 not found" error
    It sets auth automatically when ROBOT_TOKEN is already set by get_best_robot_token
    """

    def set_auth_header():
        if "headers" in kwargs:
            if "Authorization" not in kwargs["headers"]:
                kwargs["headers"][
                    "Authorization"
                ] = f"Bearer {grt.get_best_robot_token()}"
        else:
            kwargs["headers"] = {
                "Authorization": f"Bearer {grt.get_best_robot_token()}"
            }

    if grt.ROBOT_TOKEN is not None:
        set_auth_header()

    token_is_set = "Authorization" in kwargs.get("headers", {})
    exc = Exception("A placeholder to satisfy typing and avoid nesting")
    try_cnt = 0
    while try_cnt < retries:
        try_cnt += 1
        try:
            response = requests.get(url, **kwargs)
            response.raise_for_status()
            return response
        except requests.HTTPError as e:
            exc = e
            ratelimit_exceeded = (
                e.response.status_code == 403
                and b"rate limit exceeded"
                in e.response._content  # pylint:disable=protected-access
            )
            try_auth = e.response.status_code == 404
            if (ratelimit_exceeded or try_auth) and not token_is_set:
                logging.warning(
                    "Received rate limit exception, setting the auth header and retry"
                )
                set_auth_header()
                token_is_set = True
                try_cnt = 0
                continue
        except Exception as e:
            exc = e

        if try_cnt < retries:
            logging.info("Exception '%s' while getting, retry %i", exc, try_cnt)
            time.sleep(sleep)

    raise exc


def get_build_name_for_check(check_name: str) -> str:
    return CI_CONFIG.test_configs[check_name].required_build


def read_build_urls(build_name: str, reports_path: Union[Path, str]) -> List[str]:
    for root, _, files in os.walk(reports_path):
        for file in files:
            if file.endswith(f"_{build_name}.json"):
                logging.info("Found build report json %s", file)
                with open(os.path.join(root, file), "r", encoding="utf-8") as file_handler:
                    build_report = json.load(file_handler)
                    return build_report["build_urls"]  # type: ignore
    return []


def download_build_with_progress(url: str, path: Path) -> None:
    logging.info("Downloading from %s to temp path %s", url, path)
    for i in range(DOWNLOAD_RETRIES_COUNT):
        try:
            response = get_with_retries(url, retries=1, stream=True)
            total_length = int(response.headers.get("content-length", 0))
            if path.is_file() and total_length and path.stat().st_size == total_length:
                logging.info(
                    "The file %s already exists and have a proper size %s",
                    path,
                    total_length,
                )
                return

            with open(path, "wb") as f:
                if total_length == 0:
                    logging.info(
                        "No content-length, will download file without progress"
                    )
                    f.write(response.content)
                else:
                    dl = 0

                    logging.info("Content length is %ld bytes", total_length)
                    for data in response.iter_content(chunk_size=4096):
                        dl += len(data)
                        f.write(data)
                        if sys.stdout.isatty():
                            done = int(50 * dl / total_length)
                            percent = int(100 * float(dl) / total_length)
                            eq_str = "=" * done
                            space_str = " " * (50 - done)
                            sys.stdout.write(f"\r[{eq_str}{space_str}] {percent}%")
                            sys.stdout.flush()
            break
        except Exception as e:
            if sys.stdout.isatty():
                sys.stdout.write("\n")
            if os.path.exists(path):
                os.remove(path)

            if i + 1 < DOWNLOAD_RETRIES_COUNT:
                time.sleep(3)
            else:
                raise DownloadException(
                    f"Cannot download dataset from {url}, all retries exceeded"
                ) from e

    if sys.stdout.isatty():
        sys.stdout.write("\n")
    logging.info("Downloading finished")


def download_builds(
    result_path: str, build_urls: List[str], filter_fn: Callable[[str], bool]
) -> None:
    for url in build_urls:
        if filter_fn(url):
            fname = os.path.basename(url.replace("%2B", "+").replace("%20", " "))
            logging.info("Will download %s to %s", fname, result_path)
            download_build_with_progress(url, Path(result_path) / fname)


def download_builds_filter(
    check_name, reports_path, result_path, filter_fn=lambda _: True
):
    build_name = get_build_name_for_check(check_name)
    urls = read_build_urls(build_name, reports_path)
    print(urls)

    if not urls:
        raise DownloadException("No build URLs found")

    download_builds(result_path, urls, filter_fn)


def download_all_deb_packages(check_name, reports_path, result_path):
    download_builds_filter(
        check_name, reports_path, result_path, lambda x: x.endswith("deb")
    )


def download_unit_tests(check_name, reports_path, result_path):
    download_builds_filter(
        check_name, reports_path, result_path, lambda x: x.endswith("unit_tests_dbms")
    )


def download_clickhouse_binary(check_name, reports_path, result_path):
    download_builds_filter(
        check_name, reports_path, result_path, lambda x: x.endswith("clickhouse")
    )


def download_performance_build(check_name, reports_path, result_path):
    download_builds_filter(
        check_name,
        reports_path,
        result_path,
        lambda x: x.endswith("performance.tar.zst"),
    )


def download_fuzzers(check_name, reports_path, result_path):
    download_builds_filter(
        check_name,
        reports_path,
        result_path,
        lambda x: x.endswith(("_fuzzer", ".dict", ".options", "_seed_corpus.zip")),
    )
Get rid of build number 2021-11-12 11:07:54 +00:00			`#!/usr/bin/env python3`

			`import json`
			`import logging`
Add typing and order import 2022-09-07 17:20:22 +00:00			`import os`
Get rid of build number 2021-11-12 11:07:54 +00:00			`import sys`
			`import time`
Use Path in download helper, do not redownload exist files 2023-01-27 15:10:10 +00:00			`from pathlib import Path`
Migrate S3Helper to pathlib.Path 2023-08-29 14:35:53 +00:00			`from typing import Any, Callable, List, Union`
Get rid of build number 2021-11-12 11:07:54 +00:00
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00			`import requests # type: ignore`
Get rid of build number 2021-11-12 11:07:54 +00:00
Fix the global ROBOT_TOKEN, do not retry unnecessary 2023-05-02 10:37:05 +00:00			`import get_robot_token as grt # we need an updated ROBOT_TOKEN`
Get rid of build numbers and simplify builds paths in S3 2021-11-26 10:57:36 +00:00			`from ci_config import CI_CONFIG`
Get rid of build number 2021-11-12 11:07:54 +00:00
			`DOWNLOAD_RETRIES_COUNT = 5`

Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00
Upload cargo cache for rust to S3 and reuse 2023-08-10 17:12:09 +00:00			`class DownloadException(Exception):`
			`pass`


Add get_with_retries helper to build_download_helper.py 2022-01-26 12:20:08 +00:00			`def get_with_retries(`
			`url: str,`
			`retries: int = DOWNLOAD_RETRIES_COUNT,`
			`sleep: int = 3,`
Fix run_check.py and dependencies 2022-11-10 16:11:23 +00:00			`**kwargs: Any,`
Add get_with_retries helper to build_download_helper.py 2022-01-26 12:20:08 +00:00			`) -> requests.Response:`
Fix hardcoded retries for get_with_retries 2022-06-30 09:58:24 +00:00			`logging.info(`
			`"Getting URL with %i tries and sleep %i in between: %s", retries, sleep, url`
			`)`
Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00			`exc = Exception("A placeholder to satisfy typing and avoid nesting")`
Fix hardcoded retries for get_with_retries 2022-06-30 09:58:24 +00:00			`for i in range(retries):`
Add get_with_retries helper to build_download_helper.py 2022-01-26 12:20:08 +00:00			`try:`
			`response = requests.get(url, **kwargs)`
			`response.raise_for_status()`
Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00			`return response`
Add get_with_retries helper to build_download_helper.py 2022-01-26 12:20:08 +00:00			`except Exception as e:`
Fix hardcoded retries for get_with_retries 2022-06-30 09:58:24 +00:00			`if i + 1 < retries:`
Add get_with_retries helper to build_download_helper.py 2022-01-26 12:20:08 +00:00			`logging.info("Exception '%s' while getting, retry %i", e, i + 1)`
			`time.sleep(sleep)`

			`exc = e`

Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00			`raise exc`


			`def get_gh_api(`
			`url: str,`
			`retries: int = DOWNLOAD_RETRIES_COUNT,`
			`sleep: int = 3,`
			`**kwargs: Any,`
			`) -> requests.Response:`
Revert "Revert "Integration check script fix ups"" This reverts commit 67b9407530bb15f7e6d49cd1c2bde7b6c441389b. 2023-10-17 08:18:17 +00:00			`"""`
			`Request GH api w/o auth by default, and failover to the get_best_robot_token in case of receiving`
			`"403 rate limit exceeded" or "404 not found" error`
fix docs 2023-10-11 17:25:36 +00:00			`It sets auth automatically when ROBOT_TOKEN is already set by get_best_robot_token`
Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00			`"""`

			`def set_auth_header():`
			`if "headers" in kwargs:`
			`if "Authorization" not in kwargs["headers"]:`
Fix the global ROBOT_TOKEN, do not retry unnecessary 2023-05-02 10:37:05 +00:00			`kwargs["headers"][`
			`"Authorization"`
			`] = f"Bearer {grt.get_best_robot_token()}"`
Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00			`else:`
Fix the global ROBOT_TOKEN, do not retry unnecessary 2023-05-02 10:37:05 +00:00			`kwargs["headers"] = {`
			`"Authorization": f"Bearer {grt.get_best_robot_token()}"`
			`}`
Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00
Fix the global ROBOT_TOKEN, do not retry unnecessary 2023-05-02 10:37:05 +00:00			`if grt.ROBOT_TOKEN is not None:`
Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00			`set_auth_header()`

Revert "Revert "Integration check script fix ups"" This reverts commit 67b9407530bb15f7e6d49cd1c2bde7b6c441389b. 2023-10-17 08:18:17 +00:00			`token_is_set = "Authorization" in kwargs.get("headers", {})`
			`exc = Exception("A placeholder to satisfy typing and avoid nesting")`
			`try_cnt = 0`
			`while try_cnt < retries:`
			`try_cnt += 1`
Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00			`try:`
Revert "Revert "Integration check script fix ups"" This reverts commit 67b9407530bb15f7e6d49cd1c2bde7b6c441389b. 2023-10-17 08:18:17 +00:00			`response = requests.get(url, **kwargs)`
Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00			`response.raise_for_status()`
			`return response`
Revert "Revert "Integration check script fix ups"" This reverts commit 67b9407530bb15f7e6d49cd1c2bde7b6c441389b. 2023-10-17 08:18:17 +00:00			`except requests.HTTPError as e:`
			`exc = e`
			`ratelimit_exceeded = (`
			`e.response.status_code == 403`
fix docs and script 2023-10-12 17:32:11 +00:00			`and b"rate limit exceeded"`
Revert "Revert "Integration check script fix ups"" This reverts commit 67b9407530bb15f7e6d49cd1c2bde7b6c441389b. 2023-10-17 08:18:17 +00:00			`in e.response._content # pylint:disable=protected-access`
			`)`
			`try_auth = e.response.status_code == 404`
			`if (ratelimit_exceeded or try_auth) and not token_is_set:`
Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00			`logging.warning(`
fix docs and script 2023-10-12 17:32:11 +00:00			`"Received rate limit exception, setting the auth header and retry"`
Add a fallback to authenticated requests to GH API 2023-04-28 16:26:50 +00:00			`)`
			`set_auth_header()`
Revert "Revert "Integration check script fix ups"" This reverts commit 67b9407530bb15f7e6d49cd1c2bde7b6c441389b. 2023-10-17 08:18:17 +00:00			`token_is_set = True`
			`try_cnt = 0`
			`continue`
			`except Exception as e:`
			`exc = e`
gh api request func update 2023-10-11 16:00:48 +00:00
Revert "Revert "Integration check script fix ups"" This reverts commit 67b9407530bb15f7e6d49cd1c2bde7b6c441389b. 2023-10-17 08:18:17 +00:00			`if try_cnt < retries:`
			`logging.info("Exception '%s' while getting, retry %i", exc, try_cnt)`
			`time.sleep(sleep)`

			`raise exc`
Add get_with_retries helper to build_download_helper.py 2022-01-26 12:20:08 +00:00

Fix run_check.py and dependencies 2022-11-10 16:11:23 +00:00			`def get_build_name_for_check(check_name: str) -> str:`
Refactor CI_CONFIG from dict to dataclasses 2023-08-02 16:27:14 +00:00			`return CI_CONFIG.test_configs[check_name].required_build`
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00
Get rid of build number 2021-11-12 11:07:54 +00:00
Migrate S3Helper to pathlib.Path 2023-08-29 14:35:53 +00:00			`def read_build_urls(build_name: str, reports_path: Union[Path, str]) -> List[str]:`
Get rid of build number 2021-11-12 11:07:54 +00:00			`for root, _, files in os.walk(reports_path):`
Better 2024-02-08 23:46:12 +00:00			`for file in files:`
			`if file.endswith(f"_{build_name}.json"):`
			`logging.info("Found build report json %s", file)`
			`with open(os.path.join(root, file), "r", encoding="utf-8") as file_handler:`
Get rid of build number 2021-11-12 11:07:54 +00:00			`build_report = json.load(file_handler)`
Fix run_check.py and dependencies 2022-11-10 16:11:23 +00:00			`return build_report["build_urls"] # type: ignore`
Get rid of build number 2021-11-12 11:07:54 +00:00			`return []`

Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00
Use Path in download helper, do not redownload exist files 2023-01-27 15:10:10 +00:00			`def download_build_with_progress(url: str, path: Path) -> None:`
Get rid of build number 2021-11-12 11:07:54 +00:00			`logging.info("Downloading from %s to temp path %s", url, path)`
			`for i in range(DOWNLOAD_RETRIES_COUNT):`
			`try:`
Use Path in download helper, do not redownload exist files 2023-01-27 15:10:10 +00:00			`response = get_with_retries(url, retries=1, stream=True)`
			`total_length = int(response.headers.get("content-length", 0))`
			`if path.is_file() and total_length and path.stat().st_size == total_length:`
			`logging.info(`
			`"The file %s already exists and have a proper size %s",`
			`path,`
			`total_length,`
			`)`
			`return`

Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00			`with open(path, "wb") as f:`
Use Path in download helper, do not redownload exist files 2023-01-27 15:10:10 +00:00			`if total_length == 0:`
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00			`logging.info(`
			`"No content-length, will download file without progress"`
			`)`
Get rid of build number 2021-11-12 11:07:54 +00:00			`f.write(response.content)`
			`else:`
			`dl = 0`
Use Path in download helper, do not redownload exist files 2023-01-27 15:10:10 +00:00
Get rid of build number 2021-11-12 11:07:54 +00:00			`logging.info("Content length is %ld bytes", total_length)`
			`for data in response.iter_content(chunk_size=4096):`
			`dl += len(data)`
			`f.write(data)`
			`if sys.stdout.isatty():`
			`done = int(50 * dl / total_length)`
			`percent = int(100 * float(dl) / total_length)`
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00			`eq_str = "=" * done`
			`space_str = " " * (50 - done)`
Get rid of build number 2021-11-12 11:07:54 +00:00			`sys.stdout.write(f"\r[{eq_str}{space_str}] {percent}%")`
			`sys.stdout.flush()`
			`break`
Fix DownloadException 2023-09-27 16:24:56 +00:00			`except Exception as e:`
Add get_with_retries helper to build_download_helper.py 2022-01-26 12:20:08 +00:00			`if sys.stdout.isatty():`
			`sys.stdout.write("\n")`
Get rid of build number 2021-11-12 11:07:54 +00:00			`if os.path.exists(path):`
			`os.remove(path)`
Fix DownloadException 2023-09-27 16:24:56 +00:00
			`if i + 1 < DOWNLOAD_RETRIES_COUNT:`
			`time.sleep(3)`
			`else:`
			`raise DownloadException(`
			`f"Cannot download dataset from {url}, all retries exceeded"`
			`) from e`
Get rid of build number 2021-11-12 11:07:54 +00:00
Add get_with_retries helper to build_download_helper.py 2022-01-26 12:20:08 +00:00			`if sys.stdout.isatty():`
			`sys.stdout.write("\n")`
Get rid of build number 2021-11-12 11:07:54 +00:00			`logging.info("Downloading finished")`


Use Path in download helper, do not redownload exist files 2023-01-27 15:10:10 +00:00			`def download_builds(`
			`result_path: str, build_urls: List[str], filter_fn: Callable[[str], bool]`
			`) -> None:`
Get rid of build number 2021-11-12 11:07:54 +00:00			`for url in build_urls:`
			`if filter_fn(url):`
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00			`fname = os.path.basename(url.replace("%2B", "+").replace("%20", " "))`
Get rid of build number 2021-11-12 11:07:54 +00:00			`logging.info("Will download %s to %s", fname, result_path)`
Use Path in download helper, do not redownload exist files 2023-01-27 15:10:10 +00:00			`download_build_with_progress(url, Path(result_path) / fname)`
Get rid of build number 2021-11-12 11:07:54 +00:00
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00
			`def download_builds_filter(`
			`check_name, reports_path, result_path, filter_fn=lambda _: True`
			`):`
Get rid of build numbers and simplify builds paths in S3 2021-11-26 10:57:36 +00:00			`build_name = get_build_name_for_check(check_name)`
Rename get_build_urls to read_build_urls 2022-09-07 13:06:44 +00:00			`urls = read_build_urls(build_name, reports_path)`
Debugging broken checks 2021-11-12 19:57:26 +00:00			`print(urls)`
Get rid of build number 2021-11-12 11:07:54 +00:00
			`if not urls:`
Upload cargo cache for rust to S3 and reuse 2023-08-10 17:12:09 +00:00			`raise DownloadException("No build URLs found")`
Get rid of build number 2021-11-12 11:07:54 +00:00
			`download_builds(result_path, urls, filter_fn)`

Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00
Get rid of build number 2021-11-12 11:07:54 +00:00			`def download_all_deb_packages(check_name, reports_path, result_path):`
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00			`download_builds_filter(`
			`check_name, reports_path, result_path, lambda x: x.endswith("deb")`
			`)`

Get rid of build number 2021-11-12 11:07:54 +00:00
			`def download_unit_tests(check_name, reports_path, result_path):`
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00			`download_builds_filter(`
			`check_name, reports_path, result_path, lambda x: x.endswith("unit_tests_dbms")`
			`)`

Get rid of build number 2021-11-12 11:07:54 +00:00
			`def download_clickhouse_binary(check_name, reports_path, result_path):`
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00			`download_builds_filter(`
			`check_name, reports_path, result_path, lambda x: x.endswith("clickhouse")`
			`)`

Add ramdrive 2021-12-09 09:04:05 +00:00
			`def download_performance_build(check_name, reports_path, result_path):`
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00			`download_builds_filter(`
Automatic style fix 2023-01-09 01:08:38 +00:00			`check_name,`
			`reports_path,`
			`result_path,`
			`lambda x: x.endswith("performance.tar.zst"),`
Apply black formatter to build_download_helper.py 2022-01-26 11:10:20 +00:00			`)`
add libFuzzer tests, initial integration 2023-09-10 17:07:49 +00:00

			`def download_fuzzers(check_name, reports_path, result_path):`
			`download_builds_filter(`
add infrastructure files to the download filter 2023-09-11 22:45:50 +00:00			`check_name,`
			`reports_path,`
			`result_path,`
			`lambda x: x.endswith(("_fuzzer", ".dict", ".options", "_seed_corpus.zip")),`
add libFuzzer tests, initial integration 2023-09-10 17:07:49 +00:00			`)`