ClickHouse/tests/ci/github_helper.py

#!/usr/bin/env python
"""Helper for GitHub API requests"""
import logging
from datetime import date, datetime, timedelta
from pathlib import Path
from os import path as p
from time import sleep
from typing import List, Optional, Tuple

import github

# explicit reimport
# pylint: disable=useless-import-alias
from github.GithubException import (
    RateLimitExceededException as RateLimitExceededException,
)
from github.Issue import Issue as Issue
from github.NamedUser import NamedUser as NamedUser
from github.PullRequest import PullRequest as PullRequest
from github.Repository import Repository as Repository

# pylint: enable=useless-import-alias

CACHE_PATH = p.join(p.dirname(p.realpath(__file__)), "gh_cache")

logger = logging.getLogger(__name__)

PullRequests = List[PullRequest]
Issues = List[Issue]


class GitHub(github.Github):
    def __init__(self, *args, create_cache_dir=True, **kwargs):
        # Define meta attribute and apply setter logic
        self._cache_path = Path(CACHE_PATH)
        if create_cache_dir:
            self.cache_path = self.cache_path
        # And set Path
        super().__init__(*args, **kwargs)
        self._retries = 0

    # pylint: disable=signature-differs
    def search_issues(self, *args, **kwargs) -> Issues:  # type: ignore
        """Wrapper around search method with throttling and splitting by date.

        We split only by the first"""
        splittable = False
        for arg, value in kwargs.items():
            if arg in ["closed", "created", "merged", "updated"]:
                if hasattr(value, "__iter__") and not isinstance(value, str):
                    assert [True for v in value if isinstance(v, (date, datetime))]
                    assert len(value) == 2
                    kwargs[arg] = f"{value[0].isoformat()}..{value[1].isoformat()}"
                    if not splittable:
                        # We split only by the first met splittable argument
                        preserved_arg = arg
                        preserved_value = value
                        middle_value = value[0] + (value[1] - value[0]) / 2
                        splittable = middle_value not in value
                    continue
                assert isinstance(value, (date, datetime, str))

        inter_result = []  # type: Issues
        for i in range(self.retries):
            try:
                logger.debug("Search issues, args=%s, kwargs=%s", args, kwargs)
                result = super().search_issues(*args, **kwargs)
                if result.totalCount == 1000 and splittable:
                    # The hard limit is 1000. If it's splittable, then we make
                    # two subrequests requests with less time frames
                    logger.debug(
                        "The search result contain exactly 1000 results, "
                        "splitting %s=%s by middle point %s",
                        preserved_arg,
                        kwargs[preserved_arg],
                        middle_value,
                    )
                    kwargs[preserved_arg] = [preserved_value[0], middle_value]
                    inter_result.extend(self.search_issues(*args, **kwargs))
                    if isinstance(middle_value, date):
                        # When middle_value is a date, 2022-01-01..2022-01-03
                        # is split to 2022-01-01..2022-01-02 and
                        # 2022-01-02..2022-01-03, so we have results for
                        # 2022-01-02 twicely. We split it to
                        # 2022-01-01..2022-01-02 and 2022-01-03..2022-01-03.
                        # 2022-01-01..2022-01-02 aren't split, see splittable
                        middle_value += timedelta(days=1)
                    kwargs[preserved_arg] = [middle_value, preserved_value[1]]
                    inter_result.extend(self.search_issues(*args, **kwargs))
                    return inter_result

                inter_result.extend(result)
                return inter_result
            except RateLimitExceededException as e:
                if i == self.retries - 1:
                    exception = e
                self.sleep_on_rate_limit()

        raise exception

    # pylint: enable=signature-differs
    def get_pulls_from_search(self, *args, **kwargs) -> PullRequests:  # type: ignore
        """The search api returns actually issues, so we need to fetch PullRequests"""
        issues = self.search_issues(*args, **kwargs)
        repos = {}
        prs = []  # type: PullRequests
        for issue in issues:
            # See https://github.com/PyGithub/PyGithub/issues/2202,
            # obj._rawData doesn't spend additional API requests
            # pylint: disable=protected-access
            repo_url = issue._rawData["repository_url"]  # type: ignore
            if repo_url not in repos:
                repos[repo_url] = issue.repository
            prs.append(
                self.get_pull_cached(repos[repo_url], issue.number, issue.updated_at)
            )
        return prs

    def sleep_on_rate_limit(self):
        for limit, data in self.get_rate_limit().raw_data.items():
            if data["remaining"] == 0:
                sleep_time = data["reset"] - int(datetime.now().timestamp()) + 1
                if sleep_time > 0:
                    logger.warning(
                        "Faced rate limit for '%s' requests type, sleeping %s",
                        limit,
                        sleep_time,
                    )
                    sleep(sleep_time)
                return

    def get_pull_cached(
        self, repo: Repository, number: int, obj_updated_at: Optional[datetime] = None
    ) -> PullRequest:
        cache_file = self.cache_path / f"pr-{number}.pickle"

        if cache_file.is_file():
            is_updated, cached_pr = self._is_cache_updated(cache_file, obj_updated_at)
            if is_updated:
                logger.debug("Getting PR #%s from cache", number)
                return cached_pr  # type: ignore
        logger.debug("Getting PR #%s from API", number)
        for i in range(self.retries):
            try:
                pr = repo.get_pull(number)
                break
            except RateLimitExceededException:
                if i == self.retries - 1:
                    raise
                self.sleep_on_rate_limit()
        logger.debug("Caching PR #%s from API in %s", number, cache_file)
        with open(cache_file, "wb") as prfd:
            self.dump(pr, prfd)  # type: ignore
        return pr

    def get_user_cached(
        self, login: str, obj_updated_at: Optional[datetime] = None
    ) -> NamedUser:
        cache_file = self.cache_path / f"user-{login}.pickle"

        if cache_file.is_file():
            is_updated, cached_user = self._is_cache_updated(cache_file, obj_updated_at)
            if is_updated:
                logger.debug("Getting user %s from cache", login)
                return cached_user  # type: ignore
        logger.debug("Getting PR #%s from API", login)
        for i in range(self.retries):
            try:
                user = self.get_user(login)
                break
            except RateLimitExceededException:
                if i == self.retries - 1:
                    raise
                self.sleep_on_rate_limit()
        logger.debug("Caching user %s from API in %s", login, cache_file)
        with open(cache_file, "wb") as prfd:
            self.dump(user, prfd)  # type: ignore
        return user

    def _get_cached(self, path: Path):  # type: ignore
        with open(path, "rb") as ob_fd:
            return self.load(ob_fd)  # type: ignore

    def _is_cache_updated(
        self, cache_file: Path, obj_updated_at: Optional[datetime]
    ) -> Tuple[bool, object]:
        cached_obj = self._get_cached(cache_file)
        # We don't want the cache_updated being always old,
        # for example in cases when the user is not updated for ages
        cache_updated = max(
            datetime.fromtimestamp(cache_file.stat().st_mtime), cached_obj.updated_at
        )
        if obj_updated_at is None:
            # When we don't know about the object is updated or not,
            # we update it once per hour
            obj_updated_at = datetime.now() - timedelta(hours=1)
        if obj_updated_at <= cache_updated:
            return True, cached_obj
        return False, cached_obj

    @property
    def cache_path(self) -> Path:
        return self._cache_path

    @cache_path.setter
    def cache_path(self, value: str) -> None:
        self._cache_path = Path(value)
        if self._cache_path.exists():
            assert self._cache_path.is_dir()
        else:
            self._cache_path.mkdir(parents=True)

    @property
    def retries(self):
        if self._retries == 0:
            self._retries = 3
        return self._retries

    @retries.setter
    def retries(self, value: int) -> None:
        assert isinstance(value, int)
        self._retries = value
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`#!/usr/bin/env python`
			`"""Helper for GitHub API requests"""`
			`import logging`
			`from datetime import date, datetime, timedelta`
			`from pathlib import Path`
			`from os import path as p`
			`from time import sleep`
Improve caching logic, add cached NamedUser 2022-07-21 13:45:38 +00:00			`from typing import List, Optional, Tuple`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00
			`import github`
Fix cherry-pick.py typing issues 2022-11-15 13:46:48 +00:00
			`# explicit reimport`
			`# pylint: disable=useless-import-alias`
			`from github.GithubException import (`
			`RateLimitExceededException as RateLimitExceededException,`
			`)`
			`from github.Issue import Issue as Issue`
			`from github.NamedUser import NamedUser as NamedUser`
			`from github.PullRequest import PullRequest as PullRequest`
			`from github.Repository import Repository as Repository`

			`# pylint: enable=useless-import-alias`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00
			`CACHE_PATH = p.join(p.dirname(p.realpath(__file__)), "gh_cache")`

			`logger = logging.getLogger(__name__)`

			`PullRequests = List[PullRequest]`
			`Issues = List[Issue]`


			`class GitHub(github.Github):`
Create GitHub cache directory optionally by default 2022-12-09 20:33:06 +00:00			`def __init__(self, args, create_cache_dir=True, *kwargs):`
Fix the CACHE_PATH creation for default value 2022-12-09 13:00:53 +00:00			`# Define meta attribute and apply setter logic`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`self._cache_path = Path(CACHE_PATH)`
Create GitHub cache directory optionally by default 2022-12-09 20:33:06 +00:00			`if create_cache_dir:`
			`self.cache_path = self.cache_path`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`# And set Path`
			`super().__init__(args, *kwargs)`
Fix found points during the review 2022-07-17 12:05:21 +00:00			`self._retries = 0`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00
			`# pylint: disable=signature-differs`
			`def search_issues(self, args, *kwargs) -> Issues: # type: ignore`
			`"""Wrapper around search method with throttling and splitting by date.`

			`We split only by the first"""`
			`splittable = False`
			`for arg, value in kwargs.items():`
			`if arg in ["closed", "created", "merged", "updated"]:`
Fix found points during the review 2022-07-17 12:05:21 +00:00			`if hasattr(value, "__iter__") and not isinstance(value, str):`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`assert [True for v in value if isinstance(v, (date, datetime))]`
			`assert len(value) == 2`
			`kwargs[arg] = f"{value[0].isoformat()}..{value[1].isoformat()}"`
Fix found points during the review 2022-07-17 12:05:21 +00:00			`if not splittable:`
			`# We split only by the first met splittable argument`
			`preserved_arg = arg`
			`preserved_value = value`
			`middle_value = value[0] + (value[1] - value[0]) / 2`
			`splittable = middle_value not in value`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`continue`
			`assert isinstance(value, (date, datetime, str))`

			`inter_result = [] # type: Issues`
Fix found points during the review 2022-07-17 12:05:21 +00:00			`for i in range(self.retries):`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`try:`
Fix found points during the review 2022-07-17 12:05:21 +00:00			`logger.debug("Search issues, args=%s, kwargs=%s", args, kwargs)`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`result = super().search_issues(args, *kwargs)`
			`if result.totalCount == 1000 and splittable:`
			`# The hard limit is 1000. If it's splittable, then we make`
			`# two subrequests requests with less time frames`
			`logger.debug(`
			`"The search result contain exactly 1000 results, "`
			`"splitting %s=%s by middle point %s",`
			`preserved_arg,`
			`kwargs[preserved_arg],`
			`middle_value,`
			`)`
			`kwargs[preserved_arg] = [preserved_value[0], middle_value]`
			`inter_result.extend(self.search_issues(args, *kwargs))`
			`if isinstance(middle_value, date):`
			`# When middle_value is a date, 2022-01-01..2022-01-03`
			`# is split to 2022-01-01..2022-01-02 and`
			`# 2022-01-02..2022-01-03, so we have results for`
			`# 2022-01-02 twicely. We split it to`
			`# 2022-01-01..2022-01-02 and 2022-01-03..2022-01-03.`
			`# 2022-01-01..2022-01-02 aren't split, see splittable`
			`middle_value += timedelta(days=1)`
			`kwargs[preserved_arg] = [middle_value, preserved_value[1]]`
			`inter_result.extend(self.search_issues(args, *kwargs))`
			`return inter_result`

			`inter_result.extend(result)`
			`return inter_result`
			`except RateLimitExceededException as e:`
Fix found points during the review 2022-07-17 12:05:21 +00:00			`if i == self.retries - 1:`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`exception = e`
			`self.sleep_on_rate_limit()`

			`raise exception`

			`# pylint: enable=signature-differs`
Add and fix typing for docker_pull_helper, github_helper and style_check 2022-11-14 19:02:33 +00:00			`def get_pulls_from_search(self, args, *kwargs) -> PullRequests: # type: ignore`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`"""The search api returns actually issues, so we need to fetch PullRequests"""`
			`issues = self.search_issues(args, *kwargs)`
			`repos = {}`
			`prs = [] # type: PullRequests`
			`for issue in issues:`
			`# See https://github.com/PyGithub/PyGithub/issues/2202,`
			`# obj._rawData doesn't spend additional API requests`
			`# pylint: disable=protected-access`
			`repo_url = issue._rawData["repository_url"] # type: ignore`
			`if repo_url not in repos:`
			`repos[repo_url] = issue.repository`
			`prs.append(`
			`self.get_pull_cached(repos[repo_url], issue.number, issue.updated_at)`
			`)`
			`return prs`

			`def sleep_on_rate_limit(self):`
			`for limit, data in self.get_rate_limit().raw_data.items():`
			`if data["remaining"] == 0:`
			`sleep_time = data["reset"] - int(datetime.now().timestamp()) + 1`
			`if sleep_time > 0:`
			`logger.warning(`
			`"Faced rate limit for '%s' requests type, sleeping %s",`
			`limit,`
			`sleep_time,`
			`)`
			`sleep(sleep_time)`
			`return`

			`def get_pull_cached(`
Improve caching logic, add cached NamedUser 2022-07-21 13:45:38 +00:00			`self, repo: Repository, number: int, obj_updated_at: Optional[datetime] = None`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`) -> PullRequest:`
Improve caching logic, add cached NamedUser 2022-07-21 13:45:38 +00:00			`cache_file = self.cache_path / f"pr-{number}.pickle"`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00
Improve caching logic, add cached NamedUser 2022-07-21 13:45:38 +00:00			`if cache_file.is_file():`
			`is_updated, cached_pr = self._is_cache_updated(cache_file, obj_updated_at)`
			`if is_updated:`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`logger.debug("Getting PR #%s from cache", number)`
Improve caching logic, add cached NamedUser 2022-07-21 13:45:38 +00:00			`return cached_pr # type: ignore`
Improve debug logging of github_helper 2022-07-18 08:32:45 +00:00			`logger.debug("Getting PR #%s from API", number)`
Fix found points during the review 2022-07-17 12:05:21 +00:00			`for i in range(self.retries):`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`try:`
			`pr = repo.get_pull(number)`
			`break`
			`except RateLimitExceededException:`
Fix found points during the review 2022-07-17 12:05:21 +00:00			`if i == self.retries - 1:`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`raise`
			`self.sleep_on_rate_limit()`
Improve caching logic, add cached NamedUser 2022-07-21 13:45:38 +00:00			`logger.debug("Caching PR #%s from API in %s", number, cache_file)`
			`with open(cache_file, "wb") as prfd:`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`self.dump(pr, prfd) # type: ignore`
			`return pr`

Improve caching logic, add cached NamedUser 2022-07-21 13:45:38 +00:00			`def get_user_cached(`
			`self, login: str, obj_updated_at: Optional[datetime] = None`
			`) -> NamedUser:`
			`cache_file = self.cache_path / f"user-{login}.pickle"`

			`if cache_file.is_file():`
			`is_updated, cached_user = self._is_cache_updated(cache_file, obj_updated_at)`
			`if is_updated:`
			`logger.debug("Getting user %s from cache", login)`
			`return cached_user # type: ignore`
			`logger.debug("Getting PR #%s from API", login)`
			`for i in range(self.retries):`
			`try:`
			`user = self.get_user(login)`
			`break`
			`except RateLimitExceededException:`
			`if i == self.retries - 1:`
			`raise`
			`self.sleep_on_rate_limit()`
			`logger.debug("Caching user %s from API in %s", login, cache_file)`
			`with open(cache_file, "wb") as prfd:`
			`self.dump(user, prfd) # type: ignore`
			`return user`

Add and fix typing for docker_pull_helper, github_helper and style_check 2022-11-14 19:02:33 +00:00			`def _get_cached(self, path: Path): # type: ignore`
Improve caching logic, add cached NamedUser 2022-07-21 13:45:38 +00:00			`with open(path, "rb") as ob_fd:`
			`return self.load(ob_fd) # type: ignore`

			`def _is_cache_updated(`
			`self, cache_file: Path, obj_updated_at: Optional[datetime]`
			`) -> Tuple[bool, object]:`
			`cached_obj = self._get_cached(cache_file)`
Add notes about _is_cache_updated logic 2022-08-02 16:44:49 +00:00			`# We don't want the cache_updated being always old,`
			`# for example in cases when the user is not updated for ages`
			`cache_updated = max(`
			`datetime.fromtimestamp(cache_file.stat().st_mtime), cached_obj.updated_at`
			`)`
Improve caching logic, add cached NamedUser 2022-07-21 13:45:38 +00:00			`if obj_updated_at is None:`
Add notes about _is_cache_updated logic 2022-08-02 16:44:49 +00:00			`# When we don't know about the object is updated or not,`
			`# we update it once per hour`
Improve caching logic, add cached NamedUser 2022-07-21 13:45:38 +00:00			`obj_updated_at = datetime.now() - timedelta(hours=1)`
			`if obj_updated_at <= cache_updated:`
			`return True, cached_obj`
			`return False, cached_obj`

Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`@property`
Add and fix typing for docker_pull_helper, github_helper and style_check 2022-11-14 19:02:33 +00:00			`def cache_path(self) -> Path:`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`return self._cache_path`

			`@cache_path.setter`
Add and fix typing for docker_pull_helper, github_helper and style_check 2022-11-14 19:02:33 +00:00			`def cache_path(self, value: str) -> None:`
Rewrite cherry_pick.py to PyGithub API 2022-07-14 18:57:03 +00:00			`self._cache_path = Path(value)`
			`if self._cache_path.exists():`
			`assert self._cache_path.is_dir()`
			`else:`
			`self._cache_path.mkdir(parents=True)`
Fix found points during the review 2022-07-17 12:05:21 +00:00
			`@property`
			`def retries(self):`
			`if self._retries == 0:`
			`self._retries = 3`
			`return self._retries`

			`@retries.setter`
Add and fix typing for docker_pull_helper, github_helper and style_check 2022-11-14 19:02:33 +00:00			`def retries(self, value: int) -> None:`
			`assert isinstance(value, int)`
Fix found points during the review 2022-07-17 12:05:21 +00:00			`self._retries = value`