Merge remote-tracking branch 'origin/master' into auto/v22.7.2.15-stable

This commit is contained in:
Mikhail f. Shiryaev 2022-08-03 12:04:34 +02:00
commit c05526beef
No known key found for this signature in database
GPG Key ID: 4B02ED204C7D93F4
5 changed files with 120 additions and 130 deletions

View File

@ -29,7 +29,7 @@ jobs:
fetch-depth: 0
- name: Generate versions
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_TOKEN: ${{ secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN }}
run: |
./utils/list-versions/list-versions.sh > ./utils/list-versions/version_date.tsv
GID=$(id -g "${UID}")

View File

@ -5,11 +5,12 @@ from datetime import date, datetime, timedelta
from pathlib import Path
from os import path as p
from time import sleep
from typing import List, Optional
from typing import List, Optional, Tuple
import github
from github.GithubException import RateLimitExceededException
from github.Issue import Issue
from github.NamedUser import NamedUser
from github.PullRequest import PullRequest
from github.Repository import Repository
@ -120,21 +121,15 @@ class GitHub(github.Github):
return
def get_pull_cached(
self, repo: Repository, number: int, updated_at: Optional[datetime] = None
self, repo: Repository, number: int, obj_updated_at: Optional[datetime] = None
) -> PullRequest:
pr_cache_file = self.cache_path / f"{number}.pickle"
if updated_at is None:
updated_at = datetime.now() - timedelta(hours=-1)
cache_file = self.cache_path / f"pr-{number}.pickle"
def _get_pr(path: Path) -> PullRequest:
with open(path, "rb") as prfd:
return self.load(prfd) # type: ignore
if pr_cache_file.is_file():
cached_pr = _get_pr(pr_cache_file)
if updated_at <= cached_pr.updated_at:
if cache_file.is_file():
is_updated, cached_pr = self._is_cache_updated(cache_file, obj_updated_at)
if is_updated:
logger.debug("Getting PR #%s from cache", number)
return cached_pr
return cached_pr # type: ignore
logger.debug("Getting PR #%s from API", number)
for i in range(self.retries):
try:
@ -144,11 +139,56 @@ class GitHub(github.Github):
if i == self.retries - 1:
raise
self.sleep_on_rate_limit()
logger.debug("Caching PR #%s from API in %s", number, pr_cache_file)
with open(pr_cache_file, "wb") as prfd:
logger.debug("Caching PR #%s from API in %s", number, cache_file)
with open(cache_file, "wb") as prfd:
self.dump(pr, prfd) # type: ignore
return pr
def get_user_cached(
self, login: str, obj_updated_at: Optional[datetime] = None
) -> NamedUser:
cache_file = self.cache_path / f"user-{login}.pickle"
if cache_file.is_file():
is_updated, cached_user = self._is_cache_updated(cache_file, obj_updated_at)
if is_updated:
logger.debug("Getting user %s from cache", login)
return cached_user # type: ignore
logger.debug("Getting PR #%s from API", login)
for i in range(self.retries):
try:
user = self.get_user(login)
break
except RateLimitExceededException:
if i == self.retries - 1:
raise
self.sleep_on_rate_limit()
logger.debug("Caching user %s from API in %s", login, cache_file)
with open(cache_file, "wb") as prfd:
self.dump(user, prfd) # type: ignore
return user
def _get_cached(self, path: Path):
with open(path, "rb") as ob_fd:
return self.load(ob_fd) # type: ignore
def _is_cache_updated(
self, cache_file: Path, obj_updated_at: Optional[datetime]
) -> Tuple[bool, object]:
cached_obj = self._get_cached(cache_file)
# We don't want the cache_updated being always old,
# for example in cases when the user is not updated for ages
cache_updated = max(
datetime.fromtimestamp(cache_file.stat().st_mtime), cached_obj.updated_at
)
if obj_updated_at is None:
# When we don't know about the object is updated or not,
# we update it once per hour
obj_updated_at = datetime.now() - timedelta(hours=1)
if obj_updated_at <= cache_updated:
return True, cached_obj
return False, cached_obj
@property
def cache_path(self):
return self._cache_path

View File

@ -335,7 +335,7 @@ class Release:
yield
except (Exception, KeyboardInterrupt):
logging.warning("Rolling back checked out %s for %s", ref, orig_ref)
self.run(f"git reset --hard; git checkout {orig_ref}")
self.run(f"git reset --hard; git checkout -f {orig_ref}")
raise
else:
if with_checkout_back and need_rollback:

View File

@ -6,20 +6,14 @@ import logging
import os.path as p
import os
import re
from datetime import date, datetime, timedelta
from queue import Empty, Queue
from datetime import date, timedelta
from subprocess import CalledProcessError, DEVNULL
from threading import Thread
from time import sleep
from typing import Dict, List, Optional, TextIO
from fuzzywuzzy.fuzz import ratio # type: ignore
from github import Github
from github_helper import GitHub, PullRequest, PullRequests, Repository
from github.GithubException import RateLimitExceededException, UnknownObjectException
from github.NamedUser import NamedUser
from github.Issue import Issue
from github.PullRequest import PullRequest
from github.Repository import Repository
from git_helper import is_shallow, git_runner as runner
# This array gives the preferred category order, and is also used to
@ -39,7 +33,7 @@ categories_preferred_order = (
FROM_REF = ""
TO_REF = ""
SHA_IN_CHANGELOG = [] # type: List[str]
GitHub = Github()
gh = GitHub()
CACHE_PATH = p.join(p.dirname(p.realpath(__file__)), "gh_cache")
@ -49,7 +43,7 @@ class Description:
):
self.number = number
self.html_url = html_url
self.user = user
self.user = gh.get_user_cached(user._rawData["login"]) # type: ignore
self.entry = entry
self.category = category
@ -78,7 +72,7 @@ class Description:
user_name = self.user.login
break
except RateLimitExceededException:
sleep_on_rate_limit()
gh.sleep_on_rate_limit()
return (
f"* {entry} [#{self.number}]({self.html_url}) "
f"([{user_name}]({self.user.html_url}))."
@ -94,85 +88,34 @@ class Description:
return self.number < other.number
class Worker(Thread):
def __init__(self, request_queue: Queue, repo: Repository):
Thread.__init__(self)
self.queue = request_queue
self.repo = repo
self.response = [] # type: List[Description]
def run(self):
while not self.queue.empty():
try:
issue = self.queue.get() # type: Issue
except Empty:
break # possible race condition, just continue
api_pr = get_pull_cached(self.repo, issue.number, issue.updated_at)
in_changelog = False
merge_commit = api_pr.merge_commit_sha
try:
runner.run(f"git rev-parse '{merge_commit}'")
except CalledProcessError:
# It's possible that commit not in the repo, just continue
logging.info("PR %s does not belong to the repo", api_pr.number)
continue
in_changelog = merge_commit in SHA_IN_CHANGELOG
if in_changelog:
desc = generate_description(api_pr, self.repo)
if desc is not None:
self.response.append(desc)
self.queue.task_done()
def sleep_on_rate_limit(time: int = 20):
logging.warning("Faced rate limit, sleeping %s", time)
sleep(time)
def get_pull_cached(
repo: Repository, number: int, updated_at: Optional[datetime] = None
) -> PullRequest:
pr_cache_file = p.join(CACHE_PATH, f"{number}.pickle")
if updated_at is None:
updated_at = datetime.now() - timedelta(hours=-1)
if p.isfile(pr_cache_file):
cache_updated = datetime.fromtimestamp(p.getmtime(pr_cache_file))
if cache_updated > updated_at:
with open(pr_cache_file, "rb") as prfd:
return GitHub.load(prfd) # type: ignore
while True:
try:
pr = repo.get_pull(number)
break
except RateLimitExceededException:
sleep_on_rate_limit()
with open(pr_cache_file, "wb") as prfd:
GitHub.dump(pr, prfd) # type: ignore
return pr
def get_descriptions(
repo: Repository, issues: List[Issue], jobs: int
) -> Dict[str, List[Description]]:
workers = [] # type: List[Worker]
queue = Queue() # type: Queue[Issue]
for issue in issues:
queue.put(issue)
for _ in range(jobs):
worker = Worker(queue, repo)
worker.start()
workers.append(worker)
def get_descriptions(prs: PullRequests) -> Dict[str, List[Description]]:
descriptions = {} # type: Dict[str, List[Description]]
for worker in workers:
worker.join()
for desc in worker.response:
if desc.category not in descriptions:
descriptions[desc.category] = []
descriptions[desc.category].append(desc)
repos = {} # type: Dict[str, Repository]
for pr in prs:
# See https://github.com/PyGithub/PyGithub/issues/2202,
# obj._rawData doesn't spend additional API requests
# We'll save some requests
# pylint: disable=protected-access
repo_name = pr._rawData["base"]["repo"]["full_name"] # type: ignore
# pylint: enable=protected-access
if repo_name not in repos:
repos[repo_name] = pr.base.repo
in_changelog = False
merge_commit = pr.merge_commit_sha
try:
runner.run(f"git rev-parse '{merge_commit}'")
except CalledProcessError:
# It's possible that commit not in the repo, just continue
logging.info("PR %s does not belong to the repo", pr.number)
continue
in_changelog = merge_commit in SHA_IN_CHANGELOG
if in_changelog:
desc = generate_description(pr, repos[repo_name])
if desc is not None:
if desc.category not in descriptions:
descriptions[desc.category] = []
descriptions[desc.category].append(desc)
for descs in descriptions.values():
descs.sort()
@ -193,6 +136,11 @@ def parse_args() -> argparse.Namespace:
default=0,
help="set the script verbosity, could be used multiple",
)
parser.add_argument(
"--debug-helpers",
action="store_true",
help="add debug logging for git_helper and github_helper",
)
parser.add_argument(
"--output",
type=argparse.FileType("w"),
@ -246,7 +194,7 @@ def generate_description(item: PullRequest, repo: Repository) -> Optional[Descri
branch_parts = item.head.ref.split("/")
if len(branch_parts) == 3:
try:
item = get_pull_cached(repo, int(branch_parts[-1]))
item = gh.get_pull_cached(repo, int(branch_parts[-1]))
except Exception as e:
logging.warning("unable to get backpoted PR, exception: %s", e)
else:
@ -337,9 +285,13 @@ def generate_description(item: PullRequest, repo: Repository) -> Optional[Descri
def write_changelog(fd: TextIO, descriptions: Dict[str, List[Description]]):
year = date.today().year
to_commit = runner(f"git rev-parse {TO_REF}^{{}}")[:11]
from_commit = runner(f"git rev-parse {FROM_REF}^{{}}")[:11]
fd.write(
f"---\nsidebar_position: 1\nsidebar_label: {year}\n---\n\n# {year} Changelog\n\n"
f"### ClickHouse release {TO_REF} FIXME as compared to {FROM_REF}\n\n"
f"---\nsidebar_position: 1\nsidebar_label: {year}\n---\n\n"
f"# {year} Changelog\n\n"
f"### ClickHouse release {TO_REF} ({to_commit}) FIXME "
f"as compared to {FROM_REF} ({from_commit})\n\n"
)
seen_categories = [] # type: List[str]
@ -391,12 +343,15 @@ def set_sha_in_changelog():
def main():
log_levels = [logging.CRITICAL, logging.WARN, logging.INFO, logging.DEBUG]
log_levels = [logging.WARN, logging.INFO, logging.DEBUG]
args = parse_args()
logging.basicConfig(
format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d]:\n%(message)s",
level=log_levels[min(args.verbose, 3)],
level=log_levels[min(args.verbose, 2)],
)
if args.debug_helpers:
logging.getLogger("github_helper").setLevel(logging.DEBUG)
logging.getLogger("git_helper").setLevel(logging.DEBUG)
# Create a cache directory
if not p.isdir(CACHE_PATH):
os.mkdir(CACHE_PATH, 0o700)
@ -413,35 +368,29 @@ def main():
logging.info("Using %s..%s as changelog interval", FROM_REF, TO_REF)
# use merge-base commit as a starting point, if used ref in another branch
base_commit = runner.run(f"git merge-base '{FROM_REF}^{{}}' '{TO_REF}^{{}}'")
# Get starting and ending dates for gathering PRs
# Add one day after and before to mitigate TZ possible issues
# `tag^{}` format gives commit ref when we have annotated tags
# format %cs gives a committer date, works better for cherry-picked commits
from_date = runner.run(f"git log -1 --format=format:%cs '{FROM_REF}^{{}}'")
from_date = (date.fromisoformat(from_date) - timedelta(1)).isoformat()
from_date = runner.run(f"git log -1 --format=format:%cs '{base_commit}'")
to_date = runner.run(f"git log -1 --format=format:%cs '{TO_REF}^{{}}'")
to_date = (date.fromisoformat(to_date) + timedelta(1)).isoformat()
merged = (
date.fromisoformat(from_date) - timedelta(1),
date.fromisoformat(to_date) + timedelta(1),
)
# Get all PRs for the given time frame
global GitHub
GitHub = Github(
global gh
gh = GitHub(
args.gh_user_or_token, args.gh_password, per_page=100, pool_size=args.jobs
)
query = f"type:pr repo:{args.repo} is:merged merged:{from_date}..{to_date}"
repo = GitHub.get_repo(args.repo)
api_prs = GitHub.search_issues(query=query, sort="created")
logging.info("Found %s PRs for the query: '%s'", api_prs.totalCount, query)
gh.cache_path = CACHE_PATH
query = f"type:pr repo:{args.repo} is:merged"
prs = gh.get_pulls_from_search(query=query, merged=merged, sort="created")
issues = [] # type: List[Issue]
while True:
try:
for issue in api_prs:
issues.append(issue)
break
except RateLimitExceededException:
sleep_on_rate_limit()
descriptions = get_descriptions(repo, issues, args.jobs)
descriptions = get_descriptions(prs)
write_changelog(args.output, descriptions)

View File

@ -0,0 +1 @@
../../tests/ci/github_helper.py