Rewrite changelog generator to pure python

2024-11-21 15:12:02 +00:00 · 2022-05-09 21:14:35 +02:00 · 2022-05-09 21:14:35 +02:00 · 6cdab10d29
commit 6cdab10d29
parent 24b7a7538d
6 changed files with 387 additions and 269 deletions
--- a/utils/changelog/README.md
+++ b/utils/changelog/README.md
@ -5,17 +5,14 @@ Generate github token:

 Dependencies:
 ```
-sudo apt-get install git curl jq python3 python3-fuzzywuzzy 
-```
-
-Update information about tags:
-```
-git fetch --tags
+sudo apt-get update
+sudo apt-get install git python3 python3-fuzzywuzzy python3-github
+python3 changelog.py -h
 ```

 Usage example:

 ```
-export GITHUB_USER=... GITHUB_TOKEN=ghp_...
-./changelog.sh v21.5.6.6-stable v21.6.2.7-prestable
+python3 changelog.py --output=changelog-v22.4.1.2305-prestable.md --gh-user-or-token="$GITHUB_TOKEN" v21.6.2.7-prestable
+python3 changelog.py --output=changelog-v22.4.1.2305-prestable.md --gh-user-or-token="$USER" --gh-password="$PASSWORD" v21.6.2.7-prestable
 ```
--- a/utils/changelog/changelog.py
+++ b/utils/changelog/changelog.py
@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+import re
+from datetime import date, timedelta
+from queue import Empty, Queue
+from subprocess import CalledProcessError, DEVNULL
+from threading import Thread
+from typing import Dict, List, Optional, TextIO
+
+from fuzzywuzzy.fuzz import ratio  # type: ignore
+from github import Github
+from github.NamedUser import NamedUser
+from github.PullRequest import PullRequest
+from github.Repository import Repository
+from git_helper import is_shallow, git_runner as runner
+
+# This array gives the preferred category order, and is also used to
+# normalize category names.
+categories_preferred_order = (
+    "Backward Incompatible Change",
+    "New Feature",
+    "Performance Improvement",
+    "Improvement",
+    "Bug Fix",
+    "Build/Testing/Packaging Improvement",
+    "Other",
+)
+
+FROM_REF = ""
+TO_REF = ""
+
+
+class Description:
+    def __init__(
+        self, number: int, user: NamedUser, html_url: str, entry: str, category: str
+    ):
+        self.number = number
+        self.html_url = html_url
+        self.user = user
+        self.entry = entry
+        self.category = category
+
+    @property
+    def formatted_entry(self) -> str:
+        # Substitute issue links.
+        # 1) issue number w/o markdown link
+        entry = re.sub(
+            r"([^[])#([0-9]{4,})",
+            r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)",
+            self.entry,
+        )
+        # 2) issue URL w/o markdown link
+        entry = re.sub(
+            r"([^(])https://github.com/ClickHouse/ClickHouse/issues/([0-9]{4,})",
+            r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)",
+            entry,
+        )
+        user_name = self.user.name if self.user.name else self.user.login
+        return (
+            f"* {entry} [#{self.number}]({self.html_url}) "
+            f"([{user_name}]({self.user.html_url}))."
+        )
+
+    # Sort PR descriptions by numbers
+    def __eq__(self, other) -> bool:
+        if not isinstance(self, type(other)):
+            return NotImplemented
+        return self.number == other.number
+
+    def __lt__(self, other: "Description") -> bool:
+        return self.number < other.number
+
+
+class Worker(Thread):
+    def __init__(self, request_queue: Queue, repo: Repository):
+        Thread.__init__(self)
+        self.queue = request_queue
+        self.repo = repo
+        self.response = []  # type: List[Description]
+
+    def run(self):
+        while not self.queue.empty():
+            try:
+                number = self.queue.get()
+            except Empty:
+                break  # possible race condition, just continue
+            api_pr = self.repo.get_pull(number)
+            in_changelog = False
+            merge_commit = api_pr.merge_commit_sha
+            try:
+                runner.run(f"git rev-parse '{merge_commit}'")
+            except CalledProcessError:
+                # It's possible that commit not in the repo, just continue
+                logging.info("PR %s does not belong to the repo", api_pr.number)
+                continue
+
+            try:
+                runner.run(
+                    f"git merge-base --is-ancestor '{merge_commit}' '{TO_REF}'",
+                    stderr=DEVNULL,
+                )
+                runner.run(
+                    f"git merge-base --is-ancestor '{FROM_REF}' '{merge_commit}'",
+                    stderr=DEVNULL,
+                )
+                in_changelog = True
+            except CalledProcessError:
+                # Commit is not between from and to refs
+                continue
+            if in_changelog:
+                desc = generate_description(api_pr, self.repo)
+                if desc is not None:
+                    self.response.append(desc)
+
+            self.queue.task_done()
+
+
+def get_descriptions(
+    repo: Repository, numbers: List[int], jobs: int
+) -> Dict[str, List[Description]]:
+    workers = []  # type: List[Worker]
+    queue = Queue()  # type: Queue # (!?!?!?!??!)
+    for number in numbers:
+        queue.put(number)
+    for _ in range(jobs):
+        worker = Worker(queue, repo)
+        worker.start()
+        workers.append(worker)
+
+    descriptions = {}  # type: Dict[str, List[Description]]
+    for worker in workers:
+        worker.join()
+        for desc in worker.response:
+            if desc.category not in descriptions:
+                descriptions[desc.category] = []
+            descriptions[desc.category].append(desc)
+
+    for descs in descriptions.values():
+        descs.sort()
+
+    return descriptions
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="Generate a changelog in MD format between given tags. "
+        "It fetches all tags and unshallow the git repositore automatically",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="count",
+        default=0,
+        help="set the script verbosity, could be used multiple",
+    )
+    parser.add_argument(
+        "--output",
+        type=argparse.FileType("w"),
+        default="-",
+        help="output file for changelog",
+    )
+    parser.add_argument(
+        "--repo",
+        default="ClickHouse/ClickHouse",
+        help="a repository to query for pull-requests from GitHub",
+    )
+    parser.add_argument(
+        "--jobs",
+        type=int,
+        default=10,
+        help="number of jobs to get pull-requests info from GitHub API",
+    )
+    parser.add_argument(
+        "--gh-user-or-token",
+        help="user name or GH token to authenticate",
+    )
+    parser.add_argument(
+        "--gh-password",
+        help="a password that should be used when user is given",
+    )
+    parser.add_argument(
+        "--from",
+        dest="from_ref",
+        help="git ref for a starting point of changelog, by default is calculated "
+        "automatically to match a previous tag in history",
+    )
+    parser.add_argument(
+        "to_ref",
+        metavar="TO_REF",
+        help="git ref for the changelog end",
+    )
+    args = parser.parse_args()
+    return args
+
+
+# This function mirrors the PR description checks in ClickhousePullRequestTrigger.
+# Returns False if the PR should not be mentioned changelog.
+def generate_description(item: PullRequest, repo: Repository) -> Optional[Description]:
+    backport_number = item.number
+    if item.head.ref.startswith("backport/"):
+        branch_parts = item.head.ref.split("/")
+        if len(branch_parts) == 3:
+            item = repo.get_pull(int(branch_parts[-1]))
+        else:
+            logging.warning(
+                "The branch %s doesn't match backport template, using PR %s as is",
+                item.head.ref,
+                item.number,
+            )
+    description = item.body
+    # Don't skip empty lines because they delimit parts of description
+    lines = [x.strip() for x in (description.split("\n") if description else [])]
+    lines = [re.sub(r"\s+", " ", ln) for ln in lines]
+
+    category = ""
+    entry = ""
+
+    if lines:
+        i = 0
+        while i < len(lines):
+            if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]):
+                i += 1
+                if i >= len(lines):
+                    break
+                # Can have one empty line between header and the category itself.
+                # Filter it out.
+                if not lines[i]:
+                    i += 1
+                    if i >= len(lines):
+                        break
+                category = re.sub(r"^[-*\s]*", "", lines[i])
+                i += 1
+            elif re.match(
+                r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i]
+            ):
+                i += 1
+                # Can have one empty line between header and the entry itself.
+                # Filter it out.
+                if i < len(lines) and not lines[i]:
+                    i += 1
+                # All following lines until empty one are the changelog entry.
+                entry_lines = []
+                while i < len(lines) and lines[i]:
+                    entry_lines.append(lines[i])
+                    i += 1
+                entry = " ".join(entry_lines)
+            else:
+                i += 1
+
+    if not category:
+        # Shouldn't happen, because description check in CI should catch such PRs.
+        # Fall through, so that it shows up in output and the user can fix it.
+        category = "NO CL CATEGORY"
+
+    # Filter out the PR categories that are not for changelog.
+    if re.match(
+        r"(?i)doc|((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)",
+        category,
+    ):
+        return None
+
+    if backport_number != item.number:
+        entry = f"Backported in #{backport_number}: {entry}"
+
+    if not entry:
+        # Shouldn't happen, because description check in CI should catch such PRs.
+        category = "NO CL ENTRY"
+        entry = "NO CL ENTRY:  '" + item.title + "'"
+
+    entry = entry.strip()
+    if entry[-1] != ".":
+        entry += "."
+
+    for c in categories_preferred_order:
+        if ratio(category.lower(), c.lower()) >= 90:
+            category = c
+            break
+
+    return Description(item.number, item.user, item.html_url, entry, category)
+
+
+def write_changelog(fd: TextIO, descriptions: Dict[str, List[Description]]):
+    fd.write(f"### ClickHouse release {TO_REF} FIXME as compared to {FROM_REF}\n\n")
+
+    seen_categories = []  # type: List[str]
+    for category in categories_preferred_order:
+        if category in descriptions:
+            seen_categories.append(category)
+            fd.write(f"#### {category}\n")
+            for desc in descriptions[category]:
+                fd.write(f"{desc.formatted_entry}\n")
+
+            fd.write("\n")
+
+    for category in descriptions:
+        if category not in seen_categories:
+            fd.write(f"#### {category}\n\n")
+            for desc in descriptions[category]:
+                fd.write(f"{desc.formatted_entry}\n")
+
+            fd.write("\n")
+
+
+def check_refs(from_ref: Optional[str], to_ref: str):
+    global FROM_REF, TO_REF
+    TO_REF = to_ref
+
+    # Check TO_REF
+    runner.run(f"git rev-parse {TO_REF}")
+
+    # Check from_ref
+    if from_ref is None:
+        FROM_REF = runner.run(f"git describe --abbrev=0 --tags '{TO_REF}~'")
+        # Check if the previsous tag is different for merge commits
+        # I __assume__ we won't have octopus merges, at least for the tagged commits
+        try:
+            alternative_tag = runner.run(
+                f"git describe --abbrev=0 --tags '{TO_REF}^2'", stderr=DEVNULL
+            )
+            if FROM_REF != alternative_tag:
+                raise Exception(
+                    f"Unable to get unified parent tag for {TO_REF}, "
+                    f"define it manually, get {FROM_REF} and {alternative_tag}"
+                )
+        except CalledProcessError:
+            pass
+    else:
+        runner.run(f"git rev-parse {FROM_REF}")
+        FROM_REF = from_ref
+
+
+def main():
+    log_levels = [logging.CRITICAL, logging.WARN, logging.INFO, logging.DEBUG]
+    args = parse_args()
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d]:\n%(message)s",
+        level=log_levels[min(args.verbose, 3)],
+    )
+    # Get the full repo
+    if is_shallow():
+        logging.info("Unshallow repository")
+        runner.run("git fetch --unshallow", stderr=DEVNULL)
+    logging.info("Fetching all tags")
+    runner.run("git fetch --tags", stderr=DEVNULL)
+
+    check_refs(args.from_ref, args.to_ref)
+
+    logging.info("Using %s..%s as changelog interval", FROM_REF, TO_REF)
+
+    # Get starting and ending dates for gathering PRs
+    # Add one day after and before to mitigate TZ possible issues
+    # `tag^{}` format gives commit ref when we have annotated tags
+    from_date = runner.run(f"git log -1 --format=format:%as '{FROM_REF}^{{}}'")
+    from_date = (date.fromisoformat(from_date) - timedelta(1)).isoformat()
+    to_date = runner.run(f"git log -1 --format=format:%as '{TO_REF}^{{}}'")
+    to_date = (date.fromisoformat(to_date) + timedelta(1)).isoformat()
+
+    # Get all PRs for the given time frame
+    gh = Github(
+        args.gh_user_or_token, args.gh_password, per_page=100, pool_size=args.jobs
+    )
+    query = f"type:pr repo:{args.repo} is:merged merged:{from_date}..{to_date}"
+    repo = gh.get_repo(args.repo)
+    api_prs = gh.search_issues(query=query, sort="created")
+    logging.info("Found %s PRs for the query: '%s'", api_prs.totalCount, query)
+
+    pr_numbers = [pr.number for pr in api_prs]
+
+    descriptions = get_descriptions(repo, pr_numbers, args.jobs)
+
+    write_changelog(args.output, descriptions)
+
+
+if __name__ == "__main__":
+    main()
--- a/utils/changelog/changelog.sh
+++ b/utils/changelog/changelog.sh
@ -1,96 +0,0 @@
-#!/bin/bash
-set -e
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-from="$1"
-to="$2"
-log_command=(git log "$from..$to" --first-parent)
-
-"${log_command[@]}" > "changelog-log.txt"
-
-# Check for diamond merges.
-if "${log_command[@]}" --oneline --grep "Merge branch '" | grep ''
-then
-    # DO NOT ADD automated handling of diamond merges to this script.
-    # It is an unsustainable way to work with git, and it MUST be visible.
-    echo Warning: suspected diamond merges above.
-    echo Some commits will be missed, review these manually.
-fi
-
-# Search for PR numbers in commit messages. First variant is normal merge, and second
-# variant is squashed. Next are some backport message variants.
-find_prs=(sed -n "s/^.*merg[eding]*.*#\([[:digit:]]\+\).*$/\1/Ip;
-                  s/^.*(#\([[:digit:]]\+\))$/\1/p;
-                  s/^.*back[- ]*port[ed of]*.*#\([[:digit:]]\+\).*$/\1/Ip;
-                  s/^.*cherry[- ]*pick[ed of]*.*#\([[:digit:]]\+\).*$/\1/Ip")
-
-# awk is to filter out small task numbers from different task tracker, which are
-# referenced by documentation commits like '* DOCSUP-824: query log (#115)'.
-"${find_prs[@]}" "changelog-log.txt" | sort -rn | uniq | awk '$0 > 1000 { print $0 }' > "changelog-prs.txt"
-
-echo "$(wc -l < "changelog-prs.txt") PRs added between $from and $to."
-if [ $(wc -l < "changelog-prs.txt") -eq 0 ] ; then exit 0 ; fi
-
-function github_download()
-{
-    local url=${1}
-    local file=${2}
-    if ! [ -f "$file" ]
-    then
-        echo "curl -u \"$GITHUB_USER:***\" -sSf \"$url\" > \"$file\""
-
-        if ! curl -u "$GITHUB_USER:$GITHUB_TOKEN" \
-                -sSf "$url" \
-                > "$file"
-        then
-            >&2 echo "Failed to download '$url' to '$file'. Contents: '$(cat "$file")'."
-            rm "$file"
-            return 1
-        fi
-        sleep 0.1
-    fi
-}
-
-rm changelog-prs-filtered.txt &> /dev/null ||:
-for pr in $(cat "changelog-prs.txt")
-do
-    # Download PR info from github.
-    file="pr$pr.json"
-    github_download "https://api.github.com/repos/ClickHouse/ClickHouse/pulls/$pr" "$file" || continue
-
-    if ! [ "$pr" == "$(jq -r .number "$file")" ]
-    then
-        >&2 echo "Got wrong data for PR #$pr (please check and remove '$file')."
-        continue
-    fi
-
-    # Filter out PRs by bots.
-    user_login=$(jq -r .user.login "$file")
-
-    filter_bot=$(echo "$user_login" | grep -q "\[bot\]$" && echo "Skip." || echo "Ok." ||:)
-    filter_robot=$(echo "$user_login" | grep -q "robot-clickhouse" && echo "Skip." || echo "Ok." ||:)
-
-    if [ "Skip." == "$filter_robot" ] || [ "Skip." == "$filter_bot" ]
-    then
-        continue
-    fi
-
-    # Download author info from github.
-    user_id=$(jq -r .user.id "$file")
-    user_file="user$user_id.json"
-    github_download "$(jq -r .user.url "$file")" "$user_file" || continue
-
-    if ! [ "$user_id" == "$(jq -r .id "$user_file")" ]
-    then
-        >&2 echo "Got wrong data for user #$user_id (please check and remove '$user_file')."
-        continue
-    fi
-
-    echo "$pr" >> changelog-prs-filtered.txt
-done
-
-echo "### ClickHouse release $to FIXME as compared to $from
-" > changelog.md
-"$script_dir/format-changelog.py" changelog-prs-filtered.txt >> changelog.md
-cat changelog.md
--- a/utils/changelog/format-changelog.py
+++ b/utils/changelog/format-changelog.py
@ -1,165 +0,0 @@
-#!/usr/bin/python3
-
-import argparse
-import collections
-import fuzzywuzzy.fuzz
-import itertools
-import json
-import os
-import re
-import sys
-
-parser = argparse.ArgumentParser(description="Format changelog for given PRs.")
-parser.add_argument(
-    "file",
-    metavar="FILE",
-    type=argparse.FileType("r", encoding="utf-8"),
-    nargs="?",
-    default=sys.stdin,
-    help="File with PR numbers, one per line.",
-)
-args = parser.parse_args()
-
-# This function mirrors the PR description checks in ClickhousePullRequestTrigger.
-# Returns False if the PR should not be mentioned changelog.
-def parse_one_pull_request(item):
-    description = item["body"]
-    # Don't skip empty lines because they delimit parts of description
-    lines = [
-        line
-        for line in [
-            x.strip() for x in (description.split("\n") if description else [])
-        ]
-    ]
-    lines = [re.sub(r"\s+", " ", l) for l in lines]
-
-    category = ""
-    entry = ""
-
-    if lines:
-        i = 0
-        while i < len(lines):
-            if re.match(r"(?i)^[>*_ ]*change\s*log\s*category", lines[i]):
-                i += 1
-                if i >= len(lines):
-                    break
-                # Can have one empty line between header and the category itself. Filter it out.
-                if not lines[i]:
-                    i += 1
-                    if i >= len(lines):
-                        break
-                category = re.sub(r"^[-*\s]*", "", lines[i])
-                i += 1
-            elif re.match(
-                r"(?i)^[>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i]
-            ):
-                i += 1
-                # Can have one empty line between header and the entry itself. Filter it out.
-                if i < len(lines) and not lines[i]:
-                    i += 1
-                # All following lines until empty one are the changelog entry.
-                entry_lines = []
-                while i < len(lines) and lines[i]:
-                    entry_lines.append(lines[i])
-                    i += 1
-                entry = " ".join(entry_lines)
-            else:
-                i += 1
-
-    if not category:
-        # Shouldn't happen, because description check in CI should catch such PRs.
-        # Fall through, so that it shows up in output and the user can fix it.
-        category = "NO CL CATEGORY"
-
-    # Filter out the PR categories that are not for changelog.
-    if re.match(
-        r"(?i)doc|((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)",
-        category,
-    ):
-        return False
-
-    if not entry:
-        # Shouldn't happen, because description check in CI should catch such PRs.
-        category = "NO CL ENTRY"
-        entry = "NO CL ENTRY:  '" + item["title"] + "'"
-
-    entry = entry.strip()
-    if entry[-1] != ".":
-        entry += "."
-
-    item["entry"] = entry
-    item["category"] = category
-
-    return True
-
-
-# This array gives the preferred category order, and is also used to
-# normalize category names.
-categories_preferred_order = [
-    "Backward Incompatible Change",
-    "New Feature",
-    "Performance Improvement",
-    "Improvement",
-    "Bug Fix",
-    "Build/Testing/Packaging Improvement",
-    "Other",
-]
-
-category_to_pr = collections.defaultdict(lambda: [])
-users = {}
-for line in args.file:
-    pr = json.loads(open(f"pr{line.strip()}.json").read())
-    assert pr["number"]
-    if not parse_one_pull_request(pr):
-        continue
-
-    assert pr["category"]
-
-    # Normalize category name
-    for c in categories_preferred_order:
-        if fuzzywuzzy.fuzz.ratio(pr["category"].lower(), c.lower()) >= 90:
-            pr["category"] = c
-            break
-
-    category_to_pr[pr["category"]].append(pr)
-    user_id = pr["user"]["id"]
-    users[user_id] = json.loads(open(f"user{user_id}.json").read())
-
-
-def print_category(category):
-    print(("#### " + category))
-    print()
-    for pr in category_to_pr[category]:
-        user = users[pr["user"]["id"]]
-        user_name = user["name"] if user["name"] else user["login"]
-
-        # Substitute issue links.
-        # 1) issue number w/o markdown link
-        pr["entry"] = re.sub(
-            r"([^[])#([0-9]{4,})",
-            r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)",
-            pr["entry"],
-        )
-        # 2) issue URL w/o markdown link
-        pr["entry"] = re.sub(
-            r"([^(])https://github.com/ClickHouse/ClickHouse/issues/([0-9]{4,})",
-            r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)",
-            pr["entry"],
-        )
-
-        print(
-            f'* {pr["entry"]} [#{pr["number"]}]({pr["html_url"]}) ([{user_name}]({user["html_url"]})).'
-        )
-
-    print()
-
-
-# Print categories in preferred order
-for category in categories_preferred_order:
-    if category in category_to_pr:
-        print_category(category)
-        category_to_pr.pop(category)
-
-# Print the rest of the categories
-for category in category_to_pr:
-    print_category(category)
--- a/utils/changelog/git_helper.py
+++ b/utils/changelog/git_helper.py
@ -0,0 +1 @@
+../../tests/ci/git_helper.py
--- a/utils/changelog/requirements.txt
+++ b/utils/changelog/requirements.txt
@ -0,0 +1,3 @@
+fuzzywuzzy
+PyGitHub
+python-Levenshtein