ClickHouse/utils/changelog/format-changelog.py

#!/usr/bin/python3

import argparse
import collections
import fuzzywuzzy.fuzz
import itertools
import json
import os
import re
import sys

parser = argparse.ArgumentParser(description="Format changelog for given PRs.")
parser.add_argument(
    "file",
    metavar="FILE",
    type=argparse.FileType("r", encoding="utf-8"),
    nargs="?",
    default=sys.stdin,
    help="File with PR numbers, one per line.",
)
args = parser.parse_args()

# This function mirrors the PR description checks in ClickhousePullRequestTrigger.
# Returns False if the PR should not be mentioned changelog.
def parse_one_pull_request(item):
    description = item["body"]
    # Don't skip empty lines because they delimit parts of description
    lines = [
        line
        for line in [
            x.strip() for x in (description.split("\n") if description else [])
        ]
    ]
    lines = [re.sub(r"\s+", " ", l) for l in lines]

    category = ""
    entry = ""

    if lines:
        i = 0
        while i < len(lines):
            if re.match(r"(?i)^[>*_ ]*change\s*log\s*category", lines[i]):
                i += 1
                if i >= len(lines):
                    break
                # Can have one empty line between header and the category itself. Filter it out.
                if not lines[i]:
                    i += 1
                    if i >= len(lines):
                        break
                category = re.sub(r"^[-*\s]*", "", lines[i])
                i += 1
            elif re.match(
                r"(?i)^[>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i]
            ):
                i += 1
                # Can have one empty line between header and the entry itself. Filter it out.
                if i < len(lines) and not lines[i]:
                    i += 1
                # All following lines until empty one are the changelog entry.
                entry_lines = []
                while i < len(lines) and lines[i]:
                    entry_lines.append(lines[i])
                    i += 1
                entry = " ".join(entry_lines)
            else:
                i += 1

    if not category:
        # Shouldn't happen, because description check in CI should catch such PRs.
        # Fall through, so that it shows up in output and the user can fix it.
        category = "NO CL CATEGORY"

    # Filter out the PR categories that are not for changelog.
    if re.match(
        r"(?i)doc|((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)",
        category,
    ):
        return False

    if not entry:
        # Shouldn't happen, because description check in CI should catch such PRs.
        category = "NO CL ENTRY"
        entry = "NO CL ENTRY:  '" + item["title"] + "'"

    entry = entry.strip()
    if entry[-1] != ".":
        entry += "."

    item["entry"] = entry
    item["category"] = category

    return True


# This array gives the preferred category order, and is also used to
# normalize category names.
categories_preferred_order = [
    "Backward Incompatible Change",
    "New Feature",
    "Performance Improvement",
    "Improvement",
    "Bug Fix",
    "Build/Testing/Packaging Improvement",
    "Other",
]

category_to_pr = collections.defaultdict(lambda: [])
users = {}
for line in args.file:
    pr = json.loads(open(f"pr{line.strip()}.json").read())
    assert pr["number"]
    if not parse_one_pull_request(pr):
        continue

    assert pr["category"]

    # Normalize category name
    for c in categories_preferred_order:
        if fuzzywuzzy.fuzz.ratio(pr["category"].lower(), c.lower()) >= 90:
            pr["category"] = c
            break

    category_to_pr[pr["category"]].append(pr)
    user_id = pr["user"]["id"]
    users[user_id] = json.loads(open(f"user{user_id}.json").read())


def print_category(category):
    print(("#### " + category))
    print()
    for pr in category_to_pr[category]:
        user = users[pr["user"]["id"]]
        user_name = user["name"] if user["name"] else user["login"]

        # Substitute issue links.
        # 1) issue number w/o markdown link
        pr["entry"] = re.sub(
            r"([^[])#([0-9]{4,})",
            r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)",
            pr["entry"],
        )
        # 2) issue URL w/o markdown link
        pr["entry"] = re.sub(
            r"([^(])https://github.com/ClickHouse/ClickHouse/issues/([0-9]{4,})",
            r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)",
            pr["entry"],
        )

        print(
            f'* {pr["entry"]} [#{pr["number"]}]({pr["html_url"]}) ([{user_name}]({user["html_url"]})).'
        )

    print()


# Print categories in preferred order
for category in categories_preferred_order:
    if category in category_to_pr:
        print_category(category)
        category_to_pr.pop(category)

# Print the rest of the categories
for category in category_to_pr:
    print_category(category)
simple backport script 2020-04-13 21:15:58 +00:00			`#!/usr/bin/python3`

			`import argparse`
			`import collections`
Changelog for 20.5 2020-07-03 08:57:38 +00:00			`import fuzzywuzzy.fuzz`
			`import itertools`
			`import json`
			`import os`
simple backport script 2020-04-13 21:15:58 +00:00			`import re`
Changelog for 20.5 2020-07-03 08:57:38 +00:00			`import sys`
simple backport script 2020-04-13 21:15:58 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`parser = argparse.ArgumentParser(description="Format changelog for given PRs.")`
			`parser.add_argument(`
			`"file",`
			`metavar="FILE",`
			`type=argparse.FileType("r", encoding="utf-8"),`
			`nargs="?",`
			`default=sys.stdin,`
			`help="File with PR numbers, one per line.",`
			`)`
simple backport script 2020-04-13 21:15:58 +00:00			`args = parser.parse_args()`

			`# This function mirrors the PR description checks in ClickhousePullRequestTrigger.`
			`# Returns False if the PR should not be mentioned changelog.`
			`def parse_one_pull_request(item):`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`description = item["body"]`
simple backport script 2020-04-13 21:15:58 +00:00			`# Don't skip empty lines because they delimit parts of description`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`lines = [`
			`line`
			`for line in [`
			`x.strip() for x in (description.split("\n") if description else [])`
			`]`
			`]`
			`lines = [re.sub(r"\s+", " ", l) for l in lines]`
simple backport script 2020-04-13 21:15:58 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`category = ""`
			`entry = ""`
simple backport script 2020-04-13 21:15:58 +00:00
			`if lines:`
			`i = 0`
			`while i < len(lines):`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`if re.match(r"(?i)^[>_ ]change\slog\scategory", lines[i]):`
simple backport script 2020-04-13 21:15:58 +00:00			`i += 1`
			`if i >= len(lines):`
			`break`
simple changelog script 2020-04-14 11:28:27 +00:00			`# Can have one empty line between header and the category itself. Filter it out.`
			`if not lines[i]:`
			`i += 1`
			`if i >= len(lines):`
			`break`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`category = re.sub(r"^[-\s]", "", lines[i])`
simple backport script 2020-04-13 21:15:58 +00:00			`i += 1`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`elif re.match(`
			`r"(?i)^[>_ ](short\sdescription\|change\slog\s*entry)", lines[i]`
			`):`
simple backport script 2020-04-13 21:15:58 +00:00			`i += 1`
			`# Can have one empty line between header and the entry itself. Filter it out.`
			`if i < len(lines) and not lines[i]:`
			`i += 1`
			`# All following lines until empty one are the changelog entry.`
			`entry_lines = []`
			`while i < len(lines) and lines[i]:`
			`entry_lines.append(lines[i])`
			`i += 1`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`entry = " ".join(entry_lines)`
simple backport script 2020-04-13 21:15:58 +00:00			`else:`
			`i += 1`

			`if not category:`
			`# Shouldn't happen, because description check in CI should catch such PRs.`
			`# Fall through, so that it shows up in output and the user can fix it.`
			`category = "NO CL CATEGORY"`

			`# Filter out the PR categories that are not for changelog.`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`if re.match(`
			`r"(?i)doc\|((non\|in\|not\|un)[-\s]significant)\|(not[ ]for[ ]*changelog)",`
			`category,`
			`):`
simple backport script 2020-04-13 21:15:58 +00:00			`return False`

			`if not entry:`
			`# Shouldn't happen, because description check in CI should catch such PRs.`
			`category = "NO CL ENTRY"`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`entry = "NO CL ENTRY: '" + item["title"] + "'"`
simple backport script 2020-04-13 21:15:58 +00:00
			`entry = entry.strip()`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`if entry[-1] != ".":`
			`entry += "."`
simple backport script 2020-04-13 21:15:58 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`item["entry"] = entry`
			`item["category"] = category`
simple backport script 2020-04-13 21:15:58 +00:00
			`return True`

Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00
Changelog for 20.5 2020-07-03 08:57:38 +00:00			`# This array gives the preferred category order, and is also used to`
			`# normalize category names.`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`categories_preferred_order = [`
			`"Backward Incompatible Change",`
			`"New Feature",`
			`"Performance Improvement",`
			`"Improvement",`
			`"Bug Fix",`
			`"Build/Testing/Packaging Improvement",`
			`"Other",`
			`]`
simple backport script 2020-04-13 21:15:58 +00:00
			`category_to_pr = collections.defaultdict(lambda: [])`
			`users = {}`
simple changelog script 2020-04-14 09:11:09 +00:00			`for line in args.file:`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`pr = json.loads(open(f"pr{line.strip()}.json").read())`
			`assert pr["number"]`
simple backport script 2020-04-13 21:15:58 +00:00			`if not parse_one_pull_request(pr):`
			`continue`

Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`assert pr["category"]`
Changelog for 20.5 2020-07-03 08:57:38 +00:00
			`# Normalize category name`
			`for c in categories_preferred_order:`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`if fuzzywuzzy.fuzz.ratio(pr["category"].lower(), c.lower()) >= 90:`
			`pr["category"] = c`
Changelog for 20.5 2020-07-03 08:57:38 +00:00			`break`

Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`category_to_pr[pr["category"]].append(pr)`
			`user_id = pr["user"]["id"]`
			`users[user_id] = json.loads(open(f"user{user_id}.json").read())`

simple backport script 2020-04-13 21:15:58 +00:00
			`def print_category(category):`
Convert to python3 (#15007) 2020-10-02 16:54:07 +00:00			`print(("#### " + category))`
simple backport script 2020-04-13 21:15:58 +00:00			`print()`
			`for pr in category_to_pr[category]:`
			`user = users[pr["user"]["id"]]`
			`user_name = user["name"] if user["name"] else user["login"]`

Some changelogs 2020-11-13 06:28:36 +00:00			`# Substitute issue links.`
			`# 1) issue number w/o markdown link`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`pr["entry"] = re.sub(`
			`r"([^[])#([0-9]{4,})",`
			`r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)",`
			`pr["entry"],`
			`)`
Some changelogs 2020-11-13 06:28:36 +00:00			`# 2) issue URL w/o markdown link`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`pr["entry"] = re.sub(`
			`r"([^(])https://github.com/ClickHouse/ClickHouse/issues/([0-9]{4,})",`
			`r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)",`
			`pr["entry"],`
			`)`
simple backport script 2020-04-13 21:15:58 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`print(`
			`f'* {pr["entry"]} [#{pr["number"]}]({pr["html_url"]}) ([{user_name}]({user["html_url"]})).'`
			`)`
simple backport script 2020-04-13 21:15:58 +00:00
			`print()`

Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00
simple backport script 2020-04-13 21:15:58 +00:00			`# Print categories in preferred order`
			`for category in categories_preferred_order:`
			`if category in category_to_pr:`
			`print_category(category)`
			`category_to_pr.pop(category)`

			`# Print the rest of the categories`
			`for category in category_to_pr:`
			`print_category(category)`