ClickHouse/utils/make_changelog.py

499 lines
18 KiB
Python
Executable File

#!/usr/bin/env python
# Note: should work with python 2 and 3
from __future__ import print_function
import requests
import json
import subprocess
import re
import os
import time
import logging
import codecs
import argparse
GITHUB_API_URL = 'https://api.github.com/'
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
def http_get_json(url, token, max_retries, retry_timeout):
for t in range(max_retries):
if token:
resp = requests.get(url, headers={"Authorization": "token {}".format(token)})
else:
resp = requests.get(url)
if resp.status_code != 200:
msg = "Request {} failed with code {}.\n{}\n".format(url, resp.status_code, resp.text)
if resp.status_code == 403 or resp.status_code >= 500:
try:
if (resp.json()['message'].startswith('API rate limit exceeded') or resp.status_code >= 500) and t + 1 < max_retries:
logging.warning(msg)
time.sleep(retry_timeout)
continue
except Exception:
pass
raise Exception(msg)
return resp.json()
def github_api_get_json(query, token, max_retries, retry_timeout):
return http_get_json(GITHUB_API_URL + query, token, max_retries, retry_timeout)
def check_sha(sha):
if not (re.match('^[a-hA-H0-9]+$', sha) and len(sha) >= 7):
raise Exception("String " + sha + " doesn't look like git sha.")
def get_merge_base(first, second, project_root):
try:
command = "git merge-base {} {}".format(first, second)
text = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, cwd=project_root).stdout.read()
text = text.decode('utf-8', 'ignore')
sha = tuple(filter(len, text.split()))[0]
check_sha(sha)
return sha
except Exception:
logging.error('Cannot find merge base for %s and %s', first, second)
raise
def rev_parse(rev, project_root):
try:
command = "git rev-parse {}".format(rev)
text = subprocess.check_output(command, shell=True, cwd=project_root)
text = text.decode('utf-8', 'ignore')
sha = tuple(filter(len, text.split()))[0]
check_sha(sha)
return sha
except Exception:
logging.error('Cannot find revision %s', rev)
raise
# Get list of commits from branch to base_sha. Update commits_info.
def get_commits_from_branch(repo, branch, base_sha, commits_info, max_pages, token, max_retries, retry_timeout):
def get_commits_from_page(page):
query = 'repos/{}/commits?sha={}&page={}'.format(repo, branch, page)
resp = github_api_get_json(query, token, max_retries, retry_timeout)
for commit in resp:
sha = commit['sha']
if sha not in commits_info:
commits_info[sha] = commit
return [commit['sha'] for commit in resp]
commits = []
found_base_commit = False
for page in range(max_pages):
page_commits = get_commits_from_page(page)
for commit in page_commits:
if commit == base_sha:
found_base_commit = True
break
commits.append(commit)
if found_base_commit:
break
if not found_base_commit:
raise Exception("Can't found base commit sha {} in branch {}. Checked {} commits on {} pages.\nCommits: {}"
.format(base_sha, branch, len(commits), max_pages, ' '.join(commits)))
return commits
# Get list of commits a specified commit is cherry-picked from. Can return an empty list.
def parse_original_commits_from_cherry_pick_message(commit_message):
prefix = '(cherry picked from commits'
pos = commit_message.find(prefix)
if pos == -1:
prefix = '(cherry picked from commit'
pos = commit_message.find(prefix)
if pos == -1:
return []
pos += len(prefix)
endpos = commit_message.find(')', pos)
if endpos == -1:
return []
lst = [x.strip() for x in commit_message[pos:endpos].split(',')]
lst = [x for x in lst if x]
return lst
# Use GitHub search api to check if commit from any pull request. Update pull_requests info.
def find_pull_request_for_commit(commit_info, pull_requests, token, max_retries, retry_timeout):
commits = [commit_info['sha']] + parse_original_commits_from_cherry_pick_message(commit_info['commit']['message'])
query = 'search/issues?q={}+type:pr+repo:{}&sort=created&order=asc'.format(' '.join(commits), repo)
resp = github_api_get_json(query, token, max_retries, retry_timeout)
found = False
for item in resp['items']:
if 'pull_request' in item:
found = True
number = item['number']
if number not in pull_requests:
pull_requests[number] = {
'title': item['title'],
'description': item['body'],
'user': item['user']['login'],
}
return found
# Find pull requests from list of commits. If no pull request found, add commit to not_found_commits list.
def find_pull_requests(commits, commits_info, token, max_retries, retry_timeout):
not_found_commits = []
pull_requests = {}
for i, commit in enumerate(commits):
if (i + 1) % 10 == 0:
logging.info('Processed %d commits', i + 1)
if not find_pull_request_for_commit(commits_info[commit], pull_requests, token, max_retries, retry_timeout):
not_found_commits.append(commit)
return not_found_commits, pull_requests
# Find pull requests by list of numbers
def find_pull_requests_by_num(pull_requests_nums, token, max_retries, retry_timeout):
pull_requests = {}
for pr in pull_requests_nums:
item = github_api_get_json('repos/{}/pulls/{}'.format(repo, pr), token, max_retries, retry_timeout)
number = item['number']
if number not in pull_requests:
pull_requests[number] = {
'title': item['title'],
'description': item['body'],
'user': item['user']['login'],
}
return pull_requests
# Get users for all unknown commits and pull requests.
def get_users_info(pull_requests, commits_info, token, max_retries, retry_timeout):
users = {}
def update_user(user):
if user not in users:
query = 'users/{}'.format(user)
resp = github_api_get_json(query, token, max_retries, retry_timeout)
users[user] = resp
for pull_request in pull_requests.values():
update_user(pull_request['user'])
for commit_info in commits_info.values():
if 'author' in commit_info and commit_info['author'] is not None:
update_user(commit_info['committer']['login'])
else:
logging.warning('Not found author for commit %s.', commit_info['html_url'])
return users
# List of unknown commits -> text description.
def process_unknown_commits(commits, commits_info, users):
pattern = u'Commit: [{}]({})\nAuthor: {}\nMessage: {}'
texts = []
for commit in commits:
info = commits_info[commit]
html_url = info['html_url']
msg = info['commit']['message']
name = None
login = None
author = None
if not info['author']:
author = 'Unknown'
else:
# GitHub login
if 'login' in info['author']:
login = info['author']['login']
# First, try get name from github user
try:
name = users[login]['name']
except KeyError:
pass
else:
login = 'Unknown'
# Then, try get name from commit
if not name:
try:
name = info['commit']['author']['name']
except KeyError:
pass
author = '[{}]({})'.format(name or login, info['author']['html_url'])
texts.append(pattern.format(commit, html_url, author, msg))
text = 'Commits which are not from any pull request:\n\n'
return text + '\n\n'.join(texts)
# This function mirrors the PR description checks in ClickhousePullRequestTrigger.
# Returns False if the PR should not be mentioned changelog.
def parse_one_pull_request(item):
description = item['description']
lines = [line for line in map(lambda x: x.strip(), description.split('\n')) if line]
lines = [re.sub(r'\s+', ' ', l) for l in lines]
cat_pos = None
short_descr_pos = None
long_descr_pos = None
if lines:
for i in range(len(lines) - 1):
if re.match(r'(?i).*category.*:$', lines[i]):
cat_pos = i
if re.match(r'(?i)^\**\s*(Short description|Change\s*log entry)', lines[i]):
short_descr_pos = i
if re.match(r'(?i)^\**\s*Detailed description', lines[i]):
long_descr_pos = i
if cat_pos is None:
return False
cat = lines[cat_pos + 1]
cat = re.sub(r'^[-*\s]*', '', cat)
# Filter out the PR categories that are not for changelog.
if re.match(r'(?i)doc|((non|in|not|un)[-\s]*significant)', cat):
return False
short_descr = ''
if short_descr_pos:
short_descr_end = long_descr_pos or len(lines)
short_descr = lines[short_descr_pos + 1]
if short_descr_pos + 2 != short_descr_end:
short_descr += ' ...'
# If we have nothing meaningful
if not re.match('\w', short_descr):
short_descr = item['title']
# TODO: Add detailed description somewhere
item['entry'] = short_descr
item['category'] = cat
return True
# List of pull requests -> text description.
def process_pull_requests(pull_requests, users, repo):
groups = {}
for id, item in pull_requests.items():
if not parse_one_pull_request(item):
continue
pattern = u"{} [#{}]({}) ({})"
link = 'https://github.com/{}/pull/{}'.format(repo, id)
author = 'author not found'
if item['user'] in users:
# TODO get user from any commit if no user name on github
user = users[item['user']]
author = u'[{}]({})'.format(user['name'] or user['login'], user['html_url'])
cat = item['category']
if cat not in groups:
groups[cat] = []
groups[cat].append(pattern.format(item['entry'], id, link, author))
categories_preferred_order = ['Backward Incompatible Change', 'New Feature', 'Bug Fix', 'Improvement', 'Performance Improvement', 'Build/Testing/Packaging Improvement', 'Other']
def categories_sort_key(name):
if name in categories_preferred_order:
return str(categories_preferred_order.index(name)).zfill(3)
else:
return name.lower()
texts = []
for group, text in sorted(groups.items(), key = lambda kv: categories_sort_key(kv[0])):
items = [u'* {}'.format(pr) for pr in text]
texts.append(u'### {}\n{}'.format(group if group else u'[No category]', '\n'.join(items)))
return '\n\n'.join(texts)
# Load inner state. For debug purposes.
def load_state(state_file, base_sha, new_tag, prev_tag):
state = {}
if state_file:
try:
if os.path.exists(state_file):
logging.info('Reading state from %s', state_file)
with codecs.open(state_file, encoding='utf-8') as f:
state = json.loads(f.read())
else:
logging.info('State file does not exist. Will create new one.')
except Exception as e:
logging.warning('Cannot load state from %s. Reason: %s', state_file, str(e))
if state:
if 'base_sha' not in state or 'new_tag' not in state or 'prev_tag' not in state:
logging.warning('Invalid state. Will create new one.')
elif state['base_sha'] == base_sha and state['new_tag'] == new_tag and state['prev_tag'] == prev_tag:
logging.info('State loaded.')
else:
logging.info('Loaded state has different tags or merge base sha. Will create new state.')
state = {}
return state
# Save inner state. For debug purposes.
def save_state(state_file, state):
with codecs.open(state_file, 'w', encoding='utf-8') as f:
f.write(json.dumps(state, indent=4, separators=(',', ': ')))
def make_changelog(new_tag, prev_tag, pull_requests_nums, repo, repo_folder, state_file, token, max_retries, retry_timeout):
base_sha = None
if new_tag and prev_tag:
base_sha = get_merge_base(new_tag, prev_tag, repo_folder)
logging.info('Base sha: %s', base_sha)
# Step 1. Get commits from merge_base to new_tag HEAD.
# Result is a list of commits + map with commits info (author, message)
commits_info = {}
commits = []
is_commits_loaded = False
# Step 2. For each commit check if it is from any pull request (using github search api).
# Result is a list of unknown commits + map with pull request info (author, description).
unknown_commits = []
pull_requests = {}
is_pull_requests_loaded = False
# Step 3. Map users with their info (Name)
users = {}
is_users_loaded = False
# Step 4. Make changelog text from data above.
state = load_state(state_file, base_sha, new_tag, prev_tag)
if state:
if 'commits' in state and 'commits_info' in state:
logging.info('Loading commits from %s', state_file)
commits_info = state['commits_info']
commits = state['commits']
is_commits_loaded = True
if 'pull_requests' in state and 'unknown_commits' in state:
logging.info('Loading pull requests from %s', state_file)
unknown_commits = state['unknown_commits']
pull_requests = state['pull_requests']
is_pull_requests_loaded = True
if 'users' in state:
logging.info('Loading users requests from %s', state_file)
users = state['users']
is_users_loaded = True
if base_sha:
state['base_sha'] = base_sha
state['new_tag'] = new_tag
state['prev_tag'] = prev_tag
if not is_commits_loaded:
logging.info('Getting commits using github api.')
commits = get_commits_from_branch(repo, new_tag, base_sha, commits_info, 100, token, max_retries, retry_timeout)
state['commits'] = commits
state['commits_info'] = commits_info
logging.info('Found %d commits from %s to %s.\n', len(commits), new_tag, base_sha)
save_state(state_file, state)
if not is_pull_requests_loaded:
logging.info('Searching for pull requests using github api.')
unknown_commits, pull_requests = find_pull_requests(commits, commits_info, token, max_retries, retry_timeout)
state['unknown_commits'] = unknown_commits
state['pull_requests'] = pull_requests
else:
pull_requests = find_pull_requests_by_num(pull_requests_nums.split(','), token, max_retries, retry_timeout)
logging.info('Found %d pull requests and %d unknown commits.\n', len(pull_requests), len(unknown_commits))
save_state(state_file, state)
if not is_users_loaded:
logging.info('Getting users info using github api.')
users = get_users_info(pull_requests, commits_info, token, max_retries, retry_timeout)
state['users'] = users
logging.info('Found %d users.', len(users))
save_state(state_file, state)
changelog = u'{}\n\n{}'.format(process_pull_requests(pull_requests, users, repo), process_unknown_commits(unknown_commits, commits_info, users))
# Substitute links to issues
changelog = re.sub(r'(?<!\[)#(\d{4,})(?!\])', r'[#\1](https://github.com/{}/issues/\1)'.format(repo), changelog)
# Remove double whitespaces and trailing whitespaces
changelog = re.sub(r' {2,}| +$', r''.format(repo), changelog)
print(changelog)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Make changelog.')
parser.add_argument('prev_release_tag', nargs='?', help='Git tag from previous release.')
parser.add_argument('new_release_tag', nargs='?', help='Git tag for new release.')
parser.add_argument('--pull-requests', help='Process the specified list of pull-request numbers (comma separated) instead of commits between tags.')
parser.add_argument('--token', help='Github token. Use it to increase github api query limit.')
parser.add_argument('--directory', help='ClickHouse repo directory. Script dir by default.')
parser.add_argument('--state', help='File to dump inner states result.', default='changelog_state.json')
parser.add_argument('--repo', help='ClickHouse repo on GitHub.', default='ClickHouse/ClickHouse')
parser.add_argument('--max_retry', default=100, type=int,
help='Max number of retries pre api query in case of API rate limit exceeded error.')
parser.add_argument('--retry_timeout', help='Timeout after retry in seconds.', type=int, default=5)
args = parser.parse_args()
prev_release_tag = args.prev_release_tag
new_release_tag = args.new_release_tag
token = args.token or ''
repo_folder = args.directory or SCRIPT_DIR
state_file = args.state
repo = args.repo
max_retry = args.max_retry
retry_timeout = args.retry_timeout
pull_requests = args.pull_requests
if (not prev_release_tag or not new_release_tag) and not pull_requests:
raise Exception('Either release tags or --pull-requests must be specified')
if prev_release_tag and new_release_tag and pull_requests:
raise Exception('Either release tags or --pull-requests must be specified')
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
repo_folder = os.path.expanduser(repo_folder)
new_release_tag = rev_parse(new_release_tag, repo_folder)
prev_release_tag = rev_parse(prev_release_tag, repo_folder)
make_changelog(new_release_tag, prev_release_tag, pull_requests, repo, repo_folder, state_file, token, max_retry, retry_timeout)