123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- import json
- import locale
- import os
- import re
- import subprocess
- from collections import namedtuple
- from os.path import expanduser
- import requests
- Features = namedtuple(
- "Features",
- [
- "title",
- "body",
- "pr_number",
- "files_changed",
- "labels",
- ],
- )
- def dict_to_features(dct):
- return Features(
- title=dct["title"],
- body=dct["body"],
- pr_number=dct["pr_number"],
- files_changed=dct["files_changed"],
- labels=dct["labels"],
- )
- def features_to_dict(features):
- return dict(features._asdict())
- def run(command):
- """Returns (return-code, stdout, stderr)"""
- p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
- output, err = p.communicate()
- rc = p.returncode
- enc = locale.getpreferredencoding()
- output = output.decode(enc)
- err = err.decode(enc)
- return rc, output.strip(), err.strip()
- def commit_body(commit_hash):
- cmd = f"git log -n 1 --pretty=format:%b {commit_hash}"
- ret, out, err = run(cmd)
- return out if ret == 0 else None
- def commit_title(commit_hash):
- cmd = f"git log -n 1 --pretty=format:%s {commit_hash}"
- ret, out, err = run(cmd)
- return out if ret == 0 else None
- def commit_files_changed(commit_hash):
- cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}"
- ret, out, err = run(cmd)
- return out.split("\n") if ret == 0 else None
- def parse_pr_number(body, commit_hash, title):
- regex = r"(#[0-9]+)"
- matches = re.findall(regex, title)
- if len(matches) == 0:
- if "revert" not in title.lower() and "updating submodules" not in title.lower():
- print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR")
- return None
- if len(matches) > 1:
- print(f"[{commit_hash}: {title}] Got two PR numbers, using the last one")
- return matches[-1][1:]
- return matches[0][1:]
- def get_ghstack_token():
- pattern = "github_oauth = (.*)"
- with open(expanduser("~/.ghstackrc"), "r+") as f:
- config = f.read()
- matches = re.findall(pattern, config)
- if len(matches) == 0:
- raise RuntimeError("Can't find a github oauth token")
- return matches[0]
- token = get_ghstack_token()
- headers = {"Authorization": f"token {token}"}
- def run_query(query):
- request = requests.post("https://api.github.com/graphql", json={"query": query}, headers=headers)
- if request.status_code == 200:
- return request.json()
- else:
- raise Exception(f"Query failed to run by returning code of {request.status_code}. {query}")
- def gh_labels(pr_number):
- query = f"""
- {{
- repository(owner: "pytorch", name: "vision") {{
- pullRequest(number: {pr_number}) {{
- labels(first: 10) {{
- edges {{
- node {{
- name
- }}
- }}
- }}
- }}
- }}
- }}
- """
- query = run_query(query)
- edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"]
- return [edge["node"]["name"] for edge in edges]
- def get_features(commit_hash, return_dict=False):
- title, body, files_changed = (
- commit_title(commit_hash),
- commit_body(commit_hash),
- commit_files_changed(commit_hash),
- )
- pr_number = parse_pr_number(body, commit_hash, title)
- labels = []
- if pr_number is not None:
- labels = gh_labels(pr_number)
- result = Features(title, body, pr_number, files_changed, labels)
- if return_dict:
- return features_to_dict(result)
- return result
- class CommitDataCache:
- def __init__(self, path="results/data.json"):
- self.path = path
- self.data = {}
- if os.path.exists(path):
- self.data = self.read_from_disk()
- def get(self, commit):
- if commit not in self.data.keys():
- # Fetch and cache the data
- self.data[commit] = get_features(commit)
- self.write_to_disk()
- return self.data[commit]
- def read_from_disk(self):
- with open(self.path) as f:
- data = json.load(f)
- data = {commit: dict_to_features(dct) for commit, dct in data.items()}
- return data
- def write_to_disk(self):
- data = {commit: features._asdict() for commit, features in self.data.items()}
- with open(self.path, "w") as f:
- json.dump(data, f)
- def get_commits_between(base_version, new_version):
- cmd = f"git merge-base {base_version} {new_version}"
- rc, merge_base, _ = run(cmd)
- assert rc == 0
- # Returns a list of something like
- # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555)
- cmd = f"git log --reverse --oneline {merge_base}..{new_version}"
- rc, commits, _ = run(cmd)
- assert rc == 0
- log_lines = commits.split("\n")
- hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines])
- return hashes, titles
- def convert_to_dataframes(feature_list):
- import pandas as pd
- df = pd.DataFrame.from_records(feature_list, columns=Features._fields)
- return df
- def main(base_version, new_version):
- hashes, titles = get_commits_between(base_version, new_version)
- cdc = CommitDataCache("data.json")
- for idx, commit in enumerate(hashes):
- if idx % 10 == 0:
- print(f"{idx} / {len(hashes)}")
- cdc.get(commit)
- return cdc
- if __name__ == "__main__":
- # d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8')
- # print(d)
- # hashes, titles = get_commits_between("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
- # Usage: change the tags below accordingly to the current release, then save the json with
- # cdc.write_to_disk().
- # Then you can use classify_prs.py (as a notebook)
- # to open the json and generate the release notes semi-automatically.
- cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
- from IPython import embed
- embed()
|