123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- # In[1]:
- # imports and set configuration
- import pandas as pd
- from retrieve_prs_data import run
- exclude_prototype = True
- data_filename = "10.0_to_11.0-rc2.json"
- previous_release = "v10.0"
- current_release = "v11.0-rc2"
- # In[2]:
- df = pd.read_json(data_filename).T
- df.tail()
- # In[3]:
- all_labels = {lbl for labels in df["labels"] for lbl in labels}
- all_labels
- # In[4]:
- # Add one column per label
- for label in all_labels:
- df[label] = df["labels"].apply(lambda labels_list: label in labels_list)
- df.head()
- # In[5]:
- # Add a clean "module" column. It contains tuples since PRs can have more than one module.
- # Maybe we should include "topics" in that column as well?
- all_modules = { # mapping: full name -> clean name
- label: "".join(label.split(" ")[1:]) for label in all_labels if label.startswith("module")
- }
- # We use an ugly loop, but whatever ¯\_(ツ)_/¯
- df["module"] = [[] for _ in range(len(df))]
- for i, row in df.iterrows():
- for full_name, clean_name in all_modules.items():
- if full_name in row["labels"]:
- row["module"].append(clean_name)
- df["module"] = df.module.apply(tuple)
- df.head()
- # In[6]:
- mod_df = df.set_index("module").sort_index()
- mod_df.tail()
- # In[7]:
- # All improvement PRs
- mod_df[mod_df["enhancement"]].head()
- # In[8]:
- # improvement f module
- # note: don't filter module name on the index as the index contain tuples with non-exclusive values
- # Use the boolean column instead
- mod_df[mod_df["enhancement"] & mod_df["module: transforms"]]
- # In[9]:
- def format_prs(mod_df):
- out = []
- for idx, row in mod_df.iterrows():
- if exclude_prototype and row["prototype"]:
- continue
- modules = idx
- # Put "documentation" and "tests" first for sorting to be dece
- for last_module in ("documentation", "tests"):
- if last_module in modules:
- modules = [m for m in modules if m != last_module] + [last_module]
- module = f"[{', '.join(modules)}]"
- module = module.replace("referencescripts", "reference scripts")
- module = module.replace("code", "reference scripts")
- out.append(f"{module} {row['title']}")
- return "\n".join(out)
- # In[10]:
- included_prs = pd.DataFrame()
- # If labels are accurate, this shouhld generate most of the release notes already
- # We keep track of the included PRs to figure out which ones are missing
- for section_title, module_idx in (
- ("Backward-incompatible changes", "bc-breaking"),
- ("Deprecations", "deprecation"),
- ("New Features", "new feature"),
- ("Improvements", "enhancement"),
- ("Bug Fixes", "bug"),
- ("Code Quality", "code quality"),
- ):
- print(f"## {section_title}")
- print()
- tmp_df = mod_df[mod_df[module_idx]]
- included_prs = pd.concat([included_prs, tmp_df])
- print(format_prs(tmp_df))
- print()
- # In[11]:
- # Missing PRs are these ones... classify them manually
- missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False)
- print(format_prs(missing_prs))
- # In[12]:
- # Generate list of contributors
- print()
- print("## Contributors")
- command_to_run = f"{{ git shortlog -s {previous_release}..{current_release} | cut -f2- & git log -s {previous_release}..{current_release} | grep Co-authored | cut -f2- -d: | cut -f1 -d\\< | sed 's/^ *//;s/ *$//' ; }} | sort --ignore-case | uniq | tr '\\n' ';' | sed 's/;/, /g;s/, $//' | fold -s"
- rc, output, err = run(command_to_run)
- print(output)
|