classify_prs.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. # In[1]:
  2. # imports and set configuration
  3. import pandas as pd
  4. from retrieve_prs_data import run
  5. exclude_prototype = True
  6. data_filename = "10.0_to_11.0-rc2.json"
  7. previous_release = "v10.0"
  8. current_release = "v11.0-rc2"
  9. # In[2]:
  10. df = pd.read_json(data_filename).T
  11. df.tail()
  12. # In[3]:
  13. all_labels = {lbl for labels in df["labels"] for lbl in labels}
  14. all_labels
  15. # In[4]:
  16. # Add one column per label
  17. for label in all_labels:
  18. df[label] = df["labels"].apply(lambda labels_list: label in labels_list)
  19. df.head()
  20. # In[5]:
  21. # Add a clean "module" column. It contains tuples since PRs can have more than one module.
  22. # Maybe we should include "topics" in that column as well?
  23. all_modules = { # mapping: full name -> clean name
  24. label: "".join(label.split(" ")[1:]) for label in all_labels if label.startswith("module")
  25. }
  26. # We use an ugly loop, but whatever ¯\_(ツ)_/¯
  27. df["module"] = [[] for _ in range(len(df))]
  28. for i, row in df.iterrows():
  29. for full_name, clean_name in all_modules.items():
  30. if full_name in row["labels"]:
  31. row["module"].append(clean_name)
  32. df["module"] = df.module.apply(tuple)
  33. df.head()
  34. # In[6]:
  35. mod_df = df.set_index("module").sort_index()
  36. mod_df.tail()
  37. # In[7]:
  38. # All improvement PRs
  39. mod_df[mod_df["enhancement"]].head()
  40. # In[8]:
  41. # improvement f module
  42. # note: don't filter module name on the index as the index contain tuples with non-exclusive values
  43. # Use the boolean column instead
  44. mod_df[mod_df["enhancement"] & mod_df["module: transforms"]]
  45. # In[9]:
  46. def format_prs(mod_df):
  47. out = []
  48. for idx, row in mod_df.iterrows():
  49. if exclude_prototype and row["prototype"]:
  50. continue
  51. modules = idx
  52. # Put "documentation" and "tests" first for sorting to be dece
  53. for last_module in ("documentation", "tests"):
  54. if last_module in modules:
  55. modules = [m for m in modules if m != last_module] + [last_module]
  56. module = f"[{', '.join(modules)}]"
  57. module = module.replace("referencescripts", "reference scripts")
  58. module = module.replace("code", "reference scripts")
  59. out.append(f"{module} {row['title']}")
  60. return "\n".join(out)
  61. # In[10]:
  62. included_prs = pd.DataFrame()
  63. # If labels are accurate, this shouhld generate most of the release notes already
  64. # We keep track of the included PRs to figure out which ones are missing
  65. for section_title, module_idx in (
  66. ("Backward-incompatible changes", "bc-breaking"),
  67. ("Deprecations", "deprecation"),
  68. ("New Features", "new feature"),
  69. ("Improvements", "enhancement"),
  70. ("Bug Fixes", "bug"),
  71. ("Code Quality", "code quality"),
  72. ):
  73. print(f"## {section_title}")
  74. print()
  75. tmp_df = mod_df[mod_df[module_idx]]
  76. included_prs = pd.concat([included_prs, tmp_df])
  77. print(format_prs(tmp_df))
  78. print()
  79. # In[11]:
  80. # Missing PRs are these ones... classify them manually
  81. missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False)
  82. print(format_prs(missing_prs))
  83. # In[12]:
  84. # Generate list of contributors
  85. print()
  86. print("## Contributors")
  87. command_to_run = f"{{ git shortlog -s {previous_release}..{current_release} | cut -f2- & git log -s {previous_release}..{current_release} | grep Co-authored | cut -f2- -d: | cut -f1 -d\\< | sed 's/^ *//;s/ *$//' ; }} | sort --ignore-case | uniq | tr '\\n' ';' | sed 's/;/, /g;s/, $//' | fold -s"
  88. rc, output, err = run(command_to_run)
  89. print(output)