benchmark_utils.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. import time
  2. import os
  3. import json
  4. import torch
  5. from torch.profiler import profile, ProfilerActivity
  6. def synchronize():
  7. pass
  8. class NullContext:
  9. def __enter__(self):
  10. pass
  11. def __exit__(self, exc_type, exc_val, exc_tb):
  12. pass
  13. def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1,
  14. devices=None, kwargs_for_f=None, kwargs_for_profiler=None):
  15. """
  16. Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
  17. [num_runs] times to [trace_filename].
  18. [activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
  19. Return total runtime without the profiler
  20. Outputs to trace_filename
  21. """
  22. if devices is None:
  23. devices = ["cuda"]
  24. global synchronize
  25. if devices != ["cpu"] and torch.cuda.is_available():
  26. synchronize = torch.cuda.synchronize
  27. if kwargs_for_f is None:
  28. kwargs_for_f = {}
  29. if kwargs_for_profiler is None:
  30. kwargs_for_profiler = {}
  31. with optimize_ctx:
  32. torch.manual_seed(1337)
  33. for _ in range(5): # warmup runs
  34. f(input, **kwargs_for_f)
  35. synchronize()
  36. torch.manual_seed(1337)
  37. t0 = time.perf_counter()
  38. for _ in range(num_runs):
  39. f(input, **kwargs_for_f)
  40. synchronize()
  41. t1 = time.perf_counter()
  42. timing = t1 - t0
  43. with profile(activities=activities, **kwargs_for_profiler) as prof:
  44. with optimize_ctx:
  45. synchronize()
  46. torch.manual_seed(1337)
  47. for _ in range(num_runs):
  48. f(input, **kwargs_for_f)
  49. synchronize()
  50. prof.export_chrome_trace(trace_filename)
  51. return timing
  52. def get_chrome_trace_events(filename):
  53. f = open(filename)
  54. data = json.load(f)
  55. events = data["traceEvents"]
  56. return events
  57. def is_gpu_compute_event(event):
  58. global gpu_pids
  59. return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X"
  60. def get_sorted_gpu_events(events):
  61. sorted_gpu_events = []
  62. for event in events:
  63. if(not is_gpu_compute_event(event)):
  64. continue
  65. sorted_gpu_events.append(event)
  66. return sorted(sorted_gpu_events, key=lambda x: x["ts"])
  67. def get_duration(sorted_gpu_events):
  68. if len(sorted_gpu_events) == 0:
  69. return 0
  70. event = sorted_gpu_events[0]
  71. current_end_time = event["ts"] + event["dur"]
  72. total_duration = event["dur"]
  73. for event in sorted_gpu_events[1:]:
  74. start_time = max(event["ts"], current_end_time)
  75. end_time = event["ts"] + event["dur"]
  76. total_duration = total_duration + max(end_time - start_time, 0)
  77. current_end_time = max(current_end_time, end_time)
  78. return total_duration
  79. def get_sorted_gpu_mm_conv_events(events):
  80. def is_mm_conv_event(event):
  81. return "name" in event and ("gemm" in event["name"] or "conv" in event["name"]
  82. or "cutlass" in event["name"] or "wgrad" in event["name"])
  83. gpu_events = get_sorted_gpu_events(events)
  84. sorted_events = []
  85. for event in gpu_events:
  86. if(not is_mm_conv_event(event)):
  87. continue
  88. sorted_events.append(event)
  89. return sorted_events
  90. gpu_pids = []
  91. def compute_utilization(filename: str, total_length: float):
  92. """
  93. Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
  94. and percent of times spent on matmal and convolution
  95. Args:
  96. filename(str): Name of chrome traces file produced by pytorch profiler
  97. total_length(float): total length of the process without profiler in second
  98. Return:
  99. tuple: (GPU Utilization, percent of time spent on matmal and convolution)
  100. """
  101. events = get_chrome_trace_events(filename)
  102. # get pids of GPU events
  103. global gpu_pids
  104. gpu_pids = []
  105. for event in events:
  106. if "name" not in event:
  107. continue
  108. if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]:
  109. gpu_pids.append(event["pid"])
  110. total_length = total_length * 1e6
  111. sorted_gpu_events = get_sorted_gpu_events(events)
  112. utilization = get_duration(sorted_gpu_events) / total_length
  113. sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events)
  114. mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length
  115. return utilization, mm_conv_utilization
  116. def benchmark_utilization(f, input, trace_folder, optimize_ctx=None, trace_file_name="tmp_chrome_trace", num_runs=1):
  117. """
  118. Benchmark the GPU Utilization and percent of time spent on matmal and convolution operations of
  119. running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
  120. It will produce a chrome trace file in trace_folder/trace_file_name.json
  121. Example:
  122. ```
  123. def f(a):
  124. return a.sum()
  125. a = torch.rand(2**20, device="cuda")
  126. utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
  127. ```
  128. Args:
  129. f: function to benchmark
  130. input: input to :attr:`f`
  131. trace_folder: name of the folder to store the chrome trace
  132. optimize_ctx: the context in which f will run
  133. trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"
  134. num_runs: number of times to run f, excluding the warm-up runs, default to 1.
  135. Return:
  136. tuple: (GPU Utilization, percent of time spent on matmal and convolution)
  137. """
  138. isExist = os.path.exists(trace_folder)
  139. if not isExist:
  140. os.makedirs(trace_folder)
  141. print("create folder " + trace_folder)
  142. if optimize_ctx is None:
  143. optimize_ctx = NullContext()
  144. chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json")
  145. total_length = dump_chrome_trace(f, input, chrome_trace_file_name, optimize_ctx,
  146. [ProfilerActivity.CUDA], num_runs=num_runs, devices="cuda")
  147. utilization, mm_conv_utilization = compute_utilization(chrome_trace_file_name, total_length)
  148. return utilization, mm_conv_utilization