profiler_util.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928
  1. import itertools
  2. import torch
  3. from torch.autograd import DeviceType
  4. from collections import defaultdict, namedtuple
  5. from operator import attrgetter
  6. from typing import Dict, List, Tuple, Optional
  7. import bisect
  8. import math
  9. __all__ = ["EventList", "FormattedTimesMixin", "Interval", "Kernel", "FunctionEvent", "FunctionEventAvg",
  10. "StringTable", "MemRecordsAcc"]
  11. class EventList(list):
  12. """A list of Events (for pretty printing)"""
  13. def __init__(self, *args, **kwargs):
  14. use_cuda = kwargs.pop('use_cuda', True)
  15. profile_memory = kwargs.pop('profile_memory', False)
  16. with_flops = kwargs.pop('with_flops', False)
  17. super().__init__(*args, **kwargs)
  18. self._use_cuda = use_cuda
  19. self._profile_memory = profile_memory
  20. self._tree_built = False
  21. self._with_flops = with_flops
  22. def _build_tree(self):
  23. self._populate_cpu_children()
  24. self._remove_dup_nodes()
  25. self._set_backward_stacktraces()
  26. self._tree_built = True
  27. def __str__(self):
  28. return self.table()
  29. def _remove_dup_nodes(self):
  30. while True:
  31. to_delete = set()
  32. for idx in range(len(self)):
  33. if (self[idx].cpu_parent is not None and
  34. self[idx].cpu_parent.name == self[idx].name and
  35. len(self[idx].cpu_parent.cpu_children) == 1):
  36. self[idx].cpu_parent.cpu_children = self[idx].cpu_children
  37. self[idx].cpu_parent.kernels = self[idx].kernels # lift kernels up
  38. for ch in self[idx].cpu_children:
  39. ch.cpu_parent = self[idx].cpu_parent
  40. to_delete.add(idx)
  41. if len(to_delete) == 0:
  42. break
  43. new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete]
  44. self.clear()
  45. self.extend(new_evts)
  46. def _populate_cpu_children(self):
  47. """Populates child events into each underlying FunctionEvent object.
  48. One event is a child of another if [s1, e1) is inside [s2, e2). Where
  49. s1 and e1 would be start and end of the child event's interval. And
  50. s2 and e2 start and end of the parent event's interval
  51. Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10]
  52. be a parent of two other intervals.
  53. If for any reason two intervals intersect only partially, this function
  54. will not record a parent child relationship between then.
  55. """
  56. # Some events can be async (i.e. start and end on different threads),
  57. # since it's generally undefined how to attribute children ranges to
  58. # async ranges, we do not use them when calculating nested ranges and stats
  59. sync_events = [evt for evt in self if not evt.is_async and evt.device_type == DeviceType.CPU]
  60. events = sorted(
  61. sync_events,
  62. key=attrgetter("thread"),
  63. )
  64. # Group by both thread and node_id, so that events that happen to have
  65. # the same thread_id but are from different nodes aren't incorrectly
  66. # grouped together.
  67. threads = itertools.groupby(
  68. events, key=lambda event: (event.thread, event.node_id)
  69. )
  70. # For each thread we keep a stack of current nested parents.
  71. # We maintain the invariant that each interval is a subset of all other
  72. # intervals lower in the stack.
  73. #
  74. # First we sort the intervals by their start time. Then we iterate over them.
  75. # Every time we see a new interval we remove several parents from
  76. # the top until we restore the invariant. Then parent child relationship
  77. # if recorded if the stack is not empty.
  78. # Finally we add new interval to the list
  79. #
  80. # Algorithm has O(N * log(N)) complexity where N is number of
  81. # intervals
  82. for thread_id, thread_events in threads:
  83. thread_events_ = sorted(
  84. thread_events,
  85. key=lambda event: [event.time_range.start, -event.time_range.end],
  86. )
  87. current_events: List[FunctionEvent] = []
  88. cur_end = 0
  89. for event in thread_events_:
  90. while len(current_events) > 0:
  91. parent = current_events[-1]
  92. if event.time_range.start >= parent.time_range.end or \
  93. event.time_range.end > parent.time_range.end:
  94. # this can't be a parent
  95. current_events.pop()
  96. else:
  97. parent.append_cpu_child(event)
  98. assert (
  99. event.cpu_parent is None
  100. ), "There is already a CPU parent event for {}".format(
  101. event.key
  102. )
  103. event.set_cpu_parent(parent)
  104. break
  105. current_events.append(event)
  106. def _set_backward_stacktraces(self):
  107. def bw_parent(evt):
  108. if evt is None:
  109. return None
  110. elif evt.scope == 1: # BACKWARD_FUNCTION
  111. return evt
  112. else:
  113. return bw_parent(evt.cpu_parent)
  114. fwd_stacks = {}
  115. for evt in self:
  116. if bw_parent(evt) is None and evt.stack is not None:
  117. t = (evt.sequence_nr, evt.thread)
  118. if t not in fwd_stacks:
  119. fwd_stacks[t] = evt.stack
  120. for evt in self:
  121. p = bw_parent(evt)
  122. if p is not None:
  123. assert p.fwd_thread is not None
  124. t = (p.sequence_nr, p.fwd_thread)
  125. if t in fwd_stacks:
  126. evt.stack = fwd_stacks[t]
  127. else:
  128. evt.stack = []
  129. @property
  130. def self_cpu_time_total(self):
  131. return sum([event.self_cpu_time_total for event in self])
  132. def table(
  133. self,
  134. sort_by=None,
  135. row_limit=100,
  136. max_src_column_width=75,
  137. max_name_column_width=55,
  138. max_shapes_column_width=80,
  139. header=None,
  140. top_level_events_only=False
  141. ):
  142. """Prints an EventList as a nicely formatted table.
  143. Args:
  144. sort_by (str, optional): Attribute used to sort entries. By default
  145. they are printed in the same order as they were registered.
  146. Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
  147. ``cuda_time_total``, ``cpu_memory_usage``, ``cuda_memory_usage``,
  148. ``self_cpu_memory_usage``, ``self_cuda_memory_usage``, ``count``.
  149. top_level_events_only(bool, optional): Boolean flag to determine the
  150. selection of events to display. If true, the profiler will only
  151. display events at top level like top-level invocation of python
  152. `lstm`, python `add` or other functions, nested events like low-level
  153. cpu/cuda ops events are omitted for profiler result readability.
  154. Returns:
  155. A string containing the table.
  156. """
  157. return _build_table(
  158. self,
  159. sort_by=sort_by,
  160. row_limit=row_limit,
  161. max_src_column_width=max_src_column_width,
  162. max_name_column_width=max_name_column_width,
  163. max_shapes_column_width=max_shapes_column_width,
  164. header=header,
  165. profile_memory=self._profile_memory,
  166. with_flops=self._with_flops,
  167. top_level_events_only=top_level_events_only)
  168. def export_chrome_trace(self, path):
  169. """Exports an EventList as a Chrome tracing tools file.
  170. The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL.
  171. Args:
  172. path (str): Path where the trace will be written.
  173. """
  174. import os
  175. with open(path, 'w') as f:
  176. chrome_events = []
  177. next_id = 0
  178. # Use file IO over using json.dump since JSON dumping is very slow and
  179. # this technique is proven to give a 4x speedup.
  180. f.write("[")
  181. for evt in self:
  182. if evt.trace_name is None:
  183. continue
  184. f.write(
  185. '{"name": "%s", '
  186. '"ph": "X", '
  187. '"ts": %s, '
  188. '"dur": %s, '
  189. '"tid": %s, '
  190. '"pid": "CPU functions", '
  191. '"args": {}}, '
  192. % (
  193. evt.trace_name,
  194. evt.time_range.start,
  195. evt.time_range.elapsed_us(),
  196. evt.thread
  197. if not evt.is_remote
  198. else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "',
  199. )
  200. )
  201. for k in evt.kernels:
  202. # 's' and 'f' draw Flow arrows from
  203. # the CPU launch to the GPU kernel
  204. f.write('{"name": "%s", '
  205. '"ph": "s", '
  206. '"ts": %s, '
  207. '"tid": %s, '
  208. '"pid": "CPU functions", '
  209. '"id": %s, '
  210. '"cat": "cpu_to_cuda", '
  211. '"args": {}}, ' % (evt.trace_name, evt.time_range.start,
  212. evt.thread, next_id))
  213. # Note: use torch.profiler to get device kernel trace
  214. next_id += 1
  215. if len(self) > 0:
  216. # remove trailing whitespace and comma
  217. f.seek(f.tell() - 2, os.SEEK_SET)
  218. f.truncate()
  219. f.write("]")
  220. def supported_export_stacks_metrics(self):
  221. return ["self_cpu_time_total", "self_cuda_time_total"]
  222. def export_stacks(self, path: str, metric: str):
  223. if metric not in self.supported_export_stacks_metrics():
  224. raise ValueError("metric should be one of: " + str(self.supported_export_stacks_metrics()))
  225. translate_table = str.maketrans(" ;\t\n", "____")
  226. with open(path, 'w') as f:
  227. for evt in self:
  228. if evt.stack and len(evt.stack) > 0:
  229. metric_value = getattr(evt, metric)
  230. if int(metric_value) > 0:
  231. stack_str = ""
  232. for entry in reversed(evt.stack):
  233. stack_str += entry.translate(translate_table)
  234. stack_str += ";"
  235. stack_str = stack_str[:-1] + " " + str(int(metric_value))
  236. f.write(stack_str + "\n")
  237. def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0):
  238. """Averages all function events over their keys.
  239. Args:
  240. group_by_input_shapes: group entries by
  241. (event name, input shapes) rather than just event name.
  242. This is useful to see which input shapes contribute to the runtime
  243. the most and may help with size-specific optimizations or
  244. choosing the best candidates for quantization (aka fitting a roof line)
  245. group_by_stack_n: group by top n stack trace entries
  246. Returns:
  247. An EventList containing FunctionEventAvg objects.
  248. """
  249. assert self._tree_built
  250. stats: Dict[Tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
  251. def get_key(event, group_by_input_shapes, group_by_stack_n) -> Tuple[str, ...]:
  252. key = [str(event.key), str(event.node_id), str(event.device_type), str(event.is_legacy)]
  253. if group_by_input_shapes:
  254. key.append(str(event.input_shapes))
  255. if group_by_stack_n > 0:
  256. key += event.stack[:group_by_stack_n]
  257. return tuple(key)
  258. for evt in self:
  259. stats[get_key(evt, group_by_input_shapes, group_by_stack_n)].add(evt)
  260. avg_list = EventList(
  261. stats.values(),
  262. use_cuda=self._use_cuda,
  263. profile_memory=self._profile_memory,
  264. with_flops=self._with_flops)
  265. for evt in avg_list:
  266. evt.stack = evt.stack[:group_by_stack_n]
  267. if not group_by_input_shapes:
  268. evt.input_shapes = ""
  269. return avg_list
  270. def total_average(self):
  271. """Averages all events.
  272. Returns:
  273. A FunctionEventAvg object.
  274. """
  275. total_stat = FunctionEventAvg()
  276. for evt in self:
  277. total_stat += evt
  278. total_stat.key = None
  279. total_stat.key = 'Total'
  280. return total_stat
  281. def _format_time(time_us):
  282. """Defines how to format time in FunctionEvent"""
  283. US_IN_SECOND = 1000.0 * 1000.0
  284. US_IN_MS = 1000.0
  285. if time_us >= US_IN_SECOND:
  286. return '{:.3f}s'.format(time_us / US_IN_SECOND)
  287. if time_us >= US_IN_MS:
  288. return '{:.3f}ms'.format(time_us / US_IN_MS)
  289. return '{:.3f}us'.format(time_us)
  290. def _format_time_share(time_us, total_time_us):
  291. """Defines how to format time in FunctionEvent"""
  292. if total_time_us == 0:
  293. assert time_us == 0, "Expected time_us == 0 but got {}".format(time_us)
  294. return "NaN"
  295. return '{:.2f}%'.format(time_us * 100.0 / total_time_us)
  296. def _format_memory(nbytes):
  297. """Returns a formatted memory size string"""
  298. KB = 1024
  299. MB = 1024 * KB
  300. GB = 1024 * MB
  301. if (abs(nbytes) >= GB):
  302. return '{:.2f} Gb'.format(nbytes * 1.0 / GB)
  303. elif (abs(nbytes) >= MB):
  304. return '{:.2f} Mb'.format(nbytes * 1.0 / MB)
  305. elif (abs(nbytes) >= KB):
  306. return '{:.2f} Kb'.format(nbytes * 1.0 / KB)
  307. else:
  308. return str(nbytes) + ' b'
  309. def _attr_formatter(name):
  310. return property(lambda self: _format_time(getattr(self, name)))
  311. class FormattedTimesMixin:
  312. """Helpers for FunctionEvent and FunctionEventAvg.
  313. The subclass should define `*_time_total` and `count` attributes.
  314. """
  315. cpu_time_str = _attr_formatter('cpu_time')
  316. cuda_time_str = _attr_formatter('cuda_time')
  317. cpu_time_total_str = _attr_formatter('cpu_time_total')
  318. cuda_time_total_str = _attr_formatter('cuda_time_total')
  319. self_cpu_time_total_str = _attr_formatter('self_cpu_time_total')
  320. self_cuda_time_total_str = _attr_formatter('self_cuda_time_total')
  321. @property
  322. def cpu_time(self):
  323. return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count # type: ignore[attr-defined]
  324. @property
  325. def cuda_time(self):
  326. return 0.0 if self.count == 0 else 1.0 * self.cuda_time_total / self.count # type: ignore[attr-defined]
  327. class Interval:
  328. def __init__(self, start, end):
  329. self.start = start
  330. self.end = end
  331. def elapsed_us(self):
  332. return self.end - self.start
  333. Kernel = namedtuple('Kernel', ['name', 'device', 'duration'])
  334. class FunctionEvent(FormattedTimesMixin):
  335. """Profiling information about a single function."""
  336. def __init__(
  337. self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None,
  338. stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
  339. is_remote=False, sequence_nr=-1, node_id=-1, device_type=DeviceType.CPU, device_index=0,
  340. is_legacy=False, flops=None, trace_name=None):
  341. self.id: int = id
  342. self.node_id: int = node_id
  343. self.name: str = name
  344. self.trace_name: str = trace_name
  345. self.time_range: Interval = Interval(start_us, end_us)
  346. self.thread: int = thread
  347. self.fwd_thread: Optional[int] = fwd_thread
  348. self.kernels: List[Kernel] = []
  349. self.count: int = 1
  350. self.cpu_children: List[FunctionEvent] = []
  351. self.cpu_parent: Optional[FunctionEvent] = None
  352. self.input_shapes: Tuple[int, ...] = input_shapes
  353. self.stack: List = stack
  354. self.scope: int = scope
  355. self.cpu_memory_usage: int = cpu_memory_usage
  356. self.cuda_memory_usage: int = cuda_memory_usage
  357. self.is_async: bool = is_async
  358. self.is_remote: bool = is_remote
  359. self.sequence_nr: int = sequence_nr
  360. self.device_type: DeviceType = device_type
  361. self.device_index: int = device_index
  362. self.is_legacy: bool = is_legacy
  363. self.flops: Optional[int] = flops
  364. def append_kernel(self, name, device, duration):
  365. assert self.device_type == DeviceType.CPU
  366. self.kernels.append(Kernel(name, device, duration))
  367. def append_cpu_child(self, child):
  368. """Append a CPU child of type FunctionEvent.
  369. One is supposed to append only direct children to the event to have
  370. correct self cpu time being reported.
  371. """
  372. assert(self.device_type == DeviceType.CPU)
  373. assert(isinstance(child, FunctionEvent))
  374. assert(child.device_type == DeviceType.CPU)
  375. self.cpu_children.append(child)
  376. def set_cpu_parent(self, parent):
  377. """Set the immediate CPU parent of type FunctionEvent
  378. One profiling FunctionEvent should have only one CPU parent such that
  379. the child's range interval is completely inside the parent's. We use
  380. this connection to determine the event is from top-level op or not.
  381. """
  382. assert(self.device_type == DeviceType.CPU)
  383. assert(isinstance(parent, FunctionEvent))
  384. assert(parent.device_type == DeviceType.CPU)
  385. self.cpu_parent = parent
  386. # Note: async events don't have children, are not used when computing 'self'
  387. # metrics of other events, have only total cpu time
  388. @property
  389. def self_cpu_memory_usage(self):
  390. if self.is_async or self.device_type != DeviceType.CPU:
  391. return 0
  392. return self.cpu_memory_usage - sum(
  393. [child.cpu_memory_usage for child in self.cpu_children]
  394. )
  395. @property
  396. def self_cuda_memory_usage(self):
  397. if self.is_async or self.device_type != DeviceType.CPU:
  398. return 0
  399. return self.cuda_memory_usage - sum(
  400. [child.cuda_memory_usage for child in self.cpu_children]
  401. )
  402. @property
  403. def self_cpu_time_total(self):
  404. if self.is_async or self.device_type != DeviceType.CPU:
  405. return 0
  406. return self.cpu_time_total - sum(
  407. [child.cpu_time_total for child in self.cpu_children]
  408. )
  409. @property
  410. def cuda_time_total(self):
  411. if self.is_async:
  412. return 0
  413. if self.device_type == DeviceType.CPU:
  414. if not self.is_legacy:
  415. # account for the kernels in the children ops
  416. return (sum(kinfo.duration for kinfo in self.kernels) +
  417. sum(ch.cuda_time_total for ch in self.cpu_children))
  418. else:
  419. # each legacy cpu events has a single (fake) kernel
  420. return sum(kinfo.duration for kinfo in self.kernels)
  421. else:
  422. assert self.device_type == DeviceType.CUDA
  423. return self.time_range.elapsed_us()
  424. @property
  425. def self_cuda_time_total(self):
  426. if self.is_async:
  427. return 0
  428. if self.device_type == DeviceType.CPU:
  429. return self.cuda_time_total - \
  430. sum([child.cuda_time_total for child in self.cpu_children])
  431. else:
  432. assert(self.device_type == DeviceType.CUDA)
  433. return self.cuda_time_total
  434. @property
  435. def cpu_time_total(self):
  436. if self.device_type == DeviceType.CPU:
  437. return self.time_range.elapsed_us()
  438. else:
  439. return 0
  440. @property
  441. def key(self):
  442. return self.name
  443. def __repr__(self):
  444. return (
  445. '<FunctionEvent id={} name={} device_type={} node_id={} cpu_time={} start_us={} end_us={} '
  446. 'cpu_children={} cuda_time={} name={} thread={} input_shapes={} '
  447. 'cpu_memory_usage={} cuda_memory_usage={} is_async={} is_remote={} seq_nr={} is_legacy={}>'.format(
  448. self.id,
  449. self.name,
  450. self.device_type,
  451. self.node_id,
  452. self.cpu_time_str,
  453. self.time_range.start,
  454. self.time_range.end,
  455. str([child.id for child in self.cpu_children]),
  456. self.cuda_time_str,
  457. self.name,
  458. self.thread,
  459. str(self.input_shapes),
  460. self.cpu_memory_usage,
  461. self.cuda_memory_usage,
  462. self.is_async,
  463. self.is_remote,
  464. self.sequence_nr,
  465. self.is_legacy,
  466. )
  467. )
  468. class FunctionEventAvg(FormattedTimesMixin):
  469. """Used to average stats over multiple FunctionEvent objects."""
  470. def __init__(self):
  471. self.key: Optional[str] = None
  472. self.count: int = 0
  473. self.node_id: int = 0
  474. self.is_async: bool = False
  475. self.is_remote: bool = False
  476. self.cpu_time_total: int = 0
  477. self.cuda_time_total: int = 0
  478. self.self_cpu_time_total: int = 0
  479. self.self_cuda_time_total: int = 0
  480. self.input_shapes: Optional[List[List[int]]] = None
  481. self.stack: Optional[List] = None
  482. self.scope: Optional[int] = None
  483. self.cpu_memory_usage: int = 0
  484. self.cuda_memory_usage: int = 0
  485. self.self_cpu_memory_usage: int = 0
  486. self.self_cuda_memory_usage: int = 0
  487. self.cpu_children: Optional[List[FunctionEvent]] = None
  488. self.cpu_parent: Optional[FunctionEvent] = None
  489. self.device_type: DeviceType = DeviceType.CPU
  490. self.is_legacy: bool = False
  491. self.flops: int = 0
  492. def add(self, other):
  493. if self.key is None:
  494. # First function being recorded as part of FunctionEventAvg, propagate
  495. # fields.
  496. self.key = other.key
  497. self.node_id = other.node_id
  498. self.is_async = other.is_async
  499. self.is_remote = other.is_remote
  500. self.cpu_parent = other.cpu_parent
  501. self.cpu_children = other.cpu_children
  502. self.input_shapes = other.input_shapes
  503. self.stack = other.stack
  504. self.scope = other.scope
  505. self.device_type = other.device_type
  506. self.is_legacy = other.is_legacy
  507. assert isinstance(other, (FunctionEvent, FunctionEventAvg))
  508. assert other.key == self.key
  509. self.cpu_time_total += other.cpu_time_total
  510. self.cuda_time_total += other.cuda_time_total
  511. self.self_cpu_time_total += other.self_cpu_time_total
  512. self.self_cuda_time_total += other.self_cuda_time_total
  513. self.cpu_memory_usage += other.cpu_memory_usage
  514. self.cuda_memory_usage += other.cuda_memory_usage
  515. self.self_cpu_memory_usage += other.self_cpu_memory_usage
  516. self.self_cuda_memory_usage += other.self_cuda_memory_usage
  517. self.count += other.count
  518. if self.flops is None:
  519. self.flops = other.flops
  520. elif other.flops is not None:
  521. self.flops += other.flops
  522. return self
  523. def __iadd__(self, other):
  524. return self.add(other)
  525. def __repr__(self):
  526. return (
  527. '<FunctionEventAvg key={} self_cpu_time={} cpu_time={} '
  528. ' self_cuda_time={} cuda_time={} input_shapes={} '
  529. 'cpu_memory_usage={} cuda_memory_usage={}>'.format(
  530. self.key,
  531. self.self_cpu_time_total_str,
  532. self.cpu_time_str,
  533. self.self_cuda_time_total_str,
  534. self.cuda_time_str,
  535. str(self.input_shapes),
  536. self.cpu_memory_usage,
  537. self.cuda_memory_usage,
  538. )
  539. )
  540. class StringTable(defaultdict):
  541. def __missing__(self, key):
  542. # manage cases like 't' (demangled to 'unsigned short') separately,
  543. # for now simply check the length to avoid unexpected results for
  544. # the short sequences
  545. self[key] = torch._C._demangle(key) if len(key) > 1 else key
  546. return self[key]
  547. class MemRecordsAcc:
  548. """Acceleration structure for accessing mem_records in interval"""
  549. def __init__(self, mem_records):
  550. self._mem_records = mem_records
  551. self._start_uses = []
  552. self._indices = []
  553. if len(mem_records) > 0:
  554. tmp = sorted([(r[0].start_us(), i) for i, r in enumerate(mem_records)])
  555. self._start_uses, self._indices = zip(*tmp)
  556. def in_interval(self, start_us, end_us):
  557. start_idx = bisect.bisect_left(self._start_uses, start_us)
  558. end_idx = bisect.bisect_right(self._start_uses, end_us)
  559. for i in range(start_idx, end_idx):
  560. yield self._mem_records[self._indices[i]]
  561. def _filter_stack_entry(entry):
  562. filtered_entries = [
  563. ("autograd/__init__", "_make_grads"),
  564. ("autograd/__init__", "backward"),
  565. ("torch/tensor", "backward"),
  566. ("_internal/common_utils", "prof_callable"),
  567. ("_internal/common_utils", "prof_func_call"),
  568. ("_internal/common_utils", "prof_meth_call"),
  569. ]
  570. return all([not (f[0] in entry and f[1] in entry) for f in filtered_entries])
  571. MEMORY_EVENT_NAME = "[memory]"
  572. OUT_OF_MEMORY_EVENT_NAME = "[OutOfMemory]"
  573. def _filter_name(name):
  574. # ignoring the following utility ops
  575. filtered_out_names = [
  576. MEMORY_EVENT_NAME, # used only for the top-level memory events
  577. OUT_OF_MEMORY_EVENT_NAME,
  578. "profiler::_record_function_enter",
  579. "profiler::_record_function_enter_new",
  580. "profiler::_record_function_exit",
  581. "aten::is_leaf",
  582. "aten::output_nr",
  583. "aten::_version",
  584. ]
  585. return name in filtered_out_names
  586. # Demangles and optionally rewrites the provided event name,
  587. # with_wildcard - whether to replace certain numbered event names
  588. # with a wildcard name to aggregate them together in the profiler table
  589. # output
  590. def _rewrite_name(name, with_wildcard=False):
  591. string_table = StringTable()
  592. name = string_table[name]
  593. if with_wildcard:
  594. if name.startswith("ProfilerStep#"):
  595. name = "ProfilerStep*"
  596. return name
  597. def _build_table(
  598. events,
  599. sort_by=None,
  600. header=None,
  601. row_limit=100,
  602. max_src_column_width=75,
  603. max_name_column_width=55,
  604. max_shapes_column_width=80,
  605. with_flops=False,
  606. profile_memory=False,
  607. top_level_events_only=False):
  608. """Prints a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
  609. if len(events) == 0:
  610. return ""
  611. has_cuda_time = any([event.self_cuda_time_total > 0 for event in events])
  612. has_cuda_mem = any([event.self_cuda_memory_usage > 0 for event in events])
  613. has_input_shapes = any(
  614. [(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events])
  615. if sort_by is not None:
  616. events = EventList(sorted(
  617. events, key=lambda evt: getattr(evt, sort_by), reverse=True
  618. ), use_cuda=has_cuda_time, profile_memory=profile_memory, with_flops=with_flops)
  619. name_column_width = max([len(evt.key) for evt in events]) + 4
  620. if max_name_column_width is not None:
  621. name_column_width = min(name_column_width, max_name_column_width)
  622. shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4
  623. if max_shapes_column_width is not None:
  624. shapes_column_width = min(shapes_column_width, max_shapes_column_width)
  625. DEFAULT_COLUMN_WIDTH = 12
  626. flops_column_width = DEFAULT_COLUMN_WIDTH
  627. src_column_width = None
  628. stacks = []
  629. for evt in events:
  630. if evt.stack is not None and len(evt.stack) > 0:
  631. stacks.append(evt.stack)
  632. has_stack = len(stacks) > 0
  633. if has_stack:
  634. src_column_width = max([max([len(entry) for entry in stack]) for stack in stacks]) + 4
  635. if max_src_column_width is not None:
  636. src_column_width = min(src_column_width, max_src_column_width)
  637. headers = [
  638. 'Name',
  639. 'Self CPU %',
  640. 'Self CPU',
  641. 'CPU total %',
  642. 'CPU total',
  643. 'CPU time avg',
  644. ]
  645. if has_cuda_time:
  646. headers.extend([
  647. 'Self CUDA',
  648. 'Self CUDA %',
  649. 'CUDA total',
  650. 'CUDA time avg',
  651. ])
  652. if profile_memory:
  653. headers.extend([
  654. 'CPU Mem',
  655. 'Self CPU Mem',
  656. ])
  657. if has_cuda_mem:
  658. headers.extend([
  659. 'CUDA Mem',
  660. 'Self CUDA Mem',
  661. ])
  662. headers.append(
  663. '# of Calls'
  664. )
  665. # Only append Node ID if any event has a valid (>= 0) Node ID
  666. append_node_id = any([evt.node_id != -1 for evt in events])
  667. if append_node_id:
  668. headers.append('Node ID')
  669. # Have to use a list because nonlocal is Py3 only...
  670. SPACING_SIZE = 2
  671. row_format_lst = [""]
  672. header_sep_lst = [""]
  673. line_length_lst = [-SPACING_SIZE]
  674. MAX_STACK_ENTRY = 5
  675. def add_column(padding, text_dir='>'):
  676. row_format_lst[0] += '{: ' + text_dir + str(padding) + '}' + (' ' * SPACING_SIZE)
  677. header_sep_lst[0] += '-' * padding + (' ' * SPACING_SIZE)
  678. line_length_lst[0] += padding + SPACING_SIZE
  679. def auto_scale_flops(flops):
  680. flop_headers = [
  681. 'FLOPs',
  682. 'KFLOPs',
  683. 'MFLOPs',
  684. 'GFLOPs',
  685. 'TFLOPs',
  686. 'PFLOPs',
  687. ]
  688. assert flops > 0
  689. log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
  690. assert log_flops >= 0 and log_flops < len(flop_headers)
  691. return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
  692. add_column(name_column_width)
  693. for _ in headers[1:]:
  694. add_column(DEFAULT_COLUMN_WIDTH)
  695. if has_input_shapes:
  696. headers.append('Input Shapes')
  697. add_column(shapes_column_width)
  698. if has_stack:
  699. headers.append('Source Location')
  700. add_column(src_column_width, text_dir='<')
  701. if with_flops:
  702. # Auto-scaling of flops header
  703. raw_flops = []
  704. for evt in events:
  705. if evt.flops > 0:
  706. raw_flops.append(evt.flops)
  707. if len(raw_flops) != 0:
  708. (flops_scale, flops_header) = auto_scale_flops(min(raw_flops))
  709. headers.append('Total {}'.format(flops_header))
  710. add_column(flops_column_width)
  711. else:
  712. with_flops = False # can't find any valid flops
  713. row_format = row_format_lst[0]
  714. header_sep = header_sep_lst[0]
  715. line_length = line_length_lst[0]
  716. add_column = None # type: ignore[assignment]
  717. # Have to use a list because nonlocal is Py3 only...
  718. result = []
  719. def append(s):
  720. result.append(s)
  721. result.append('\n') # Yes, newline after the end as well
  722. sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events])
  723. sum_self_cuda_time_total = 0
  724. for evt in events:
  725. if evt.device_type == DeviceType.CPU:
  726. # in legacy profiler, kernel info is stored in cpu events
  727. if evt.is_legacy:
  728. sum_self_cuda_time_total += evt.self_cuda_time_total
  729. elif evt.device_type == DeviceType.CUDA:
  730. # in kineto profiler, there're events with the correct device type (e.g. CUDA)
  731. sum_self_cuda_time_total += evt.self_cuda_time_total
  732. # Actual printing
  733. if header is not None:
  734. append('=' * line_length)
  735. append(header)
  736. if top_level_events_only:
  737. append('=' * line_length)
  738. append('This report only display top-level ops statistics')
  739. append(header_sep)
  740. append(row_format.format(*headers))
  741. append(header_sep)
  742. def trim_path(path, src_column_width):
  743. if len(path) > src_column_width:
  744. offset = len(path) - src_column_width
  745. path = path[offset:]
  746. if len(path) > 3:
  747. path = "..." + path[3:]
  748. return path
  749. event_limit = 0
  750. for evt in events:
  751. if event_limit == row_limit:
  752. break
  753. if top_level_events_only and evt.cpu_parent is not None:
  754. continue
  755. else:
  756. event_limit += 1
  757. name = evt.key
  758. if max_name_column_width is not None and len(name) >= max_name_column_width - 3:
  759. name = name[:(max_name_column_width - 3)] + "..."
  760. row_values = [
  761. name,
  762. # Self CPU total %, 0 for async events.
  763. _format_time_share(evt.self_cpu_time_total, sum_self_cpu_time_total),
  764. evt.self_cpu_time_total_str, # Self CPU total
  765. # CPU total %, 0 for async events.
  766. _format_time_share(evt.cpu_time_total, sum_self_cpu_time_total) if not evt.is_async else 0,
  767. evt.cpu_time_total_str, # CPU total
  768. evt.cpu_time_str, # CPU time avg
  769. ]
  770. if has_cuda_time:
  771. row_values.extend([
  772. evt.self_cuda_time_total_str,
  773. # CUDA time total %
  774. _format_time_share(evt.self_cuda_time_total, sum_self_cuda_time_total),
  775. evt.cuda_time_total_str,
  776. evt.cuda_time_str, # Cuda time avg
  777. ])
  778. if profile_memory:
  779. row_values.extend([
  780. # CPU Mem Total
  781. _format_memory(evt.cpu_memory_usage),
  782. # Self CPU Mem Total
  783. _format_memory(evt.self_cpu_memory_usage),
  784. ])
  785. if has_cuda_mem:
  786. row_values.extend([
  787. # CUDA Mem Total
  788. _format_memory(evt.cuda_memory_usage),
  789. # Self CUDA Mem Total
  790. _format_memory(evt.self_cuda_memory_usage),
  791. ])
  792. row_values.append(
  793. evt.count, # Number of calls
  794. )
  795. if append_node_id:
  796. row_values.append(evt.node_id)
  797. if has_input_shapes:
  798. row_values.append(str(evt.input_shapes)[:shapes_column_width])
  799. if with_flops:
  800. if evt.flops <= 0:
  801. row_values.append("--")
  802. else:
  803. row_values.append('{0:8.3f}'.format(evt.flops * flops_scale))
  804. if has_stack:
  805. src_field = ""
  806. if len(evt.stack) > 0:
  807. src_field = trim_path(evt.stack[0], src_column_width)
  808. row_values.append(src_field)
  809. append(row_format.format(*row_values))
  810. if has_stack:
  811. empty_headers = [""] * (len(headers) - 1)
  812. for entry in evt.stack[1:MAX_STACK_ENTRY]:
  813. append(row_format.format(*(empty_headers + [trim_path(entry, src_column_width)])))
  814. empty_headers.append("")
  815. append(row_format.format(*empty_headers))
  816. append(header_sep)
  817. append("Self CPU time total: {}".format(_format_time(sum_self_cpu_time_total)))
  818. if has_cuda_time:
  819. append("Self CUDA time total: {}".format(_format_time(sum_self_cuda_time_total)))
  820. return ''.join(result)