123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928 |
- import itertools
- import torch
- from torch.autograd import DeviceType
- from collections import defaultdict, namedtuple
- from operator import attrgetter
- from typing import Dict, List, Tuple, Optional
- import bisect
- import math
- __all__ = ["EventList", "FormattedTimesMixin", "Interval", "Kernel", "FunctionEvent", "FunctionEventAvg",
- "StringTable", "MemRecordsAcc"]
- class EventList(list):
- """A list of Events (for pretty printing)"""
- def __init__(self, *args, **kwargs):
- use_cuda = kwargs.pop('use_cuda', True)
- profile_memory = kwargs.pop('profile_memory', False)
- with_flops = kwargs.pop('with_flops', False)
- super().__init__(*args, **kwargs)
- self._use_cuda = use_cuda
- self._profile_memory = profile_memory
- self._tree_built = False
- self._with_flops = with_flops
- def _build_tree(self):
- self._populate_cpu_children()
- self._remove_dup_nodes()
- self._set_backward_stacktraces()
- self._tree_built = True
- def __str__(self):
- return self.table()
- def _remove_dup_nodes(self):
- while True:
- to_delete = set()
- for idx in range(len(self)):
- if (self[idx].cpu_parent is not None and
- self[idx].cpu_parent.name == self[idx].name and
- len(self[idx].cpu_parent.cpu_children) == 1):
- self[idx].cpu_parent.cpu_children = self[idx].cpu_children
- self[idx].cpu_parent.kernels = self[idx].kernels # lift kernels up
- for ch in self[idx].cpu_children:
- ch.cpu_parent = self[idx].cpu_parent
- to_delete.add(idx)
- if len(to_delete) == 0:
- break
- new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete]
- self.clear()
- self.extend(new_evts)
- def _populate_cpu_children(self):
- """Populates child events into each underlying FunctionEvent object.
- One event is a child of another if [s1, e1) is inside [s2, e2). Where
- s1 and e1 would be start and end of the child event's interval. And
- s2 and e2 start and end of the parent event's interval
- Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10]
- be a parent of two other intervals.
- If for any reason two intervals intersect only partially, this function
- will not record a parent child relationship between then.
- """
- # Some events can be async (i.e. start and end on different threads),
- # since it's generally undefined how to attribute children ranges to
- # async ranges, we do not use them when calculating nested ranges and stats
- sync_events = [evt for evt in self if not evt.is_async and evt.device_type == DeviceType.CPU]
- events = sorted(
- sync_events,
- key=attrgetter("thread"),
- )
- # Group by both thread and node_id, so that events that happen to have
- # the same thread_id but are from different nodes aren't incorrectly
- # grouped together.
- threads = itertools.groupby(
- events, key=lambda event: (event.thread, event.node_id)
- )
- # For each thread we keep a stack of current nested parents.
- # We maintain the invariant that each interval is a subset of all other
- # intervals lower in the stack.
- #
- # First we sort the intervals by their start time. Then we iterate over them.
- # Every time we see a new interval we remove several parents from
- # the top until we restore the invariant. Then parent child relationship
- # if recorded if the stack is not empty.
- # Finally we add new interval to the list
- #
- # Algorithm has O(N * log(N)) complexity where N is number of
- # intervals
- for thread_id, thread_events in threads:
- thread_events_ = sorted(
- thread_events,
- key=lambda event: [event.time_range.start, -event.time_range.end],
- )
- current_events: List[FunctionEvent] = []
- cur_end = 0
- for event in thread_events_:
- while len(current_events) > 0:
- parent = current_events[-1]
- if event.time_range.start >= parent.time_range.end or \
- event.time_range.end > parent.time_range.end:
- # this can't be a parent
- current_events.pop()
- else:
- parent.append_cpu_child(event)
- assert (
- event.cpu_parent is None
- ), "There is already a CPU parent event for {}".format(
- event.key
- )
- event.set_cpu_parent(parent)
- break
- current_events.append(event)
- def _set_backward_stacktraces(self):
- def bw_parent(evt):
- if evt is None:
- return None
- elif evt.scope == 1: # BACKWARD_FUNCTION
- return evt
- else:
- return bw_parent(evt.cpu_parent)
- fwd_stacks = {}
- for evt in self:
- if bw_parent(evt) is None and evt.stack is not None:
- t = (evt.sequence_nr, evt.thread)
- if t not in fwd_stacks:
- fwd_stacks[t] = evt.stack
- for evt in self:
- p = bw_parent(evt)
- if p is not None:
- assert p.fwd_thread is not None
- t = (p.sequence_nr, p.fwd_thread)
- if t in fwd_stacks:
- evt.stack = fwd_stacks[t]
- else:
- evt.stack = []
- @property
- def self_cpu_time_total(self):
- return sum([event.self_cpu_time_total for event in self])
- def table(
- self,
- sort_by=None,
- row_limit=100,
- max_src_column_width=75,
- max_name_column_width=55,
- max_shapes_column_width=80,
- header=None,
- top_level_events_only=False
- ):
- """Prints an EventList as a nicely formatted table.
- Args:
- sort_by (str, optional): Attribute used to sort entries. By default
- they are printed in the same order as they were registered.
- Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
- ``cuda_time_total``, ``cpu_memory_usage``, ``cuda_memory_usage``,
- ``self_cpu_memory_usage``, ``self_cuda_memory_usage``, ``count``.
- top_level_events_only(bool, optional): Boolean flag to determine the
- selection of events to display. If true, the profiler will only
- display events at top level like top-level invocation of python
- `lstm`, python `add` or other functions, nested events like low-level
- cpu/cuda ops events are omitted for profiler result readability.
- Returns:
- A string containing the table.
- """
- return _build_table(
- self,
- sort_by=sort_by,
- row_limit=row_limit,
- max_src_column_width=max_src_column_width,
- max_name_column_width=max_name_column_width,
- max_shapes_column_width=max_shapes_column_width,
- header=header,
- profile_memory=self._profile_memory,
- with_flops=self._with_flops,
- top_level_events_only=top_level_events_only)
- def export_chrome_trace(self, path):
- """Exports an EventList as a Chrome tracing tools file.
- The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL.
- Args:
- path (str): Path where the trace will be written.
- """
- import os
- with open(path, 'w') as f:
- chrome_events = []
- next_id = 0
- # Use file IO over using json.dump since JSON dumping is very slow and
- # this technique is proven to give a 4x speedup.
- f.write("[")
- for evt in self:
- if evt.trace_name is None:
- continue
- f.write(
- '{"name": "%s", '
- '"ph": "X", '
- '"ts": %s, '
- '"dur": %s, '
- '"tid": %s, '
- '"pid": "CPU functions", '
- '"args": {}}, '
- % (
- evt.trace_name,
- evt.time_range.start,
- evt.time_range.elapsed_us(),
- evt.thread
- if not evt.is_remote
- else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "',
- )
- )
- for k in evt.kernels:
- # 's' and 'f' draw Flow arrows from
- # the CPU launch to the GPU kernel
- f.write('{"name": "%s", '
- '"ph": "s", '
- '"ts": %s, '
- '"tid": %s, '
- '"pid": "CPU functions", '
- '"id": %s, '
- '"cat": "cpu_to_cuda", '
- '"args": {}}, ' % (evt.trace_name, evt.time_range.start,
- evt.thread, next_id))
- # Note: use torch.profiler to get device kernel trace
- next_id += 1
- if len(self) > 0:
- # remove trailing whitespace and comma
- f.seek(f.tell() - 2, os.SEEK_SET)
- f.truncate()
- f.write("]")
- def supported_export_stacks_metrics(self):
- return ["self_cpu_time_total", "self_cuda_time_total"]
- def export_stacks(self, path: str, metric: str):
- if metric not in self.supported_export_stacks_metrics():
- raise ValueError("metric should be one of: " + str(self.supported_export_stacks_metrics()))
- translate_table = str.maketrans(" ;\t\n", "____")
- with open(path, 'w') as f:
- for evt in self:
- if evt.stack and len(evt.stack) > 0:
- metric_value = getattr(evt, metric)
- if int(metric_value) > 0:
- stack_str = ""
- for entry in reversed(evt.stack):
- stack_str += entry.translate(translate_table)
- stack_str += ";"
- stack_str = stack_str[:-1] + " " + str(int(metric_value))
- f.write(stack_str + "\n")
- def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0):
- """Averages all function events over their keys.
- Args:
- group_by_input_shapes: group entries by
- (event name, input shapes) rather than just event name.
- This is useful to see which input shapes contribute to the runtime
- the most and may help with size-specific optimizations or
- choosing the best candidates for quantization (aka fitting a roof line)
- group_by_stack_n: group by top n stack trace entries
- Returns:
- An EventList containing FunctionEventAvg objects.
- """
- assert self._tree_built
- stats: Dict[Tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
- def get_key(event, group_by_input_shapes, group_by_stack_n) -> Tuple[str, ...]:
- key = [str(event.key), str(event.node_id), str(event.device_type), str(event.is_legacy)]
- if group_by_input_shapes:
- key.append(str(event.input_shapes))
- if group_by_stack_n > 0:
- key += event.stack[:group_by_stack_n]
- return tuple(key)
- for evt in self:
- stats[get_key(evt, group_by_input_shapes, group_by_stack_n)].add(evt)
- avg_list = EventList(
- stats.values(),
- use_cuda=self._use_cuda,
- profile_memory=self._profile_memory,
- with_flops=self._with_flops)
- for evt in avg_list:
- evt.stack = evt.stack[:group_by_stack_n]
- if not group_by_input_shapes:
- evt.input_shapes = ""
- return avg_list
- def total_average(self):
- """Averages all events.
- Returns:
- A FunctionEventAvg object.
- """
- total_stat = FunctionEventAvg()
- for evt in self:
- total_stat += evt
- total_stat.key = None
- total_stat.key = 'Total'
- return total_stat
- def _format_time(time_us):
- """Defines how to format time in FunctionEvent"""
- US_IN_SECOND = 1000.0 * 1000.0
- US_IN_MS = 1000.0
- if time_us >= US_IN_SECOND:
- return '{:.3f}s'.format(time_us / US_IN_SECOND)
- if time_us >= US_IN_MS:
- return '{:.3f}ms'.format(time_us / US_IN_MS)
- return '{:.3f}us'.format(time_us)
- def _format_time_share(time_us, total_time_us):
- """Defines how to format time in FunctionEvent"""
- if total_time_us == 0:
- assert time_us == 0, "Expected time_us == 0 but got {}".format(time_us)
- return "NaN"
- return '{:.2f}%'.format(time_us * 100.0 / total_time_us)
- def _format_memory(nbytes):
- """Returns a formatted memory size string"""
- KB = 1024
- MB = 1024 * KB
- GB = 1024 * MB
- if (abs(nbytes) >= GB):
- return '{:.2f} Gb'.format(nbytes * 1.0 / GB)
- elif (abs(nbytes) >= MB):
- return '{:.2f} Mb'.format(nbytes * 1.0 / MB)
- elif (abs(nbytes) >= KB):
- return '{:.2f} Kb'.format(nbytes * 1.0 / KB)
- else:
- return str(nbytes) + ' b'
- def _attr_formatter(name):
- return property(lambda self: _format_time(getattr(self, name)))
- class FormattedTimesMixin:
- """Helpers for FunctionEvent and FunctionEventAvg.
- The subclass should define `*_time_total` and `count` attributes.
- """
- cpu_time_str = _attr_formatter('cpu_time')
- cuda_time_str = _attr_formatter('cuda_time')
- cpu_time_total_str = _attr_formatter('cpu_time_total')
- cuda_time_total_str = _attr_formatter('cuda_time_total')
- self_cpu_time_total_str = _attr_formatter('self_cpu_time_total')
- self_cuda_time_total_str = _attr_formatter('self_cuda_time_total')
- @property
- def cpu_time(self):
- return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count # type: ignore[attr-defined]
- @property
- def cuda_time(self):
- return 0.0 if self.count == 0 else 1.0 * self.cuda_time_total / self.count # type: ignore[attr-defined]
- class Interval:
- def __init__(self, start, end):
- self.start = start
- self.end = end
- def elapsed_us(self):
- return self.end - self.start
- Kernel = namedtuple('Kernel', ['name', 'device', 'duration'])
- class FunctionEvent(FormattedTimesMixin):
- """Profiling information about a single function."""
- def __init__(
- self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None,
- stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
- is_remote=False, sequence_nr=-1, node_id=-1, device_type=DeviceType.CPU, device_index=0,
- is_legacy=False, flops=None, trace_name=None):
- self.id: int = id
- self.node_id: int = node_id
- self.name: str = name
- self.trace_name: str = trace_name
- self.time_range: Interval = Interval(start_us, end_us)
- self.thread: int = thread
- self.fwd_thread: Optional[int] = fwd_thread
- self.kernels: List[Kernel] = []
- self.count: int = 1
- self.cpu_children: List[FunctionEvent] = []
- self.cpu_parent: Optional[FunctionEvent] = None
- self.input_shapes: Tuple[int, ...] = input_shapes
- self.stack: List = stack
- self.scope: int = scope
- self.cpu_memory_usage: int = cpu_memory_usage
- self.cuda_memory_usage: int = cuda_memory_usage
- self.is_async: bool = is_async
- self.is_remote: bool = is_remote
- self.sequence_nr: int = sequence_nr
- self.device_type: DeviceType = device_type
- self.device_index: int = device_index
- self.is_legacy: bool = is_legacy
- self.flops: Optional[int] = flops
- def append_kernel(self, name, device, duration):
- assert self.device_type == DeviceType.CPU
- self.kernels.append(Kernel(name, device, duration))
- def append_cpu_child(self, child):
- """Append a CPU child of type FunctionEvent.
- One is supposed to append only direct children to the event to have
- correct self cpu time being reported.
- """
- assert(self.device_type == DeviceType.CPU)
- assert(isinstance(child, FunctionEvent))
- assert(child.device_type == DeviceType.CPU)
- self.cpu_children.append(child)
- def set_cpu_parent(self, parent):
- """Set the immediate CPU parent of type FunctionEvent
- One profiling FunctionEvent should have only one CPU parent such that
- the child's range interval is completely inside the parent's. We use
- this connection to determine the event is from top-level op or not.
- """
- assert(self.device_type == DeviceType.CPU)
- assert(isinstance(parent, FunctionEvent))
- assert(parent.device_type == DeviceType.CPU)
- self.cpu_parent = parent
- # Note: async events don't have children, are not used when computing 'self'
- # metrics of other events, have only total cpu time
- @property
- def self_cpu_memory_usage(self):
- if self.is_async or self.device_type != DeviceType.CPU:
- return 0
- return self.cpu_memory_usage - sum(
- [child.cpu_memory_usage for child in self.cpu_children]
- )
- @property
- def self_cuda_memory_usage(self):
- if self.is_async or self.device_type != DeviceType.CPU:
- return 0
- return self.cuda_memory_usage - sum(
- [child.cuda_memory_usage for child in self.cpu_children]
- )
- @property
- def self_cpu_time_total(self):
- if self.is_async or self.device_type != DeviceType.CPU:
- return 0
- return self.cpu_time_total - sum(
- [child.cpu_time_total for child in self.cpu_children]
- )
- @property
- def cuda_time_total(self):
- if self.is_async:
- return 0
- if self.device_type == DeviceType.CPU:
- if not self.is_legacy:
- # account for the kernels in the children ops
- return (sum(kinfo.duration for kinfo in self.kernels) +
- sum(ch.cuda_time_total for ch in self.cpu_children))
- else:
- # each legacy cpu events has a single (fake) kernel
- return sum(kinfo.duration for kinfo in self.kernels)
- else:
- assert self.device_type == DeviceType.CUDA
- return self.time_range.elapsed_us()
- @property
- def self_cuda_time_total(self):
- if self.is_async:
- return 0
- if self.device_type == DeviceType.CPU:
- return self.cuda_time_total - \
- sum([child.cuda_time_total for child in self.cpu_children])
- else:
- assert(self.device_type == DeviceType.CUDA)
- return self.cuda_time_total
- @property
- def cpu_time_total(self):
- if self.device_type == DeviceType.CPU:
- return self.time_range.elapsed_us()
- else:
- return 0
- @property
- def key(self):
- return self.name
- def __repr__(self):
- return (
- '<FunctionEvent id={} name={} device_type={} node_id={} cpu_time={} start_us={} end_us={} '
- 'cpu_children={} cuda_time={} name={} thread={} input_shapes={} '
- 'cpu_memory_usage={} cuda_memory_usage={} is_async={} is_remote={} seq_nr={} is_legacy={}>'.format(
- self.id,
- self.name,
- self.device_type,
- self.node_id,
- self.cpu_time_str,
- self.time_range.start,
- self.time_range.end,
- str([child.id for child in self.cpu_children]),
- self.cuda_time_str,
- self.name,
- self.thread,
- str(self.input_shapes),
- self.cpu_memory_usage,
- self.cuda_memory_usage,
- self.is_async,
- self.is_remote,
- self.sequence_nr,
- self.is_legacy,
- )
- )
- class FunctionEventAvg(FormattedTimesMixin):
- """Used to average stats over multiple FunctionEvent objects."""
- def __init__(self):
- self.key: Optional[str] = None
- self.count: int = 0
- self.node_id: int = 0
- self.is_async: bool = False
- self.is_remote: bool = False
- self.cpu_time_total: int = 0
- self.cuda_time_total: int = 0
- self.self_cpu_time_total: int = 0
- self.self_cuda_time_total: int = 0
- self.input_shapes: Optional[List[List[int]]] = None
- self.stack: Optional[List] = None
- self.scope: Optional[int] = None
- self.cpu_memory_usage: int = 0
- self.cuda_memory_usage: int = 0
- self.self_cpu_memory_usage: int = 0
- self.self_cuda_memory_usage: int = 0
- self.cpu_children: Optional[List[FunctionEvent]] = None
- self.cpu_parent: Optional[FunctionEvent] = None
- self.device_type: DeviceType = DeviceType.CPU
- self.is_legacy: bool = False
- self.flops: int = 0
- def add(self, other):
- if self.key is None:
- # First function being recorded as part of FunctionEventAvg, propagate
- # fields.
- self.key = other.key
- self.node_id = other.node_id
- self.is_async = other.is_async
- self.is_remote = other.is_remote
- self.cpu_parent = other.cpu_parent
- self.cpu_children = other.cpu_children
- self.input_shapes = other.input_shapes
- self.stack = other.stack
- self.scope = other.scope
- self.device_type = other.device_type
- self.is_legacy = other.is_legacy
- assert isinstance(other, (FunctionEvent, FunctionEventAvg))
- assert other.key == self.key
- self.cpu_time_total += other.cpu_time_total
- self.cuda_time_total += other.cuda_time_total
- self.self_cpu_time_total += other.self_cpu_time_total
- self.self_cuda_time_total += other.self_cuda_time_total
- self.cpu_memory_usage += other.cpu_memory_usage
- self.cuda_memory_usage += other.cuda_memory_usage
- self.self_cpu_memory_usage += other.self_cpu_memory_usage
- self.self_cuda_memory_usage += other.self_cuda_memory_usage
- self.count += other.count
- if self.flops is None:
- self.flops = other.flops
- elif other.flops is not None:
- self.flops += other.flops
- return self
- def __iadd__(self, other):
- return self.add(other)
- def __repr__(self):
- return (
- '<FunctionEventAvg key={} self_cpu_time={} cpu_time={} '
- ' self_cuda_time={} cuda_time={} input_shapes={} '
- 'cpu_memory_usage={} cuda_memory_usage={}>'.format(
- self.key,
- self.self_cpu_time_total_str,
- self.cpu_time_str,
- self.self_cuda_time_total_str,
- self.cuda_time_str,
- str(self.input_shapes),
- self.cpu_memory_usage,
- self.cuda_memory_usage,
- )
- )
- class StringTable(defaultdict):
- def __missing__(self, key):
- # manage cases like 't' (demangled to 'unsigned short') separately,
- # for now simply check the length to avoid unexpected results for
- # the short sequences
- self[key] = torch._C._demangle(key) if len(key) > 1 else key
- return self[key]
- class MemRecordsAcc:
- """Acceleration structure for accessing mem_records in interval"""
- def __init__(self, mem_records):
- self._mem_records = mem_records
- self._start_uses = []
- self._indices = []
- if len(mem_records) > 0:
- tmp = sorted([(r[0].start_us(), i) for i, r in enumerate(mem_records)])
- self._start_uses, self._indices = zip(*tmp)
- def in_interval(self, start_us, end_us):
- start_idx = bisect.bisect_left(self._start_uses, start_us)
- end_idx = bisect.bisect_right(self._start_uses, end_us)
- for i in range(start_idx, end_idx):
- yield self._mem_records[self._indices[i]]
- def _filter_stack_entry(entry):
- filtered_entries = [
- ("autograd/__init__", "_make_grads"),
- ("autograd/__init__", "backward"),
- ("torch/tensor", "backward"),
- ("_internal/common_utils", "prof_callable"),
- ("_internal/common_utils", "prof_func_call"),
- ("_internal/common_utils", "prof_meth_call"),
- ]
- return all([not (f[0] in entry and f[1] in entry) for f in filtered_entries])
- MEMORY_EVENT_NAME = "[memory]"
- OUT_OF_MEMORY_EVENT_NAME = "[OutOfMemory]"
- def _filter_name(name):
- # ignoring the following utility ops
- filtered_out_names = [
- MEMORY_EVENT_NAME, # used only for the top-level memory events
- OUT_OF_MEMORY_EVENT_NAME,
- "profiler::_record_function_enter",
- "profiler::_record_function_enter_new",
- "profiler::_record_function_exit",
- "aten::is_leaf",
- "aten::output_nr",
- "aten::_version",
- ]
- return name in filtered_out_names
- # Demangles and optionally rewrites the provided event name,
- # with_wildcard - whether to replace certain numbered event names
- # with a wildcard name to aggregate them together in the profiler table
- # output
- def _rewrite_name(name, with_wildcard=False):
- string_table = StringTable()
- name = string_table[name]
- if with_wildcard:
- if name.startswith("ProfilerStep#"):
- name = "ProfilerStep*"
- return name
- def _build_table(
- events,
- sort_by=None,
- header=None,
- row_limit=100,
- max_src_column_width=75,
- max_name_column_width=55,
- max_shapes_column_width=80,
- with_flops=False,
- profile_memory=False,
- top_level_events_only=False):
- """Prints a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
- if len(events) == 0:
- return ""
- has_cuda_time = any([event.self_cuda_time_total > 0 for event in events])
- has_cuda_mem = any([event.self_cuda_memory_usage > 0 for event in events])
- has_input_shapes = any(
- [(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events])
- if sort_by is not None:
- events = EventList(sorted(
- events, key=lambda evt: getattr(evt, sort_by), reverse=True
- ), use_cuda=has_cuda_time, profile_memory=profile_memory, with_flops=with_flops)
- name_column_width = max([len(evt.key) for evt in events]) + 4
- if max_name_column_width is not None:
- name_column_width = min(name_column_width, max_name_column_width)
- shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4
- if max_shapes_column_width is not None:
- shapes_column_width = min(shapes_column_width, max_shapes_column_width)
- DEFAULT_COLUMN_WIDTH = 12
- flops_column_width = DEFAULT_COLUMN_WIDTH
- src_column_width = None
- stacks = []
- for evt in events:
- if evt.stack is not None and len(evt.stack) > 0:
- stacks.append(evt.stack)
- has_stack = len(stacks) > 0
- if has_stack:
- src_column_width = max([max([len(entry) for entry in stack]) for stack in stacks]) + 4
- if max_src_column_width is not None:
- src_column_width = min(src_column_width, max_src_column_width)
- headers = [
- 'Name',
- 'Self CPU %',
- 'Self CPU',
- 'CPU total %',
- 'CPU total',
- 'CPU time avg',
- ]
- if has_cuda_time:
- headers.extend([
- 'Self CUDA',
- 'Self CUDA %',
- 'CUDA total',
- 'CUDA time avg',
- ])
- if profile_memory:
- headers.extend([
- 'CPU Mem',
- 'Self CPU Mem',
- ])
- if has_cuda_mem:
- headers.extend([
- 'CUDA Mem',
- 'Self CUDA Mem',
- ])
- headers.append(
- '# of Calls'
- )
- # Only append Node ID if any event has a valid (>= 0) Node ID
- append_node_id = any([evt.node_id != -1 for evt in events])
- if append_node_id:
- headers.append('Node ID')
- # Have to use a list because nonlocal is Py3 only...
- SPACING_SIZE = 2
- row_format_lst = [""]
- header_sep_lst = [""]
- line_length_lst = [-SPACING_SIZE]
- MAX_STACK_ENTRY = 5
- def add_column(padding, text_dir='>'):
- row_format_lst[0] += '{: ' + text_dir + str(padding) + '}' + (' ' * SPACING_SIZE)
- header_sep_lst[0] += '-' * padding + (' ' * SPACING_SIZE)
- line_length_lst[0] += padding + SPACING_SIZE
- def auto_scale_flops(flops):
- flop_headers = [
- 'FLOPs',
- 'KFLOPs',
- 'MFLOPs',
- 'GFLOPs',
- 'TFLOPs',
- 'PFLOPs',
- ]
- assert flops > 0
- log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
- assert log_flops >= 0 and log_flops < len(flop_headers)
- return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
- add_column(name_column_width)
- for _ in headers[1:]:
- add_column(DEFAULT_COLUMN_WIDTH)
- if has_input_shapes:
- headers.append('Input Shapes')
- add_column(shapes_column_width)
- if has_stack:
- headers.append('Source Location')
- add_column(src_column_width, text_dir='<')
- if with_flops:
- # Auto-scaling of flops header
- raw_flops = []
- for evt in events:
- if evt.flops > 0:
- raw_flops.append(evt.flops)
- if len(raw_flops) != 0:
- (flops_scale, flops_header) = auto_scale_flops(min(raw_flops))
- headers.append('Total {}'.format(flops_header))
- add_column(flops_column_width)
- else:
- with_flops = False # can't find any valid flops
- row_format = row_format_lst[0]
- header_sep = header_sep_lst[0]
- line_length = line_length_lst[0]
- add_column = None # type: ignore[assignment]
- # Have to use a list because nonlocal is Py3 only...
- result = []
- def append(s):
- result.append(s)
- result.append('\n') # Yes, newline after the end as well
- sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events])
- sum_self_cuda_time_total = 0
- for evt in events:
- if evt.device_type == DeviceType.CPU:
- # in legacy profiler, kernel info is stored in cpu events
- if evt.is_legacy:
- sum_self_cuda_time_total += evt.self_cuda_time_total
- elif evt.device_type == DeviceType.CUDA:
- # in kineto profiler, there're events with the correct device type (e.g. CUDA)
- sum_self_cuda_time_total += evt.self_cuda_time_total
- # Actual printing
- if header is not None:
- append('=' * line_length)
- append(header)
- if top_level_events_only:
- append('=' * line_length)
- append('This report only display top-level ops statistics')
- append(header_sep)
- append(row_format.format(*headers))
- append(header_sep)
- def trim_path(path, src_column_width):
- if len(path) > src_column_width:
- offset = len(path) - src_column_width
- path = path[offset:]
- if len(path) > 3:
- path = "..." + path[3:]
- return path
- event_limit = 0
- for evt in events:
- if event_limit == row_limit:
- break
- if top_level_events_only and evt.cpu_parent is not None:
- continue
- else:
- event_limit += 1
- name = evt.key
- if max_name_column_width is not None and len(name) >= max_name_column_width - 3:
- name = name[:(max_name_column_width - 3)] + "..."
- row_values = [
- name,
- # Self CPU total %, 0 for async events.
- _format_time_share(evt.self_cpu_time_total, sum_self_cpu_time_total),
- evt.self_cpu_time_total_str, # Self CPU total
- # CPU total %, 0 for async events.
- _format_time_share(evt.cpu_time_total, sum_self_cpu_time_total) if not evt.is_async else 0,
- evt.cpu_time_total_str, # CPU total
- evt.cpu_time_str, # CPU time avg
- ]
- if has_cuda_time:
- row_values.extend([
- evt.self_cuda_time_total_str,
- # CUDA time total %
- _format_time_share(evt.self_cuda_time_total, sum_self_cuda_time_total),
- evt.cuda_time_total_str,
- evt.cuda_time_str, # Cuda time avg
- ])
- if profile_memory:
- row_values.extend([
- # CPU Mem Total
- _format_memory(evt.cpu_memory_usage),
- # Self CPU Mem Total
- _format_memory(evt.self_cpu_memory_usage),
- ])
- if has_cuda_mem:
- row_values.extend([
- # CUDA Mem Total
- _format_memory(evt.cuda_memory_usage),
- # Self CUDA Mem Total
- _format_memory(evt.self_cuda_memory_usage),
- ])
- row_values.append(
- evt.count, # Number of calls
- )
- if append_node_id:
- row_values.append(evt.node_id)
- if has_input_shapes:
- row_values.append(str(evt.input_shapes)[:shapes_column_width])
- if with_flops:
- if evt.flops <= 0:
- row_values.append("--")
- else:
- row_values.append('{0:8.3f}'.format(evt.flops * flops_scale))
- if has_stack:
- src_field = ""
- if len(evt.stack) > 0:
- src_field = trim_path(evt.stack[0], src_column_width)
- row_values.append(src_field)
- append(row_format.format(*row_values))
- if has_stack:
- empty_headers = [""] * (len(headers) - 1)
- for entry in evt.stack[1:MAX_STACK_ENTRY]:
- append(row_format.format(*(empty_headers + [trim_path(entry, src_column_width)])))
- empty_headers.append("")
- append(row_format.format(*empty_headers))
- append(header_sep)
- append("Self CPU time total: {}".format(_format_time(sum_self_cpu_time_total)))
- if has_cuda_time:
- append("Self CUDA time total: {}".format(_format_time(sum_self_cuda_time_total)))
- return ''.join(result)
|