123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299 |
- import torch
- import torch.cuda
- from torch.autograd.profiler_util import (
- EventList, FunctionEvent, MEMORY_EVENT_NAME,
- _filter_name, _filter_stack_entry, _rewrite_name
- )
- from torch.autograd import (
- DeviceType, ProfilerConfig, ProfilerState,
- _disable_profiler_legacy, _enable_profiler_legacy,
- )
- import itertools
- from warnings import warn
- __all__ = ["profile"]
- class profile:
- """DEPRECATED: use torch.profiler instead"""
- def __init__(
- self,
- enabled=True,
- *,
- use_cuda=False,
- record_shapes=False,
- with_flops=False,
- profile_memory=False,
- with_stack=False,
- with_modules=False):
- self.enabled: bool = enabled
- if not self.enabled:
- return
- self.use_cuda = use_cuda
- self.function_events = None
- self.entered = False
- self.record_shapes = record_shapes
- self.with_flops = with_flops
- self.record_shapes |= self.with_flops
- self.profile_memory = profile_memory
- self.with_stack = with_stack
- self.with_modules = with_modules
- if self.use_cuda and not torch.cuda.is_available():
- warn("CUDA is not available, disabling CUDA profiling")
- self.use_cuda = False
- if self.use_cuda:
- self.profiler_kind = ProfilerState.CUDA
- else:
- self.profiler_kind = ProfilerState.CPU
- def config(self):
- return ProfilerConfig(
- self.profiler_kind,
- self.record_shapes,
- self.profile_memory,
- self.with_stack,
- self.with_flops,
- self.with_modules,
- # avoid exposing _ExperimentalConfig this in legacy public API
- torch._C._profiler._ExperimentalConfig(),
- )
- def __enter__(self):
- if not self.enabled:
- return
- if self.entered:
- raise RuntimeError("Profiler context manager is not reentrant")
- self.entered = True
- self._start_trace()
- return self
- def _start_trace(self):
- _enable_profiler_legacy(self.config())
- def __exit__(self, exc_type, exc_val, exc_tb):
- if not self.enabled:
- return
- if self.use_cuda:
- torch.cuda.synchronize()
- records = _disable_profiler_legacy()
- parsed_results = _parse_legacy_records(records)
- self.function_events = EventList(
- parsed_results,
- use_cuda=self.use_cuda,
- profile_memory=self.profile_memory,
- with_flops=self.with_flops)
- self.function_events._build_tree()
- return False
- def __repr__(self):
- if self.function_events is None:
- return '<unfinished profiler_legacy.profile>'
- return repr(self.function_events)
- def __str__(self):
- if self.function_events is None:
- return '<unfinished profile.profiler_legacy.profile>'
- return str(self.function_events)
- def _check_finish(self):
- if self.function_events is None:
- raise RuntimeError("Profiler didn't finish running")
- def table(
- self,
- sort_by=None,
- row_limit=100,
- max_src_column_width=75,
- max_name_column_width=55,
- max_shapes_column_width=80,
- header=None,
- top_level_events_only=False
- ):
- self._check_finish()
- assert self.function_events is not None
- return self.function_events.table(
- sort_by=sort_by,
- row_limit=row_limit,
- max_src_column_width=max_src_column_width,
- max_name_column_width=max_name_column_width,
- max_shapes_column_width=max_shapes_column_width,
- header=header,
- top_level_events_only=top_level_events_only
- )
- table.__doc__ = EventList.table.__doc__
- def export_chrome_trace(self, path):
- self._check_finish()
- assert self.function_events is not None
- return self.function_events.export_chrome_trace(path)
- export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
- def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
- self._check_finish()
- assert self.function_events is not None, "Expected profiling results"
- assert self.with_stack, "export_stacks() requires with_stack=True"
- return self.function_events.export_stacks(path, metric)
- def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
- self._check_finish()
- assert self.function_events is not None, "Expected profiling results"
- return self.function_events.key_averages(group_by_input_shape, group_by_stack_n)
- key_averages.__doc__ = EventList.key_averages.__doc__
- def total_average(self):
- self._check_finish()
- assert self.function_events is not None, "Expected profiling results"
- return self.function_events.total_average()
- total_average.__doc__ = EventList.total_average.__doc__
- @property
- def self_cpu_time_total(self):
- """ Returns total time spent on CPU obtained as a sum of
- all self times across all the events.
- """
- self._check_finish()
- assert self.function_events is not None
- return self.function_events.self_cpu_time_total
- def _parse_legacy_records(thread_records):
- def _get_record_key(record):
- """
- Returns a tuple to be used by _parse_legacy_records for correlating start and
- end records.
- """
- return (record.handle(), record.node_id())
- next_id = 0
- start_record = None
- functions = []
- record_stack = []
- # '__start_profile' is not guaranteed to be first, so we must find it here
- for record in itertools.chain(*thread_records):
- name = record.name()
- if start_record is None and name == '__start_profile':
- start_record = record
- assert start_record is not None and not start_record.is_remote()
- for thread_record_list in thread_records:
- # accumulated memory allocations per handle
- cpu_memory_allocs = {}
- cuda_memory_allocs = {}
- # ranges per handle
- range_starts = {}
- filtered_handles = set()
- prev_record = None
- for record in thread_record_list:
- record_key = _get_record_key(record)
- if (_filter_name(record.name()) or
- record_key in filtered_handles):
- filtered_handles.add(record_key)
- continue
- if record.kind() == 'push':
- # workaround to reduce double logging from operator
- # wrappers and redispatch
- if prev_record is not None:
- duplicate = (
- prev_record.name() == record.name()
- and prev_record.kind() == record.kind()
- and prev_record.node_id() == record.node_id()
- )
- if duplicate:
- filtered_handles.add(record_key)
- continue
- range_starts[record_key] = record
- cpu_memory_allocs[record_key] = 0
- cuda_memory_allocs[record_key] = 0
- elif record.kind() == 'pop':
- assert (
- record_key in range_starts
- ), """Expected record with key {} to exist in range_starts.
- This means that the pop event did not have a corresponding push.""".format(
- record_key
- )
- start = range_starts[record_key]
- cpu_memory_usage = cpu_memory_allocs[record_key]
- cuda_memory_usage = cuda_memory_allocs[record_key]
- is_async = start.is_async() or (
- start.thread_id() != record.thread_id()
- )
- is_remote_event = record.is_remote()
- start_flops = start.flops()
- fe = FunctionEvent(
- id=record.handle(),
- node_id=record.node_id(),
- name=_rewrite_name(name=start.name(), with_wildcard=True),
- trace_name=_rewrite_name(name=start.name(), with_wildcard=False),
- thread=start.thread_id(),
- start_us=start_record.cpu_elapsed_us(start),
- end_us=start_record.cpu_elapsed_us(record),
- fwd_thread=start.fwd_thread_id(),
- input_shapes=start.shapes(),
- stack=[entry for entry in start.stack() if _filter_stack_entry(entry)],
- scope=start.scope(),
- cpu_memory_usage=cpu_memory_usage,
- cuda_memory_usage=cuda_memory_usage,
- is_async=is_async,
- is_remote=is_remote_event,
- sequence_nr=start.sequence_nr(),
- device_type=DeviceType.CPU,
- is_legacy=True,
- flops=start_flops,
- )
- # note: async events have only cpu total time
- if not is_async and start.has_cuda():
- duration = start.cuda_elapsed_us(record)
- if duration > 0:
- fe.append_kernel(
- start.name(),
- start.device(),
- duration)
- functions.append(fe)
- del range_starts[record_key]
- del cpu_memory_allocs[record_key]
- del cuda_memory_allocs[record_key]
- elif record.kind() == 'memory_alloc':
- num_open_handles_cpu = len(cpu_memory_allocs)
- num_open_handles_cuda = len(cuda_memory_allocs)
- assert num_open_handles_cpu == num_open_handles_cuda
- for handle in cpu_memory_allocs.keys():
- cpu_memory_allocs[handle] += record.cpu_memory_usage()
- for handle in cuda_memory_allocs.keys():
- cuda_memory_allocs[handle] += record.cuda_memory_usage()
- if num_open_handles_cpu == 0:
- # output event as a top-level memory event
- fe = FunctionEvent(
- id=0,
- name=MEMORY_EVENT_NAME,
- trace_name=None,
- thread=0,
- start_us=0,
- end_us=0,
- stack=[],
- cpu_memory_usage=record.cpu_memory_usage(),
- cuda_memory_usage=record.cuda_memory_usage(),
- is_legacy=True,
- )
- functions.append(fe)
- prev_record = record
- # Sort functions by start time then by end time ascending.
- # This ensures that--in the case of nested events which
- # have the same start time (which may happen due to the
- # granularity of the given clock tick)--we always show
- # the outermost nested call first. This adds stability
- # in how FunctionEvents appear
- functions.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
- return functions
|