123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891 |
- from typing import Any, Dict, List, Optional
- from collections import defaultdict
- from warnings import warn
- import torch
- import torch.cuda
- from torch._C._profiler import _ExperimentalConfig
- from torch.autograd import (
- _disable_profiler,
- _enable_profiler,
- _kineto_step,
- _prepare_profiler,
- _ProfilerResult,
- _supported_activities,
- DeviceType,
- kineto_available,
- ProfilerActivity,
- ProfilerConfig,
- ProfilerState,
- )
- from torch.autograd.profiler_util import (
- _filter_name,
- _filter_stack_entry,
- _rewrite_name,
- EventList,
- FunctionEvent,
- MEMORY_EVENT_NAME,
- MemRecordsAcc,
- OUT_OF_MEMORY_EVENT_NAME,
- )
- from torch.futures import Future
- __all__ = ["profile", "record_function", "emit_itt", "emit_nvtx", "load_nvprof", "EnforceUnique",
- "parse_nvprof_trace", "KinetoStepTracker", "EventList", "FunctionEvent", "MemRecordsAcc"]
- try:
- # Available in Python >= 3.2
- from contextlib import ContextDecorator as _ContextDecorator
- except ImportError:
- import functools
- class _ContextDecorator: # type: ignore[no-redef]
- def __enter__(self):
- raise NotImplementedError
- def __exit__(self, exc_type, exc_val, exc_tb):
- raise NotImplementedError
- def __call__(self, func):
- @functools.wraps(func)
- def wrapped(*args, **kwargs):
- with self:
- return func(*args, **kwargs)
- return wrapped
- class profile:
- """Context manager that manages autograd profiler state and holds a summary of results.
- Under the hood it just records events of functions being executed in C++ and
- exposes those events to Python. You can wrap any code into it and it will
- only report runtime of PyTorch functions.
- Note: profiler is thread local and is automatically propagated into the async tasks
- Args:
- enabled (bool, optional): Setting this to False makes this context manager a no-op.
- use_cuda (bool, optional): Enables timing of CUDA events as well using the cudaEvent API.
- Adds approximately 4us of overhead to each tensor operation.
- record_shapes (bool, optional): If shapes recording is set, information
- about input dimensions will be collected. This allows one to see which
- dimensions have been used under the hood and further group by them
- using prof.key_averages(group_by_input_shape=True). Please note that
- shape recording might skew your profiling data. It is recommended to
- use separate runs with and without shape recording to validate the timing.
- Most likely the skew will be negligible for bottom most events (in a case
- of nested function calls). But for higher level functions the total
- self cpu time might be artificially increased because of the shape
- collection.
- with_flops (bool, optional): If with_flops is set, the profiler will estimate
- the FLOPs (floating point operations) value using the operator's input shape.
- This allows one to estimate the hardware performance. Currently,
- this option only works for the matrix multiplication and 2D convolution operators.
- profile_memory (bool, optional): track tensor memory allocation/deallocation.
- with_stack (bool, optional): record source information (file and line number) for the ops.
- with_modules (bool): record module hierarchy (including function names)
- corresponding to the callstack of the op. e.g. If module A's forward call's
- module B's forward which contains an aten::add op,
- then aten::add's module hierarchy is A.B
- Note that this support exist, at the moment, only for TorchScript models
- and not eager mode models.
- use_kineto (bool, optional): experimental, enable profiling with Kineto profiler.
- use_cpu (bool, optional): profile CPU events; setting to ``False`` requires
- ``use_kineto=True`` and can be used to lower the overhead for GPU-only profiling.
- experimental_config (_ExperimentalConfig) : A set of experimental options
- used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed.
- .. warning:
- Enabling memory profiling or source attribution incurs additional profiler
- overhead
- .. warning:
- This context managers should not be called recursively, i.e. no nested
- instances are allowed
- .. warning:
- Due to some CUDA multiprocessing limitations (multiprocessing-cuda-note_),
- one cannot use the profiler with ``use_cuda = True`` to benchmark
- DataLoaders with ``num_workers > 0``. If you wish to benchmark data loading,
- please use ``use_cuda = False`` or ``num_workers = 0``.
- Example:
- >>> # xdoctest: +SKIP
- >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
- >>> x = torch.randn((1, 1), requires_grad=True)
- >>> with torch.autograd.profiler.profile() as prof:
- >>> for _ in range(100): # any normal python code, really!
- >>> y = x ** 2
- >>> y.backward()
- >>> # NOTE: some columns were removed for brevity
- >>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
- ----------------------------------- --------------- --------------- ---------------
- Name Self CPU total CPU time avg Number of Calls
- ----------------------------------- --------------- --------------- ---------------
- mul 32.048ms 32.048ms 200
- pow 27.041ms 27.041ms 200
- PowBackward0 9.727ms 55.483ms 100
- torch::autograd::AccumulateGrad 9.148ms 9.148ms 100
- torch::autograd::GraphRoot 691.816us 691.816us 100
- ----------------------------------- --------------- --------------- ---------------
- """
- def __init__(
- self,
- enabled=True,
- *,
- use_cuda=False,
- record_shapes=False,
- with_flops=False,
- profile_memory=False,
- with_stack=False,
- with_modules=False,
- use_kineto=False,
- use_cpu=True,
- experimental_config=None):
- self.enabled: bool = enabled
- if not self.enabled:
- return
- self.use_cuda = use_cuda
- self.function_events: Optional[EventList] = None
- self.entered = False
- self.record_shapes = record_shapes
- self.with_flops = with_flops
- self.record_shapes |= self.with_flops
- self.profile_memory = profile_memory
- self.with_stack = with_stack
- self.with_modules = with_modules
- self.use_cpu = use_cpu
- if experimental_config is None:
- experimental_config = _ExperimentalConfig()
- self.experimental_config = experimental_config
- self.kineto_results: Optional[_ProfilerResult] = None
- if not self.use_cpu:
- assert use_kineto, \
- "Device-only events supported only with Kineto (use_kineto=True)"
- if self.use_cuda and not torch.cuda.is_available():
- warn("CUDA is not available, disabling CUDA profiling")
- self.use_cuda = False
- self.kineto_activities = set()
- if self.use_cpu:
- self.kineto_activities.add(ProfilerActivity.CPU)
- self.profiler_kind = ProfilerState.KINETO
- if self.use_cuda:
- if (not use_kineto or ProfilerActivity.CUDA not in
- _supported_activities()):
- assert self.use_cpu, "Legacy CUDA profiling requires use_cpu=True"
- self.profiler_kind = ProfilerState.KINETO_GPU_FALLBACK
- else:
- self.kineto_activities.add(ProfilerActivity.CUDA)
- assert len(self.kineto_activities) > 0, \
- "No activities specified for the profiler"
- def config(self):
- return ProfilerConfig(
- self.profiler_kind,
- self.record_shapes,
- self.profile_memory,
- self.with_stack,
- self.with_flops,
- self.with_modules,
- self.experimental_config)
- def __enter__(self):
- if not self.enabled:
- return
- if self.entered:
- raise RuntimeError("Profiler context manager is not reentrant")
- self._prepare_trace()
- self._start_trace()
- return self
- def _prepare_trace(self):
- self.entered = True
- _prepare_profiler(self.config(), self.kineto_activities)
- def _start_trace(self):
- self.entered = True
- _enable_profiler(self.config(), self.kineto_activities)
- def __exit__(self, exc_type, exc_val, exc_tb):
- if not self.enabled:
- return
- if self.use_cuda:
- torch.cuda.synchronize()
- self.kineto_results = _disable_profiler()
- parsed_results = self._parse_kineto_results(self.kineto_results)
- self.function_events = EventList(
- parsed_results,
- use_cuda=self.use_cuda,
- profile_memory=self.profile_memory,
- with_flops=self.with_flops)
- self.function_events._build_tree()
- return False
- def __repr__(self):
- if self.function_events is None:
- return '<unfinished torch.autograd.profile>'
- return repr(self.function_events)
- def __str__(self):
- if self.function_events is None:
- return '<unfinished torch.autograd.profile>'
- return str(self.function_events)
- def _check_finish(self):
- if self.function_events is None:
- raise RuntimeError("Profiler didn't finish running")
- def table(
- self,
- sort_by=None,
- row_limit=100,
- max_src_column_width=75,
- max_name_column_width=55,
- max_shapes_column_width=80,
- header=None,
- top_level_events_only=False
- ):
- self._check_finish()
- assert self.function_events is not None
- return self.function_events.table(
- sort_by=sort_by,
- row_limit=row_limit,
- max_src_column_width=max_src_column_width,
- max_name_column_width=max_name_column_width,
- max_shapes_column_width=max_shapes_column_width,
- header=header,
- top_level_events_only=top_level_events_only
- )
- table.__doc__ = EventList.table.__doc__
- def export_chrome_trace(self, path):
- self._check_finish()
- if kineto_available():
- self.kineto_results.save(path) # type: ignore[union-attr]
- else:
- return self.function_events.export_chrome_trace(path) # type: ignore[union-attr]
- export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
- def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
- self._check_finish()
- assert self.function_events is not None, "Expected profiling results"
- assert self.with_stack, "export_stacks() requires with_stack=True"
- return self.function_events.export_stacks(path, metric)
- def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
- self._check_finish()
- assert self.function_events is not None, "Expected profiling results"
- return self.function_events.key_averages(group_by_input_shape, group_by_stack_n)
- key_averages.__doc__ = EventList.key_averages.__doc__
- def total_average(self):
- self._check_finish()
- assert self.function_events is not None, "Expected profiling results"
- return self.function_events.total_average()
- total_average.__doc__ = EventList.total_average.__doc__
- @property
- def self_cpu_time_total(self):
- """ Returns total time spent on CPU obtained as a sum of
- all self times across all the events.
- """
- self._check_finish()
- assert self.function_events is not None
- return self.function_events.self_cpu_time_total
- def _parse_kineto_results(self, result):
- # result.events() has most of the events - PyTorch op-level and device-level events
- trace_start_us = result.trace_start_us()
- mem_records = [[evt, False] for evt in result.events() if evt.name() == MEMORY_EVENT_NAME]
- oom_records = [evt for evt in result.events() if evt.name() == OUT_OF_MEMORY_EVENT_NAME]
- mem_records_acc = MemRecordsAcc(mem_records)
- def _cpu_memory_usage(mem_record):
- return mem_record.nbytes() if \
- mem_record.device_type() in [DeviceType.CPU, DeviceType.MKLDNN, DeviceType.IDEEP] \
- else 0
- def _cuda_memory_usage(mem_record):
- return mem_record.nbytes() if \
- mem_record.device_type() in [DeviceType.CUDA, DeviceType.HIP] \
- else 0
- # Create and return FunctionEvent list
- function_events = []
- cuda_corr_map: Dict[int, List[FunctionEvent]] = {}
- max_evt_id = 0
- for kineto_event in result.events():
- if _filter_name(kineto_event.name()):
- continue
- rel_start_us = kineto_event.start_us() - trace_start_us
- rel_end_us = rel_start_us + kineto_event.duration_us()
- abs_end_us = kineto_event.start_us() + kineto_event.duration_us()
- cpu_memory_usage = 0
- cuda_memory_usage = 0
- if kineto_event.device_type() == DeviceType.CPU:
- # find the corresponding memory allocation events
- for mem_record in mem_records_acc.in_interval(kineto_event.start_us(), abs_end_us):
- cpu_memory_usage += _cpu_memory_usage(mem_record[0])
- cuda_memory_usage += _cuda_memory_usage(mem_record[0])
- mem_record[1] = True
- is_async = kineto_event.is_async() or (
- kineto_event.start_thread_id() != kineto_event.end_thread_id()
- )
- fe = FunctionEvent(
- id=kineto_event.correlation_id(),
- name=_rewrite_name(name=kineto_event.name(), with_wildcard=True),
- trace_name=_rewrite_name(name=kineto_event.name(), with_wildcard=False),
- thread=kineto_event.start_thread_id(),
- start_us=rel_start_us,
- end_us=rel_end_us,
- fwd_thread=kineto_event.fwd_thread_id(),
- input_shapes=kineto_event.shapes(),
- stack=[entry for entry in kineto_event.stack() if _filter_stack_entry(entry)],
- scope=kineto_event.scope(),
- cpu_memory_usage=cpu_memory_usage,
- cuda_memory_usage=cuda_memory_usage,
- is_async=is_async,
- sequence_nr=kineto_event.sequence_nr(),
- device_type=kineto_event.device_type(),
- device_index=kineto_event.device_index(),
- flops=kineto_event.flops(),
- )
- max_evt_id = fe.id if fe.id > max_evt_id else max_evt_id
- if fe.device_type == DeviceType.CPU and not fe.is_async:
- # Check if we have CUDA time as a fallback
- cuda_time = kineto_event.cuda_elapsed_us()
- if cuda_time > 0:
- fe.append_kernel(
- fe.name,
- fe.device_index,
- cuda_time)
- fe.is_legacy = True
- function_events.append(fe)
- corr_id = kineto_event.linked_correlation_id()
- if corr_id > 0:
- if corr_id not in cuda_corr_map:
- cuda_corr_map[corr_id] = []
- cuda_corr_map[corr_id].append(fe)
- # associate CUDA kernels and CUDA runtime (CPU) with CPU events
- for fe in function_events:
- if (fe.device_type == DeviceType.CPU and not fe.is_async and
- fe.id in cuda_corr_map):
- for f_evt in cuda_corr_map[fe.id]:
- if f_evt.device_type == DeviceType.CUDA:
- fe.append_kernel(
- f_evt.name,
- f_evt.device_index,
- f_evt.time_range.end - f_evt.time_range.start)
- elif f_evt.device_type == DeviceType.CPU:
- # make sure that 'thread' of a CPU Kineto (e.g. CUDA Runtime) event is associated
- # with the 'thread' of the corresponding linked PyTorch event to properly track
- # parents and children
- f_evt.thread = fe.thread
- def createFunctionEventForMemoryEvents(evt):
- rel_start_us = evt.start_us() - trace_start_us
- fe = FunctionEvent(
- id=max_evt_id,
- name=evt.name(),
- trace_name=None, # not outputting in the trace
- thread=evt.start_thread_id(),
- start_us=rel_start_us,
- end_us=rel_start_us, # no duration
- fwd_thread=evt.start_thread_id(),
- input_shapes=[],
- stack=[],
- scope=0, # RecordScope::FUNCTION
- cpu_memory_usage=_cpu_memory_usage(evt),
- cuda_memory_usage=_cuda_memory_usage(evt),
- is_async=False,
- sequence_nr=-1,
- device_type=DeviceType.CPU,
- device_index=0,
- )
- return fe
- # output top-level memory events
- for mem_record in mem_records:
- if not mem_record[1]:
- max_evt_id += 1
- fe = createFunctionEventForMemoryEvents(mem_record[0])
- function_events.append(fe)
- for oom_record in oom_records:
- max_evt_id += 1
- fe = createFunctionEventForMemoryEvents(oom_record)
- function_events.append(fe)
- function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
- return function_events
- class record_function(_ContextDecorator):
- """Context manager/function decorator that adds a label to a block of
- Python code (or function) when running autograd profiler. It is
- useful when tracing the code profile.
- Args:
- name (str): Label assigned to the block of code.
- node_id (int): ID of node, for distributed profiling. Unset in
- non-distributed cases.
- Example:
- >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
- >>> x = torch.randn((1, 1), requires_grad=True)
- >>> with torch.autograd.profiler.profile() as prof:
- ... y = x ** 2
- ... with torch.autograd.profiler.record_function("label-z"): # label the block
- ... z = y ** 3
- ... y.backward()
- ...
- >>> # xdoctest: +IGNORE_WANT
- >>> # NOTE: some columns were removed for brevity
- >>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
- ----------------------------------- --------------- --------------- ---------------
- Name Self CPU total % CPU time avg Number of Calls
- ----------------------------------- --------------- --------------- ---------------
- pow 60.77% 47.470us 3
- mul 21.73% 25.465us 2
- PowBackward0 12.03% 121.891us 1
- torch::autograd::AccumulateGrad 2.70% 6.324us 1
- label-z 2.13% 12.421us 1
- torch::autograd::GraphRoot 0.64% 1.503us 1
- ----------------------------------- --------------- --------------- ---------------
- Self CPU time total: 234.344us
- CUDA time total: 0.000us
- """
- def __init__(self, name: str, args: Optional[str] = None):
- self.name: str = name
- self.args: Optional[str] = args
- # Whether or not we should run record function's end callbacks when exiting.
- self.run_callbacks_on_exit: bool = True
- # TODO: TorchScript ignores standard type annotation here
- # self.record: Optional["torch.classes.profiler._RecordFunction"] = None
- self.record = torch.jit.annotate(Optional["torch.classes.profiler._RecordFunction"], None)
- def __enter__(self):
- self.record = torch.ops.profiler._record_function_enter_new(self.name, self.args)
- return self
- def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
- if not self.run_callbacks_on_exit:
- return
- # Local variable is needed by TorchScript to refine Optional[T] to T
- record = self.record
- assert record is not None
- # TODO: Too slow with __torch_function__ handling enabled
- # See https://github.com/pytorch/pytorch/issues/76410
- if not torch.jit.is_scripting():
- with torch._C.DisableTorchFunctionSubclass():
- torch.ops.profiler._record_function_exit._RecordFunction(record)
- else:
- torch.ops.profiler._record_function_exit(record)
- def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
- """
- _call_end_callbacks_on_future is meant to be used for profiling async
- calls that return a future. Calling this function will extend recording
- beyond this scope, until the future is satisfied. It is useful for profiling
- the end to end time of asynchronous calls. This function should only be called
- once to attach the callback onto the future, and will throw if called multiple
- times.
- Args:
- fut: (torch._C.Future): future for which to schedule
- callback for.
- Returns:
- A future that completes with the value of the passed in future when
- the profiling callbacks have ran.
- """
- # Throw if we have already attached a callback onto the future.
- if not self.run_callbacks_on_exit:
- raise RuntimeError("_call_end_callbacks_on_future can only be called once.")
- # We are scheduling to run this RecordFunction's end callbacks when the
- # passed in future completes, so don't run end callbacks on exit.
- self.run_callbacks_on_exit = False
- # Local variable is needed by TorchScript to refine Optional[T] to T
- record = self.record
- assert record is not None
- # TODO: Too slow with __torch_function__ handling enabled
- # See https://github.com/pytorch/pytorch/issues/76410
- if not torch.jit.is_scripting():
- with torch._C.DisableTorchFunctionSubclass():
- profiled_future = torch.ops.profiler._call_end_callbacks_on_jit_fut._RecordFunction(
- record, fut)
- else:
- profiled_future = torch.ops.profiler._call_end_callbacks_on_jit_fut(record, fut)
- return profiled_future
- class emit_itt:
- """Context manager that makes every autograd operation emit an ITT range.
- It is useful when running the program under Intel(R) VTune Profiler::
- vtune <--vtune-flags> <regular command here>
- The Instrumentation and Tracing Technology (ITT) API enables your application to generate and
- control the collection of trace data during its execution across different Intel tools.
- This context manager is to annotate Intel(R) VTune Profiling trace. With help of this context manager,
- you will be able to see labled ranges in Intel(R) VTune Profiler GUI.
- .. warning:
- This context manager should not be called recursively, i.e. at most one
- instance should be enabled at any given time.
- Args:
- enabled (bool, optional): Setting ``enabled=False`` makes this context manager a no-op.
- Default: ``True``.
- record_shapes (bool, optional): If ``record_shapes=True``, the itt range wrapping
- each autograd op will append information about the sizes of Tensor arguments received
- by that op, in the following format:
- ``[[arg0.size(0), arg0.size(1), ...], [arg1.size(0), arg1.size(1), ...], ...]``
- Non-tensor arguments will be represented by ``[]``.
- Arguments will be listed in the order they are received by the backend op.
- Please note that this order may not match the order in which those arguments were passed
- on the Python side. Also note that shape recording may increase the overhead of itt range creation.
- Default: ``False``
- Example:
- >>> # xdoctest: +SKIP("Undefined variables")
- >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
- >>> with torch.autograd.profiler.emit_itt():
- ... model(x)
- """
- def __init__(self, enabled=True, record_shapes=False):
- self.enabled = enabled
- self.entered = False
- self.record_shapes = record_shapes
- def __enter__(self):
- if not self.enabled:
- return
- if self.entered:
- raise RuntimeError("ITT annotation context manager is not reentrant")
- self.entered = True
- _enable_profiler(
- ProfilerConfig(
- ProfilerState.ITT,
- self.record_shapes,
- False,
- False,
- False,
- False,
- _ExperimentalConfig()),
- set()
- )
- return self
- def __exit__(self, exc_type, exc_val, exc_tb):
- if not self.enabled:
- return
- _disable_profiler()
- return False
- class emit_nvtx:
- """Context manager that makes every autograd operation emit an NVTX range.
- It is useful when running the program under nvprof::
- nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
- Unfortunately, there's no way to force nvprof to flush the data it collected
- to disk, so for CUDA profiling one has to use this context manager to annotate
- nvprof traces and wait for the process to exit before inspecting them.
- Then, either NVIDIA Visual Profiler (nvvp) can be used to visualize the timeline, or
- :func:`torch.autograd.profiler.load_nvprof` can load the results for inspection
- e.g. in Python REPL.
- .. warning:
- This context manager should not be called recursively, i.e. at most one
- instance should be enabled at any given time.
- Args:
- enabled (bool, optional): Setting ``enabled=False`` makes this context manager a no-op.
- Default: ``True``.
- record_shapes (bool, optional): If ``record_shapes=True``, the nvtx range wrapping
- each autograd op will append information about the sizes of Tensor arguments received
- by that op, in the following format:
- ``[[arg0.size(0), arg0.size(1), ...], [arg1.size(0), arg1.size(1), ...], ...]``
- Non-tensor arguments will be represented by ``[]``.
- Arguments will be listed in the order they are received by the backend op.
- Please note that this order may not match the order in which those arguments were passed
- on the Python side. Also note that shape recording may increase the overhead of nvtx range creation.
- Default: ``False``
- Example:
- >>> # xdoctest: +SKIP("undefined variables")
- >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
- >>> with torch.cuda.profiler.profile():
- ... model(x) # Warmup CUDA memory allocator and profiler
- ... with torch.autograd.profiler.emit_nvtx():
- ... model(x)
- **Forward-backward correlation**
- When viewing a profile created using :class:`emit_nvtx` in the Nvidia Visual Profiler,
- correlating each backward-pass op with the corresponding forward-pass op can be difficult.
- To ease this task, :class:`emit_nvtx` appends sequence number information to the ranges it
- generates.
- During the forward pass, each function range is decorated with ``seq=<N>``. ``seq`` is a running
- counter, incremented each time a new backward Function object is created and stashed for backward.
- Thus, the ``seq=<N>`` annotation associated with each forward function range tells you that
- if a backward Function object is created by this forward function,
- the backward object will receive sequence number N.
- During the backward pass, the top-level range wrapping each C++ backward Function's
- ``apply()`` call is decorated with ``stashed seq=<M>``. ``M`` is the sequence number that
- the backward object was created with. By comparing ``stashed seq`` numbers in backward with ``seq``
- numbers in forward, you can track down which forward op created each backward Function.
- Any functions executed during the backward pass are also decorated with ``seq=<N>``. During
- default backward (with ``create_graph=False``) this information is irrelevant, and in fact,
- ``N`` may simply be 0 for all such functions. Only the top-level ranges associated with
- backward Function objects' ``apply()`` methods are useful, as a way to correlate these Function
- objects with the earlier forward pass.
- **Double-backward**
- If, on the other hand, a backward pass with ``create_graph=True`` is underway (in other words,
- if you are setting up for a double-backward), each function's execution during backward
- is given a nonzero, useful ``seq=<N>``. Those functions may themselves create Function objects
- to be executed later during double-backward, just as the original functions in the forward pass did.
- The relationship between backward and double-backward is conceptually the same as the relationship
- between forward and backward: The functions still emit current-sequence-number-tagged ranges,
- the Function objects they create still stash those sequence numbers, and during the eventual
- double-backward, the Function objects' ``apply()`` ranges are still tagged with ``stashed seq``
- numbers, which can be compared to `seq` numbers from the backward pass.
- .. warning:
- The sequence number is thread-local, and some forward functions don't create an associated
- backward Function object (instead delegating that to sub-functions further down the call chain).
- For these reasons, the correspondence of stashed sequence numbers in
- backward Function ``apply()`` ranges with `seq` numbers in forward-pass ranges is
- not guaranteed to be 1 to 1. The sequence numbers alone may not be enough to fully
- disambiguate which forward function created which
- backward Function object. You may need to make a judgment based on analytic knowledge of what
- the expected correspondence should be.
- """
- def __init__(self, enabled=True, record_shapes=False):
- self.enabled = enabled
- self.entered = False
- self.record_shapes = record_shapes
- def __enter__(self):
- if not self.enabled:
- return
- if self.entered:
- raise RuntimeError("NVTX annotation context manager is not reentrant")
- self.entered = True
- torch.cuda.synchronize()
- _enable_profiler(
- ProfilerConfig(
- ProfilerState.NVTX,
- self.record_shapes,
- False,
- False,
- False,
- False,
- _ExperimentalConfig()),
- set()
- )
- return self
- def __exit__(self, exc_type, exc_val, exc_tb):
- if not self.enabled:
- return
- torch.cuda.synchronize()
- _disable_profiler()
- return False
- def load_nvprof(path):
- """Opens an nvprof trace file and parses autograd annotations.
- Args:
- path (str): path to nvprof trace
- """
- return EventList(parse_nvprof_trace(path))
- class EnforceUnique:
- """Raises an error if a key is seen more than once."""
- def __init__(self):
- self.seen = set()
- def see(self, *key):
- if key in self.seen:
- raise RuntimeError('duplicate key: ' + str(key))
- self.seen.add(key)
- def parse_nvprof_trace(path):
- import sqlite3
- conn = sqlite3.connect(path)
- conn.row_factory = sqlite3.Row
- # Parse strings table
- strings = {}
- for r in conn.execute("SELECT _id_ as id, value FROM StringTable"):
- strings[r["id"]] = torch._C._demangle(r["value"])
- # First, find all functions and create FunctionEvents for them
- marker_query = """
- SELECT
- start.id AS marker_id, start.name, start.timestamp AS start_time, end.timestamp AS end_time
- FROM
- CUPTI_ACTIVITY_KIND_MARKER AS start INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
- ON start.id = end.id
- WHERE
- start.name != 0 AND end.name = 0
- """
- functions = []
- functions_map = {}
- unique = EnforceUnique()
- for row in conn.execute(marker_query):
- unique.see(row['marker_id'])
- evt = FunctionEvent(id=row['marker_id'],
- node_id=0, # missing a node_id when calling FunctionEvent. This is just to ensure
- # that pytorch doesn't crash when creating a FunctionEvent() object
- name=strings[row['name']],
- start_us=row['start_time'],
- end_us=row['end_time'],
- thread=0) # TODO: find in sqlite database
- functions.append(evt)
- functions_map[evt.id] = evt
- # Now, correlate all kernels with FunctionEvents
- kernel_query = """
- SELECT
- start.id AS marker_id, start.name, start.timestamp, end.timestamp,
- runtime._id_ AS runtime_id, runtime.cbid, runtime.start AS runtime_start, runtime.end AS runtime_end,
- kernel.start AS kernel_start, kernel.end AS kernel_end, kernel.name AS kernel_name
- FROM
- CUPTI_ACTIVITY_KIND_MARKER AS start
- INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
- ON start.id = end.id
- INNER JOIN CUPTI_ACTIVITY_KIND_RUNTIME as runtime
- ON (start.timestamp < runtime.start AND runtime.end < end.timestamp)
- INNER JOIN CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL AS kernel
- ON kernel.correlationId = runtime.correlationId
- """
- unique = EnforceUnique()
- for row in conn.execute(kernel_query):
- unique.see(row['marker_id'], row['runtime_id'])
- # 211 is cudaKernelLaunch for cuda >= 9.2
- assert (row['cbid'] == 211)
- evt = functions_map[row['marker_id']]
- evt.append_kernel(row['kernel_name'],
- 0,
- row['kernel_end'] - row['kernel_start'])
- functions.sort(key=lambda evt: evt.time_range.start)
- return functions
- class KinetoStepTracker:
- """Provides an abstraction for incrementing the step count globally.
- Previously, we only had one place to mark that a step() has occurred
- in the program via pytorch profiler step(). We will now add step hooks
- in the Optimizer class https://github.com/pytorch/pytorch/issues/88446
- - This could mean programs that already call profiler.step() every
- iteration can end up double incrementing step count.
- - If a model uses multiple optimizers we can also have double or more
- counting of the step.
- We fix this by adding a layer of abstraction before calling step()
- to the kineto library. The idea is to maintain steps per requester in a dict:
- ```
- {
- "ProfilerStep": 100, # triggered by profiler step() call
- "Optimizer1Step": 100, # Optimizer 1 or 2 are just examples, could be SGD, Adam etc
- "Optimizer2Step": 100,
- }
- ```
- To figure out the global step count just take the max of dict values (100).
- If one of the count increments the max will go up.
- ```
- {
- "ProfilerStep": 100,
- "Optimizer1Step": 101, # Optimizer1 got incremented first say
- "Optimizer2Step": 100,
- }
- ```
- Then global step count is 101
- We only call the kineto step() function when global count increments.
- NOTE: Please do not use the KinetoStepTracker in modules beside the Optimizer
- for now. The result could be incorrect increments of the step count.
- """
- _current_step = -1
- _step_dict: Dict[str, int] = defaultdict(int)
- @classmethod
- def init_step_count(cls, requester: str):
- cls._step_dict[requester] = cls._current_step
- @classmethod
- def erase_step_count(cls, requester: str) -> bool:
- return cls._step_dict.pop(requester, None) is not None
- @classmethod
- def increment_step(cls, requester: str) -> int:
- """Increments the step count for the requester.
- Additionally if the max over all step counts has incremented then
- trigger the _kineto_step()
- returns global step count
- """
- if requester not in cls._step_dict:
- cls.init_step_count(requester)
- cls._step_dict[requester] += 1
- new_step = max(cls._step_dict.values())
- if new_step > cls._current_step:
- delta = new_step - cls._current_step
- if delta > 1:
- warn("Profiler step count has increased more than 1 - "
- f"current_step = {cls._current_step} step dict = {cls._step_dict}")
- for _ in range(0, delta):
- _kineto_step()
- cls._current_step = new_step
- return cls._current_step
- @classmethod
- def current_step(cls) -> int:
- return cls._current_step
|