123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- # Copyright 2019 Kakao Brain
- #
- # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
- #
- # This source code is licensed under the BSD license found in the
- # LICENSE file in the root directory of this source tree.
- """Per-layer profilers."""
- import copy
- import time
- from typing import Any, Generator, List, Union, Sequence
- import torch
- from torch import Tensor
- import torch.nn as nn
- from ..microbatch import Batch
- __all__: List[str] = []
- Device = Union[torch.device, int, str]
- Tensors = Sequence[Tensor]
- TensorOrTensors = Union[Tensor, Tensors]
- def layerwise_sandbox(module: nn.Sequential, device: torch.device,) -> Generator[nn.Module, None, None]:
- """Copies layers for ease to profile. It doesn't modify the given
- module.
- """
- for layer in module:
- layer_copy = copy.deepcopy(layer)
- layer_copy.to(device)
- layer_copy.train()
- yield layer_copy
- def detach(batch: Batch) -> None:
- """Detaches from autograd graph."""
- for i, x in enumerate(batch):
- batch[i] = x.detach().requires_grad_(x.requires_grad)
- def profile_times(module: nn.Sequential, sample: Union[List[Any], Tensor], timeout: float, device: torch.device,) -> List[int]:
- """Profiles elapsed times per layer."""
- if any(p.grad is not None for p in module.parameters()):
- raise ValueError("some parameter already has gradient")
- _batch = Batch(sample)
- for i, x in enumerate(_batch):
- _batch[i] = x.detach().to(device).requires_grad_(x.requires_grad)
- time_bufs: List[List[float]] = [[] for _ in module]
- begun_at = time.time()
- while time.time() - begun_at < timeout:
- batch = _batch
- for i, layer in enumerate(layerwise_sandbox(module, device)):
- detach(batch)
- if device.type == "cuda":
- torch.cuda.synchronize(device)
- tick = time.time()
- # Forward
- batch = batch.call(layer)
- # Backward
- backward_tensors = tuple(y for y in batch if y.requires_grad)
- if backward_tensors:
- torch.autograd.backward(backward_tensors, backward_tensors)
- if device.type == "cuda":
- torch.cuda.synchronize(device)
- tock = time.time()
- time_bufs[i].append(tock - tick)
- us = 1_000_000
- return [sum(int(t * us) for t in buf) for buf in time_bufs]
- def profile_sizes(
- module: nn.Sequential, input: Union[List[Any], Tensor], chunks: int, param_scale: float, device: torch.device,
- ) -> List[int]:
- """Profiles CUDA memory usage per layer."""
- if device.type != "cuda":
- raise ValueError("size profiler supports only CUDA device")
- batch = Batch(input)
- sizes: List[int] = []
- latent_scale = batch[0].size(0) / chunks
- for i, x in enumerate(batch):
- batch[i] = x[:1].detach().to(device).requires_grad_(x.requires_grad)
- for layer in layerwise_sandbox(module, device):
- detach(batch)
- # Detect memory usage at forward.
- torch._C._cuda_clearCublasWorkspaces()
- memory_before = torch.cuda.memory_allocated(device)
- batch = batch.call(layer)
- torch._C._cuda_clearCublasWorkspaces()
- memory_after = torch.cuda.memory_allocated(device)
- latent_size = memory_after - memory_before
- # Analyze size of parameters.
- param_size = sum(p._typed_storage()._nbytes() for p in layer.parameters())
- # Combine size of parameters and activations with normalize scales.
- size = latent_size * latent_scale + param_size * param_scale
- sizes.append(int(size))
- return sizes
|