profile.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # Copyright 2019 Kakao Brain
  2. #
  3. # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
  4. #
  5. # This source code is licensed under the BSD license found in the
  6. # LICENSE file in the root directory of this source tree.
  7. """Per-layer profilers."""
  8. import copy
  9. import time
  10. from typing import Any, Generator, List, Union, Sequence
  11. import torch
  12. from torch import Tensor
  13. import torch.nn as nn
  14. from ..microbatch import Batch
  15. __all__: List[str] = []
  16. Device = Union[torch.device, int, str]
  17. Tensors = Sequence[Tensor]
  18. TensorOrTensors = Union[Tensor, Tensors]
  19. def layerwise_sandbox(module: nn.Sequential, device: torch.device,) -> Generator[nn.Module, None, None]:
  20. """Copies layers for ease to profile. It doesn't modify the given
  21. module.
  22. """
  23. for layer in module:
  24. layer_copy = copy.deepcopy(layer)
  25. layer_copy.to(device)
  26. layer_copy.train()
  27. yield layer_copy
  28. def detach(batch: Batch) -> None:
  29. """Detaches from autograd graph."""
  30. for i, x in enumerate(batch):
  31. batch[i] = x.detach().requires_grad_(x.requires_grad)
  32. def profile_times(module: nn.Sequential, sample: Union[List[Any], Tensor], timeout: float, device: torch.device,) -> List[int]:
  33. """Profiles elapsed times per layer."""
  34. if any(p.grad is not None for p in module.parameters()):
  35. raise ValueError("some parameter already has gradient")
  36. _batch = Batch(sample)
  37. for i, x in enumerate(_batch):
  38. _batch[i] = x.detach().to(device).requires_grad_(x.requires_grad)
  39. time_bufs: List[List[float]] = [[] for _ in module]
  40. begun_at = time.time()
  41. while time.time() - begun_at < timeout:
  42. batch = _batch
  43. for i, layer in enumerate(layerwise_sandbox(module, device)):
  44. detach(batch)
  45. if device.type == "cuda":
  46. torch.cuda.synchronize(device)
  47. tick = time.time()
  48. # Forward
  49. batch = batch.call(layer)
  50. # Backward
  51. backward_tensors = tuple(y for y in batch if y.requires_grad)
  52. if backward_tensors:
  53. torch.autograd.backward(backward_tensors, backward_tensors)
  54. if device.type == "cuda":
  55. torch.cuda.synchronize(device)
  56. tock = time.time()
  57. time_bufs[i].append(tock - tick)
  58. us = 1_000_000
  59. return [sum(int(t * us) for t in buf) for buf in time_bufs]
  60. def profile_sizes(
  61. module: nn.Sequential, input: Union[List[Any], Tensor], chunks: int, param_scale: float, device: torch.device,
  62. ) -> List[int]:
  63. """Profiles CUDA memory usage per layer."""
  64. if device.type != "cuda":
  65. raise ValueError("size profiler supports only CUDA device")
  66. batch = Batch(input)
  67. sizes: List[int] = []
  68. latent_scale = batch[0].size(0) / chunks
  69. for i, x in enumerate(batch):
  70. batch[i] = x[:1].detach().to(device).requires_grad_(x.requires_grad)
  71. for layer in layerwise_sandbox(module, device):
  72. detach(batch)
  73. # Detect memory usage at forward.
  74. torch._C._cuda_clearCublasWorkspaces()
  75. memory_before = torch.cuda.memory_allocated(device)
  76. batch = batch.call(layer)
  77. torch._C._cuda_clearCublasWorkspaces()
  78. memory_after = torch.cuda.memory_allocated(device)
  79. latent_size = memory_after - memory_before
  80. # Analyze size of parameters.
  81. param_size = sum(p._typed_storage()._nbytes() for p in layer.parameters())
  82. # Combine size of parameters and activations with normalize scales.
  83. size = latent_size * latent_scale + param_size * param_scale
  84. sizes.append(int(size))
  85. return sizes