api.py 3.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. import abc
  2. import torch.nn as nn
  3. from dataclasses import dataclass
  4. from typing import Dict, List, Optional, Union
  5. from torch.distributed._shard.sharder import Sharder
  6. from torch.distributed._shard.sharding_spec import ShardingSpec
  7. @dataclass
  8. class ShardingPlan:
  9. """
  10. Representation of a sharding plan, describes how to shard a module
  11. across hosts. `plan` is used to shard module parameters according to the spec provided,
  12. `output_plan` and `return_local_tensor` are optional, they are used to specify the output
  13. layout of a module with a spec, and when to convert back to data parallel fashion.
  14. Args:
  15. plan (Dict[str, Union[:class:`torch.distributed._shard.sharding_spec.ShardingSpec`,
  16. :class:`torch.distributed._shard.sharder.Sharder`]):
  17. a dict describes how to shard a module, there're currently two ways to shard a module:
  18. 1. directly shard a module parameter by a `ShardingSpec`, keyed by the name of
  19. a parameter to a `ShardingSpec`.
  20. 2. shard a submodule by applying a `Sharder` on it, keyed by the name of a module
  21. to a `Sharder` object.
  22. output_plan (Dict[str, :class:`torch.distributed._shard.sharding_spec.ShardingSpec`), optional):
  23. a dict specifies the layout of a module's output which produces a ShardedTensor,
  24. keyed by the name of module to ShardingSpec("" in key means the root module).
  25. Default: `None`
  26. return_local_tensor (List[str], optional): a list of string, each element enables
  27. a module's sharded output to be returned as a Tensor from its local shards to
  28. ensure further processsing in a data parallel fashion. ("" in list means the
  29. root module).
  30. Default: None
  31. Example:
  32. Suppose we want to shard a module with two linear layers and then run it with DDP, we also
  33. want to convert the output of the second linear layer back to DDP, we can do it as follows:
  34. >>> # xdoctest: +REQUIRES(module:torch._C._distributed_c10d)
  35. >>> class MyModule(nn.Module):
  36. >>> def __init__(self):
  37. >>> super().__init__()
  38. >>> self.fc1 = nn.Linear()
  39. >>> self.gelu = nn.GELU()
  40. >>> self.fc2 = nn.Linear()
  41. >>> self.relu = nn.Linear()
  42. >>>
  43. >>> def forward(self, input):
  44. >>> return self.relu(self.fc2(self.gelu(self.fc1(input))))
  45. >>> # xdoctest: +SKIP("Undefined spec1, spec2)
  46. >>> sharding_plan = ShardingPlan(
  47. >>> plan={
  48. >>> "fc1.weight": spec1,
  49. >>> "fc2.weight": spec2
  50. >>> },
  51. >>> output_plan={
  52. >>> "fc2": output_spec
  53. >>> },
  54. >>> return_local_tensor=["fc2"]
  55. >>> )
  56. """
  57. plan: Dict[str, Union[ShardingSpec, Sharder]]
  58. output_plan: Optional[Dict[str, ShardingSpec]] = None
  59. return_local_tensor: Optional[List[str]] = None
  60. class ShardingPlanner(abc.ABC):
  61. """
  62. Default ShardingPlanner interface, can be extended and
  63. implement advanced sharding strategies.
  64. """
  65. @abc.abstractmethod
  66. def build_plan(self, module: nn.Module) -> ShardingPlan:
  67. """
  68. Given a nn.Module, define how to shard the module across
  69. ranks, return a ShardingPlan
  70. Args:
  71. module (:class:`torch.nn.Module`):
  72. The module to apply sharding to.
  73. Returns:
  74. A :class:`torch.distributed._shard.sharding_plan.ShardingPlan` object that
  75. represents how to shard the module.
  76. """
  77. pass