remote_module.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774
  1. #!/usr/bin/python3
  2. import collections
  3. import io
  4. import sys
  5. import types
  6. from typing import (
  7. Any,
  8. Callable,
  9. Dict,
  10. Iterator,
  11. List,
  12. Mapping,
  13. Optional,
  14. Set,
  15. Tuple,
  16. Type,
  17. TypeVar,
  18. Union,
  19. )
  20. import torch
  21. import torch.distributed.rpc as rpc
  22. from torch import Tensor, device, dtype, nn
  23. from torch.distributed.nn.jit import instantiator
  24. from torch.distributed import _remote_device
  25. from torch.distributed.rpc.internal import _internal_rpc_pickler
  26. from torch.nn import Module
  27. from torch.nn.parameter import Parameter
  28. from torch.utils.hooks import RemovableHandle
  29. __all__ = ["RemoteModule"]
  30. _grad_t = Union[Tuple[Tensor, ...], Tensor]
  31. # See https://mypy.readthedocs.io/en/latest/generics.html#generic-methods-and-generic-self for the use
  32. # of `T` to annotate `self`. Many methods of `Module` return `self` and we want those return values to be
  33. # the type of the subclass, not the looser type of `Module`.
  34. T = TypeVar("T", bound="Module")
  35. _NON_SCRIPTABLE_REMOTE_MODULE_MODULE = (
  36. instantiator.instantiate_non_scriptable_remote_module_template()
  37. )
  38. _REMOTE_MODULE_PICKLED_ATTRIBUTES = (
  39. "on",
  40. "device",
  41. "is_device_map_set",
  42. "is_scriptable",
  43. "generated_methods",
  44. "module_rref",
  45. )
  46. _SerializedRemoteModule = collections.namedtuple("_SerializedRemoteModule", _REMOTE_MODULE_PICKLED_ATTRIBUTES) # type: ignore[misc]
  47. # These attributes are mostly from RemoteModule's parent class and are intentionally not pickled.
  48. # A new attribute of RemoteModule should be either in _REMOTE_MODULE_PICKLED_ATTRIBUTES
  49. # or _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING.
  50. # Otherwise, it will not be pickled.
  51. _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING = (
  52. "training",
  53. "_parameters",
  54. "_buffers",
  55. "_non_persistent_buffers_set",
  56. "_backward_hooks",
  57. "_backward_pre_hooks",
  58. "_is_full_backward_hook",
  59. "_forward_hooks",
  60. "_forward_hooks_with_kwargs",
  61. "_forward_pre_hooks",
  62. "_forward_pre_hooks_with_kwargs",
  63. "_state_dict_hooks",
  64. "_state_dict_pre_hooks",
  65. "_load_state_dict_pre_hooks",
  66. "_load_state_dict_post_hooks",
  67. "_state_dict_pre_hooks",
  68. "_modules",
  69. # The two attributes below are generated methods, not available at pickling time.
  70. "forward_async",
  71. "forward",
  72. )
  73. # RPC handler.
  74. def _instantiate_template(module_interface_cls, enable_moving_cpu_tensors_to_cuda):
  75. instantiator.instantiate_scriptable_remote_module_template(
  76. module_interface_cls, enable_moving_cpu_tensors_to_cuda
  77. )
  78. def _create_module(module_cls, args, kwargs, device):
  79. module = module_cls(*args, **kwargs)
  80. if not isinstance(module, nn.Module):
  81. raise ValueError(
  82. "Expect `module_cls(*args, **kwargs)` returns an instance of <class nn.Module>, "
  83. f"but it returns an instance of {type(module)}."
  84. )
  85. module.to(device)
  86. return module
  87. def _create_module_with_interface(
  88. module_cls, args, kwargs, device, module_interface_cls
  89. ):
  90. module = _create_module(module_cls, args, kwargs, device)
  91. if module_interface_cls is not None:
  92. module = torch.jit.script(module)
  93. return rpc.RRef(module, module_interface_cls)
  94. def _param_rrefs(module_rref, recurse) -> List[rpc.RRef[Parameter]]:
  95. ret: List[rpc.RRef[Parameter]] = []
  96. for param in module_rref.local_value().parameters(recurse):
  97. ret.append(rpc.RRef(param))
  98. return ret
  99. def _raise_not_supported(name: str) -> None:
  100. raise ValueError("Method ``{}`` not supported for RemoteModule".format(name))
  101. class _RemoteModule(nn.Module):
  102. def __new__(cls, *args, **kwargs):
  103. # Use __new__ for logging purposes.
  104. torch._C._log_api_usage_once("torch.distributed.nn.api.remote_module")
  105. return super(_RemoteModule, cls).__new__(cls)
  106. def __init__(
  107. self,
  108. remote_device: str,
  109. module_cls: Type[nn.Module],
  110. args: Tuple = None,
  111. kwargs: Dict[str, Any] = None,
  112. _module_interface_cls: Any = None,
  113. ):
  114. """
  115. A RemoteModule instance can only be created after RPC initialization.
  116. It creates a user-specified module on a specified remote node.
  117. It behaves like a regular ``nn.Module`` except that the ``forward`` method is
  118. executed on the remote node.
  119. It takes care of autograd recording to ensure the backward pass propagates
  120. gradients back to the corresponding remote module.
  121. It can be shared across processors using `RPC framework <https://pytorch.org/docs/stable/rpc.html>`__,
  122. without incurring any overheads of copying the actual module,
  123. which is equivalent to an :class:`~torch.distributed.rpc.RRef`
  124. pointing to the remote module.
  125. The arguments of ``forward_async`` and ``forward`` are the same as
  126. the ``forward`` method of the module returned by the ``module_cls``.
  127. Apart from ``forward_async`` and ``forward``, no other methods are supported from nn.Module for now.
  128. Particularly, to create a hybrid model, typically the local modules should be
  129. created outside of remote modules, rather than as submodules of any remote module (by calling ``add_module``).
  130. Hybrid Example:
  131. >>> class HybridModel(nn.Module):
  132. >>> def __init__(self):
  133. >>> nn.Module.__init__(self)
  134. >>> self.remote_embedding = RemoteModule(...)
  135. >>> self.local_linear = nn.Linear(...)
  136. For example, if ``module_cls`` returns an instance of ``nn.Linear``,
  137. that has ``forward`` method signature, ``def forward(input: Tensor) -> Tensor:``,
  138. the generated ``RemoteModule`` will have 2 methods in signature of
  139. ``def forward(input: Tensor) -> Tensor:`` and
  140. ``def forward_async(input: Tensor) -> Future[Tensor]:``.
  141. .. note::
  142. If the remote module is placed on a cuda device,
  143. any input CPU tensors will be automatically moved to the same cuda device,
  144. and GPU tensors are returned over the wire according to the device map of the remote worker on TensorPipe RPC backend.
  145. Args:
  146. remote_device (str): Device on the destination worker where we'd like to place this module.
  147. The device can be a local device or a remote device specified by one of the following remote
  148. formats:
  149. 1. "rank:<rank>/<device>" (ex: "rank:0/cuda:0").
  150. 2. "<worker_name>/<device>" (ex: "trainer0/cuda:0").
  151. In addition, the device field can be optional and the default value is "cpu".
  152. module_cls (nn.Module): For example,
  153. >>> class MyModule(nn.Module):
  154. >>> def forward(input):
  155. >>> return input + 1
  156. >>>
  157. >>> module_cls = MyModule
  158. args (Sequence, optional): args to be passed to ``module_cls``.
  159. kwargs (Dict, optional): kwargs to be passed to ``module_cls``.
  160. _module_interface_cls (type, optional): The TorchScript interface type for the module
  161. to be created. The type object should be decorated by @torch.jit.interface.
  162. If not provided, the generated RemoteModule is not torchscript-able.
  163. Warning, this is an experimental API and susceptible to frequent changes.
  164. Returns:
  165. A remote module instance which wraps the :class:`~nn.Module` created by the
  166. user-provided ``module_cls``, it has a blocking ``forward`` method and an
  167. asynchronous ``forward_async`` method that returns a future of the ``forward`` call
  168. on the user-provided module on the remote side.
  169. Example::
  170. Run the following code in two different processes:
  171. >>> # xdoctest: +SKIP("distributed")
  172. >>> # On worker 0:
  173. >>> import torch
  174. >>> import torch.distributed.rpc as rpc
  175. >>> from torch import nn, Tensor
  176. >>> from torch.distributed.nn.api.remote_module import RemoteModule
  177. >>>
  178. >>> rpc.init_rpc("worker0", rank=0, world_size=2)
  179. >>> remote_linear_module = RemoteModule(
  180. >>> "worker1/cpu", nn.Linear, args=(20, 30),
  181. >>> )
  182. >>> input = torch.randn(128, 20)
  183. >>> ret_fut = remote_linear_module.forward_async(input)
  184. >>> ret = ret_fut.wait()
  185. >>> rpc.shutdown()
  186. >>> # On worker 1:
  187. >>> import torch
  188. >>> import torch.distributed.rpc as rpc
  189. >>>
  190. >>> rpc.init_rpc("worker1", rank=1, world_size=2)
  191. >>> rpc.shutdown()
  192. """
  193. super().__init__()
  194. enable_moving_cpu_tensors_to_cuda = self._prepare_init(remote_device)
  195. # Default arguments preperation.
  196. args = args if args is not None else ()
  197. kwargs = kwargs if kwargs is not None else {}
  198. if _module_interface_cls is not None:
  199. # Users reply on this field to know if this generated RemoteModule is TorchScript-able.
  200. self.is_scriptable = True
  201. # Instantiate template on remote side.
  202. fut = rpc.rpc_async(
  203. self.on,
  204. _instantiate_template,
  205. (_module_interface_cls, enable_moving_cpu_tensors_to_cuda),
  206. )
  207. self._init_template(
  208. _module_interface_cls, enable_moving_cpu_tensors_to_cuda
  209. )
  210. # Instantiate template on remote side.
  211. fut = rpc.rpc_async(
  212. self.on,
  213. _instantiate_template,
  214. (_module_interface_cls, enable_moving_cpu_tensors_to_cuda),
  215. )
  216. # Create the module on the remote side.
  217. fut.wait() # Ensure remote_module_cls is available on remote side.
  218. # TODO: We need to change this to rpc.remote, and make it async (see the else branch below).
  219. # For that we need to be able to apply _module_interface_cls to the RRef returned by rpc.remote
  220. # See https://github.com/pytorch/pytorch/issues/58098 for more context.
  221. self.module_rref = rpc.rpc_sync(
  222. self.on,
  223. _create_module_with_interface,
  224. (module_cls, args, kwargs, self.device, _module_interface_cls),
  225. )
  226. else:
  227. self.is_scriptable = False
  228. self.generated_methods = (
  229. _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods
  230. )
  231. # Create the module on the remote side.
  232. self.module_rref = rpc.remote(
  233. self.on,
  234. _create_module,
  235. (module_cls, args, kwargs, self.device),
  236. )
  237. self._install_generated_methods()
  238. self._check_attribute_picklability()
  239. def remote_parameters(self, recurse: bool = True) -> List[rpc.RRef[Parameter]]:
  240. """
  241. Returns a list of :class:`~torch.distributed.rpc.RRef` pointing to the
  242. remote module's parameters. This can typically be used in conjuction
  243. with :class:`~torch.distributed.optim.DistributedOptimizer`.
  244. Args:
  245. recurse (bool): if True, then returns parameters of the remote
  246. module and all submodules of the remote module. Otherwise,
  247. returns only parameters that are direct members of the
  248. remote module.
  249. Returns:
  250. A list of :class:`~torch.distributed.rpc.RRef` (``List[RRef[nn.Parameter]]``)
  251. to remote module's parameters.
  252. """
  253. return rpc.rpc_sync(self.on, _param_rrefs, args=(self.module_rref, recurse))
  254. def get_module_rref(self) -> rpc.RRef[nn.Module]:
  255. """
  256. Returns an :class:`~torch.distributed.rpc.RRef` (``RRef[nn.Module]``)
  257. pointing to the remote module.
  258. """
  259. return self.module_rref
  260. @torch.jit.export
  261. def __getstate__(self):
  262. raise RuntimeError(
  263. "Cannot pickle RemoteModule in python pickler. RemoteModule can only be pickled when using RPC"
  264. )
  265. @torch.jit.export
  266. def __setstate__(self, state):
  267. raise RuntimeError(
  268. "Cannot unpickle RemoteModule in python pickler. RemoteModule can only be unpickled when using RPC"
  269. )
  270. def register_buffer(
  271. self, name: str, tensor: Optional[Tensor], persistent: bool = True
  272. ) -> None:
  273. _raise_not_supported(self.register_buffer.__name__)
  274. def register_parameter(self, name: str, param: Optional[Parameter]) -> None:
  275. _raise_not_supported(self.register_parameter.__name__)
  276. def add_module(self, name: str, module: Optional[Module]) -> None:
  277. _raise_not_supported(self.add_module.__name__)
  278. def apply(self: T, fn: Callable[[Module], None]) -> T: # type: ignore[return]
  279. _raise_not_supported(self.apply.__name__)
  280. def cuda(self: T, device: Optional[Union[int, device]] = None) -> T: # type: ignore[return]
  281. _raise_not_supported(self.cuda.__name__)
  282. def ipu(self: T, device: Optional[Union[int, device]] = None) -> T: # type: ignore[return]
  283. _raise_not_supported(self.ipu.__name__)
  284. def xpu(self: T, device: Optional[Union[int, device]] = None) -> T: # type: ignore[return]
  285. _raise_not_supported(self.xpu.__name__)
  286. def cpu(self: T) -> T: # type: ignore[return]
  287. _raise_not_supported(self.cpu.__name__)
  288. def type(self: T, dst_type: Union[dtype, str]) -> T: # type: ignore[return]
  289. _raise_not_supported(self.type.__name__)
  290. def float(self: T) -> T: # type: ignore[return]
  291. _raise_not_supported(self.float.__name__)
  292. def double(self: T) -> T: # type: ignore[return]
  293. _raise_not_supported(self.double.__name__)
  294. def half(self: T) -> T: # type: ignore[return]
  295. _raise_not_supported(self.half.__name__)
  296. def bfloat16(self: T) -> T: # type: ignore[return]
  297. _raise_not_supported(self.bfloat16.__name__)
  298. def to(self, *args, **kwargs) -> T: # type: ignore[return]
  299. _raise_not_supported(self.to.__name__)
  300. def register_backward_hook( # type: ignore[return]
  301. self, hook: Callable[[Module, _grad_t, _grad_t], Union[None, _grad_t]]
  302. ) -> RemovableHandle:
  303. _raise_not_supported(self.register_backward_hook.__name__)
  304. def register_forward_pre_hook( # type: ignore[return]
  305. self,
  306. hook: Union[
  307. Callable[[T, Tuple[Any, ...]], Optional[Any]],
  308. Callable[[T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]],
  309. ],
  310. prepend: bool = False,
  311. with_kwargs: bool = False,
  312. ) -> RemovableHandle:
  313. _raise_not_supported(self.register_forward_pre_hook.__name__)
  314. def register_forward_hook( # type: ignore[return]
  315. self,
  316. hook: Union[
  317. Callable[[T, Tuple[Any, ...], Any], Optional[Any]],
  318. Callable[[T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]],
  319. ],
  320. prepend: bool = False,
  321. with_kwargs: bool = False,
  322. ) -> RemovableHandle:
  323. _raise_not_supported(self.register_forward_hook.__name__)
  324. def state_dict(self, *args, **kwargs):
  325. _raise_not_supported(self.state_dict.__name__)
  326. def load_state_dict(
  327. self,
  328. state_dict: Mapping[str, Any],
  329. strict: bool = True,
  330. ):
  331. _raise_not_supported(self.load_state_dict.__name__)
  332. def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
  333. raise ValueError(
  334. "Method ``parameters`` not supported for RemoteModule. Please use ``remote_parameters`` instead."
  335. )
  336. def named_parameters( # type: ignore[return]
  337. self,
  338. prefix: str = "",
  339. recurse: bool = True,
  340. remove_duplicate: bool = True
  341. ) -> Iterator[Tuple[str, Parameter]]:
  342. _raise_not_supported(self.named_parameters.__name__)
  343. def buffers(self, recurse: bool = True) -> Iterator[Tensor]: # type: ignore[return]
  344. _raise_not_supported(self.buffers.__name__)
  345. def named_buffers( # type: ignore[return]
  346. self,
  347. prefix: str = "",
  348. recurse: bool = True,
  349. remove_duplicate: bool = True
  350. ) -> Iterator[Tuple[str, Tensor]]:
  351. _raise_not_supported(self.named_buffers.__name__)
  352. def children(self) -> Iterator[Module]: # type: ignore[return]
  353. _raise_not_supported(self.children.__name__)
  354. def named_children(self) -> Iterator[Tuple[str, Module]]: # type: ignore[return]
  355. _raise_not_supported(self.named_children.__name__)
  356. def modules(self) -> Iterator[Module]: # type: ignore[return]
  357. _raise_not_supported(self.modules.__name__)
  358. def named_modules(
  359. self,
  360. memo: Optional[Set[Module]] = None,
  361. prefix: str = "",
  362. remove_duplicate: bool = True,
  363. ):
  364. _raise_not_supported(self.named_modules.__name__)
  365. def train(self: T, mode: bool = True) -> T:
  366. return self.module_rref.rpc_sync().train() # type: ignore[operator, union-attr]
  367. def eval(self: T) -> T:
  368. return self.module_rref.rpc_sync().eval() # type: ignore[operator, union-attr]
  369. def requires_grad_(self: T, requires_grad: bool = True) -> T: # type: ignore[return]
  370. _raise_not_supported(self.requires_grad_.__name__)
  371. def zero_grad(self, set_to_none: bool = True) -> None:
  372. _raise_not_supported(self.zero_grad.__name__)
  373. def share_memory(self: T) -> T: # type: ignore[return]
  374. _raise_not_supported(self.share_memory.__name__)
  375. def extra_repr(self) -> str: # type: ignore[return]
  376. _raise_not_supported(self.extra_repr.__name__)
  377. def _prepare_init(self, remote_device_str: str) -> bool:
  378. """
  379. Prepares the initializaiton and returns whether to enable automatically moving CPU tensors to CUDA devices.
  380. """
  381. # Sanity check.
  382. assert rpc._is_current_rpc_agent_set(), "RemoteModule only works in RPC."
  383. remote_device = _remote_device(remote_device_str)
  384. self.on = remote_device.worker_name() if remote_device.worker_name() is not None else remote_device.rank()
  385. self.device = str(remote_device.device())
  386. agent = rpc._get_current_rpc_agent()
  387. # If the device map of the remote worker is set,
  388. # then enable moving any input CPU tensors to the same cuda device.
  389. self.is_device_map_set = bool(
  390. agent._get_device_map(agent.get_worker_info(self.on)) # type: ignore[arg-type]
  391. )
  392. # ``enable_moving_cpu_tensors_to_cuda`` is less strict than ``is_device_map_set``:
  393. # If ``enable_moving_cpu_tensors_to_cuda`` is true, but the device map is not set,
  394. # then any CPU tensors can still be moved to a cuda device to run forward,
  395. # but the output must be moved back to CPU before being sent over the wire.
  396. enable_moving_cpu_tensors_to_cuda = torch.device(self.device).type == "cuda"
  397. return enable_moving_cpu_tensors_to_cuda
  398. def _init_template(self, module_interface_cls, enable_moving_cpu_tensors_to_cuda):
  399. """
  400. Instantiates template on local side.
  401. """
  402. generated_module = instantiator.instantiate_scriptable_remote_module_template(
  403. module_interface_cls, enable_moving_cpu_tensors_to_cuda
  404. )
  405. self.generated_methods = generated_module._generated_methods
  406. def _check_attribute_picklability(self):
  407. """
  408. Checks if all the attribute has explicitly defined whether to be pickled (i.e., picklability).
  409. """
  410. for k in self.__dict__.keys():
  411. if (
  412. k not in _REMOTE_MODULE_PICKLED_ATTRIBUTES
  413. and k not in _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING
  414. ):
  415. raise AttributeError(
  416. "Attribute {} must be either in ``_REMOTE_MODULE_PICKLED_ATTRIBUTES`` or "
  417. "``_REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING``.".format(k)
  418. )
  419. def _install_generated_methods(self):
  420. for method in self.generated_methods:
  421. method_name = method.__name__
  422. method = torch.jit.export(method)
  423. setattr(self, method_name, types.MethodType(method, self))
  424. @staticmethod
  425. def init_from_module_rref(
  426. remote_device: str,
  427. module_rref: rpc.RRef[nn.Module],
  428. _module_interface_cls: Any = None,
  429. ):
  430. """
  431. Besides the constructor, a RemoteModule instance can also be initialized given a module RRef.
  432. This alternate initiailization method can be particularly useful if we want to create multiple
  433. RemoteModule instances that share the same underlying module and reduce memory consumption.
  434. Moreover, this also provides a workaround for passing script RemoteModule over RPC,
  435. which is not supported. The recommended way is as follows:
  436. 1. the sender creates a RemoteModule;
  437. 2. the sender sends its ``module_rref`` over RPC;
  438. 3. the receiver calls this method to initialize another RemoteModule using the same ``module_rref``.
  439. Example::
  440. Run the following code in two different processes:
  441. >>> # xdoctest: +SKIP("distributed")
  442. >>> # On worker 0:
  443. >>> import torch
  444. >>> import torch.distributed.rpc as rpc
  445. >>> from torch import nn, Tensor
  446. >>> from torch.distributed.nn.api.remote_module import RemoteModule
  447. >>>
  448. >>> rpc.init_rpc("worker0", rank=0, world_size=2)
  449. >>> remote_module = RemoteModule(
  450. >>> "worker1/cpu", nn.Linear, args=(20, 30),
  451. >>> )
  452. >>>
  453. >>> remote_module1 = rpc.rpc_sync(
  454. >>> "worker1/cpu",
  455. >>> RemoteModule.init_from_module_rref,
  456. >>> ("worker1/cpu", remote_module1.get_module_rref()),
  457. >>> )
  458. >>> rpc.shutdown()
  459. >>> # On worker 1:
  460. >>> import torch
  461. >>> import torch.distributed.rpc as rpc
  462. >>>
  463. >>> rpc.init_rpc("worker1", rank=1, world_size=2)
  464. >>> rpc.shutdown()
  465. Args:
  466. remote_device (str): Device on the destination worker where we'd like to place this module.
  467. The device can be a local device or a remote device specified by one of the following remote
  468. formats:
  469. 1. "rank:<rank>/<device>" (ex: "rank:0/cuda:0").
  470. 2. "<worker_name>/<device>" (ex: "trainer0/cuda:0").
  471. In addition, the device field can be optional and the default value is "cpu".
  472. module_rref (RRef[nn.Module]): The module reference shared by both the caller and
  473. the created remote module.
  474. _module_interface_cls (type, optional): The TorchScript interface type for the module
  475. to be created. The type object should be decorated by @torch.jit.interface.
  476. If not provided, the generated RemoteModule is not torchscript-able.
  477. Warning, this is an experimental API and susceptible to frequent changes.
  478. Returns:
  479. A remote module instance which wraps the :class:`~nn.Module` created by the
  480. user-provided ``module_rref``, it has a blocking ``forward`` method and an
  481. asynchronous ``forward_async`` method that returns a future of the ``forward`` call
  482. on the user-provided module on the remote side.
  483. """
  484. # NOTE: if a new attribute is added to this class, also need to add it
  485. # to ``_REMOTE_MODULE_PICKLED_ATTRIBUTES`` for pickling/unpickling.
  486. remote_module = object.__new__(RemoteModule)
  487. enable_moving_cpu_tensors_to_cuda = remote_module._prepare_init(remote_device)
  488. if _module_interface_cls is not None:
  489. # Users reply on this field to know if this generated RemoteModule is TorchScript-able.
  490. remote_module.is_scriptable = True
  491. remote_module._init_template(
  492. _module_interface_cls, enable_moving_cpu_tensors_to_cuda
  493. )
  494. else:
  495. remote_module.is_scriptable = False
  496. remote_module.generated_methods = (
  497. _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods
  498. )
  499. remote_module.module_rref = module_rref
  500. remote_module._install_generated_methods()
  501. remote_module._check_attribute_picklability()
  502. return remote_module
  503. class RemoteModule(_RemoteModule):
  504. """
  505. A RemoteModule instance can only be created after RPC initialization.
  506. It creates a user-specified module on a specified remote node.
  507. It behaves like a regular ``nn.Module`` except that the ``forward`` method is
  508. executed on the remote node.
  509. It takes care of autograd recording to ensure the backward pass propagates
  510. gradients back to the corresponding remote module.
  511. It generates two methods ``forward_async`` and ``forward`` based on the
  512. signature of the ``forward`` method of ``module_cls``. ``forward_async``
  513. runs asynchronously and returns a Future. The arguments of ``forward_async``
  514. and ``forward`` are the same as the ``forward`` method of the module
  515. returned by the ``module_cls``.
  516. For example, if ``module_cls`` returns an instance of ``nn.Linear``,
  517. that has ``forward`` method signature: ``def forward(input: Tensor) -> Tensor:``,
  518. the generated ``RemoteModule`` will have 2 methods with the signatures:
  519. | ``def forward(input: Tensor) -> Tensor:``
  520. | ``def forward_async(input: Tensor) -> Future[Tensor]:``
  521. Args:
  522. remote_device (str): Device on the destination worker where we'd like to place this module.
  523. The format should be "<workername>/<device>", where the device field can be parsed as torch.device type.
  524. E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0".
  525. In addition, the device field can be optional and the default value is "cpu".
  526. module_cls (nn.Module): Class for the module to be created remotely. For example,
  527. >>> class MyModule(nn.Module):
  528. >>> def forward(input):
  529. >>> return input + 1
  530. >>>
  531. >>> module_cls = MyModule
  532. args (Sequence, optional): args to be passed to ``module_cls``.
  533. kwargs (Dict, optional): kwargs to be passed to ``module_cls``.
  534. Returns:
  535. A remote module instance which wraps the :class:`~nn.Module` created by the
  536. user-provided ``module_cls``, it has a blocking ``forward`` method and an
  537. asynchronous ``forward_async`` method that returns a future of the ``forward`` call
  538. on the user-provided module on the remote side.
  539. Example::
  540. Run the following code in two different processes:
  541. >>> # xdoctest: +SKIP("distributed")
  542. >>> # On worker 0:
  543. >>> import torch
  544. >>> import torch.distributed.rpc as rpc
  545. >>> from torch import nn, Tensor
  546. >>> from torch.distributed.nn.api.remote_module import RemoteModule
  547. >>>
  548. >>> rpc.init_rpc("worker0", rank=0, world_size=2)
  549. >>> remote_linear_module = RemoteModule(
  550. >>> "worker1/cpu", nn.Linear, args=(20, 30),
  551. >>> )
  552. >>> input = torch.randn(128, 20)
  553. >>> ret_fut = remote_linear_module.forward_async(input)
  554. >>> ret = ret_fut.wait()
  555. >>> rpc.shutdown()
  556. >>> # On worker 1:
  557. >>> import torch
  558. >>> import torch.distributed.rpc as rpc
  559. >>>
  560. >>> rpc.init_rpc("worker1", rank=1, world_size=2)
  561. >>> rpc.shutdown()
  562. Furthermore, a more practical example that is combined with
  563. `DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel>`__ (DDP)
  564. can be found in this `tutorial <https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html>`__.
  565. """
  566. def __init__(
  567. self,
  568. remote_device: str,
  569. module_cls: Type[nn.Module],
  570. args: Tuple = None,
  571. kwargs: Dict[str, Any] = None,
  572. ):
  573. super().__init__(remote_device, module_cls, args, kwargs)
  574. def _remote_module_receiver(
  575. *remote_module_pickled_attrs,
  576. ):
  577. """
  578. Deserializes a RemoteModule.
  579. """
  580. serialized_remote_module = _SerializedRemoteModule._make(
  581. remote_module_pickled_attrs
  582. )
  583. m = object.__new__(RemoteModule)
  584. m.__dict__.update(serialized_remote_module._asdict())
  585. # Unpickling the attribute `module_rref` must invoke RRef's `_deserialize()` method.
  586. m.module_rref = rpc.PyRRef._deserialize(m.module_rref)
  587. # Install generated methods when unpickled.
  588. for method in m.generated_methods:
  589. method_name = method.__name__
  590. method = torch.jit.export(method)
  591. setattr(m, method_name, types.MethodType(method, m))
  592. return m
  593. def _remote_module_reducer(remote_module):
  594. """
  595. Serializes a RemoteModule.
  596. """
  597. pickled_attrs = {}
  598. for k, v in remote_module.__dict__.items():
  599. # Pickling the attribute `module_rref` must invoke RRef's `_serialize()` method.
  600. if k == "module_rref":
  601. pickled_attrs[k] = v._serialize()
  602. elif k in _REMOTE_MODULE_PICKLED_ATTRIBUTES:
  603. pickled_attrs[k] = v
  604. # Check if unpickled attributes are all in _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING.
  605. elif k not in _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING:
  606. print(
  607. "The new attribute ``{}`` of RemoteModule is ignored during RPC pickling. "
  608. "To pickle this attribute, please add it to ``_REMOTE_MODULE_PICKLED_ATTRIBUTES``. "
  609. "Otherwise, please explicitly add it to ``_REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING``.".format(
  610. k
  611. ),
  612. file=sys.stderr,
  613. )
  614. return (
  615. _remote_module_receiver,
  616. tuple(pickled_attrs.values()),
  617. )
  618. def _recursive_script_module_receiver(
  619. recursive_script_module_serialized,
  620. ):
  621. """
  622. Deserializes a RecursiveScirptModule that does not contain a script RemoteModule.
  623. """
  624. f = io.BytesIO(recursive_script_module_serialized)
  625. m = torch.jit.load(f)
  626. return m
  627. def _recursive_script_module_reducer(recursive_script_module):
  628. """
  629. Serializes a RecursiveScirptModule that does not contain a script RemoteModule,
  630. and raises an error otherwise.
  631. """
  632. if hasattr(recursive_script_module._c, "module_rref"):
  633. raise RuntimeError(
  634. "Passing a script RemoteModule over RPC is not supported. Please create a RemoteModule in the sender, "
  635. "send the `module_rref` to the receiver, and create a new instance on the receiver end by passing this `module_rref`."
  636. )
  637. f = io.BytesIO()
  638. torch.jit.save(recursive_script_module, f)
  639. return (_recursive_script_module_receiver, (f.getvalue(),))
  640. _internal_rpc_pickler._register_reducer(RemoteModule, _remote_module_reducer)
  641. _internal_rpc_pickler._register_reducer(
  642. torch.jit.RecursiveScriptModule, _recursive_script_module_reducer
  643. )