api.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. #!/usr/bin/env python3
  2. # Copyright (c) Facebook, Inc. and its affiliates.
  3. # All rights reserved.
  4. #
  5. # This source code is licensed under the BSD-style license found in the
  6. # LICENSE file in the root directory of this source tree.
  7. import sys
  8. import uuid
  9. from dataclasses import dataclass, field
  10. from typing import Any, Callable, Dict, List, Optional, Tuple, Union
  11. import torch.distributed.elastic.rendezvous.registry as rdzv_registry
  12. from torch.distributed.elastic import events, metrics
  13. from torch.distributed.elastic.agent.server.api import WorkerSpec
  14. from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent
  15. from torch.distributed.elastic.multiprocessing import SignalException, Std
  16. from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
  17. from torch.distributed.elastic.rendezvous import RendezvousParameters
  18. from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
  19. from torch.distributed.elastic.utils.logging import get_logger
  20. __all__ = ['LaunchConfig', 'elastic_launch', 'launch_agent']
  21. logger = get_logger()
  22. @dataclass
  23. class LaunchConfig:
  24. """
  25. Creates a rendezvous config.
  26. Args:
  27. min_nodes: Minimum amount of nodes that the user function will
  28. be launched on. Elastic agent ensures that the user
  29. function start only when the min_nodes amount enters
  30. the rendezvous.
  31. max_nodes: Maximum amount of nodes that the user function
  32. will be launched on.
  33. nproc_per_node: On each node the elastic agent will launch
  34. this amount of workers that will execute user
  35. defined function.
  36. rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
  37. rdzv_endpoint: The endpoint of the rdzv sync. storage.
  38. rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
  39. rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
  40. to be removed in future versions, see the note below. The default timeout is 900 seconds.
  41. run_id: The unique run id of the job (if not passed a unique one will be
  42. deduced from run environment - flow workflow id in flow - or auto generated).
  43. role: User defined role of the worker (defaults to "trainer").
  44. max_restarts: The maximum amount of restarts that elastic agent will conduct
  45. on workers before failure.
  46. monitor_interval: The interval in seconds that is used by the elastic_agent
  47. as a period of monitoring workers.
  48. start_method: The method is used by the elastic agent to start the
  49. workers (spawn, fork, forkserver).
  50. log_dir: base log directory where log files are written. If not set,
  51. one is created in a tmp dir but NOT removed on exit.
  52. redirects: configuration to redirect stdout/stderr to log files.
  53. Pass a single ``Std`` enum to redirect all workers,
  54. or a mapping keyed by local_rank to selectively redirect.
  55. tee: configuration to "tee" stdout/stderr to console + log file.
  56. metrics_cfg: configuration to initialize metrics.
  57. local_addr: address of the local node if any. If not set, a lookup on the local
  58. machine's FQDN will be performed.
  59. ..note:
  60. `rdzv_timeout` is a legacy argument that will be removed in future.
  61. Set the timeout via `rdzv_configs['timeout']`
  62. """
  63. min_nodes: int
  64. max_nodes: int
  65. nproc_per_node: int
  66. run_id: str = ""
  67. role: str = "default_role"
  68. rdzv_endpoint: str = ""
  69. rdzv_backend: str = "etcd"
  70. rdzv_configs: Dict[str, Any] = field(default_factory=dict)
  71. rdzv_timeout: int = -1
  72. max_restarts: int = 3
  73. monitor_interval: float = 30
  74. start_method: str = "spawn"
  75. log_dir: Optional[str] = None
  76. redirects: Union[Std, Dict[int, Std]] = Std.NONE
  77. tee: Union[Std, Dict[int, Std]] = Std.NONE
  78. metrics_cfg: Dict[str, str] = field(default_factory=dict)
  79. local_addr: Optional[str] = None
  80. def __post_init__(self):
  81. default_timeout = 900
  82. if self.rdzv_timeout != -1:
  83. self.rdzv_configs["timeout"] = self.rdzv_timeout
  84. elif "timeout" not in self.rdzv_configs:
  85. self.rdzv_configs["timeout"] = default_timeout
  86. class elastic_launch:
  87. """
  88. Launches an torchelastic agent on the container that invoked the entrypoint.
  89. 1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
  90. ``entrypoint`` can be a function or a command.
  91. 2. The return value is a map of each worker's output mapped
  92. by their respective global rank.
  93. Usage
  94. ::
  95. def worker_fn(foo):
  96. # ...
  97. def main():
  98. # entrypoint is a function.
  99. outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
  100. # return rank 0's output
  101. return outputs[0]
  102. # entrypoint is a command and ``script.py`` is the python module.
  103. outputs = elastic_launch(LaunchConfig, "script.py")(args)
  104. outputs = elastic_launch(LaunchConfig, "python")("script.py")
  105. """
  106. def __init__(
  107. self,
  108. config: LaunchConfig,
  109. entrypoint: Union[Callable, str, None],
  110. ):
  111. self._config = config
  112. self._entrypoint = entrypoint
  113. def __call__(self, *args):
  114. return launch_agent(self._config, self._entrypoint, list(args))
  115. def _get_entrypoint_name(
  116. entrypoint: Union[Callable, str, None], args: List[Any]
  117. ) -> str:
  118. """Retrive entrypoint name with the rule:
  119. 1. If entrypoint is a function, use ``entrypont.__qualname__``.
  120. 2. If entrypoint is a string, check its value:
  121. 2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
  122. which does not start with hifen letter (for example, "-u" will be skipped).
  123. 2.2 otherwise, use ``entrypoint`` value.
  124. 3. Otherwise, return empty string.
  125. """
  126. if isinstance(entrypoint, Callable): # type: ignore[arg-type]
  127. return entrypoint.__name__ # type: ignore[union-attr]
  128. elif isinstance(entrypoint, str):
  129. if entrypoint == sys.executable:
  130. return next((arg for arg in args if arg[0] != "-"), "")
  131. else:
  132. return entrypoint
  133. else:
  134. return ""
  135. def _get_addr_and_port(
  136. rdzv_parameters: RendezvousParameters,
  137. ) -> Tuple[Optional[str], Optional[int]]:
  138. if rdzv_parameters.backend != "static":
  139. return (None, None)
  140. endpoint = rdzv_parameters.endpoint
  141. endpoint = endpoint.strip()
  142. if not endpoint:
  143. raise ValueError(
  144. "Endpoint is missing in endpoint. Try to add --master-addr and --master-port"
  145. )
  146. master_addr, master_port = parse_rendezvous_endpoint(endpoint, default_port=-1)
  147. if master_port == -1:
  148. raise ValueError(
  149. f"port is missing in endpoint: {endpoint}. Try to specify --master-port"
  150. )
  151. return (master_addr, master_port)
  152. def launch_agent(
  153. config: LaunchConfig,
  154. entrypoint: Union[Callable, str, None],
  155. args: List[Any],
  156. ) -> Dict[int, Any]:
  157. if not config.run_id:
  158. run_id = str(uuid.uuid4().int)
  159. logger.warning(f"config has no run_id, generated a random run_id: {run_id}")
  160. config.run_id = run_id
  161. entrypoint_name = _get_entrypoint_name(entrypoint, args)
  162. logger.info(
  163. f"Starting elastic_operator with launch configs:\n"
  164. f" entrypoint : {entrypoint_name}\n"
  165. f" min_nodes : {config.min_nodes}\n"
  166. f" max_nodes : {config.max_nodes}\n"
  167. f" nproc_per_node : {config.nproc_per_node}\n"
  168. f" run_id : {config.run_id}\n"
  169. f" rdzv_backend : {config.rdzv_backend}\n"
  170. f" rdzv_endpoint : {config.rdzv_endpoint}\n"
  171. f" rdzv_configs : {config.rdzv_configs}\n"
  172. f" max_restarts : {config.max_restarts}\n"
  173. f" monitor_interval : {config.monitor_interval}\n"
  174. f" log_dir : {config.log_dir}\n"
  175. f" metrics_cfg : {config.metrics_cfg}\n"
  176. )
  177. rdzv_parameters = RendezvousParameters(
  178. backend=config.rdzv_backend,
  179. endpoint=config.rdzv_endpoint,
  180. run_id=config.run_id,
  181. min_nodes=config.min_nodes,
  182. max_nodes=config.max_nodes,
  183. local_addr=config.local_addr,
  184. **config.rdzv_configs,
  185. )
  186. master_addr, master_port = _get_addr_and_port(rdzv_parameters)
  187. spec = WorkerSpec(
  188. role=config.role,
  189. local_world_size=config.nproc_per_node,
  190. entrypoint=entrypoint,
  191. args=tuple(args),
  192. rdzv_handler=rdzv_registry.get_rendezvous_handler(rdzv_parameters),
  193. max_restarts=config.max_restarts,
  194. monitor_interval=config.monitor_interval,
  195. redirects=config.redirects,
  196. tee=config.tee,
  197. master_addr=master_addr,
  198. master_port=master_port,
  199. local_addr=config.local_addr,
  200. )
  201. agent = LocalElasticAgent(
  202. spec=spec, start_method=config.start_method, log_dir=config.log_dir
  203. )
  204. shutdown_rdzv = True
  205. try:
  206. metrics.initialize_metrics(metrics.MetricsConfig(config.metrics_cfg))
  207. result = agent.run()
  208. # records that agent.run() has succeeded NOT that workers have succeeded
  209. events.record(agent.get_event_succeeded())
  210. if result.is_failed():
  211. # ChildFailedError is treated specially by @record
  212. # if the error files for the failed children exist
  213. # @record will copy the first error (root cause)
  214. # to the error file of the launcher process.
  215. raise ChildFailedError(
  216. name=entrypoint_name,
  217. failures=result.failures,
  218. )
  219. return result.return_values
  220. except ChildFailedError:
  221. raise
  222. except SignalException:
  223. # when the agent dies with a signal do NOT shutdown the rdzv_handler
  224. # since this closes the rendezvous on this rdzv_id permanently and
  225. # prevents any additional scaling events
  226. shutdown_rdzv = False
  227. events.record(agent.get_event_failed())
  228. raise
  229. except Exception:
  230. events.record(agent.get_event_failed())
  231. raise
  232. finally:
  233. if shutdown_rdzv:
  234. spec.rdzv_handler.shutdown()