__init__.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. # Copyright (c) Facebook, Inc. and its affiliates.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the BSD-style license found in the
  5. # LICENSE file in the root directory of this source tree.
  6. """
  7. Expiration timers are set up on the same process as the agent and
  8. used from your script to deal with stuck workers. When you go into
  9. a code-block that has the potential to get stuck you can acquire
  10. an expiration timer, which instructs the timer server to kill the
  11. process if it does not release the timer by the self-imposed expiration
  12. deadline.
  13. Usage::
  14. import torchelastic.timer as timer
  15. import torchelastic.agent.server as agent
  16. def main():
  17. start_method = "spawn"
  18. message_queue = mp.get_context(start_method).Queue()
  19. server = timer.LocalTimerServer(message, max_interval=0.01)
  20. server.start() # non-blocking
  21. spec = WorkerSpec(
  22. fn=trainer_func,
  23. args=(message_queue,),
  24. ...<OTHER_PARAMS...>)
  25. agent = agent.LocalElasticAgent(spec, start_method)
  26. agent.run()
  27. def trainer_func(message_queue):
  28. timer.configure(timer.LocalTimerClient(message_queue))
  29. with timer.expires(after=60): # 60 second expiry
  30. # do some work
  31. In the example above if ``trainer_func`` takes more than 60 seconds to
  32. complete, then the worker process is killed and the agent retries the worker group.
  33. """
  34. from .api import TimerClient, TimerRequest, TimerServer, configure, expires # noqa: F401
  35. from .local_timer import LocalTimerClient, LocalTimerServer # noqa: F401
  36. from .file_based_local_timer import FileTimerClient, FileTimerServer, FileTimerRequest # noqa: F401