api.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. #!/usr/bin/env python3
  2. # Copyright (c) Facebook, Inc. and its affiliates.
  3. # All rights reserved.
  4. #
  5. # This source code is licensed under the BSD-style license found in the
  6. # LICENSE file in the root directory of this source tree.
  7. import json
  8. from dataclasses import asdict, dataclass, field
  9. from enum import Enum
  10. from typing import Dict, Union, Optional
  11. __all__ = ['EventSource', 'Event', 'NodeState', 'RdzvEvent']
  12. EventMetadataValue = Union[str, int, float, bool, None]
  13. class EventSource(str, Enum):
  14. """
  15. Known identifiers of the event producers.
  16. """
  17. AGENT = "AGENT"
  18. WORKER = "WORKER"
  19. @dataclass
  20. class Event:
  21. """
  22. The class represents the generic event that occurs during the torchelastic
  23. job execution. The event can be any kind of meaningful action.
  24. Args:
  25. name: event name.
  26. source: the event producer, e.g. agent or worker
  27. timestamp: timestamp in milliseconds when event occured.
  28. metadata: additional data that is associated with the event.
  29. """
  30. name: str
  31. source: EventSource
  32. timestamp: int = 0
  33. metadata: Dict[str, EventMetadataValue] = field(default_factory=dict)
  34. def __str__(self):
  35. return self.serialize()
  36. @staticmethod
  37. def deserialize(data: Union[str, "Event"]) -> "Event":
  38. if isinstance(data, Event):
  39. return data
  40. if isinstance(data, str):
  41. data_dict = json.loads(data)
  42. data_dict["source"] = EventSource[data_dict["source"]]
  43. return Event(**data_dict)
  44. def serialize(self) -> str:
  45. return json.dumps(asdict(self))
  46. class NodeState(str, Enum):
  47. """
  48. The states that a node can be in rendezvous.
  49. """
  50. INIT = "INIT"
  51. RUNNING = "RUNNING"
  52. SUCCEEDED = "SUCCEEDED"
  53. FAILED = "FAILED"
  54. @dataclass
  55. class RdzvEvent:
  56. """
  57. Dataclass to represent any rendezvous event.
  58. Args:
  59. name: Event name. (E.g. Current action being performed)
  60. run_id: The run id of the rendezvous
  61. message: The message describing the event
  62. hostname: Hostname of the node
  63. pid: The process id of the node
  64. node_state: The state of the node (INIT, RUNNING, SUCCEEDED, FAILED)
  65. master_endpoint: The master endpoint for the rendezvous store, if known
  66. rank: The rank of the node, if known
  67. local_id: The local_id of the node, if defined in dynamic_rendezvous.py
  68. error_trace: Error stack trace, if this is an error event.
  69. """
  70. name: str
  71. run_id: str
  72. message: str
  73. hostname: str
  74. pid: int
  75. node_state: NodeState
  76. master_endpoint: str = ""
  77. rank: Optional[int] = None
  78. local_id: Optional[int] = None
  79. error_trace: str = ""
  80. def __str__(self):
  81. return self.serialize()
  82. @staticmethod
  83. def deserialize(data: Union[str, "RdzvEvent"]) -> "RdzvEvent":
  84. if isinstance(data, RdzvEvent):
  85. return data
  86. if isinstance(data, str):
  87. data_dict = json.loads(data)
  88. data_dict["node_state"] = NodeState[data_dict["node_state"]]
  89. return RdzvEvent(**data_dict)
  90. def serialize(self) -> str:
  91. return json.dumps(asdict(self))