CUDADeviceAssertionHost.h 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. #pragma once
  2. #include <c10/cuda/CUDAMacros.h>
  3. #include <memory>
  4. #include <mutex>
  5. #include <string>
  6. #include <vector>
  7. #ifdef USE_CUDA
  8. #define TORCH_USE_CUDA_DSA
  9. #endif
  10. /// Number of assertion failure messages we can store. If this is too small
  11. /// threads will fail silently.
  12. constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
  13. constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;
  14. namespace c10 {
  15. namespace cuda {
  16. /// Holds information about any device-side assertions that fail.
  17. /// Held in managed memory and access by both the CPU and the GPU.
  18. struct DeviceAssertionData {
  19. /// Stringification of the assertion
  20. char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN];
  21. /// File the assertion was in
  22. char filename[C10_CUDA_DSA_MAX_STR_LEN];
  23. /// Name of the function the assertion was in
  24. char function_name[C10_CUDA_DSA_MAX_STR_LEN];
  25. /// Line number the assertion was at
  26. int line_number;
  27. /// Number uniquely identifying the kernel launch that triggered the assertion
  28. uint32_t caller;
  29. /// block_id of the thread that failed the assertion
  30. int32_t block_id[3];
  31. /// third_id of the thread that failed the assertion
  32. int32_t thread_id[3];
  33. };
  34. /// Used to hold assertions generated by the device
  35. /// Held in managed memory and access by both the CPU and the GPU.
  36. struct DeviceAssertionsData {
  37. /// Total number of assertions found; a subset of thse will be recorded
  38. /// in `assertions`
  39. int32_t assertion_count;
  40. /// An array of assertions that will be written to in a race-free manner
  41. DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT];
  42. };
  43. /// Use to hold info about kernel launches so that we can run kernels
  44. /// asynchronously and still associate launches with device-side
  45. /// assertion failures
  46. struct CUDAKernelLaunchInfo {
  47. /// Filename of the code where the kernel was launched from
  48. const char* launch_filename;
  49. /// Function from which the kernel was launched
  50. const char* launch_function;
  51. /// Line number of where the code was launched from
  52. uint32_t launch_linenum;
  53. /// Backtrace of where the kernel was launched from, only populated if
  54. /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
  55. std::string launch_stacktrace;
  56. /// Kernel that was launched
  57. const char* kernel_name;
  58. /// Device the kernel was launched on
  59. int device;
  60. /// Stream the kernel was launched on
  61. int32_t stream;
  62. /// A number that uniquely identifies the kernel launch
  63. uint64_t generation_number;
  64. };
  65. /// Circular buffer used to hold information about kernel launches
  66. /// this is later used to reconstruct how a device-side kernel assertion failure
  67. /// occurred CUDAKernelLaunchRegistry is used as a singleton
  68. class C10_CUDA_API CUDAKernelLaunchRegistry {
  69. private:
  70. /// Assume that this is the max number of kernel launches that might ever be
  71. /// enqueued across all streams on a single device
  72. static constexpr int max_kernel_launches = 1024;
  73. /// How many kernel launch infos we've inserted. Used to ensure that circular
  74. /// queue doesn't provide false information by always increasing, but also to
  75. /// mark where we are inserting into the queue
  76. #ifdef TORCH_USE_CUDA_DSA
  77. uint64_t generation_number = 0;
  78. #endif
  79. /// Shared mutex between writer and accessor to ensure multi-threaded safety.
  80. mutable std::mutex read_write_mutex;
  81. /// Used to ensure prevent race conditions in GPU memory allocation
  82. mutable std::mutex gpu_alloc_mutex;
  83. /// Pointer to managed memory keeping track of device-side assertions. There
  84. /// is one entry for each possible device the process might work with. Unused
  85. /// entries are nullptrs. We could also use an unordered_set here, but this
  86. /// vector design will be faster and the wasted memory is small since we
  87. /// expect the number of GPUs per node will always be small
  88. std::vector<
  89. std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
  90. uvm_assertions;
  91. /// A single circular buffer holds information about every kernel launch the
  92. /// process makes across all devices.
  93. std::vector<CUDAKernelLaunchInfo> kernel_launches;
  94. bool check_env_for_enable_launch_stacktracing() const;
  95. bool check_env_for_dsa_enabled() const;
  96. public:
  97. CUDAKernelLaunchRegistry();
  98. /// Register a new kernel launch and obtain a generation number back to be
  99. /// passed to the kernel
  100. uint32_t insert(
  101. const char* launch_filename,
  102. const char* launch_function,
  103. const uint32_t launch_linenum,
  104. const char* kernel_name,
  105. const int32_t stream_id);
  106. /// Get copies of the kernel launch registry and each device's assertion
  107. /// failure buffer so they can be inspected without raising race conditions
  108. std::
  109. pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
  110. snapshot() const;
  111. /// Get a pointer to the current device's assertion failure buffer. If no such
  112. /// buffer exists then one is created. This means that the first kernel launch
  113. /// made on each device will be slightly slower because memory allocations are
  114. /// required
  115. DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
  116. /// Gets the global singleton of the registry
  117. static CUDAKernelLaunchRegistry& get_singleton_ref();
  118. /// If not all devices support DSA, we disable it
  119. const bool do_all_devices_support_managed_memory = false;
  120. /// Whether or not to gather stack traces when launching kernels
  121. bool gather_launch_stacktrace = false;
  122. /// Whether or not host-side DSA is enabled or disabled at run-time
  123. /// Note: Device-side code cannot be enabled/disabled at run-time
  124. bool enabled_at_runtime = false;
  125. /// Whether or not a device has indicated a failure
  126. bool has_failed() const;
  127. #ifdef TORCH_USE_CUDA_DSA
  128. const bool enabled_at_compile_time = true;
  129. #else
  130. const bool enabled_at_compile_time = false;
  131. #endif
  132. };
  133. std::string c10_retrieve_device_side_assertion_info();
  134. } // namespace cuda
  135. } // namespace c10
  136. // Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
  137. // requires the same input arguments. We introduce the following macro to
  138. // standardize these.
  139. #define TORCH_DSA_KERNEL_ARGS \
  140. [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
  141. [[maybe_unused]] uint32_t assertion_caller_id
  142. // This macro can be used to pass the DSA arguments onward to another
  143. // function
  144. #define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id