CUDADeviceAssertion.h 4.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. #pragma once
  2. #include <c10/cuda/CUDAException.h>
  3. #include <c10/macros/Macros.h>
  4. namespace c10 {
  5. namespace cuda {
  6. #ifdef TORCH_USE_CUDA_DSA
  7. // Copy string from `src` to `dst`
  8. static __device__ void dstrcpy(char* dst, const char* src) {
  9. int i = 0;
  10. // Copy string from source to destination, ensuring that it
  11. // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
  12. while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
  13. *dst++ = *src++;
  14. }
  15. *dst = '\0';
  16. }
  17. static __device__ void dsa_add_new_assertion_failure(
  18. DeviceAssertionsData* assertions_data,
  19. const char* assertion_msg,
  20. const char* filename,
  21. const char* function_name,
  22. const int line_number,
  23. const uint32_t caller,
  24. const dim3 block_id,
  25. const dim3 thread_id) {
  26. // `assertions_data` may be nullptr if device-side assertion checking
  27. // is disabled at run-time. If it is disabled at compile time this
  28. // function will never be called
  29. if (!assertions_data) {
  30. return;
  31. }
  32. // Atomically increment so other threads can fail at the same time
  33. // Note that incrementing this means that the CPU can observe that
  34. // a failure has happened and can begin to respond before we've
  35. // written information about that failure out to the buffer.
  36. const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);
  37. if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
  38. // At this point we're ran out of assertion buffer space.
  39. // We could print a message about this, but that'd get
  40. // spammy if a lot of threads did it, so we just silently
  41. // ignore any other assertion failures. In most cases the
  42. // failures will all probably be analogous anyway.
  43. return;
  44. }
  45. // Write information about the assertion failure to memory.
  46. // Note that this occurs only after the `assertion_count`
  47. // increment broadcasts that there's been a problem.
  48. auto& self = assertions_data->assertions[nid];
  49. dstrcpy(self.assertion_msg, assertion_msg);
  50. dstrcpy(self.filename, filename);
  51. dstrcpy(self.function_name, function_name);
  52. self.line_number = line_number;
  53. self.caller = caller;
  54. self.block_id[0] = block_id.x;
  55. self.block_id[1] = block_id.y;
  56. self.block_id[2] = block_id.z;
  57. self.thread_id[0] = thread_id.x;
  58. self.thread_id[1] = thread_id.y;
  59. self.thread_id[2] = thread_id.z;
  60. }
  61. // Emulates a kernel assertion. The assertion won't stop the kernel's progress,
  62. // so you should assume everything the kernel produces is garbage if there's an
  63. // assertion failure.
  64. // NOTE: This assumes that `assertions_data` and `assertion_caller_id` are
  65. // arguments of the kernel and therefore accessible.
  66. #define CUDA_KERNEL_ASSERT2(condition) \
  67. do { \
  68. if (C10_UNLIKELY(!(condition))) { \
  69. /* Has an atomic element so threads can fail at the same time */ \
  70. c10::cuda::dsa_add_new_assertion_failure( \
  71. assertions_data, \
  72. C10_STRINGIZE(condition), \
  73. __FILE__, \
  74. __FUNCTION__, \
  75. __LINE__, \
  76. assertion_caller_id, \
  77. blockIdx, \
  78. threadIdx); \
  79. /* Now that the kernel has failed we early exit the kernel, but */ \
  80. /* otherwise keep going and rely on the host to check UVM and */ \
  81. /* determine we've had a problem */ \
  82. return; \
  83. } \
  84. } while (false)
  85. #else
  86. #define CUDA_KERNEL_ASSERT2(condition) assert(condition)
  87. #endif
  88. } // namespace cuda
  89. } // namespace c10