1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- #pragma once
- #include <c10/cuda/CUDAException.h>
- #include <c10/macros/Macros.h>
- namespace c10 {
- namespace cuda {
- #ifdef TORCH_USE_CUDA_DSA
- // Copy string from `src` to `dst`
- static __device__ void dstrcpy(char* dst, const char* src) {
- int i = 0;
- // Copy string from source to destination, ensuring that it
- // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
- while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
- *dst++ = *src++;
- }
- *dst = '\0';
- }
- static __device__ void dsa_add_new_assertion_failure(
- DeviceAssertionsData* assertions_data,
- const char* assertion_msg,
- const char* filename,
- const char* function_name,
- const int line_number,
- const uint32_t caller,
- const dim3 block_id,
- const dim3 thread_id) {
- // `assertions_data` may be nullptr if device-side assertion checking
- // is disabled at run-time. If it is disabled at compile time this
- // function will never be called
- if (!assertions_data) {
- return;
- }
- // Atomically increment so other threads can fail at the same time
- // Note that incrementing this means that the CPU can observe that
- // a failure has happened and can begin to respond before we've
- // written information about that failure out to the buffer.
- const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);
- if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
- // At this point we're ran out of assertion buffer space.
- // We could print a message about this, but that'd get
- // spammy if a lot of threads did it, so we just silently
- // ignore any other assertion failures. In most cases the
- // failures will all probably be analogous anyway.
- return;
- }
- // Write information about the assertion failure to memory.
- // Note that this occurs only after the `assertion_count`
- // increment broadcasts that there's been a problem.
- auto& self = assertions_data->assertions[nid];
- dstrcpy(self.assertion_msg, assertion_msg);
- dstrcpy(self.filename, filename);
- dstrcpy(self.function_name, function_name);
- self.line_number = line_number;
- self.caller = caller;
- self.block_id[0] = block_id.x;
- self.block_id[1] = block_id.y;
- self.block_id[2] = block_id.z;
- self.thread_id[0] = thread_id.x;
- self.thread_id[1] = thread_id.y;
- self.thread_id[2] = thread_id.z;
- }
- // Emulates a kernel assertion. The assertion won't stop the kernel's progress,
- // so you should assume everything the kernel produces is garbage if there's an
- // assertion failure.
- // NOTE: This assumes that `assertions_data` and `assertion_caller_id` are
- // arguments of the kernel and therefore accessible.
- #define CUDA_KERNEL_ASSERT2(condition) \
- do { \
- if (C10_UNLIKELY(!(condition))) { \
- /* Has an atomic element so threads can fail at the same time */ \
- c10::cuda::dsa_add_new_assertion_failure( \
- assertions_data, \
- C10_STRINGIZE(condition), \
- __FILE__, \
- __FUNCTION__, \
- __LINE__, \
- assertion_caller_id, \
- blockIdx, \
- threadIdx); \
- /* Now that the kernel has failed we early exit the kernel, but */ \
- /* otherwise keep going and rely on the host to check UVM and */ \
- /* determine we've had a problem */ \
- return; \
- } \
- } while (false)
- #else
- #define CUDA_KERNEL_ASSERT2(condition) assert(condition)
- #endif
- } // namespace cuda
- } // namespace c10
|