123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- #pragma once
- #include <c10/cuda/CUDAMacros.h>
- #include <memory>
- #include <mutex>
- #include <string>
- #include <vector>
- #ifdef USE_CUDA
- #define TORCH_USE_CUDA_DSA
- #endif
- /// Number of assertion failure messages we can store. If this is too small
- /// threads will fail silently.
- constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
- constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;
- namespace c10 {
- namespace cuda {
- /// Holds information about any device-side assertions that fail.
- /// Held in managed memory and access by both the CPU and the GPU.
- struct DeviceAssertionData {
- /// Stringification of the assertion
- char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN];
- /// File the assertion was in
- char filename[C10_CUDA_DSA_MAX_STR_LEN];
- /// Name of the function the assertion was in
- char function_name[C10_CUDA_DSA_MAX_STR_LEN];
- /// Line number the assertion was at
- int line_number;
- /// Number uniquely identifying the kernel launch that triggered the assertion
- uint32_t caller;
- /// block_id of the thread that failed the assertion
- int32_t block_id[3];
- /// third_id of the thread that failed the assertion
- int32_t thread_id[3];
- };
- /// Used to hold assertions generated by the device
- /// Held in managed memory and access by both the CPU and the GPU.
- struct DeviceAssertionsData {
- /// Total number of assertions found; a subset of thse will be recorded
- /// in `assertions`
- int32_t assertion_count;
- /// An array of assertions that will be written to in a race-free manner
- DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT];
- };
- /// Use to hold info about kernel launches so that we can run kernels
- /// asynchronously and still associate launches with device-side
- /// assertion failures
- struct CUDAKernelLaunchInfo {
- /// Filename of the code where the kernel was launched from
- const char* launch_filename;
- /// Function from which the kernel was launched
- const char* launch_function;
- /// Line number of where the code was launched from
- uint32_t launch_linenum;
- /// Backtrace of where the kernel was launched from, only populated if
- /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
- std::string launch_stacktrace;
- /// Kernel that was launched
- const char* kernel_name;
- /// Device the kernel was launched on
- int device;
- /// Stream the kernel was launched on
- int32_t stream;
- /// A number that uniquely identifies the kernel launch
- uint64_t generation_number;
- };
- /// Circular buffer used to hold information about kernel launches
- /// this is later used to reconstruct how a device-side kernel assertion failure
- /// occurred CUDAKernelLaunchRegistry is used as a singleton
- class C10_CUDA_API CUDAKernelLaunchRegistry {
- private:
- /// Assume that this is the max number of kernel launches that might ever be
- /// enqueued across all streams on a single device
- static constexpr int max_kernel_launches = 1024;
- /// How many kernel launch infos we've inserted. Used to ensure that circular
- /// queue doesn't provide false information by always increasing, but also to
- /// mark where we are inserting into the queue
- #ifdef TORCH_USE_CUDA_DSA
- uint64_t generation_number = 0;
- #endif
- /// Shared mutex between writer and accessor to ensure multi-threaded safety.
- mutable std::mutex read_write_mutex;
- /// Used to ensure prevent race conditions in GPU memory allocation
- mutable std::mutex gpu_alloc_mutex;
- /// Pointer to managed memory keeping track of device-side assertions. There
- /// is one entry for each possible device the process might work with. Unused
- /// entries are nullptrs. We could also use an unordered_set here, but this
- /// vector design will be faster and the wasted memory is small since we
- /// expect the number of GPUs per node will always be small
- std::vector<
- std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
- uvm_assertions;
- /// A single circular buffer holds information about every kernel launch the
- /// process makes across all devices.
- std::vector<CUDAKernelLaunchInfo> kernel_launches;
- bool check_env_for_enable_launch_stacktracing() const;
- bool check_env_for_dsa_enabled() const;
- public:
- CUDAKernelLaunchRegistry();
- /// Register a new kernel launch and obtain a generation number back to be
- /// passed to the kernel
- uint32_t insert(
- const char* launch_filename,
- const char* launch_function,
- const uint32_t launch_linenum,
- const char* kernel_name,
- const int32_t stream_id);
- /// Get copies of the kernel launch registry and each device's assertion
- /// failure buffer so they can be inspected without raising race conditions
- std::
- pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
- snapshot() const;
- /// Get a pointer to the current device's assertion failure buffer. If no such
- /// buffer exists then one is created. This means that the first kernel launch
- /// made on each device will be slightly slower because memory allocations are
- /// required
- DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
- /// Gets the global singleton of the registry
- static CUDAKernelLaunchRegistry& get_singleton_ref();
- /// If not all devices support DSA, we disable it
- const bool do_all_devices_support_managed_memory = false;
- /// Whether or not to gather stack traces when launching kernels
- bool gather_launch_stacktrace = false;
- /// Whether or not host-side DSA is enabled or disabled at run-time
- /// Note: Device-side code cannot be enabled/disabled at run-time
- bool enabled_at_runtime = false;
- /// Whether or not a device has indicated a failure
- bool has_failed() const;
- #ifdef TORCH_USE_CUDA_DSA
- const bool enabled_at_compile_time = true;
- #else
- const bool enabled_at_compile_time = false;
- #endif
- };
- std::string c10_retrieve_device_side_assertion_info();
- } // namespace cuda
- } // namespace c10
- // Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
- // requires the same input arguments. We introduce the following macro to
- // standardize these.
- #define TORCH_DSA_KERNEL_ARGS \
- [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
- [[maybe_unused]] uint32_t assertion_caller_id
- // This macro can be used to pass the DSA arguments onward to another
- // function
- #define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id
|