CachingHostAllocator.h 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. #pragma once
  2. #include <c10/core/Allocator.h>
  3. #include <c10/cuda/CUDAStream.h>
  4. namespace at {
  5. namespace cuda {
  6. //
  7. // A caching allocator for CUDA host allocations (pinned memory).
  8. //
  9. // This provides a drop-in replacement for THCudaHostAllocator, which re-uses
  10. // freed pinned (page-locked) memory allocations. This avoids device
  11. // synchronizations due to cudaFreeHost calls.
  12. //
  13. // To ensure correct behavior, THCCachingHostAllocator_recordEvent must be
  14. // called anytime a pointer from this allocator is used in a cudaMemcpyAsync
  15. // call between host and device, and passed the corresponding context from the
  16. // allocation. This is currently invoked by at::native::copy_kernel_cuda.
  17. //
  18. // Note that this allocator does not split larger allocations into smaller
  19. // blocks, unlike the caching device allocator.
  20. //
  21. TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator();
  22. // Records an event in the specified stream. The allocation corresponding to the
  23. // input `ptr`/`ctx` will not be re-used until the event has occurred.
  24. TORCH_CUDA_CPP_API bool
  25. CachingHostAllocator_recordEvent(void* ptr, void* ctx, c10::cuda::CUDAStream stream);
  26. // Releases cached pinned memory allocations via cudaHostFree
  27. TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache();
  28. inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) {
  29. return getCachingHostAllocator()->allocate(size);
  30. }
  31. } // namespace cuda
  32. } // namespace at