ApplyGridUtils.cuh 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. #include <ATen/cuda/CUDAContext.h>
  2. #include <cuda_runtime.h>
  3. namespace at { namespace cuda {
  4. /**
  5. Computes ceil(a / b)
  6. */
  7. template <typename T>
  8. __host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) {
  9. return (a + b - 1) / b;
  10. }
  11. namespace {
  12. // Threads per block for our apply kernel
  13. // FIXME: use occupancy calculator instead
  14. constexpr uint32_t AT_APPLY_THREADS_PER_BLOCK = 512;
  15. constexpr uint32_t AT_APPLY_BLOCKS_PER_SM = 4;
  16. template <int step = 1>
  17. inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
  18. if (curDevice == -1) return false;
  19. uint64_t numel_per_thread = static_cast<uint64_t>(max_threads_per_block) * static_cast<uint64_t>(step);
  20. uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread);
  21. uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0];
  22. if (numBlocks > maxGridX)
  23. numBlocks = maxGridX;
  24. grid = dim3(numBlocks);
  25. return true;
  26. }
  27. constexpr int getApplyBlocksPerSM() {
  28. return AT_APPLY_BLOCKS_PER_SM;
  29. }
  30. constexpr int getApplyBlockSize() {
  31. return AT_APPLY_THREADS_PER_BLOCK;
  32. }
  33. inline dim3 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
  34. return dim3(max_threads_per_block);
  35. }
  36. }
  37. }} // namespace at::cuda