#include #include namespace at { namespace cuda { /** Computes ceil(a / b) */ template __host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) { return (a + b - 1) / b; } namespace { // Threads per block for our apply kernel // FIXME: use occupancy calculator instead constexpr uint32_t AT_APPLY_THREADS_PER_BLOCK = 512; constexpr uint32_t AT_APPLY_BLOCKS_PER_SM = 4; template inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) { if (curDevice == -1) return false; uint64_t numel_per_thread = static_cast(max_threads_per_block) * static_cast(step); uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread); uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0]; if (numBlocks > maxGridX) numBlocks = maxGridX; grid = dim3(numBlocks); return true; } constexpr int getApplyBlocksPerSM() { return AT_APPLY_BLOCKS_PER_SM; } constexpr int getApplyBlockSize() { return AT_APPLY_THREADS_PER_BLOCK; } inline dim3 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) { return dim3(max_threads_per_block); } } }} // namespace at::cuda