AsmUtils.cuh 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. #pragma once
  2. #include <cstdint>
  3. // Collection of direct PTX functions
  4. namespace at {
  5. namespace cuda {
  6. template <typename T>
  7. struct Bitfield {};
  8. template <>
  9. struct Bitfield<unsigned int> {
  10. static __device__ __host__ __forceinline__
  11. unsigned int getBitfield(unsigned int val, int pos, int len) {
  12. #if !defined(__CUDA_ARCH__)
  13. pos &= 0xff;
  14. len &= 0xff;
  15. unsigned int m = (1u << len) - 1u;
  16. return (val >> pos) & m;
  17. #else
  18. unsigned int ret;
  19. asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
  20. return ret;
  21. #endif
  22. }
  23. static __device__ __host__ __forceinline__
  24. unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
  25. #if !defined(__CUDA_ARCH__)
  26. pos &= 0xff;
  27. len &= 0xff;
  28. unsigned int m = (1u << len) - 1u;
  29. toInsert &= m;
  30. toInsert <<= pos;
  31. m <<= pos;
  32. return (val & ~m) | toInsert;
  33. #else
  34. unsigned int ret;
  35. asm("bfi.b32 %0, %1, %2, %3, %4;" :
  36. "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
  37. return ret;
  38. #endif
  39. }
  40. };
  41. template <>
  42. struct Bitfield<uint64_t> {
  43. static __device__ __host__ __forceinline__
  44. uint64_t getBitfield(uint64_t val, int pos, int len) {
  45. #if !defined(__CUDA_ARCH__)
  46. pos &= 0xff;
  47. len &= 0xff;
  48. uint64_t m = (1u << len) - 1u;
  49. return (val >> pos) & m;
  50. #else
  51. uint64_t ret;
  52. asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
  53. return ret;
  54. #endif
  55. }
  56. static __device__ __host__ __forceinline__
  57. uint64_t setBitfield(uint64_t val, uint64_t toInsert, int pos, int len) {
  58. #if !defined(__CUDA_ARCH__)
  59. pos &= 0xff;
  60. len &= 0xff;
  61. uint64_t m = (1u << len) - 1u;
  62. toInsert &= m;
  63. toInsert <<= pos;
  64. m <<= pos;
  65. return (val & ~m) | toInsert;
  66. #else
  67. uint64_t ret;
  68. asm("bfi.b64 %0, %1, %2, %3, %4;" :
  69. "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
  70. return ret;
  71. #endif
  72. }
  73. };
  74. __device__ __forceinline__ int getLaneId() {
  75. #if defined(USE_ROCM)
  76. return __lane_id();
  77. #else
  78. int laneId;
  79. asm("mov.s32 %0, %%laneid;" : "=r"(laneId) );
  80. return laneId;
  81. #endif
  82. }
  83. #if defined(USE_ROCM)
  84. __device__ __forceinline__ unsigned long long int getLaneMaskLt() {
  85. const std::uint64_t m = (1ull << getLaneId()) - 1ull;
  86. return m;
  87. }
  88. #else
  89. __device__ __forceinline__ unsigned getLaneMaskLt() {
  90. unsigned mask;
  91. asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
  92. return mask;
  93. }
  94. #endif
  95. #if defined (USE_ROCM)
  96. __device__ __forceinline__ unsigned long long int getLaneMaskLe() {
  97. std::uint64_t m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1));
  98. return m;
  99. }
  100. #else
  101. __device__ __forceinline__ unsigned getLaneMaskLe() {
  102. unsigned mask;
  103. asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
  104. return mask;
  105. }
  106. #endif
  107. #if defined(USE_ROCM)
  108. __device__ __forceinline__ unsigned long long int getLaneMaskGt() {
  109. const std::uint64_t m = getLaneMaskLe();
  110. return m ? ~m : m;
  111. }
  112. #else
  113. __device__ __forceinline__ unsigned getLaneMaskGt() {
  114. unsigned mask;
  115. asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
  116. return mask;
  117. }
  118. #endif
  119. #if defined(USE_ROCM)
  120. __device__ __forceinline__ unsigned long long int getLaneMaskGe() {
  121. const std::uint64_t m = getLaneMaskLt();
  122. return ~m;
  123. }
  124. #else
  125. __device__ __forceinline__ unsigned getLaneMaskGe() {
  126. unsigned mask;
  127. asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
  128. return mask;
  129. }
  130. #endif
  131. }} // namespace at::cuda