IntegerDivider.cuh 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. #pragma once
  2. #include <assert.h>
  3. #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
  4. #include <cuda_runtime.h>
  5. #endif
  6. namespace at {
  7. namespace cuda {
  8. namespace detail {
  9. // A utility class to implement integer division by multiplication, given a fixed
  10. // divisor.
  11. //
  12. // WARNING: The fast divider algorithm is only implemented for unsigned int;
  13. // otherwise we default to plain integer division. For unsigned int,
  14. // we further assume that the dividend is at most INT32_MAX. Thus,
  15. // IntDivider must NOT be used for general integer division.
  16. //
  17. // This reduced range is enough for our purpose, and it allows us to
  18. // slightly simplify the computation.
  19. //
  20. // (NOTE: Below, "2^k" denotes exponentiation, i.e., 1<<k.)
  21. //
  22. // For any N-bit unsigned integer d (> 0), we can find a "magic number" m (2^N
  23. // <= m < 2^(N+1)) and shift s such that:
  24. //
  25. // \floor(n / d) = \floor((m * n) / 2^(N+s)).
  26. //
  27. // Given such m and s, the integer division can be then implemented as:
  28. //
  29. // let m' = m - 2^N // 0 <= m' < 2^N
  30. //
  31. // fast_integer_division(n):
  32. // // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned
  33. // // integer. Then take the higher N bits.
  34. // t = (m' * n) >> N
  35. //
  36. // // Here we use the fact that n is less than 2^(N-1): otherwise the value
  37. // // of (t + n) may not fit in an N-bit integer.
  38. // return (t + n) >> s
  39. //
  40. // Finding such a magic number is surprisingly easy:
  41. //
  42. // s = \ceil(\log_2 d)
  43. // m' = \floor(2^N * (2^s - d) / d) + 1 // Need 2N-bit integer arithmetic.
  44. //
  45. // See also:
  46. // - Division by Invariant Integers Using Multiplication,
  47. // Torbjörn Granlund and Peter L. Montgomery, 1994.
  48. //
  49. // - http://www.hackersdelight.org/magic.htm
  50. //
  51. // - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
  52. // Result of div/mod operation stored together.
  53. template <typename Value>
  54. struct DivMod {
  55. Value div, mod;
  56. C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { }
  57. };
  58. // Base case: we only have an implementation for uint32_t for now. For
  59. // everything else, we use plain division.
  60. template <typename Value>
  61. struct IntDivider {
  62. IntDivider() = default;
  63. IntDivider(Value d) : divisor(d) { }
  64. C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; }
  65. C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; }
  66. C10_HOST_DEVICE inline DivMod<Value> divmod(Value n) const {
  67. return DivMod<Value>(n / divisor, n % divisor);
  68. }
  69. Value divisor;
  70. };
  71. // Implement fast integer division.
  72. template <>
  73. struct IntDivider<unsigned int> {
  74. static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int.");
  75. IntDivider() = default;
  76. IntDivider(unsigned int d) : divisor(d) {
  77. assert(divisor >= 1 && divisor <= INT32_MAX);
  78. // TODO: gcc/clang has __builtin_clz() but it's not portable.
  79. for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break;
  80. uint64_t one = 1;
  81. uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
  82. m1 = magic;
  83. assert(m1 > 0 && m1 == magic); // m1 must fit in 32 bits.
  84. }
  85. C10_HOST_DEVICE inline unsigned int div(unsigned int n) const {
  86. #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
  87. // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and
  88. // 'm1'.
  89. unsigned int t = __umulhi(n, m1);
  90. return (t + n) >> shift;
  91. #else
  92. // Using uint64_t so that the addition does not overflow.
  93. uint64_t t = ((uint64_t) n * m1) >> 32;
  94. return (t + n) >> shift;
  95. #endif
  96. }
  97. C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const {
  98. return n - div(n) * divisor;
  99. }
  100. C10_HOST_DEVICE inline DivMod<unsigned int> divmod(unsigned int n) const {
  101. unsigned int q = div(n);
  102. return DivMod<unsigned int>(q, n - q * divisor);
  103. }
  104. unsigned int divisor; // d above.
  105. unsigned int m1; // Magic number: m' above.
  106. unsigned int shift; // Shift amounts.
  107. };
  108. }}} // namespace at::cuda::detail