fxdiv.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. #pragma once
  2. #ifndef FXDIV_H
  3. #define FXDIV_H
  4. #if defined(__cplusplus) && (__cplusplus >= 201103L)
  5. #include <cstddef>
  6. #include <cstdint>
  7. #include <climits>
  8. #elif !defined(__OPENCL_VERSION__)
  9. #include <stddef.h>
  10. #include <stdint.h>
  11. #include <limits.h>
  12. #endif
  13. #if defined(_MSC_VER)
  14. #include <intrin.h>
  15. #if defined(_M_IX86) || defined(_M_X64)
  16. #include <immintrin.h>
  17. #endif
  18. #endif
  19. #ifndef FXDIV_USE_INLINE_ASSEMBLY
  20. #define FXDIV_USE_INLINE_ASSEMBLY 0
  21. #endif
  22. static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
  23. #if defined(_MSC_VER) && defined(_M_IX86)
  24. return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
  25. #else
  26. return (uint64_t) a * (uint64_t) b;
  27. #endif
  28. }
  29. static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
  30. #if defined(__OPENCL_VERSION__)
  31. return mul_hi(a, b);
  32. #elif defined(__CUDA_ARCH__)
  33. return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
  34. #elif defined(_MSC_VER) && defined(_M_IX86)
  35. return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
  36. #elif defined(_MSC_VER) && defined(_M_ARM)
  37. return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
  38. #else
  39. return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
  40. #endif
  41. }
  42. static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
  43. #if defined(__OPENCL_VERSION__)
  44. return mul_hi(a, b);
  45. #elif defined(__CUDA_ARCH__)
  46. return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
  47. #elif defined(_MSC_VER) && defined(_M_X64)
  48. return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
  49. #elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
  50. return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
  51. #else
  52. const uint32_t a_lo = (uint32_t) a;
  53. const uint32_t a_hi = (uint32_t) (a >> 32);
  54. const uint32_t b_lo = (uint32_t) b;
  55. const uint32_t b_hi = (uint32_t) (b >> 32);
  56. const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
  57. (uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
  58. return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
  59. ((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
  60. #endif
  61. }
  62. static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
  63. #if SIZE_MAX == UINT32_MAX
  64. return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
  65. #elif SIZE_MAX == UINT64_MAX
  66. return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
  67. #else
  68. #error Unsupported platform
  69. #endif
  70. }
  71. struct fxdiv_divisor_uint32_t {
  72. uint32_t value;
  73. uint32_t m;
  74. uint8_t s1;
  75. uint8_t s2;
  76. };
  77. struct fxdiv_result_uint32_t {
  78. uint32_t quotient;
  79. uint32_t remainder;
  80. };
  81. struct fxdiv_divisor_uint64_t {
  82. uint64_t value;
  83. uint64_t m;
  84. uint8_t s1;
  85. uint8_t s2;
  86. };
  87. struct fxdiv_result_uint64_t {
  88. uint64_t quotient;
  89. uint64_t remainder;
  90. };
  91. struct fxdiv_divisor_size_t {
  92. size_t value;
  93. size_t m;
  94. uint8_t s1;
  95. uint8_t s2;
  96. };
  97. struct fxdiv_result_size_t {
  98. size_t quotient;
  99. size_t remainder;
  100. };
  101. static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
  102. struct fxdiv_divisor_uint32_t result = { d };
  103. if (d == 1) {
  104. result.m = UINT32_C(1);
  105. result.s1 = 0;
  106. result.s2 = 0;
  107. } else {
  108. #if defined(__OPENCL_VERSION__)
  109. const uint32_t l_minus_1 = 31 - clz(d - 1);
  110. #elif defined(__CUDA_ARCH__)
  111. const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
  112. #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
  113. unsigned long l_minus_1;
  114. _BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
  115. #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
  116. uint32_t l_minus_1;
  117. __asm__("BSRL %[d_minus_1], %[l_minus_1]"
  118. : [l_minus_1] "=r" (l_minus_1)
  119. : [d_minus_1] "r" (d - 1)
  120. : "cc");
  121. #elif defined(__GNUC__)
  122. const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
  123. #else
  124. /* Based on Algorithm 2 from Hacker's delight */
  125. uint32_t l_minus_1 = 0;
  126. uint32_t x = d - 1;
  127. uint32_t y = x >> 16;
  128. if (y != 0) {
  129. l_minus_1 += 16;
  130. x = y;
  131. }
  132. y = x >> 8;
  133. if (y != 0) {
  134. l_minus_1 += 8;
  135. x = y;
  136. }
  137. y = x >> 4;
  138. if (y != 0) {
  139. l_minus_1 += 4;
  140. x = y;
  141. }
  142. y = x >> 2;
  143. if (y != 0) {
  144. l_minus_1 += 2;
  145. x = y;
  146. }
  147. if ((x & 2) != 0) {
  148. l_minus_1 += 1;
  149. }
  150. #endif
  151. uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
  152. /* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */
  153. #if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
  154. uint32_t q;
  155. __asm__("DIVL %[d]"
  156. : "=a" (q), "+d" (u_hi)
  157. : [d] "r" (d), "a" (0)
  158. : "cc");
  159. #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64))
  160. unsigned int remainder;
  161. const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder);
  162. #else
  163. const uint32_t q = ((uint64_t) u_hi << 32) / d;
  164. #endif
  165. result.m = q + UINT32_C(1);
  166. result.s1 = 1;
  167. result.s2 = (uint8_t) l_minus_1;
  168. }
  169. return result;
  170. }
  171. static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
  172. struct fxdiv_divisor_uint64_t result = { d };
  173. if (d == 1) {
  174. result.m = UINT64_C(1);
  175. result.s1 = 0;
  176. result.s2 = 0;
  177. } else {
  178. #if defined(__OPENCL_VERSION__)
  179. const uint32_t nlz_d = clz(d);
  180. const uint32_t l_minus_1 = 63 - clz(d - 1);
  181. #elif defined(__CUDA_ARCH__)
  182. const uint32_t nlz_d = __clzll((long long) d);
  183. const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
  184. #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
  185. unsigned long l_minus_1;
  186. _BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
  187. unsigned long bsr_d;
  188. _BitScanReverse64(&bsr_d, (unsigned __int64) d);
  189. const uint32_t nlz_d = bsr_d ^ 0x3F;
  190. #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM))
  191. const uint64_t d_minus_1 = d - 1;
  192. const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
  193. unsigned long l_minus_1;
  194. if ((uint32_t) (d_minus_1 >> 32) == 0) {
  195. _BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
  196. } else {
  197. _BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
  198. l_minus_1 += 32;
  199. }
  200. const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
  201. #elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
  202. uint64_t l_minus_1;
  203. __asm__("BSRQ %[d_minus_1], %[l_minus_1]"
  204. : [l_minus_1] "=r" (l_minus_1)
  205. : [d_minus_1] "r" (d - 1)
  206. : "cc");
  207. #elif defined(__GNUC__)
  208. const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
  209. const uint32_t nlz_d = __builtin_clzll(d);
  210. #else
  211. /* Based on Algorithm 2 from Hacker's delight */
  212. const uint64_t d_minus_1 = d - 1;
  213. const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
  214. uint32_t l_minus_1 = 0;
  215. uint32_t x = (uint32_t) d_minus_1;
  216. uint32_t y = d_minus_1 >> 32;
  217. if (y != 0) {
  218. l_minus_1 += 32;
  219. x = y;
  220. }
  221. y = x >> 16;
  222. if (y != 0) {
  223. l_minus_1 += 16;
  224. x = y;
  225. }
  226. y = x >> 8;
  227. if (y != 0) {
  228. l_minus_1 += 8;
  229. x = y;
  230. }
  231. y = x >> 4;
  232. if (y != 0) {
  233. l_minus_1 += 4;
  234. x = y;
  235. }
  236. y = x >> 2;
  237. if (y != 0) {
  238. l_minus_1 += 2;
  239. x = y;
  240. }
  241. if ((x & 2) != 0) {
  242. l_minus_1 += 1;
  243. }
  244. const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
  245. #endif
  246. uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
  247. /* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */
  248. #if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
  249. uint64_t q;
  250. __asm__("DIVQ %[d]"
  251. : "=a" (q), "+d" (u_hi)
  252. : [d] "r" (d), "a" (UINT64_C(0))
  253. : "cc");
  254. #elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__)
  255. /* GCC, Clang, and Intel Compiler fail to inline optimized implementation and call into support library for 128-bit division */
  256. const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d));
  257. #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(_M_X64)
  258. unsigned __int64 remainder;
  259. const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder);
  260. #else
  261. /* Implementation based on code from Hacker's delight */
  262. /* Normalize divisor and shift divident left */
  263. d <<= nlz_d;
  264. u_hi <<= nlz_d;
  265. /* Break divisor up into two 32-bit digits */
  266. const uint64_t d_hi = (uint32_t) (d >> 32);
  267. const uint32_t d_lo = (uint32_t) d;
  268. /* Compute the first quotient digit, q1 */
  269. uint64_t q1 = u_hi / d_hi;
  270. uint64_t r1 = u_hi - q1 * d_hi;
  271. while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
  272. q1 -= 1;
  273. r1 += d_hi;
  274. if ((r1 >> 32) != 0) {
  275. break;
  276. }
  277. }
  278. /* Multiply and subtract. */
  279. u_hi = (u_hi << 32) - q1 * d;
  280. /* Compute the second quotient digit, q0 */
  281. uint64_t q0 = u_hi / d_hi;
  282. uint64_t r0 = u_hi - q0 * d_hi;
  283. while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
  284. q0 -= 1;
  285. r0 += d_hi;
  286. if ((r0 >> 32) != 0) {
  287. break;
  288. }
  289. }
  290. const uint64_t q = (q1 << 32) | (uint32_t) q0;
  291. #endif
  292. result.m = q + UINT64_C(1);
  293. result.s1 = 1;
  294. result.s2 = (uint8_t) l_minus_1;
  295. }
  296. return result;
  297. }
  298. static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
  299. #if SIZE_MAX == UINT32_MAX
  300. const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
  301. #elif SIZE_MAX == UINT64_MAX
  302. const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
  303. #else
  304. #error Unsupported platform
  305. #endif
  306. struct fxdiv_divisor_size_t size_result = {
  307. (size_t) uint_result.value,
  308. (size_t) uint_result.m,
  309. uint_result.s1,
  310. uint_result.s2
  311. };
  312. return size_result;
  313. }
  314. static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
  315. const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
  316. return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
  317. }
  318. static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
  319. const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
  320. return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
  321. }
  322. static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
  323. #if SIZE_MAX == UINT32_MAX
  324. const struct fxdiv_divisor_uint32_t uint32_divisor = {
  325. (uint32_t) divisor.value,
  326. (uint32_t) divisor.m,
  327. divisor.s1,
  328. divisor.s2
  329. };
  330. return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
  331. #elif SIZE_MAX == UINT64_MAX
  332. const struct fxdiv_divisor_uint64_t uint64_divisor = {
  333. (uint64_t) divisor.value,
  334. (uint64_t) divisor.m,
  335. divisor.s1,
  336. divisor.s2
  337. };
  338. return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
  339. #else
  340. #error Unsupported platform
  341. #endif
  342. }
  343. static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
  344. const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
  345. return n - quotient * divisor.value;
  346. }
  347. static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
  348. const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
  349. return n - quotient * divisor.value;
  350. }
  351. static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
  352. const size_t quotient = fxdiv_quotient_size_t(n, divisor);
  353. return n - quotient * divisor.value;
  354. }
  355. static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
  356. const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
  357. return quotient * granularity.value;
  358. }
  359. static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
  360. const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
  361. return quotient * granularity.value;
  362. }
  363. static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
  364. const size_t quotient = fxdiv_quotient_size_t(n, granularity);
  365. return quotient * granularity.value;
  366. }
  367. static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
  368. const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
  369. const uint32_t remainder = n - quotient * divisor.value;
  370. struct fxdiv_result_uint32_t result = { quotient, remainder };
  371. return result;
  372. }
  373. static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
  374. const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
  375. const uint64_t remainder = n - quotient * divisor.value;
  376. struct fxdiv_result_uint64_t result = { quotient, remainder };
  377. return result;
  378. }
  379. static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
  380. const size_t quotient = fxdiv_quotient_size_t(n, divisor);
  381. const size_t remainder = n - quotient * divisor.value;
  382. struct fxdiv_result_size_t result = { quotient, remainder };
  383. return result;
  384. }
  385. #endif /* FXDIV_H */