SharedReduceOps.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545
  1. #pragma once
  2. // Please note that this file is
  3. // used across both CPU and GPU.
  4. #include <type_traits>
  5. #include <complex>
  6. #include <c10/macros/Macros.h>
  7. #include <ATen/detail/FunctionTraits.h>
  8. #include <ATen/NumericUtils.h>
  9. #if defined(__CUDACC__)
  10. #include <ATen/cuda/DeviceUtils.cuh>
  11. #include <ATen/native/cuda/DeviceSqrt.cuh>
  12. #elif defined(__HIPCC__)
  13. #include <ATen/hip/DeviceUtils.cuh>
  14. #include <ATen/native/hip/DeviceSqrt.cuh>
  15. #endif
  16. #if defined(__CUDACC__) || defined(__HIPCC__)
  17. #include <thrust/pair.h>
  18. #else
  19. #include <cmath>
  20. #define device_sqrt std::sqrt
  21. #endif
  22. #if defined(__CUDACC__) || defined(__HIPCC__)
  23. template <typename scalar_t>
  24. inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
  25. #if defined(__HIPCC__)
  26. // TODO: remove this special case for HIP when issue is fixed:
  27. // https://github.com/ROCm-Developer-Tools/HIP/issues/2209
  28. scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b));
  29. #else
  30. scalar_t max = at::_isnan(b) ? b : std::max(a, b);
  31. #endif
  32. return max;
  33. }
  34. template <typename scalar_t>
  35. inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
  36. #if defined(__HIPCC__)
  37. // TODO: remove this special case for HIP when issue is fixed:
  38. // https://github.com/ROCm-Developer-Tools/HIP/issues/2209
  39. scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b));
  40. #else
  41. scalar_t min = at::_isnan(b) ? b : std::min(a, b);
  42. #endif
  43. return min;
  44. }
  45. #define MAX(X, Y) max_propagate_nan(X,Y)
  46. #define MIN(X, Y) min_propagate_nan(X,Y)
  47. #else
  48. #include <ATen/native/cpu/zmath.h>
  49. #define MAX(X, Y) max_impl(X,Y)
  50. #define MIN(X, Y) min_impl(X,Y)
  51. #endif
  52. // ROCM hcc doesn't work well with using std:: in kernel functions
  53. #if defined(__CUDA_ARCH__)
  54. #include <c10/cuda/CUDAMathCompat.h>
  55. #define compat_pow c10::cuda::compat::pow
  56. #elif defined(__HIPCC__)
  57. #include <c10/hip/HIPMathCompat.h>
  58. #define compat_pow c10::hip::compat::pow
  59. #else
  60. #define compat_pow std::pow
  61. #endif
  62. namespace at { namespace native {
  63. namespace detail {
  64. #if defined(__CUDACC__) || defined(__HIPCC__)
  65. template <typename T1, typename T2> using pair = thrust::pair<T1, T2>;
  66. #else
  67. template <typename T1, typename T2> using pair = std::pair<T1, T2>;
  68. #endif
  69. } // namespace detail
  70. template <typename scalar_t, typename index_t>
  71. struct WelfordData {
  72. scalar_t mean;
  73. scalar_t m2;
  74. index_t n;
  75. scalar_t nf;
  76. C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {}
  77. C10_HOST_DEVICE WelfordData(
  78. scalar_t mean,
  79. scalar_t m2,
  80. index_t n,
  81. scalar_t nf)
  82. : mean(mean), m2(m2), n(n), nf(nf) {}
  83. };
  84. template <typename scalar_t, typename acc_scalar_t, typename index_t, typename res_t>
  85. struct WelfordOps {
  86. index_t correction;
  87. bool take_sqrt;
  88. public:
  89. using acc_t = WelfordData<acc_scalar_t, index_t>;
  90. inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
  91. // We accumulate n in index_t to avoid cumulative rounding error, but still
  92. // need nf for use in combine where int32 may overflow.
  93. index_t new_n = acc.n + 1;
  94. acc_scalar_t new_nf = static_cast<acc_scalar_t>(new_n);
  95. acc_scalar_t delta = data - acc.mean;
  96. acc_scalar_t new_mean = acc.mean + delta / new_nf;
  97. acc_scalar_t new_delta = data - new_mean;
  98. return {
  99. new_mean,
  100. acc.m2 + delta * new_delta,
  101. new_n,
  102. new_nf,
  103. };
  104. }
  105. inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
  106. if (a.nf == 0) {
  107. return b;
  108. }
  109. if (b.nf == 0) {
  110. return a;
  111. }
  112. acc_scalar_t delta = b.mean - a.mean;
  113. acc_scalar_t new_count = a.nf + b.nf;
  114. acc_scalar_t nb_over_n = b.nf / new_count;
  115. return {
  116. a.mean + delta * nb_over_n,
  117. a.m2 + b.m2 + delta * delta * a.nf * nb_over_n,
  118. // setting acc.n as -1 since acc.n might not be able to represent the count
  119. // correctly within its range, setting it to -1 to avoid confusion
  120. -1,
  121. new_count
  122. };
  123. }
  124. inline C10_DEVICE res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ {
  125. const auto mean = static_cast<scalar_t>(acc.mean);
  126. const auto divisor = acc.nf > correction ? acc.nf - correction : 0;
  127. const auto var = acc.m2 / divisor;
  128. res_t results(take_sqrt ? device_sqrt(var) : var, mean);
  129. return results;
  130. }
  131. static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
  132. return acc;
  133. }
  134. #if defined(__CUDACC__) || defined(__HIPCC__)
  135. inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const {
  136. return {
  137. WARP_SHFL_DOWN(acc.mean, offset)
  138. , WARP_SHFL_DOWN(acc.m2, offset)
  139. , WARP_SHFL_DOWN(acc.n, offset)
  140. , WARP_SHFL_DOWN(acc.nf, offset)
  141. };
  142. }
  143. #endif
  144. C10_HOST_DEVICE WelfordOps(index_t correction, bool take_sqrt)
  145. : correction(correction), take_sqrt(take_sqrt) {}
  146. };
  147. template <typename acc_t, typename factor_t>
  148. struct MeanOps {
  149. factor_t factor;
  150. inline C10_DEVICE acc_t reduce(acc_t a, acc_t b, int64_t /*idx*/) const {
  151. return combine(a, b);
  152. }
  153. inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
  154. return a + b;
  155. }
  156. inline C10_DEVICE acc_t project(acc_t a) const {
  157. return a * factor;
  158. }
  159. static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
  160. return acc;
  161. }
  162. #if defined(__CUDACC__) || defined(__HIPCC__)
  163. inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
  164. return WARP_SHFL_DOWN(data, offset);
  165. }
  166. #endif
  167. MeanOps(factor_t factor): factor(factor) {
  168. }
  169. };
  170. // This accumulator template is used to calculate the minimum absolute value of
  171. // a set of numbers.
  172. // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
  173. // value. These types differ for complex number input support.
  174. template <typename scalar_t, typename acc_t=scalar_t>
  175. struct AbsMinOps {
  176. inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
  177. return MIN(acc, static_cast<acc_t>(std::abs(data)));
  178. }
  179. inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
  180. return MIN(a, b);
  181. }
  182. inline C10_DEVICE acc_t project(acc_t a) const {
  183. return a;
  184. }
  185. static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
  186. return acc;
  187. }
  188. #if defined(__CUDACC__) || defined(__HIPCC__)
  189. inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
  190. return WARP_SHFL_DOWN(acc, offset);
  191. }
  192. #endif
  193. };
  194. // This accumulator template is used to calculate the maximum absolute value of
  195. // a set of numbers.
  196. // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
  197. // value. These types differ for complex number input support.
  198. template <typename scalar_t, typename acc_t=scalar_t>
  199. struct AbsMaxOps {
  200. inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
  201. return MAX(acc, static_cast<acc_t>(std::abs(data)));
  202. }
  203. inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
  204. return MAX(a, b);
  205. }
  206. inline C10_DEVICE acc_t project(acc_t a) const {
  207. return a;
  208. }
  209. static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
  210. return acc;
  211. }
  212. #if defined(__CUDACC__) || defined(__HIPCC__)
  213. inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
  214. return WARP_SHFL_DOWN(acc, offset);
  215. }
  216. #endif
  217. };
  218. // This accumulator template is used to calculate the norm of the absolute value
  219. // of a set of numbers.
  220. // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
  221. // value. These types differ for complex number input support.
  222. template <typename scalar_t, typename acc_t=scalar_t>
  223. struct NormOps {
  224. acc_t norm_;
  225. inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
  226. return acc + compat_pow(static_cast<acc_t>(std::abs(data)), norm_);
  227. }
  228. inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
  229. return a + b;
  230. }
  231. inline C10_DEVICE acc_t project(acc_t a) const {
  232. return compat_pow(a, static_cast<acc_t>(1.0) / norm_);
  233. }
  234. static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
  235. return acc;
  236. }
  237. #if defined(__CUDACC__) || defined(__HIPCC__)
  238. inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
  239. return WARP_SHFL_DOWN(acc, offset);
  240. }
  241. #endif
  242. NormOps(acc_t norm_): norm_(norm_) {
  243. }
  244. };
  245. // This accumulator template is used to calculate the order zero norm of the
  246. // absolute value of a set of numbers.
  247. // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
  248. // value. These types differ for complex number input support.
  249. template <typename scalar_t, typename acc_t=scalar_t>
  250. struct NormZeroOps {
  251. inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
  252. return acc + (data == static_cast<scalar_t>(0) ? static_cast<acc_t>(0) : static_cast<acc_t>(1));
  253. }
  254. inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
  255. return a + b;
  256. }
  257. inline C10_DEVICE acc_t project(acc_t a) const {
  258. return a;
  259. }
  260. static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
  261. return acc;
  262. }
  263. #if defined(__CUDACC__) || defined(__HIPCC__)
  264. inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
  265. return WARP_SHFL_DOWN(acc, offset);
  266. }
  267. #endif
  268. };
  269. // This accumulator template is used to calculate the order one norm of the
  270. // absolute value of a set of numbers.
  271. // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
  272. // value. These types differ for complex number input support.
  273. template <typename scalar_t, typename acc_t=scalar_t>
  274. struct NormOneOps {
  275. inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
  276. return acc + static_cast<acc_t>(std::abs(data));
  277. }
  278. inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
  279. return a + b;
  280. }
  281. inline C10_DEVICE acc_t project(acc_t a) const {
  282. return a;
  283. }
  284. static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
  285. return acc;
  286. }
  287. #if defined(__CUDACC__) || defined(__HIPCC__)
  288. inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
  289. return WARP_SHFL_DOWN(acc, offset);
  290. }
  291. #endif
  292. };
  293. template<typename acc_t>
  294. struct AbsSwitch {};
  295. template<typename scalar_t, typename acc_t>
  296. inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
  297. return static_cast<acc_t>(data);
  298. }
  299. template<typename scalar_t, typename acc_t>
  300. inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
  301. return static_cast<acc_t>(std::abs(data));
  302. }
  303. template<typename scalar_t, typename acc_t>
  304. inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
  305. return static_cast<acc_t>(std::abs(data));
  306. }
  307. // This accumulator template is used to calculate the order two norm of the
  308. // absolute value of a set of numbers.
  309. // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
  310. // value. These types differ for complex number input support.
  311. template <typename scalar_t, typename acc_t=scalar_t>
  312. struct NormTwoOps {
  313. inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
  314. acc_t data_ = abs_if_complex(data, AbsSwitch<acc_t>());
  315. return acc + data_ * data_;
  316. }
  317. inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
  318. return a + b;
  319. }
  320. inline C10_DEVICE acc_t project(acc_t a) const {
  321. return device_sqrt(a);
  322. }
  323. static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
  324. return acc;
  325. }
  326. #if defined(__CUDACC__) || defined(__HIPCC__)
  327. inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
  328. return WARP_SHFL_DOWN(acc, offset);
  329. }
  330. #endif
  331. };
  332. template <typename acc_t, typename data_t>
  333. struct NanSumOps {
  334. inline C10_DEVICE acc_t reduce(acc_t a, data_t b, int64_t /*idx*/) const {
  335. return a + (at::_isnan(b) ? acc_t{0.} : acc_t{b});
  336. }
  337. inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
  338. return a + b;
  339. }
  340. inline C10_DEVICE data_t project(acc_t a) const {
  341. return data_t{a};
  342. }
  343. static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
  344. return acc;
  345. }
  346. #if defined(__CUDACC__) || defined(__HIPCC__)
  347. inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
  348. return WARP_SHFL_DOWN(data, offset);
  349. }
  350. #endif
  351. };
  352. namespace detail {
  353. template <typename scalar_t>
  354. struct LessOrNan {
  355. C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const {
  356. // If (a == b), then choose the one with lower idx, else min(a, b)
  357. if (at::_isnan(a)) {
  358. if (at::_isnan(b)) {
  359. return idx_a < idx_b;
  360. }
  361. return true;
  362. }
  363. return (a == b) ? idx_a < idx_b : (a < b);
  364. }
  365. };
  366. template <typename scalar_t>
  367. struct GreaterOrNan {
  368. C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const {
  369. // If (a == b), then choose the one with lower idx, else max(a, b)
  370. if (at::_isnan(a)) {
  371. if (at::_isnan(b)) {
  372. return idx_a < idx_b;
  373. }
  374. return true;
  375. }
  376. return (a == b) ? idx_a < idx_b : (a > b);
  377. }
  378. };
  379. template <typename comp_t>
  380. struct MinMaxReductionOps {
  381. using scalar_t = typename binary_function_traits<comp_t>::arg1_t;
  382. using index_t = int64_t;
  383. using arg_t = detail::pair<scalar_t, index_t>;
  384. static C10_DEVICE arg_t project(arg_t arg) {
  385. return arg;
  386. }
  387. static C10_DEVICE arg_t reduce(arg_t arg, scalar_t val, int64_t idx) {
  388. return comp_t{}(arg.first, val, arg.second, idx) ? arg : arg_t(val, idx);
  389. }
  390. static C10_DEVICE arg_t combine(arg_t a, arg_t b) {
  391. return comp_t{}(a.first, b.first, a.second, b.second) ? a : b;
  392. }
  393. static C10_DEVICE arg_t translate_idx(arg_t a, int64_t base_idx) {
  394. return {a.first, a.second + base_idx};
  395. }
  396. #if defined(__CUDACC__) || defined(__HIPCC__)
  397. static C10_DEVICE arg_t warp_shfl_down(arg_t arg, int offset) {
  398. return arg_t(WARP_SHFL_DOWN(arg.first, offset),
  399. WARP_SHFL_DOWN(arg.second, offset));
  400. }
  401. #endif
  402. };
  403. template <typename comp_t>
  404. struct ArgReductionOps : public MinMaxReductionOps<comp_t> {
  405. using typename MinMaxReductionOps<comp_t>::scalar_t;
  406. using typename MinMaxReductionOps<comp_t>::index_t;
  407. using typename MinMaxReductionOps<comp_t>::arg_t;
  408. static C10_DEVICE index_t project(arg_t arg) {
  409. return arg.second;
  410. }
  411. };
  412. } // namespace detail
  413. template <typename scalar_t>
  414. struct ArgMaxOps :
  415. public detail::ArgReductionOps<detail::GreaterOrNan<scalar_t>> {
  416. };
  417. template <typename scalar_t>
  418. struct ArgMinOps :
  419. public detail::ArgReductionOps<detail::LessOrNan<scalar_t>> {
  420. };
  421. template <typename scalar_t>
  422. struct MinOps :
  423. public detail::MinMaxReductionOps<detail::LessOrNan<scalar_t>> {
  424. };
  425. template <typename scalar_t>
  426. struct MaxOps :
  427. public detail::MinMaxReductionOps<detail::GreaterOrNan<scalar_t>> {
  428. };
  429. template <typename scalar_t, typename acc_scalar_t, typename index_t>
  430. struct MinMaxOps {
  431. using acc_t = detail::pair<acc_scalar_t, acc_scalar_t>;
  432. inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
  433. return combine(acc, {data, data});
  434. }
  435. inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
  436. auto min_val = (at::_isnan(a.first) || a.first < b.first) ? a.first : b.first;
  437. auto max_val = (at::_isnan(a.second) || a.second > b.second) ? a.second : b.second;
  438. return {min_val, max_val};
  439. }
  440. inline C10_DEVICE acc_t project(acc_t acc) const {
  441. return acc;
  442. }
  443. static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
  444. return acc;
  445. }
  446. #if defined(__CUDACC__) || defined(__HIPCC__)
  447. inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
  448. return {
  449. WARP_SHFL_DOWN(acc.first, offset), WARP_SHFL_DOWN(acc.second, offset)
  450. };
  451. }
  452. #endif
  453. };
  454. }} // namespace at::native
  455. #undef MAX
  456. #undef MIN