cpp_prefix.h 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #include <algorithm>
  2. #include <atomic>
  3. #include <cmath>
  4. #include <cstdlib>
  5. #include <limits>
  6. #include <omp.h>
  7. #include <ATen/core/PhiloxRNGEngine.h>
  8. #if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
  9. #include <ATen/cpu/vec/functional.h>
  10. #include <ATen/cpu/vec/vec.h>
  11. #endif
  12. #include <c10/util/BFloat16.h>
  13. #include <c10/util/Half.h>
  14. typedef at::Half half;
  15. typedef at::BFloat16 bfloat16;
  16. template <typename T> inline T mod(T a, T b) { return a % b; }
  17. template <> inline float mod(float a, float b) { return std::fmod(a, b); }
  18. template <> inline double mod(double a, double b) { return std::fmod(a, b); }
  19. constexpr float uint32_to_uniform_float(uint32_t value) {
  20. // maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
  21. constexpr float scale = 4.6566127342e-10;
  22. return static_cast<float>(value & 0x7FFFFFFF) * scale;
  23. }
  24. float normalized_rand_cpu(uint32_t seed, uint32_t offset) {
  25. return uint32_to_uniform_float(at::Philox4_32(seed, 0, offset)());
  26. }
  27. float randn_cpu(uint32_t seed, uint32_t offset) {
  28. at::Philox4_32 engine(seed, 0, offset);
  29. return engine.randn(10);
  30. }
  31. template <typename T> struct AsIntegerType { typedef T type; };
  32. template <> struct AsIntegerType<float> { typedef uint32_t type; };
  33. template <> struct AsIntegerType<double> { typedef uint64_t type; };
  34. template <typename T> void atomic_add(volatile T *addr, T offset) {
  35. typedef typename AsIntegerType<T>::type alt_type;
  36. static_assert(sizeof(std::atomic<alt_type>) == sizeof(T),
  37. "std::atomic issue");
  38. alt_type expected;
  39. alt_type desired;
  40. std::atomic<alt_type> *atomic_addr = (std::atomic<alt_type> *)addr;
  41. do {
  42. T val = *addr;
  43. reinterpret_cast<T *>(&expected)[0] = val;
  44. reinterpret_cast<T *>(&desired)[0] = val + offset;
  45. } while (!atomic_addr->compare_exchange_weak(expected, desired,
  46. std::memory_order_relaxed));
  47. }
  48. // This function is used to convert bool or uint8 to float mask for
  49. // vectorization. The caller needs to make sure the src represents TRUE/FALSE
  50. // correctly.
  51. template <typename T>
  52. void flag_to_float(const T* src, float* dst, int64_t n) {
  53. #pragma unroll
  54. for (int64_t i = 0; i < n; i++) {
  55. uint32_t* dst_u32 = (uint32_t*)dst;
  56. dst_u32[i] = *(src + i) ? 0xFFFFFFFF : 0;
  57. }
  58. }
  59. template <typename T, std::enable_if_t<std::is_same<T, bool>::value || std::is_same<T, uint8_t>::value, bool> = true>
  60. void flag_to_float(T src, float* dst, int64_t n) {
  61. #pragma unroll
  62. for (int64_t i = 0; i < n; i++) {
  63. uint32_t* dst_u32 = (uint32_t*)dst;
  64. dst_u32[i] = src ? 0xFFFFFFFF : 0;
  65. }
  66. }
  67. #if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
  68. template <typename SRC>
  69. inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<SRC>& src) {
  70. assert(
  71. at::vec::Vectorized<float>::size() == at::vec::Vectorized<SRC>::size());
  72. at::vec::Vectorized<float> res_vec(0);
  73. #pragma unroll
  74. for (int i = 0; i < at::vec::Vectorized<float>::size(); i++) {
  75. res_vec[i] = src[i] ? 0xFFFFFFFF : 0;
  76. }
  77. return res_vec;
  78. }
  79. template <>
  80. inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<int>& src) {
  81. #if defined(CPU_CAPABILITY_AVX2)
  82. return at::vec::Vectorized<float>(_mm256_cvtepi32_ps(src));
  83. #else
  84. return at::vec::Vectorized<float>(_mm512_cvtepi32_ps(src));
  85. #endif
  86. }
  87. #endif