123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- #include <algorithm>
- #include <atomic>
- #include <cmath>
- #include <cstdlib>
- #include <limits>
- #include <omp.h>
- #include <ATen/core/PhiloxRNGEngine.h>
- #if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
- #include <ATen/cpu/vec/functional.h>
- #include <ATen/cpu/vec/vec.h>
- #endif
- #include <c10/util/BFloat16.h>
- #include <c10/util/Half.h>
- typedef at::Half half;
- typedef at::BFloat16 bfloat16;
- template <typename T> inline T mod(T a, T b) { return a % b; }
- template <> inline float mod(float a, float b) { return std::fmod(a, b); }
- template <> inline double mod(double a, double b) { return std::fmod(a, b); }
- constexpr float uint32_to_uniform_float(uint32_t value) {
- // maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
- constexpr float scale = 4.6566127342e-10;
- return static_cast<float>(value & 0x7FFFFFFF) * scale;
- }
- float normalized_rand_cpu(uint32_t seed, uint32_t offset) {
- return uint32_to_uniform_float(at::Philox4_32(seed, 0, offset)());
- }
- float randn_cpu(uint32_t seed, uint32_t offset) {
- at::Philox4_32 engine(seed, 0, offset);
- return engine.randn(10);
- }
- template <typename T> struct AsIntegerType { typedef T type; };
- template <> struct AsIntegerType<float> { typedef uint32_t type; };
- template <> struct AsIntegerType<double> { typedef uint64_t type; };
- template <typename T> void atomic_add(volatile T *addr, T offset) {
- typedef typename AsIntegerType<T>::type alt_type;
- static_assert(sizeof(std::atomic<alt_type>) == sizeof(T),
- "std::atomic issue");
- alt_type expected;
- alt_type desired;
- std::atomic<alt_type> *atomic_addr = (std::atomic<alt_type> *)addr;
- do {
- T val = *addr;
- reinterpret_cast<T *>(&expected)[0] = val;
- reinterpret_cast<T *>(&desired)[0] = val + offset;
- } while (!atomic_addr->compare_exchange_weak(expected, desired,
- std::memory_order_relaxed));
- }
- // This function is used to convert bool or uint8 to float mask for
- // vectorization. The caller needs to make sure the src represents TRUE/FALSE
- // correctly.
- template <typename T>
- void flag_to_float(const T* src, float* dst, int64_t n) {
- #pragma unroll
- for (int64_t i = 0; i < n; i++) {
- uint32_t* dst_u32 = (uint32_t*)dst;
- dst_u32[i] = *(src + i) ? 0xFFFFFFFF : 0;
- }
- }
- template <typename T, std::enable_if_t<std::is_same<T, bool>::value || std::is_same<T, uint8_t>::value, bool> = true>
- void flag_to_float(T src, float* dst, int64_t n) {
- #pragma unroll
- for (int64_t i = 0; i < n; i++) {
- uint32_t* dst_u32 = (uint32_t*)dst;
- dst_u32[i] = src ? 0xFFFFFFFF : 0;
- }
- }
- #if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
- template <typename SRC>
- inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<SRC>& src) {
- assert(
- at::vec::Vectorized<float>::size() == at::vec::Vectorized<SRC>::size());
- at::vec::Vectorized<float> res_vec(0);
- #pragma unroll
- for (int i = 0; i < at::vec::Vectorized<float>::size(); i++) {
- res_vec[i] = src[i] ? 0xFFFFFFFF : 0;
- }
- return res_vec;
- }
- template <>
- inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<int>& src) {
- #if defined(CPU_CAPABILITY_AVX2)
- return at::vec::Vectorized<float>(_mm256_cvtepi32_ps(src));
- #else
- return at::vec::Vectorized<float>(_mm512_cvtepi32_ps(src));
- #endif
- }
- #endif
|