#include #include #include #include #include #include #include #if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) #include #include #endif #include #include typedef at::Half half; typedef at::BFloat16 bfloat16; template inline T mod(T a, T b) { return a % b; } template <> inline float mod(float a, float b) { return std::fmod(a, b); } template <> inline double mod(double a, double b) { return std::fmod(a, b); } constexpr float uint32_to_uniform_float(uint32_t value) { // maximum value such that `MAX_INT * scale < 1.0` (with float rounding) constexpr float scale = 4.6566127342e-10; return static_cast(value & 0x7FFFFFFF) * scale; } float normalized_rand_cpu(uint32_t seed, uint32_t offset) { return uint32_to_uniform_float(at::Philox4_32(seed, 0, offset)()); } float randn_cpu(uint32_t seed, uint32_t offset) { at::Philox4_32 engine(seed, 0, offset); return engine.randn(10); } template struct AsIntegerType { typedef T type; }; template <> struct AsIntegerType { typedef uint32_t type; }; template <> struct AsIntegerType { typedef uint64_t type; }; template void atomic_add(volatile T *addr, T offset) { typedef typename AsIntegerType::type alt_type; static_assert(sizeof(std::atomic) == sizeof(T), "std::atomic issue"); alt_type expected; alt_type desired; std::atomic *atomic_addr = (std::atomic *)addr; do { T val = *addr; reinterpret_cast(&expected)[0] = val; reinterpret_cast(&desired)[0] = val + offset; } while (!atomic_addr->compare_exchange_weak(expected, desired, std::memory_order_relaxed)); } // This function is used to convert bool or uint8 to float mask for // vectorization. The caller needs to make sure the src represents TRUE/FALSE // correctly. template void flag_to_float(const T* src, float* dst, int64_t n) { #pragma unroll for (int64_t i = 0; i < n; i++) { uint32_t* dst_u32 = (uint32_t*)dst; dst_u32[i] = *(src + i) ? 0xFFFFFFFF : 0; } } template ::value || std::is_same::value, bool> = true> void flag_to_float(T src, float* dst, int64_t n) { #pragma unroll for (int64_t i = 0; i < n; i++) { uint32_t* dst_u32 = (uint32_t*)dst; dst_u32[i] = src ? 0xFFFFFFFF : 0; } } #if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) template inline at::vec::Vectorized to_float_mask(at::vec::Vectorized& src) { assert( at::vec::Vectorized::size() == at::vec::Vectorized::size()); at::vec::Vectorized res_vec(0); #pragma unroll for (int i = 0; i < at::vec::Vectorized::size(); i++) { res_vec[i] = src[i] ? 0xFFFFFFFF : 0; } return res_vec; } template <> inline at::vec::Vectorized to_float_mask(at::vec::Vectorized& src) { #if defined(CPU_CAPABILITY_AVX2) return at::vec::Vectorized(_mm256_cvtepi32_ps(src)); #else return at::vec::Vectorized(_mm512_cvtepi32_ps(src)); #endif } #endif