#pragma once // DO NOT DEFINE STATIC DATA IN THIS HEADER! // See Note [Do not compile initializers with AVX] #include #include #if !(defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR)) #include #include #include #include #include #include #include #include #elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX) #include #else #include #include #endif #include #include #include #include #include namespace at { namespace vec { // Note [CPU_CAPABILITY namespace] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // This header, and all of its subheaders, will be compiled with // different architecture flags for each supported set of vector // intrinsics. So we need to make sure they aren't inadvertently // linked together. We do this by declaring objects in an `inline // namespace` which changes the name mangling, but can still be // accessed as `at::vec`. inline namespace CPU_CAPABILITY { inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) { stream << val.val_; return stream; } inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) { stream << static_cast(val.val_); return stream; } inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) { stream << static_cast(val.val_); return stream; } template std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { T buf[Vectorized::size()]; vec.store(buf); stream << "vec["; for (int i = 0; i != Vectorized::size(); i++) { if (i != 0) { stream << ", "; } stream << buf[i]; } stream << "]"; return stream; } #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template<> inline Vectorized cast(const Vectorized& src) { return _mm256_castpd_ps(src); } template<> inline Vectorized cast(const Vectorized& src) { return _mm256_castps_pd(src); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template std::enable_if_t> inline gather(const double* base_addr, const Vectorized& vindex) { return _mm256_i64gather_pd(base_addr, vindex, scale); } template std::enable_if_t> inline gather(const float* base_addr, const Vectorized& vindex) { return _mm256_i32gather_ps(base_addr, vindex, scale); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template std::enable_if_t> inline mask_gather(const Vectorized& src, const double* base_addr, const Vectorized& vindex, const Vectorized& mask) { return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale); } template std::enable_if_t> inline mask_gather(const Vectorized& src, const float* base_addr, const Vectorized& vindex, const Vectorized& mask) { return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Only works for inputs in the range: [-2^51, 2^51] // From: https://stackoverflow.com/a/41148578 template<> Vectorized inline convert_to_int_of_same_size(const Vectorized &src) { auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000)); return _mm256_sub_epi64( _mm256_castpd_si256(x), _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)) ); } template<> Vectorized inline convert_to_int_of_same_size(const Vectorized &src) { return _mm256_cvttps_epi32(src); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template <> std::pair, Vectorized> inline interleave2(const Vectorized& a, const Vectorized& b) { // inputs: // a = {a0, a1, a3, a3} // b = {b0, b1, b2, b3} // swap lanes: // a_swapped = {a0, a1, b0, b1} // b_swapped = {a2, a3, b2, b3} auto a_swapped = _mm256_permute2f128_pd(a, b, 0b0100000); // 0, 2. 4 bits apart auto b_swapped = _mm256_permute2f128_pd(a, b, 0b0110001); // 1, 3. 4 bits apart // group cols crossing lanes: // return {a0, b0, a1, b1} // {a2, b2, a3, b3} return std::make_pair(_mm256_permute4x64_pd(a_swapped, 0b11011000), // 0, 2, 1, 3 _mm256_permute4x64_pd(b_swapped, 0b11011000)); // 0, 2, 1, 3 } template <> std::pair, Vectorized> inline interleave2(const Vectorized& a, const Vectorized& b) { // inputs: // a = {a0, a1, a2, a3, a4, a5, a6, a7} // b = {b0, b1, b2, b3, b4, b5, b6, b7} // swap lanes: // a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3} // b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7} // TODO: can we support caching this? auto a_swapped = _mm256_permute2f128_ps(a, b, 0b0100000); // 0, 2. 4 bits apart auto b_swapped = _mm256_permute2f128_ps(a, b, 0b0110001); // 1, 3. 4 bits apart // group cols crossing lanes: // return {a0, b0, a1, b1, a2, b2, a3, b3} // {a4, b4, a5, b5, a6, b6, a7, b7} const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); return std::make_pair(_mm256_permutevar8x32_ps(a_swapped, group_ctrl), _mm256_permutevar8x32_ps(b_swapped, group_ctrl)); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template <> std::pair, Vectorized> inline deinterleave2(const Vectorized& a, const Vectorized& b) { // inputs: // a = {a0, b0, a1, b1} // b = {a2, b2, a3, b3} // group cols crossing lanes: // a_grouped = {a0, a1, b0, b1} // b_grouped = {a2, a3, b2, b3} auto a_grouped = _mm256_permute4x64_pd(a, 0b11011000); // 0, 2, 1, 3 auto b_grouped = _mm256_permute4x64_pd(b, 0b11011000); // 0, 2, 1, 3 // swap lanes: // return {a0, a1, a2, a3} // {b0, b1, b2, b3} return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, 0b0100000), // 0, 2. 4 bits apart _mm256_permute2f128_pd(a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart } template <> std::pair, Vectorized> inline deinterleave2(const Vectorized& a, const Vectorized& b) { // inputs: // a = {a0, b0, a1, b1, a2, b2, a3, b3} // b = {a4, b4, a5, b5, a6, b6, a7, b7} // group cols crossing lanes: // a_grouped = {a0, a1, a2, a3, b0, b1, b2, b3} // b_grouped = {a4, a5, a6, a7, b4, b5, b6, b7} // TODO: can we support caching this? const __m256i group_ctrl = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); auto a_grouped = _mm256_permutevar8x32_ps(a, group_ctrl); auto b_grouped = _mm256_permutevar8x32_ps(b, group_ctrl); // swap lanes: // return {a0, a1, a2, a3, a4, a5, a6, a7} // {b0, b1, b2, b3, b4, b5, b6, b7} return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, 0b0100000), // 0, 2. 4 bits apart _mm256_permute2f128_ps(a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template<> inline Vectorized flip(const Vectorized & v) { const __m256i mask_float = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); return _mm256_permutevar8x32_ps(v, mask_float); } template<> inline Vectorized flip(const Vectorized & v) { return _mm256_permute4x64_pd(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3) } template<> inline Vectorized flip(const Vectorized & v) { return _mm256_permute4x64_epi64(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3) } template<> inline Vectorized flip(const Vectorized & v) { const __m256i mask_int32 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); return _mm256_permutevar8x32_epi32(v, mask_int32); } template<> inline Vectorized flip(const Vectorized & v) { const __m256i mask = _mm256_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ); auto reversed = _mm256_shuffle_epi8(v, mask); return _mm256_permute2x128_si256(reversed, reversed, 1); } inline __m256i flip8(const __m256i & v) { const __m256i mask_int8 = _mm256_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ); auto reversed = _mm256_shuffle_epi8(v, mask_int8); return _mm256_permute2x128_si256(reversed, reversed, 1); } template<> inline Vectorized flip(const Vectorized & v) { return flip8(v); } template<> inline Vectorized flip(const Vectorized & v) { return flip8(v); } #endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) }}}