123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320 |
- #pragma once
- // DO NOT DEFINE STATIC DATA IN THIS HEADER!
- // See Note [Do not compile initializers with AVX]
- #include <ATen/cpu/vec/vec.h>
- #include <c10/util/irange.h>
- namespace at { namespace vec {
- // slow path
- template <typename scalar_t, typename Op>
- inline scalar_t vec_reduce_all(
- const Op& vec_fun,
- vec::Vectorized<scalar_t> acc_vec,
- int64_t size) {
- using Vec = vec::Vectorized<scalar_t>;
- scalar_t acc_arr[Vec::size()];
- acc_vec.store(acc_arr);
- for (const auto i : c10::irange(1, size)) {
- std::array<scalar_t, Vec::size()> acc_arr_next = {0};
- acc_arr_next[0] = acc_arr[i];
- Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
- acc_vec = vec_fun(acc_vec, acc_vec_next);
- }
- acc_vec.store(acc_arr);
- return acc_arr[0];
- }
- template <typename scalar_t, typename Op>
- struct VecReduceAllSIMD {
- static inline scalar_t apply(const Op& vec_fun, Vectorized<scalar_t> acc_vec) {
- return vec_reduce_all(vec_fun, acc_vec, Vectorized<scalar_t>::size());
- }
- };
- #if defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
- #if defined(CPU_CAPABILITY_AVX2)
- template <typename Op>
- struct VecReduceAllSIMD<float, Op> {
- static inline float apply(const Op& vec_fun, Vectorized<float> acc_vec) {
- using Vec = Vectorized<float>;
- Vec v = acc_vec;
- // 128-bit shuffle
- Vec v1 = _mm256_permute2f128_ps(v, v, 0x1);
- v = vec_fun(v, v1);
- // 64-bit shuffle
- v1 = _mm256_shuffle_ps(v, v, 0x4E);
- v = vec_fun(v, v1);
- // 32-bit shuffle
- v1 = _mm256_shuffle_ps(v, v, 0xB1);
- v = vec_fun(v, v1);
- return _mm256_cvtss_f32(v);
- }
- };
- #endif // defined(CPU_CAPABILITY_AVX2)
- #if defined(CPU_CAPABILITY_AVX512)
- template <typename Op>
- struct VecReduceAllSIMD<float, Op> {
- static inline float apply(const Op& vec_fun, Vectorized<float> acc_vec) {
- using Vec = Vectorized<float>;
- Vec v = acc_vec;
- // 256-bit shuffle
- Vec v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
- v = vec_fun(v, v1);
- // 128-bit shuffle
- v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
- v = vec_fun(v, v1);
- // 64-bit shuffle
- v1 = _mm512_shuffle_ps(v, v, 0x4E);
- v = vec_fun(v, v1);
- // 32-bit shuffle
- v1 = _mm512_shuffle_ps(v, v, 0xB1);
- v = vec_fun(v, v1);
- return _mm512_cvtss_f32(v);
- }
- };
- #endif // defined(CPU_CAPABILITY_AVX512)
- #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
- template <typename scalar_t, typename Op>
- inline scalar_t vec_reduce_all(const Op& vec_fun, Vectorized<scalar_t> acc_vec) {
- return VecReduceAllSIMD<scalar_t, Op>::apply(vec_fun, acc_vec);
- }
- template <typename scalar_t, typename Op>
- inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
- using Vec = vec::Vectorized<scalar_t>;
- if (size < Vec::size())
- return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
- int64_t d = Vec::size();
- Vec acc_vec = Vec::loadu(data);
- for (; d < size - (size % Vec::size()); d += Vec::size()) {
- Vec data_vec = Vec::loadu(data + d);
- acc_vec = vec_fun(acc_vec, data_vec);
- }
- if (size - d > 0) {
- Vec data_vec = Vec::loadu(data + d, size - d);
- acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
- }
- return vec_reduce_all(vec_fun, acc_vec);
- }
- // similar to reduce_all, but reduces into two outputs
- template <typename scalar_t, typename Op1, typename Op2>
- inline std::pair<scalar_t, scalar_t> reduce2_all(const Op1& vec_fun1, const Op2& vec_fun2,
- const scalar_t* data, int64_t size) {
- using Vec = vec::Vectorized<scalar_t>;
- if (size < Vec::size()) {
- auto loaded_data = Vec::loadu(data, size);
- return std::pair<scalar_t, scalar_t>(
- vec_reduce_all(vec_fun1, loaded_data, size),
- vec_reduce_all(vec_fun2, loaded_data, size));
- }
- int64_t d = Vec::size();
- Vec acc_vec1 = Vec::loadu(data);
- Vec acc_vec2 = Vec::loadu(data);
- for (; d < size - (size % Vec::size()); d += Vec::size()) {
- Vec data_vec = Vec::loadu(data + d);
- acc_vec1 = vec_fun1(acc_vec1, data_vec);
- acc_vec2 = vec_fun2(acc_vec2, data_vec);
- }
- if (size - d > 0) {
- Vec data_vec = Vec::loadu(data + d, size - d);
- acc_vec1 = Vec::set(acc_vec1, vec_fun1(acc_vec1, data_vec), size - d);
- acc_vec2 = Vec::set(acc_vec2, vec_fun2(acc_vec2, data_vec), size - d);
- }
- return std::pair<scalar_t, scalar_t>(
- vec_reduce_all(vec_fun1, acc_vec1),
- vec_reduce_all(vec_fun2, acc_vec2));
- }
- template <typename scalar_t, typename MapOp, typename ReduceOp>
- inline scalar_t map_reduce_all(
- const MapOp& map_fun,
- const ReduceOp& red_fun,
- const scalar_t* data,
- int64_t size) {
- using Vec = vec::Vectorized<scalar_t>;
- if (size < Vec::size())
- return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
- int64_t d = Vec::size();
- Vec acc_vec = map_fun(Vec::loadu(data));
- for (; d < size - (size % Vec::size()); d += Vec::size()) {
- Vec data_vec = Vec::loadu(data + d);
- data_vec = map_fun(data_vec);
- acc_vec = red_fun(acc_vec, data_vec);
- }
- if (size - d > 0) {
- Vec data_vec = Vec::loadu(data + d, size - d);
- data_vec = map_fun(data_vec);
- acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
- }
- return vec_reduce_all(red_fun, acc_vec);
- }
- template <typename scalar_t, typename MapOp, typename ReduceOp>
- inline scalar_t map2_reduce_all(
- const MapOp& map_fun,
- const ReduceOp& red_fun,
- const scalar_t* data,
- const scalar_t* data2,
- int64_t size) {
- using Vec = vec::Vectorized<scalar_t>;
- if (size < Vec::size()) {
- Vec data_vec = Vec::loadu(data, size);
- Vec data2_vec = Vec::loadu(data2, size);
- data_vec = map_fun(data_vec, data2_vec);
- return vec_reduce_all(red_fun, data_vec, size);
- }
- int64_t d = Vec::size();
- Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
- for (; d < size - (size % Vec::size()); d += Vec::size()) {
- Vec data_vec = Vec::loadu(data + d);
- Vec data2_vec = Vec::loadu(data2 + d);
- data_vec = map_fun(data_vec, data2_vec);
- acc_vec = red_fun(acc_vec, data_vec);
- }
- if (size - d > 0) {
- Vec data_vec = Vec::loadu(data + d, size - d);
- Vec data2_vec = Vec::loadu(data2 + d, size - d);
- data_vec = map_fun(data_vec, data2_vec);
- acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
- }
- return vec_reduce_all(red_fun, acc_vec);
- }
- template <typename scalar_t, typename MapOp, typename ReduceOp>
- inline scalar_t map3_reduce_all(
- const MapOp& map_fun,
- const ReduceOp& red_fun,
- const scalar_t* data,
- const scalar_t* data2,
- const scalar_t* data3,
- int64_t size) {
- using Vec = vec::Vectorized<scalar_t>;
- if (size < Vec::size()) {
- Vec data_vec = Vec::loadu(data, size);
- Vec data2_vec = Vec::loadu(data2, size);
- Vec data3_vec = Vec::loadu(data3, size);
- data_vec = map_fun(data_vec, data2_vec, data3_vec);
- return vec_reduce_all(red_fun, data_vec, size);
- }
- int64_t d = Vec::size();
- Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2), Vec::loadu(data3));
- for (; d < size - (size % Vec::size()); d += Vec::size()) {
- Vec data_vec = Vec::loadu(data + d);
- Vec data2_vec = Vec::loadu(data2 + d);
- Vec data3_vec = Vec::loadu(data3 + d);
- data_vec = map_fun(data_vec, data2_vec, data3_vec);
- acc_vec = red_fun(acc_vec, data_vec);
- }
- if (size - d > 0) {
- Vec data_vec = Vec::loadu(data + d, size - d);
- Vec data2_vec = Vec::loadu(data2 + d, size - d);
- Vec data3_vec = Vec::loadu(data3 + d, size - d);
- data_vec = map_fun(data_vec, data2_vec, data3_vec);
- acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
- }
- return vec_reduce_all(red_fun, acc_vec);
- }
- template <typename scalar_t, typename Op>
- inline void map(
- const Op& vec_fun,
- scalar_t* output_data,
- const scalar_t* input_data,
- int64_t size) {
- using Vec = vec::Vectorized<scalar_t>;
- int64_t d = 0;
- for (; d < size - (size % Vec::size()); d += Vec::size()) {
- Vec output_vec = vec_fun(Vec::loadu(input_data + d));
- output_vec.store(output_data + d);
- }
- if (size - d > 0) {
- Vec output_vec = vec_fun(Vec::loadu(input_data + d, size - d));
- output_vec.store(output_data + d, size - d);
- }
- }
- template <typename scalar_t, typename Op>
- inline void map2(
- const Op& vec_fun,
- scalar_t* output_data,
- const scalar_t* input_data,
- const scalar_t* input_data2,
- int64_t size) {
- using Vec = vec::Vectorized<scalar_t>;
- int64_t d = 0;
- for (; d < size - (size % Vec::size()); d += Vec::size()) {
- Vec data_vec = Vec::loadu(input_data + d);
- Vec data_vec2 = Vec::loadu(input_data2 + d);
- Vec output_vec = vec_fun(data_vec, data_vec2);
- output_vec.store(output_data + d);
- }
- if (size - d > 0) {
- Vec data_vec = Vec::loadu(input_data + d, size - d);
- Vec data_vec2 = Vec::loadu(input_data2 + d, size - d);
- Vec output_vec = vec_fun(data_vec, data_vec2);
- output_vec.store(output_data + d, size - d);
- }
- }
- template <typename scalar_t, typename Op>
- inline void map3(
- const Op& vec_fun,
- scalar_t* output_data,
- const scalar_t* input_data1,
- const scalar_t* input_data2,
- const scalar_t* input_data3,
- int64_t size) {
- using Vec = vec::Vectorized<scalar_t>;
- int64_t d = 0;
- for (; d < size - (size % Vec::size()); d += Vec::size()) {
- Vec data_vec1 = Vec::loadu(input_data1 + d);
- Vec data_vec2 = Vec::loadu(input_data2 + d);
- Vec data_vec3 = Vec::loadu(input_data3 + d);
- Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3);
- output_vec.store(output_data + d);
- }
- if (size - d > 0) {
- Vec data_vec1 = Vec::loadu(input_data1 + d, size - d);
- Vec data_vec2 = Vec::loadu(input_data2 + d, size - d);
- Vec data_vec3 = Vec::loadu(input_data3 + d, size - d);
- Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3);
- output_vec.store(output_data + d, size - d);
- }
- }
- template <typename scalar_t, typename Op>
- inline void map4(
- const Op& vec_fun,
- scalar_t* output_data,
- const scalar_t* input_data1,
- const scalar_t* input_data2,
- const scalar_t* input_data3,
- const scalar_t* input_data4,
- int64_t size) {
- using Vec = vec::Vectorized<scalar_t>;
- int64_t d = 0;
- for (; d < size - (size % Vec::size()); d += Vec::size()) {
- Vec data_vec1 = Vec::loadu(input_data1 + d);
- Vec data_vec2 = Vec::loadu(input_data2 + d);
- Vec data_vec3 = Vec::loadu(input_data3 + d);
- Vec data_vec4 = Vec::loadu(input_data4 + d);
- Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3, data_vec4);
- output_vec.store(output_data + d);
- }
- if (size - d > 0) {
- Vec data_vec1 = Vec::loadu(input_data1 + d, size - d);
- Vec data_vec2 = Vec::loadu(input_data2 + d, size - d);
- Vec data_vec3 = Vec::loadu(input_data3 + d, size - d);
- Vec data_vec4 = Vec::loadu(input_data4 + d, size - d);
- Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3, data_vec4);
- output_vec.store(output_data + d, size - d);
- }
- }
- }} // namespace at::vec
|