Loops.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. #pragma once
  2. // This file provides two functions to help write elementwise kernels:
  3. //
  4. // cpu_kernel(TensorIterator iter, <lambda>)
  5. // cpu_kernel_vec(TensorIterator iter, <lambda>, <vec_lambda>)
  6. //
  7. // Both functions may generate vectorized code. The cpu_kernel implementation
  8. // relies on the compiler's auto-vectorization. The cpu_kernel_vec
  9. // implementation uses x86 SIMD intrinsics when available. These functions
  10. // are only intended to be used in the ATen/native/cpu subdirectory, since files
  11. // in other directories are not compiled with AVX/AVX2 enabled. See README.md
  12. // for more details.
  13. //
  14. // For example, to write a multiplication kernel for float:
  15. //
  16. // cpu_kernel(iter, [](float a, float b) { return a * b; });
  17. //
  18. // Or you may write:
  19. //
  20. // cpu_kernel_vec(iter,
  21. // [](float a, float b) { return a * b; },
  22. // [](Vectorized<float> a, Vectorized<float> b) { return a * b; });
  23. //
  24. // See BinaryOpsKernel.cpp for the complete implementation
  25. //
  26. //
  27. #include <stdint.h>
  28. #include <c10/util/C++17.h>
  29. #include <c10/util/Load.h>
  30. #include <c10/util/irange.h>
  31. #include <ATen/detail/FunctionTraits.h>
  32. #include <ATen/native/cpu/IsContiguous.h>
  33. #include <ATen/native/TensorIterator.h>
  34. #include <ATen/native/TensorIteratorDynamicCasting.h>
  35. #include <ATen/cpu/vec/vec.h>
  36. #include <utility>
  37. namespace at { namespace native { inline namespace CPU_CAPABILITY {
  38. using namespace vec;
  39. template <typename traits, std::size_t... INDEX>
  40. typename traits::ArgsTuple
  41. dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
  42. std::index_sequence<INDEX...>) {
  43. return std::make_tuple(
  44. c10::load<typename traits::template arg<INDEX>::type>(
  45. data[INDEX] + i * strides[INDEX])...);
  46. }
  47. template <typename traits>
  48. typename traits::ArgsTuple
  49. dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) {
  50. using Indices = std::make_index_sequence<traits::arity>;
  51. return dereference_impl<traits>(data, strides, i, Indices{});
  52. }
  53. template <typename traits, std::size_t... INDEX>
  54. typename traits::ArgsTuple
  55. dereference_vec_impl(char* C10_RESTRICT data[],
  56. const typename traits::result_type& opt_scalar,
  57. size_t S,
  58. int64_t i,
  59. std::index_sequence<INDEX...>) {
  60. using Vec = typename traits::result_type;
  61. using scalar_t = typename Vec::value_type;
  62. return std::make_tuple(
  63. S == INDEX + 1 ?
  64. opt_scalar :
  65. Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...);
  66. }
  67. template <typename traits>
  68. typename traits::ArgsTuple
  69. dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i) {
  70. using Indices = std::make_index_sequence<traits::arity>;
  71. return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{});
  72. }
  73. template <typename func_t,
  74. typename std::enable_if<!std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
  75. static inline void
  76. execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
  77. using traits = function_traits<func_t>;
  78. using result_type = typename traits::result_type;
  79. for (; i < n; i++) {
  80. result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
  81. *out_ptr = c10::guts::apply(std::forward<func_t>(op), dereference<traits>(
  82. &data[1],
  83. &strides[1],
  84. i));
  85. }
  86. }
  87. template <typename func_t,
  88. typename std::enable_if<std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
  89. static inline void
  90. execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
  91. using traits = function_traits<func_t>;
  92. for (; i < n; i++) {
  93. c10::guts::apply(std::forward<func_t>(op), dereference<traits>(
  94. &data[0],
  95. &strides[0],
  96. i));
  97. }
  98. }
  99. // Basic loop operation (one output, N inputs). May be auto-vectorized
  100. // by the compiler. Supports inputs and outputs of different types.
  101. template <typename func_t>
  102. static inline void
  103. basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
  104. using traits = function_traits<func_t>;
  105. constexpr int ntensors = traits::arity + 1;
  106. // Copying strides to temporary array helps auto vectorization in older GCC
  107. // versions.
  108. int64_t strides[ntensors];
  109. for (const auto arg : c10::irange(ntensors)) {
  110. strides[arg] = strides_[arg];
  111. }
  112. execute_op(data, strides, i, n, std::forward<func_t>(op));
  113. }
  114. // the recursive variadic template for iterating over the returned tuple
  115. template<class T, size_t N>
  116. struct TupleOutput {
  117. static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i,
  118. const T &tuple) {
  119. TupleOutput<T, N - 1>::handle(data, strides, i, tuple);
  120. auto output = std::get<N - 1>(tuple);
  121. using output_type = decltype(output);
  122. output_type * out_ptr = (output_type *)(data[N - 1] + i * strides[N - 1]);
  123. *out_ptr = output;
  124. }
  125. };
  126. // Base case for the above recursive template
  127. template<class T>
  128. struct TupleOutput<T, 1> {
  129. static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i,
  130. const T &tuple) {
  131. auto output = std::get<0>(tuple);
  132. using output_type = decltype(output);
  133. output_type* out_ptr = (output_type *)(data[0] + i * strides[0]);
  134. *out_ptr = output;
  135. }
  136. };
  137. template<class... Args>
  138. void handle_tuple_outputs(char* C10_RESTRICT data[],
  139. const int64_t* strides,
  140. int64_t i,
  141. const std::tuple<Args...> &tuple) {
  142. TupleOutput<decltype(tuple), sizeof...(Args)>::handle(data, strides, i, tuple);
  143. }
  144. // Loop operation for `cpu_kernel_multiple_outputs`.
  145. // 1. Use `c10::guts::apply` to make dynamic method invocation
  146. // for the lambda passed in `cpu_kernel_multiple_outputs`.
  147. // 2. Iterate over the members of the returned tuple, set the corresponding
  148. // output tensor by the tuple member in `handle_tuple_outputs` function.
  149. template <typename func_t>
  150. static inline void
  151. multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
  152. using traits = function_traits<func_t>;
  153. using result_type = typename traits::result_type;
  154. constexpr int num_outputs = std::tuple_size<result_type>::value;
  155. constexpr int ntensors = traits::arity + num_outputs;
  156. // Copying strides to temporary array helps auto vectorization in older GCC
  157. // versions.
  158. int64_t strides[ntensors];
  159. for (const auto arg : c10::irange(ntensors)) {
  160. strides[arg] = strides_[arg];
  161. }
  162. for (; i < n; i++) {
  163. auto output = c10::guts::apply(op, dereference<traits>(
  164. &data[num_outputs],
  165. &strides[num_outputs],
  166. i));
  167. handle_tuple_outputs(data, strides, i, output);
  168. }
  169. }
  170. // Explicitly vectorized loop implementation. All inputs and outputs must be
  171. // the same type and contiguous with one exception: a single input may be
  172. // a scalar (stride 0). It's position is indicated by the argument `S`. If `S`
  173. // is 0, then there are no scalar inputs.
  174. template <typename func_t, typename vec_func_t>
  175. static inline void
  176. vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) {
  177. using traits = function_traits<vec_func_t>;
  178. using scalar_t = typename function_traits<func_t>::result_type;
  179. using Vec = Vectorized<scalar_t>;
  180. constexpr int ntensors = traits::arity + 1;
  181. char* C10_RESTRICT data[ntensors];
  182. for (const auto arg : c10::irange(ntensors)) {
  183. data[arg] = data_[arg];
  184. }
  185. Vec opt_scalar = Vec(S > 0 ? *(scalar_t*)data[S] : scalar_t(0));
  186. int64_t i = 0;
  187. for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
  188. auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
  189. auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
  190. auto out1 = c10::guts::apply(std::forward<vec_func_t>(vop), std::move(args1));
  191. auto out2 = c10::guts::apply(std::forward<vec_func_t>(vop), std::move(args2));
  192. out1.store(data[0] + i * sizeof(scalar_t));
  193. out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
  194. }
  195. if (i < n) {
  196. int64_t strides[ntensors];
  197. for (const auto arg : c10::irange(ntensors)) {
  198. strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t);
  199. }
  200. basic_loop(data, strides, i, n, std::forward<func_t>(op));
  201. }
  202. }
  203. template <typename traits, typename cb_t>
  204. static inline void unroll_contiguous_scalar_checks(
  205. const int64_t* /*strides*/,
  206. std::index_sequence<>,
  207. cb_t&& cb) {
  208. cb(0);
  209. }
  210. template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
  211. static inline void unroll_contiguous_scalar_checks(
  212. const int64_t* strides,
  213. std::index_sequence<INDEX0, INDEX...>,
  214. cb_t&& cb) {
  215. if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
  216. cb(INDEX0 + 1);
  217. } else {
  218. unroll_contiguous_scalar_checks<traits>(strides, std::index_sequence<INDEX...>{}, std::forward<cb_t>(cb));
  219. }
  220. }
  221. template <typename op_t, typename vop_t>
  222. struct VectorizedLoop2d {
  223. op_t op;
  224. vop_t vop;
  225. using traits = function_traits<op_t>;
  226. static constexpr int ntensors = traits::arity + 1;
  227. using data_t = std::array<char*, ntensors>;
  228. VectorizedLoop2d(const op_t &op, vop_t vop):
  229. op(op), vop(std::move(vop)) {}
  230. static void advance(data_t &data, const int64_t *outer_strides) {
  231. for (const auto arg : c10::irange(data.size())) {
  232. data[arg] += outer_strides[arg];
  233. }
  234. }
  235. void operator()(char** base, const int64_t *strides, int64_t size0, int64_t size1) {
  236. data_t data;
  237. std::copy_n(base, ntensors, data.data());
  238. const int64_t *outer_strides = &strides[ntensors];
  239. if (is_contiguous<traits>(strides)) {
  240. for (const auto i C10_UNUSED : c10::irange(size1)) {
  241. vectorized_loop(data.data(), size0, 0, op, vop);
  242. advance(data, outer_strides);
  243. }
  244. } else {
  245. using Indices = std::make_index_sequence<traits::arity>;
  246. unroll_contiguous_scalar_checks<traits>(strides, Indices{}, [&](size_t idx) {
  247. if (idx) {
  248. for (const auto i C10_UNUSED : c10::irange(size1)) {
  249. vectorized_loop(data.data(), size0, idx, op, vop);
  250. advance(data, outer_strides);
  251. }
  252. } else {
  253. for (const auto i C10_UNUSED : c10::irange(size1)) {
  254. basic_loop(data.data(), strides, 0, size0, op);
  255. advance(data, outer_strides);
  256. }
  257. }
  258. });
  259. }
  260. }
  261. };
  262. template <typename op_t, typename vop_t>
  263. VectorizedLoop2d<op_t, vop_t> make_vectorized_loop2d(
  264. const op_t &op, const vop_t &vop) {
  265. return VectorizedLoop2d<op_t, vop_t>(op, vop);
  266. }
  267. template <typename func_t>
  268. void cpu_kernel(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
  269. using traits = function_traits<func_t>;
  270. // this could be extended to work with void return types
  271. TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
  272. TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
  273. // dynamic casting not currently supported on CPU
  274. TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
  275. iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
  276. // basic loop can handle 1d slices with arbitrary strides, and 1d slices is all that
  277. // iter.for_each is ever sending to the loop lambda
  278. basic_loop(data, strides, 0, n, std::forward<func_t>(op));
  279. }, grain_size);
  280. iter.cast_outputs();
  281. }
  282. // This function helps write elementwise kernels that requires multiple outputs.
  283. // It follows the similar structure of cpu_kernel.
  284. // Instead of `basic_loop` function, a new `multiple_outputs_loop` function is
  285. // manipulated to handle multiple return values.
  286. // For now `needs_dynamic_casting` check is not added as the passed lambda (`func_t`)
  287. // of `multiple_outputs_loop` returns `std::tuple` instead of `scalar_t`.
  288. // The `gpu_kernel_multiple_outputs` is also implemented without this check,
  289. // We could extend `needs_dynamic_casting` to support both `std::tuple` and
  290. // `thrust::tuple` in the future.
  291. template <typename func_t>
  292. void cpu_kernel_multiple_outputs(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
  293. using traits = function_traits<func_t>;
  294. TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
  295. iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
  296. multiple_outputs_loop(data, strides, 0, n, std::forward<func_t>(op));
  297. }, grain_size);
  298. iter.cast_outputs();
  299. }
  300. template <bool check_dynamic_cast=true, typename func_t, typename vec_func_t>
  301. void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, int64_t grain_size = at::internal::GRAIN_SIZE) {
  302. using traits = function_traits<func_t>;
  303. // this could be extended to work with void return types
  304. TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
  305. TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
  306. // dynamic casting not currently supported on CPU, but some kernels (like Fill)
  307. // explicitly dynamic_cast, so we give the opt-out of checking.
  308. c10::guts::if_constexpr<check_dynamic_cast>([&] {
  309. TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
  310. });
  311. iter.for_each(make_vectorized_loop2d(op, vop), grain_size);
  312. iter.cast_outputs();
  313. }
  314. template <typename func_t>
  315. void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) {
  316. using traits = function_traits<func_t>;
  317. constexpr bool result_void = std::is_void<typename traits::result_type>::value;
  318. TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity &&
  319. ((result_void && iter.noutputs() == 0) || (!result_void && iter.noutputs() == 1)));
  320. // dynamic casting not currently supported on CPU
  321. TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
  322. iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) {
  323. basic_loop(data, strides, 0, n, std::forward<func_t>(op));
  324. }, range);
  325. iter.cast_outputs();
  326. }
  327. template <typename func_t>
  328. void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) {
  329. cpu_serial_kernel(iter, op, {0, iter.numel()});
  330. }
  331. template <typename func_t, typename vec_func_t>
  332. void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) {
  333. using traits = function_traits<func_t>;
  334. // this could be extended to work with void return types
  335. TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
  336. TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
  337. // dynamic casting not currently supported on CPU
  338. TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
  339. iter.serial_for_each(make_vectorized_loop2d(op, vop), range);
  340. iter.cast_outputs();
  341. }
  342. template <typename func_t, typename vec_func_t>
  343. void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) {
  344. cpu_serial_kernel_vec(iter, op, vop, {0, iter.numel()});
  345. }
  346. }}} // namespace at::native::<anonymous>