functional_bfloat16.h 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. #pragma once
  2. // DO NOT DEFINE STATIC DATA IN THIS HEADER!
  3. // See Note [Do not compile initializers with AVX]
  4. #include <ATen/cpu/vec/vec.h>
  5. namespace at { namespace vec {
  6. // BFloat16 specification
  7. template <typename scalar_t> struct VecScalarType { using type = scalar_t; };
  8. template <> struct VecScalarType<BFloat16> { using type = float; };
  9. // This is different from at::acc_type since we only need to specialize BFloat16
  10. template <typename scalar_t>
  11. using vec_scalar_t = typename VecScalarType<scalar_t>::type;
  12. // Note that we already have specialized member of Vectorized<scalar_t> for BFloat16
  13. // so the following functions would run smoothly:
  14. // using Vec = Vectorized<BFloat16>;
  15. // Vec one = Vec(BFloat16(1));
  16. // vec::map([](Vec x) { return one / (one + x.exp()); }, y_ptr, x_ptr, N);
  17. //
  18. // Then why we still need to specialize "funtional"?
  19. // If we do specialization at Vectorized<> level, the above example would need 3 pairs of
  20. // conversion of bf16->fp32/fp32->bf16, each for ".exp()", "+" and "/".
  21. // If we do specialization at vec::map<>() level, we have only 1 pair of conversion
  22. // of bf16->fp32/fp32->bf16, for the input and output BFloat16 vector only.
  23. //
  24. // The following BFloat16 functionality will only do data type conversion for input
  25. // and output vector (reduce functionality will only convert the final scalar back to bf16).
  26. // Compared to Vectorized<> specialization,
  27. // 1. better performance since we have less data type conversion;
  28. // 2. less rounding error since immediate results are kept in fp32;
  29. // 3. accumulation done on data type of fp32.
  30. //
  31. // If you plan to extend this file, please ensure adding unit tests at
  32. // aten/src/ATen/test/vec_test_all_types.cpp
  33. //
  34. template <typename scalar_t = BFloat16, typename Op>
  35. inline BFloat16 reduce_all(const Op& vec_fun, const BFloat16* data, int64_t size) {
  36. using bVec = vec::Vectorized<BFloat16>;
  37. using fVec = vec::Vectorized<float>;
  38. if (size < bVec::size()) {
  39. bVec data_bvec = bVec::loadu(data, size);
  40. fVec data_fvec0, data_fvec1;
  41. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  42. if (size > fVec::size()) {
  43. data_fvec0 = fVec::set(data_fvec0, vec_fun(data_fvec0, data_fvec1), size - fVec::size());
  44. return vec_reduce_all<float>(vec_fun, data_fvec0, fVec::size());
  45. } else {
  46. return vec_reduce_all<float>(vec_fun, data_fvec0, size);
  47. }
  48. }
  49. int64_t d = bVec::size();
  50. bVec acc_bvec = bVec::loadu(data);
  51. fVec acc_fvec0, acc_fvec1;
  52. std::tie(acc_fvec0, acc_fvec1) = convert_bfloat16_float(acc_bvec);
  53. for (; d < size - (size % bVec::size()); d += bVec::size()) {
  54. bVec data_bvec = bVec::loadu(data + d);
  55. fVec data_fvec0, data_fvec1;
  56. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  57. acc_fvec0 = vec_fun(acc_fvec0, data_fvec0);
  58. acc_fvec1 = vec_fun(acc_fvec1, data_fvec1);
  59. }
  60. if (size - d > 0) {
  61. bVec data_bvec = bVec::loadu(data + d, size - d);
  62. fVec data_fvec0, data_fvec1;
  63. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  64. if (size - d > fVec::size()) {
  65. acc_fvec0 = vec_fun(acc_fvec0, data_fvec0);
  66. acc_fvec1 = fVec::set(acc_fvec1, vec_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
  67. } else {
  68. acc_fvec0 = fVec::set(acc_fvec0, vec_fun(acc_fvec0, data_fvec0), size - d);
  69. }
  70. }
  71. acc_fvec0 = vec_fun(acc_fvec0, acc_fvec1);
  72. return vec_reduce_all<float>(vec_fun, acc_fvec0);
  73. }
  74. template <typename scalar_t = BFloat16, typename Op1, typename Op2>
  75. inline std::pair<BFloat16, BFloat16> reduce2_all(const Op1& vec_fun1, const Op2& vec_fun2,
  76. const BFloat16* data, int64_t size) {
  77. using bVec = vec::Vectorized<BFloat16>;
  78. using fVec = vec::Vectorized<float>;
  79. if (size < bVec::size()) {
  80. bVec data_bvec = bVec::loadu(data, size);
  81. fVec data_fvec0, data_fvec1;
  82. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  83. if (size > fVec::size()) {
  84. fVec acc1_fvec = fVec::set(data_fvec0, vec_fun1(data_fvec0, data_fvec1), size - fVec::size());
  85. fVec acc2_fvec = fVec::set(data_fvec0, vec_fun2(data_fvec0, data_fvec1), size - fVec::size());
  86. return std::pair<BFloat16, BFloat16>(
  87. vec_reduce_all<float>(vec_fun1, acc1_fvec, fVec::size()),
  88. vec_reduce_all<float>(vec_fun2, acc2_fvec, fVec::size()));
  89. } else {
  90. return std::pair<BFloat16, BFloat16>(
  91. vec_reduce_all<float>(vec_fun1, data_fvec0, size),
  92. vec_reduce_all<float>(vec_fun2, data_fvec0, size));
  93. }
  94. }
  95. int64_t d = bVec::size();
  96. bVec acc_bvec = bVec::loadu(data);
  97. fVec acc1_fvec0, acc1_fvec1;
  98. std::tie(acc1_fvec0, acc1_fvec1) = convert_bfloat16_float(acc_bvec);
  99. fVec acc2_fvec0, acc2_fvec1;
  100. std::tie(acc2_fvec0, acc2_fvec1) = convert_bfloat16_float(acc_bvec);
  101. for (; d < size - (size % bVec::size()); d += bVec::size()) {
  102. bVec data_bvec = bVec::loadu(data + d);
  103. fVec data_fvec0, data_fvec1;
  104. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  105. acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0);
  106. acc1_fvec1 = vec_fun1(acc1_fvec1, data_fvec1);
  107. acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0);
  108. acc2_fvec1 = vec_fun2(acc2_fvec1, data_fvec1);
  109. }
  110. if (size - d > 0) {
  111. bVec data_bvec = bVec::loadu(data + d, size - d);
  112. fVec data_fvec0, data_fvec1;
  113. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  114. if (size - d > fVec::size()) {
  115. acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0);
  116. acc1_fvec1 = fVec::set(acc1_fvec1, vec_fun1(acc1_fvec1, data_fvec1), size - d - fVec::size());
  117. acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0);
  118. acc2_fvec1 = fVec::set(acc2_fvec1, vec_fun2(acc2_fvec1, data_fvec1), size - d - fVec::size());
  119. } else {
  120. acc1_fvec0 = fVec::set(acc1_fvec0, vec_fun1(acc1_fvec0, data_fvec0), size - d);
  121. acc2_fvec0 = fVec::set(acc2_fvec0, vec_fun2(acc2_fvec0, data_fvec0), size - d);
  122. }
  123. }
  124. acc1_fvec0 = vec_fun1(acc1_fvec0, acc1_fvec1);
  125. acc2_fvec0 = vec_fun2(acc2_fvec0, acc2_fvec1);
  126. return std::pair<BFloat16, BFloat16>(
  127. vec_reduce_all<float>(vec_fun1, acc1_fvec0),
  128. vec_reduce_all<float>(vec_fun2, acc2_fvec0));
  129. }
  130. template <typename scalar_t = BFloat16, typename MapOp, typename ReduceOp>
  131. inline BFloat16 map_reduce_all(
  132. const MapOp& map_fun,
  133. const ReduceOp& red_fun,
  134. const BFloat16* data,
  135. int64_t size) {
  136. using bVec = vec::Vectorized<BFloat16>;
  137. using fVec = vec::Vectorized<float>;
  138. if (size < bVec::size()) {
  139. bVec data_bvec = bVec::loadu(data, size);
  140. fVec data_fvec0, data_fvec1;
  141. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  142. if (size > fVec::size()) {
  143. data_fvec0 = map_fun(data_fvec0);
  144. data_fvec1 = map_fun(data_fvec1);
  145. data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
  146. return vec_reduce_all<float>(red_fun, data_fvec0, fVec::size());
  147. } else {
  148. data_fvec0 = map_fun(data_fvec0);
  149. return vec_reduce_all<float>(red_fun, data_fvec0, size);
  150. }
  151. }
  152. int64_t d = bVec::size();
  153. bVec acc_bvec = bVec::loadu(data);
  154. fVec acc_fvec0, acc_fvec1;
  155. std::tie(acc_fvec0, acc_fvec1) = convert_bfloat16_float(acc_bvec);
  156. acc_fvec0 = map_fun(acc_fvec0);
  157. acc_fvec1 = map_fun(acc_fvec1);
  158. for (; d < size - (size % bVec::size()); d += bVec::size()) {
  159. bVec data_bvec = bVec::loadu(data + d);
  160. fVec data_fvec0, data_fvec1;
  161. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  162. data_fvec0 = map_fun(data_fvec0);
  163. data_fvec1 = map_fun(data_fvec1);
  164. acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
  165. acc_fvec1 = red_fun(acc_fvec1, data_fvec1);
  166. }
  167. if (size - d > 0) {
  168. bVec data_bvec = bVec::loadu(data + d, size - d);
  169. fVec data_fvec0, data_fvec1;
  170. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  171. if (size - d > fVec::size()) {
  172. data_fvec0 = map_fun(data_fvec0);
  173. data_fvec1 = map_fun(data_fvec1);
  174. acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
  175. acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
  176. } else {
  177. data_fvec0 = map_fun(data_fvec0);
  178. acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
  179. }
  180. }
  181. acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
  182. return vec_reduce_all<float>(red_fun, acc_fvec0);
  183. }
  184. template <typename scalar_t = BFloat16, typename MapOp, typename ReduceOp>
  185. inline BFloat16 map2_reduce_all(
  186. const MapOp& map_fun,
  187. const ReduceOp& red_fun,
  188. const BFloat16* data,
  189. const BFloat16* data2,
  190. int64_t size) {
  191. using bVec = vec::Vectorized<BFloat16>;
  192. using fVec = vec::Vectorized<float>;
  193. if (size < bVec::size()) {
  194. bVec data_bvec = bVec::loadu(data, size);
  195. fVec data_fvec0, data_fvec1;
  196. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  197. bVec data2_bvec = bVec::loadu(data2, size);
  198. fVec data2_fvec0, data2_fvec1;
  199. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  200. if (size > fVec::size()) {
  201. data_fvec0 = map_fun(data_fvec0, data2_fvec0);
  202. data_fvec1 = map_fun(data_fvec1, data2_fvec1);
  203. data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
  204. return vec_reduce_all<float>(red_fun, data_fvec0, fVec::size());
  205. } else {
  206. data_fvec0 = map_fun(data_fvec0, data2_fvec0);
  207. return vec_reduce_all<float>(red_fun, data_fvec0, size);
  208. }
  209. }
  210. int64_t d = bVec::size();
  211. bVec acc_bvec = bVec::loadu(data);
  212. fVec acc_fvec0, acc_fvec1;
  213. std::tie(acc_fvec0, acc_fvec1) = convert_bfloat16_float(acc_bvec);
  214. bVec acc2_bvec = bVec::loadu(data2);
  215. fVec acc2_fvec0, acc2_fvec1;
  216. std::tie(acc2_fvec0, acc2_fvec1) = convert_bfloat16_float(acc2_bvec);
  217. acc_fvec0 = map_fun(acc_fvec0, acc2_fvec0);
  218. acc_fvec1 = map_fun(acc_fvec1, acc2_fvec1);
  219. for (; d < size - (size % bVec::size()); d += bVec::size()) {
  220. bVec data_bvec = bVec::loadu(data + d);
  221. fVec data_fvec0, data_fvec1;
  222. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  223. bVec data2_bvec = bVec::loadu(data2 + d);
  224. fVec data2_fvec0, data2_fvec1;
  225. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  226. data_fvec0 = map_fun(data_fvec0, data2_fvec0);
  227. data_fvec1 = map_fun(data_fvec1, data2_fvec1);
  228. acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
  229. acc_fvec1 = red_fun(acc_fvec1, data_fvec1);
  230. }
  231. if (size - d > 0) {
  232. bVec data_bvec = bVec::loadu(data + d, size - d);
  233. fVec data_fvec0, data_fvec1;
  234. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  235. bVec data2_bvec = bVec::loadu(data2 + d, size - d);
  236. fVec data2_fvec0, data2_fvec1;
  237. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  238. if (size - d > fVec::size()) {
  239. data_fvec0 = map_fun(data_fvec0, data2_fvec0);
  240. data_fvec1 = map_fun(data_fvec1, data2_fvec1);
  241. acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
  242. acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
  243. } else {
  244. data_fvec0 = map_fun(data_fvec0, data2_fvec0);
  245. acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
  246. }
  247. }
  248. acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
  249. return vec_reduce_all<float>(red_fun, acc_fvec0);
  250. }
  251. template <typename scalar_t = BFloat16, typename MapOp, typename ReduceOp>
  252. inline BFloat16 map3_reduce_all(
  253. const MapOp& map_fun,
  254. const ReduceOp& red_fun,
  255. const BFloat16* data,
  256. const BFloat16* data2,
  257. const BFloat16* data3,
  258. int64_t size) {
  259. using bVec = vec::Vectorized<BFloat16>;
  260. using fVec = vec::Vectorized<float>;
  261. if (size < bVec::size()) {
  262. bVec data_bvec = bVec::loadu(data, size);
  263. fVec data_fvec0, data_fvec1;
  264. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  265. bVec data2_bvec = bVec::loadu(data2, size);
  266. fVec data2_fvec0, data2_fvec1;
  267. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  268. bVec data3_bvec = bVec::loadu(data3, size);
  269. fVec data3_fvec0, data3_fvec1;
  270. std::tie(data3_fvec0, data3_fvec1) = convert_bfloat16_float(data3_bvec);
  271. if (size > fVec::size()) {
  272. data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
  273. data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
  274. data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
  275. return vec_reduce_all<float>(red_fun, data_fvec0, fVec::size());
  276. } else {
  277. data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
  278. return vec_reduce_all<float>(red_fun, data_fvec0, size);
  279. }
  280. }
  281. int64_t d = bVec::size();
  282. bVec acc_bvec = bVec::loadu(data);
  283. fVec acc_fvec0, acc_fvec1;
  284. std::tie(acc_fvec0, acc_fvec1) = convert_bfloat16_float(acc_bvec);
  285. bVec acc2_bvec = bVec::loadu(data2);
  286. fVec acc2_fvec0, acc2_fvec1;
  287. std::tie(acc2_fvec0, acc2_fvec1) = convert_bfloat16_float(acc2_bvec);
  288. bVec acc3_bvec = bVec::loadu(data3);
  289. fVec acc3_fvec0, acc3_fvec1;
  290. std::tie(acc3_fvec0, acc3_fvec1) = convert_bfloat16_float(acc3_bvec);
  291. acc_fvec0 = map_fun(acc_fvec0, acc2_fvec0, acc3_fvec0);
  292. acc_fvec1 = map_fun(acc_fvec1, acc2_fvec1, acc3_fvec1);
  293. for (; d < size - (size % bVec::size()); d += bVec::size()) {
  294. bVec data_bvec = bVec::loadu(data + d);
  295. fVec data_fvec0, data_fvec1;
  296. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  297. bVec data2_bvec = bVec::loadu(data2 + d);
  298. fVec data2_fvec0, data2_fvec1;
  299. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  300. bVec data3_bvec = bVec::loadu(data3 + d);
  301. fVec data3_fvec0, data3_fvec1;
  302. std::tie(data3_fvec0, data3_fvec1) = convert_bfloat16_float(data3_bvec);
  303. data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
  304. data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
  305. acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
  306. acc_fvec1 = red_fun(acc_fvec1, data_fvec1);
  307. }
  308. if (size - d > 0) {
  309. bVec data_bvec = bVec::loadu(data + d, size - d);
  310. fVec data_fvec0, data_fvec1;
  311. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  312. bVec data2_bvec = bVec::loadu(data2 + d, size - d);
  313. fVec data2_fvec0, data2_fvec1;
  314. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  315. bVec data3_bvec = bVec::loadu(data3 + d, size - d);
  316. fVec data3_fvec0, data3_fvec1;
  317. std::tie(data3_fvec0, data3_fvec1) = convert_bfloat16_float(data3_bvec);
  318. if (size - d > fVec::size()) {
  319. data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
  320. data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
  321. acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
  322. acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
  323. } else {
  324. data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
  325. acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
  326. }
  327. }
  328. acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
  329. return vec_reduce_all<float>(red_fun, acc_fvec0);
  330. }
  331. template <typename scalar_t = BFloat16, typename Op>
  332. inline void map(
  333. const Op& vec_fun,
  334. BFloat16* output_data,
  335. const BFloat16* input_data,
  336. int64_t size) {
  337. using bVec = vec::Vectorized<BFloat16>;
  338. using fVec = vec::Vectorized<float>;
  339. int64_t d = 0;
  340. for (; d < size - (size % bVec::size()); d += bVec::size()) {
  341. bVec data_bvec = bVec::loadu(input_data + d);
  342. fVec data_fvec0, data_fvec1;
  343. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  344. fVec output_fvec0 = vec_fun(data_fvec0);
  345. fVec output_fvec1 = vec_fun(data_fvec1);
  346. bVec output_bvec = convert_float_bfloat16(output_fvec0, output_fvec1);
  347. output_bvec.store(output_data + d);
  348. }
  349. if (size - d > 0) {
  350. bVec data_bvec = bVec::loadu(input_data + d, size - d);
  351. fVec data_fvec0, data_fvec1;
  352. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  353. fVec output_fvec0 = vec_fun(data_fvec0);
  354. fVec output_fvec1 = vec_fun(data_fvec1);
  355. bVec output_bvec = convert_float_bfloat16(output_fvec0, output_fvec1);
  356. output_bvec.store(output_data + d, size - d);
  357. }
  358. }
  359. template <typename scalar_t = BFloat16, typename Op>
  360. inline void map2(
  361. const Op& vec_fun,
  362. BFloat16* output_data,
  363. const BFloat16* input_data,
  364. const BFloat16* input_data2,
  365. int64_t size) {
  366. using bVec = vec::Vectorized<BFloat16>;
  367. using fVec = vec::Vectorized<float>;
  368. int64_t d = 0;
  369. for (; d < size - (size % bVec::size()); d += bVec::size()) {
  370. bVec data_bvec = bVec::loadu(input_data + d);
  371. fVec data_fvec0, data_fvec1;
  372. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  373. bVec data2_bvec = bVec::loadu(input_data2 + d);
  374. fVec data2_fvec0, data2_fvec1;
  375. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  376. fVec output_fvec0 = vec_fun(data_fvec0, data2_fvec0);
  377. fVec output_fvec1 = vec_fun(data_fvec1, data2_fvec1);
  378. bVec output_bvec = convert_float_bfloat16(output_fvec0, output_fvec1);
  379. output_bvec.store(output_data + d);
  380. }
  381. if (size - d > 0) {
  382. bVec data_bvec = bVec::loadu(input_data + d, size - d);
  383. fVec data_fvec0, data_fvec1;
  384. std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
  385. bVec data2_bvec = bVec::loadu(input_data2 + d, size - d);
  386. fVec data2_fvec0, data2_fvec1;
  387. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  388. fVec output_fvec0 = vec_fun(data_fvec0, data2_fvec0);
  389. fVec output_fvec1 = vec_fun(data_fvec1, data2_fvec1);
  390. bVec output_bvec = convert_float_bfloat16(output_fvec0, output_fvec1);
  391. output_bvec.store(output_data + d, size - d);
  392. }
  393. }
  394. template <typename scalar_t = BFloat16, typename Op>
  395. inline void map3(
  396. const Op& vec_fun,
  397. BFloat16* output_data,
  398. const BFloat16* input_data1,
  399. const BFloat16* input_data2,
  400. const BFloat16* input_data3,
  401. int64_t size) {
  402. using bVec = vec::Vectorized<BFloat16>;
  403. using fVec = vec::Vectorized<float>;
  404. int64_t d = 0;
  405. for (; d < size - (size % bVec::size()); d += bVec::size()) {
  406. bVec data1_bvec = bVec::loadu(input_data1 + d);
  407. fVec data1_fvec0, data1_fvec1;
  408. std::tie(data1_fvec0, data1_fvec1) = convert_bfloat16_float(data1_bvec);
  409. bVec data2_bvec = bVec::loadu(input_data2 + d);
  410. fVec data2_fvec0, data2_fvec1;
  411. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  412. bVec data3_bvec = bVec::loadu(input_data3 + d);
  413. fVec data3_fvec0, data3_fvec1;
  414. std::tie(data3_fvec0, data3_fvec1) = convert_bfloat16_float(data3_bvec);
  415. fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0);
  416. fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1);
  417. bVec output_bvec = convert_float_bfloat16(output_fvec0, output_fvec1);
  418. output_bvec.store(output_data + d);
  419. }
  420. if (size - d > 0) {
  421. bVec data1_bvec = bVec::loadu(input_data1 + d, size - d);
  422. fVec data1_fvec0, data1_fvec1;
  423. std::tie(data1_fvec0, data1_fvec1) = convert_bfloat16_float(data1_bvec);
  424. bVec data2_bvec = bVec::loadu(input_data2 + d, size - d);
  425. fVec data2_fvec0, data2_fvec1;
  426. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  427. bVec data3_bvec = bVec::loadu(input_data3 + d, size - d);
  428. fVec data3_fvec0, data3_fvec1;
  429. std::tie(data3_fvec0, data3_fvec1) = convert_bfloat16_float(data3_bvec);
  430. fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0);
  431. fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1);
  432. bVec output_bvec = convert_float_bfloat16(output_fvec0, output_fvec1);
  433. output_bvec.store(output_data + d, size - d);
  434. }
  435. }
  436. template <typename scalar_t = BFloat16, typename Op>
  437. inline void map4(
  438. const Op& vec_fun,
  439. BFloat16* output_data,
  440. const BFloat16* input_data1,
  441. const BFloat16* input_data2,
  442. const BFloat16* input_data3,
  443. const BFloat16* input_data4,
  444. int64_t size) {
  445. using bVec = vec::Vectorized<BFloat16>;
  446. using fVec = vec::Vectorized<float>;
  447. int64_t d = 0;
  448. for (; d < size - (size % bVec::size()); d += bVec::size()) {
  449. bVec data1_bvec = bVec::loadu(input_data1 + d);
  450. fVec data1_fvec0, data1_fvec1;
  451. std::tie(data1_fvec0, data1_fvec1) = convert_bfloat16_float(data1_bvec);
  452. bVec data2_bvec = bVec::loadu(input_data2 + d);
  453. fVec data2_fvec0, data2_fvec1;
  454. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  455. bVec data3_bvec = bVec::loadu(input_data3 + d);
  456. fVec data3_fvec0, data3_fvec1;
  457. std::tie(data3_fvec0, data3_fvec1) = convert_bfloat16_float(data3_bvec);
  458. bVec data4_bvec = bVec::loadu(input_data4 + d);
  459. fVec data4_fvec0, data4_fvec1;
  460. std::tie(data4_fvec0, data4_fvec1) = convert_bfloat16_float(data4_bvec);
  461. fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0);
  462. fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1);
  463. bVec output_bvec = convert_float_bfloat16(output_fvec0, output_fvec1);
  464. output_bvec.store(output_data + d);
  465. }
  466. if (size - d > 0) {
  467. bVec data1_bvec = bVec::loadu(input_data1 + d, size - d);
  468. fVec data1_fvec0, data1_fvec1;
  469. std::tie(data1_fvec0, data1_fvec1) = convert_bfloat16_float(data1_bvec);
  470. bVec data2_bvec = bVec::loadu(input_data2 + d, size - d);
  471. fVec data2_fvec0, data2_fvec1;
  472. std::tie(data2_fvec0, data2_fvec1) = convert_bfloat16_float(data2_bvec);
  473. bVec data3_bvec = bVec::loadu(input_data3 + d, size - d);
  474. fVec data3_fvec0, data3_fvec1;
  475. std::tie(data3_fvec0, data3_fvec1) = convert_bfloat16_float(data3_bvec);
  476. bVec data4_bvec = bVec::loadu(input_data4 + d, size - d);
  477. fVec data4_fvec0, data4_fvec1;
  478. std::tie(data4_fvec0, data4_fvec1) = convert_bfloat16_float(data4_bvec);
  479. fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0);
  480. fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1);
  481. bVec output_bvec = convert_float_bfloat16(output_fvec0, output_fvec1);
  482. output_bvec.store(output_data + d, size - d);
  483. }
  484. }
  485. }} // namespace at::vec