UpSampleKernelAVXAntialias.h 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719
  1. /*
  2. The Python Imaging Library (PIL) is
  3. Copyright © 1997-2011 by Secret Labs AB
  4. Copyright © 1995-2011 by Fredrik Lundh
  5. Pillow is the friendly PIL fork. It is
  6. Copyright © 2010-2022 by Alex Clark and contributors
  7. Like PIL, Pillow is licensed under the open source HPND License
  8. */
  9. // This code is heavily inspired from PILLOW-SIMD's implementation:
  10. // https://github.com/uploadcare/pillow-simd/blob/simd/master/src/libImaging/Resample.c
  11. #pragma once
  12. #ifdef CPU_CAPABILITY_AVX2
  13. // TODO: This file only supports AVX2. We could split the AVX kernels into
  14. // smaller logical blocks in order to port them into the Vec.h logic. This would
  15. // allow to support other vectorization architectures and perhaps also support
  16. // the non-vectorized fallback (we'd need to make sure it's not slower than the
  17. // current fallback).
  18. #include <ATen/core/Tensor.h>
  19. #include <ATen/cpu/vec/intrinsics.h>
  20. #include <c10/util/irange.h>
  21. #ifndef AT_PER_OPERATOR_HEADERS
  22. #include <ATen/Functions.h>
  23. #else
  24. #include <ATen/ops/empty.h>
  25. #endif
  26. namespace {
  27. static __m128i inline mm_cvtepu8_epi32(const uint32_t* C10_RESTRICT ptr) {
  28. return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)ptr));
  29. }
  30. // TODO: We may want to hard-code an unrolled version for the case where
  31. // num_channels=3 to hint the compiler to vectorize this (looks at original
  32. // PIL-SIMD's code).
  33. at::Tensor unpack_rgb(const at::Tensor& packed_tensor) {
  34. // Convert a "packed" tensor (typically RGBRGBRGB if channels_last) into
  35. // RGBARGBARGBA format where A is hard-coded to 255. Each pixel is encoded
  36. // into as 32bits. This generalizes to num_channels <= 4 and also works for
  37. // non-channels_last tensors.
  38. const uint8_t* packed = (const uint8_t*)packed_tensor.data_ptr<uint8_t>();
  39. auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
  40. auto num_channels = packed_tensor.size(0);
  41. constexpr int rgba_size = 4;
  42. auto unpacked_tensor = at::empty({rgba_size, packed_tensor.size(1), packed_tensor.size(2)}, at::CPU(at::kByte));
  43. uint8_t* unpacked = (uint8_t*) unpacked_tensor.data_ptr<uint8_t>();
  44. auto stride_i = packed_tensor.stride(2);
  45. auto stride_j = packed_tensor.stride(0);
  46. for (const auto i : c10::irange(num_pixels)) {
  47. for (const auto j : c10::irange(rgba_size)) {
  48. unpacked[rgba_size * i + j] = (j < num_channels) ? packed[stride_i * i + stride_j * j] : 0;
  49. }
  50. }
  51. return unpacked_tensor;
  52. }
  53. void pack_rgb(
  54. const at::Tensor& unpacked_tensor, // IN
  55. const at::Tensor& packed_tensor // OUT
  56. ) {
  57. constexpr int rgba_size = 4;
  58. uint8_t* unpacked = (uint8_t*)unpacked_tensor.data_ptr<uint8_t>();
  59. uint8_t* packed = (uint8_t*)packed_tensor.data_ptr<uint8_t>();
  60. auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
  61. auto num_channels = packed_tensor.size(0);
  62. auto packed_increment = packed_tensor.stride(2);
  63. auto packed_stride = packed_tensor.stride(0);
  64. for (const auto i C10_UNUSED : c10::irange(num_pixels)) {
  65. for (const auto j : c10::irange(num_channels)) {
  66. packed[j * packed_stride] = unpacked[j];
  67. }
  68. unpacked += rgba_size;
  69. packed += packed_increment;
  70. }
  71. }
  72. void ImagingResampleHorizontalConvolution8u4x(
  73. uint32_t* C10_RESTRICT lineOut0,
  74. uint32_t* C10_RESTRICT lineOut1,
  75. uint32_t* C10_RESTRICT lineOut2,
  76. uint32_t* C10_RESTRICT lineOut3,
  77. const uint32_t* C10_RESTRICT lineIn0,
  78. const uint32_t* C10_RESTRICT lineIn1,
  79. const uint32_t* C10_RESTRICT lineIn2,
  80. const uint32_t* C10_RESTRICT lineIn3,
  81. int xsize,
  82. int* xbounds,
  83. int16_t* kk,
  84. int kmax,
  85. int coefs_precision);
  86. void ImagingResampleHorizontalConvolution8u(
  87. uint32_t* C10_RESTRICT lineOut,
  88. const uint32_t* C10_RESTRICT lineIn,
  89. int xsize,
  90. int* xbounds,
  91. int16_t* kk,
  92. int kmax,
  93. int coefs_precision);
  94. void ImagingResampleVerticalConvolution8u(
  95. uint32_t* C10_RESTRICT lineOut,
  96. const uint32_t* C10_RESTRICT imIn,
  97. int xmin,
  98. int xmax,
  99. int16_t* k,
  100. int coefs_precision,
  101. int xin);
  102. void ImagingResampleHorizontal(
  103. const at::Tensor & unpacked_output,
  104. const at::Tensor & unpacked_input,
  105. int ksize,
  106. const std::vector<at::Tensor>& horiz_indices_weights,
  107. unsigned int horiz_weights_precision) {
  108. // TODO: we may want to merge that into the fallback code (currently called
  109. // basic_loop_aa_horizontal<uint8_t>)
  110. // Although this may not be needed if / when we port all this code to use
  111. // Vec.h since this would potentially give us another fall-back implem
  112. int yy;
  113. int16_t* kk = (int16_t*)(horiz_indices_weights[3].data_ptr<double>());
  114. auto xout = unpacked_output.size(2);
  115. auto yout = unpacked_output.size(1);
  116. auto xin = unpacked_input.size(2);
  117. std::vector<int> bounds_vec(2 * xout, 0);
  118. int* bounds = bounds_vec.data();
  119. int64_t* idx_ptr_xmin = horiz_indices_weights[0].data_ptr<int64_t>();
  120. int64_t* idx_ptr_size = horiz_indices_weights[1].data_ptr<int64_t>();
  121. for (int i = 0; i < xout; i++) {
  122. bounds[2 * i + 0] = idx_ptr_xmin[i];
  123. bounds[2 * i + 1] = idx_ptr_size[i];
  124. }
  125. uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr<uint8_t>();
  126. uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr<uint8_t>();
  127. yy = 0;
  128. for (; yy < yout - 3; yy += 4) {
  129. ImagingResampleHorizontalConvolution8u4x(
  130. unpacked_output_p + yy * xout,
  131. unpacked_output_p + (yy + 1) * xout,
  132. unpacked_output_p + (yy + 2) * xout,
  133. unpacked_output_p + (yy + 3) * xout,
  134. unpacked_input_p + yy * xin,
  135. unpacked_input_p + (yy + 1) * xin,
  136. unpacked_input_p + (yy + 2) * xin,
  137. unpacked_input_p + (yy + 3) * xin,
  138. xout,
  139. bounds,
  140. kk,
  141. ksize,
  142. (int)horiz_weights_precision);
  143. }
  144. for (; yy < yout; yy++) {
  145. ImagingResampleHorizontalConvolution8u(
  146. unpacked_output_p + yy * xout,
  147. unpacked_input_p + yy * xin,
  148. xout,
  149. bounds,
  150. kk,
  151. ksize,
  152. (int)horiz_weights_precision);
  153. }
  154. }
  155. void ImagingResampleVertical(
  156. const at::Tensor & unpacked_output,
  157. const at::Tensor & unpacked_input,
  158. int ksize,
  159. const std::vector<at::Tensor>& vert_indices_weights,
  160. unsigned int vert_weights_precision) {
  161. // TODO: we may want to merge that into the fallback code (currently called
  162. // basic_loop_aa_vertical<uint8_t>)
  163. // Although this may not be needed if / when we port all this code to use
  164. // Vec.h since this would potentially give us another fall-back implem
  165. int ymin, ymax;
  166. int16_t* k = nullptr;
  167. int16_t* kk = (int16_t*)(vert_indices_weights[3].data_ptr<double>());
  168. int64_t* idx_ptr_xmin = vert_indices_weights[0].data_ptr<int64_t>();
  169. int64_t* idx_ptr_size = vert_indices_weights[1].data_ptr<int64_t>();
  170. uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr<uint8_t>();
  171. uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr<uint8_t>();
  172. auto xout = unpacked_output.size(2);
  173. auto yout = unpacked_output.size(1);
  174. for (const auto yy : c10::irange(yout)) {
  175. k = &kk[yy * ksize];
  176. ymin = idx_ptr_xmin[yy];
  177. ymax = idx_ptr_size[yy];
  178. ImagingResampleVerticalConvolution8u(
  179. unpacked_output_p + yy * xout,
  180. unpacked_input_p,
  181. ymin,
  182. ymax,
  183. k,
  184. (int)vert_weights_precision,
  185. xout);
  186. }
  187. }
  188. // This is the only public entry point in this file. It supports bilinear
  189. // mode for uint8 dtype when C <= 4, with or without antialias. The
  190. // implem is based on PIL-SIMD.
  191. // Its equivalent implementation (fallback) for when AVX isn't supported or when
  192. // C > 4 is separable_upsample_generic_Nd_kernel_impl() There are a bunch of
  193. // future improvement that can be done: look for the TODOs in this file.
  194. // For details on how the weights are computed and how the multiplications are
  195. // run on int (instead of float weights), see
  196. // [ Weights computation for uint8_t and multiplication trick ]
  197. // For details on how the AVX kernels are implemented, see
  198. // https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
  199. // See also [ Support for antialias=False as a subcase of antilias=True ] to
  200. // learn more about how the antialias=False case is computed. The same holds
  201. // here: all these kernels are general enough to handle an arbitrary number of
  202. // weights, but when aa=False they could be optimized further.
  203. template <typename scale_type, class F>
  204. void upsample_avx_bilinear_uint8(
  205. const at::Tensor& input,
  206. const at::Tensor& output,
  207. bool align_corners,
  208. const scale_type& scales,
  209. bool antialias) {
  210. auto batch_size = input.size(0);
  211. auto num_channels = input.size(1);
  212. auto xin = input.size(3);
  213. auto yin = input.size(2);
  214. auto xout = output.size(3);
  215. auto yout = output.size(2);
  216. if (xin == xout && yin == yout) {
  217. output.copy_(input);
  218. return;
  219. }
  220. auto need_horizontal = xout != xin;
  221. auto need_vertical = yout != yin;
  222. int ksize_horiz, ksize_vert;
  223. std::vector<at::Tensor> horiz_indices_weights, vert_indices_weights;
  224. unsigned int horiz_weights_precision, vert_weights_precision;
  225. if (need_horizontal) {
  226. int interp_dim = 3;
  227. std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) =
  228. F::compute_indices_int16_weights_aa(
  229. /*input_size=*/xin,
  230. /*output_size=*/xout,
  231. /*stride=*/1,
  232. /*ndims=*/4,
  233. /*reshape_dim=*/interp_dim,
  234. /*align_corners=*/align_corners,
  235. /*opt_scale=*/scales[interp_dim - 2],
  236. /*antialias=*/antialias,
  237. /*align_i32=*/true);
  238. }
  239. if (need_vertical) {
  240. int interp_dim = 2;
  241. std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) =
  242. F::compute_indices_int16_weights_aa(
  243. /*input_size=*/yin,
  244. /*output_size=*/yout,
  245. /*stride=*/1,
  246. /*ndims=*/4,
  247. /*reshape_dim=*/interp_dim,
  248. /*align_corners=*/align_corners,
  249. /*opt_scale=*/scales[interp_dim - 2],
  250. /*antialias=*/antialias,
  251. /*align_i32=*/true);
  252. }
  253. bool is_rgba = num_channels == 4 && input.is_contiguous(at::MemoryFormat::ChannelsLast);
  254. at::Tensor buffer_horiz, buffer_vert;
  255. if (need_horizontal && !(is_rgba && !need_vertical)) {
  256. buffer_horiz = at::empty({4, yin, xout}, input.options());
  257. }
  258. if (need_vertical && !is_rgba) {
  259. buffer_vert = at::empty({4, yout, xout}, input.options());
  260. }
  261. // TODO: The unpack / pack operations create a copy of the original input and
  262. // output tensor. There should be a way to avoid these copies by instead
  263. // modifying the low-level kernels. Or maybe at least avoid copying the entire
  264. // tensors and just copy part of them (line by line).
  265. for (const auto i : c10::irange(batch_size)) {
  266. at::Tensor unpacked_input = (is_rgba) ? input[i] : unpack_rgb(input[i]);
  267. at::Tensor unpacked_output;
  268. if (need_horizontal) {
  269. at::Tensor unpacked_output_temp = (is_rgba && !need_vertical) ? output[i] : buffer_horiz;
  270. ImagingResampleHorizontal(
  271. unpacked_output_temp,
  272. unpacked_input,
  273. ksize_horiz,
  274. horiz_indices_weights,
  275. horiz_weights_precision);
  276. unpacked_output = unpacked_input = unpacked_output_temp;
  277. }
  278. if (need_vertical) {
  279. unpacked_output = (is_rgba) ? output[i] : buffer_vert;
  280. ImagingResampleVertical(
  281. unpacked_output,
  282. unpacked_input,
  283. ksize_vert,
  284. vert_indices_weights,
  285. vert_weights_precision);
  286. }
  287. TORCH_INTERNAL_ASSERT(unpacked_output.defined());
  288. if (!is_rgba) {
  289. pack_rgb(unpacked_output, output[i]);
  290. }
  291. }
  292. }
  293. // https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
  294. void ImagingResampleHorizontalConvolution8u4x(
  295. uint32_t* C10_RESTRICT lineOut0,
  296. uint32_t* C10_RESTRICT lineOut1,
  297. uint32_t* C10_RESTRICT lineOut2,
  298. uint32_t* C10_RESTRICT lineOut3,
  299. const uint32_t* C10_RESTRICT lineIn0,
  300. const uint32_t* C10_RESTRICT lineIn1,
  301. const uint32_t* C10_RESTRICT lineIn2,
  302. const uint32_t* C10_RESTRICT lineIn3,
  303. int xsize,
  304. int* xbounds,
  305. int16_t* kk,
  306. int kmax,
  307. int coefs_precision) {
  308. int xmin, xmax, x;
  309. int16_t* k;
  310. for (const auto xx : c10::irange(xsize)) {
  311. xmin = xbounds[xx * 2 + 0];
  312. xmax = xbounds[xx * 2 + 1];
  313. k = &kk[xx * kmax];
  314. x = 0;
  315. __m256i sss0, sss1;
  316. __m256i zero = _mm256_setzero_si256();
  317. __m256i initial = _mm256_set1_epi32(1 << (coefs_precision - 1));
  318. sss0 = initial;
  319. sss1 = initial;
  320. for (; x < xmax - 3; x += 4) {
  321. __m256i pix, mmk0, mmk1, source;
  322. mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]);
  323. mmk1 = _mm256_set1_epi32(*(int32_t*)&k[x + 2]);
  324. source = _mm256_inserti128_si256(
  325. _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn0[x + xmin])),
  326. _mm_loadu_si128((__m128i*)&lineIn1[x + xmin]),
  327. 1);
  328. // clang-format off
  329. pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
  330. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
  331. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
  332. sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0));
  333. pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
  334. -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
  335. -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
  336. sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1));
  337. source = _mm256_inserti128_si256(
  338. _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn2[x + xmin])),
  339. _mm_loadu_si128((__m128i*)&lineIn3[x + xmin]),
  340. 1);
  341. pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
  342. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
  343. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
  344. sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0));
  345. pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
  346. -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
  347. -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
  348. sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1));
  349. }
  350. for (; x < xmax - 1; x += 2) {
  351. __m256i pix, mmk;
  352. mmk = _mm256_set1_epi32(*(int32_t*)&k[x]);
  353. pix = _mm256_inserti128_si256(
  354. _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn0[x + xmin])),
  355. _mm_loadl_epi64((__m128i*)&lineIn1[x + xmin]),
  356. 1);
  357. pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
  358. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
  359. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
  360. sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
  361. pix = _mm256_inserti128_si256(
  362. _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn2[x + xmin])),
  363. _mm_loadl_epi64((__m128i*)&lineIn3[x + xmin]),
  364. 1);
  365. pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
  366. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
  367. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
  368. sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
  369. // clang-format on
  370. }
  371. for (; x < xmax; x++) {
  372. __m256i pix, mmk;
  373. // [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0
  374. mmk = _mm256_set1_epi32(k[x]);
  375. // [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0
  376. pix = _mm256_inserti128_si256(
  377. _mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn0[x + xmin])),
  378. mm_cvtepu8_epi32(&lineIn1[x + xmin]),
  379. 1);
  380. sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
  381. pix = _mm256_inserti128_si256(
  382. _mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn2[x + xmin])),
  383. mm_cvtepu8_epi32(&lineIn3[x + xmin]),
  384. 1);
  385. sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
  386. }
  387. sss0 = _mm256_srai_epi32(sss0, coefs_precision);
  388. sss1 = _mm256_srai_epi32(sss1, coefs_precision);
  389. sss0 = _mm256_packs_epi32(sss0, zero);
  390. sss1 = _mm256_packs_epi32(sss1, zero);
  391. sss0 = _mm256_packus_epi16(sss0, zero);
  392. sss1 = _mm256_packus_epi16(sss1, zero);
  393. lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0));
  394. lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1));
  395. lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0));
  396. lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1));
  397. }
  398. }
  399. // https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
  400. void ImagingResampleHorizontalConvolution8u(
  401. uint32_t* C10_RESTRICT lineOut,
  402. const uint32_t* C10_RESTRICT lineIn,
  403. int xsize,
  404. int* xbounds,
  405. int16_t* kk,
  406. int kmax,
  407. int coefs_precision) {
  408. int xmin, xmax, x;
  409. int16_t* k;
  410. for (const auto xx : c10::irange(xsize)) {
  411. __m128i sss;
  412. xmin = xbounds[xx * 2 + 0];
  413. xmax = xbounds[xx * 2 + 1];
  414. k = &kk[xx * kmax];
  415. x = 0;
  416. if (xmax < 8) {
  417. sss = _mm_set1_epi32(1 << (coefs_precision - 1));
  418. } else {
  419. // Lower part will be added to higher, use only half of the error
  420. __m256i sss256 = _mm256_set1_epi32(1 << (coefs_precision - 2));
  421. for (; x < xmax - 7; x += 8) {
  422. __m256i pix, mmk, source;
  423. __m128i tmp = _mm_loadu_si128((__m128i*)&k[x]);
  424. __m256i ksource =
  425. _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
  426. // clang-format off
  427. source = _mm256_loadu_si256((__m256i*)&lineIn[x + xmin]);
  428. pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
  429. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
  430. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
  431. mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
  432. 11,10, 9,8, 11,10, 9,8, 11,10, 9,8, 11,10, 9,8,
  433. 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
  434. sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
  435. pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
  436. -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
  437. -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
  438. mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
  439. 15,14, 13,12, 15,14, 13,12, 15,14, 13,12, 15,14, 13,12,
  440. 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
  441. sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
  442. // clang-format on
  443. }
  444. for (; x < xmax - 3; x += 4) {
  445. __m256i pix, mmk, source;
  446. __m128i tmp = _mm_loadl_epi64((__m128i*)&k[x]);
  447. __m256i ksource =
  448. _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
  449. tmp = _mm_loadu_si128((__m128i*)&lineIn[x + xmin]);
  450. source = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
  451. // clang-format off
  452. pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
  453. -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
  454. -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
  455. mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
  456. 7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4,
  457. 3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
  458. sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
  459. // clang-format on
  460. }
  461. sss = _mm_add_epi32(
  462. _mm256_extracti128_si256(sss256, 0),
  463. _mm256_extracti128_si256(sss256, 1));
  464. }
  465. for (; x < xmax - 1; x += 2) {
  466. __m128i mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
  467. __m128i source = _mm_loadl_epi64((__m128i*)&lineIn[x + xmin]);
  468. __m128i pix = _mm_shuffle_epi8(
  469. source,
  470. _mm_set_epi8(-1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0));
  471. sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
  472. }
  473. for (; x < xmax; x++) {
  474. __m128i pix = mm_cvtepu8_epi32(&lineIn[x + xmin]);
  475. __m128i mmk = _mm_set1_epi32(k[x]);
  476. sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
  477. }
  478. sss = _mm_srai_epi32(sss, coefs_precision);
  479. sss = _mm_packs_epi32(sss, sss);
  480. lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
  481. }
  482. }
  483. // https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
  484. void ImagingResampleVerticalConvolution8u(
  485. uint32_t* C10_RESTRICT lineOut,
  486. const uint32_t* C10_RESTRICT imIn,
  487. int xmin,
  488. int xmax,
  489. int16_t* k,
  490. int coefs_precision,
  491. int xin) {
  492. int x;
  493. int xx = 0;
  494. int xsize = xin;
  495. __m128i initial = _mm_set1_epi32(1 << (coefs_precision - 1));
  496. __m256i initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1));
  497. for (; xx < xsize - 7; xx += 8) {
  498. __m256i sss0 = initial_256;
  499. __m256i sss1 = initial_256;
  500. __m256i sss2 = initial_256;
  501. __m256i sss3 = initial_256;
  502. x = 0;
  503. for (; x < xmax - 1; x += 2) {
  504. __m256i source, source1, source2;
  505. __m256i pix, mmk;
  506. // Load two coefficients at once
  507. mmk = _mm256_set1_epi32(*(int32_t*)&k[x]);
  508. // Load 2 lines
  509. // (__m256i *) &imIn->image32[x + xmin][xx]
  510. source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx));
  511. // (__m256i *) &imIn->image32[x + 1 + xmin][xx]
  512. source2 =
  513. _mm256_loadu_si256((__m256i*)(imIn + (x + 1 + xmin) * xin + xx));
  514. source = _mm256_unpacklo_epi8(source1, source2);
  515. pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
  516. sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
  517. pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
  518. sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
  519. source = _mm256_unpackhi_epi8(source1, source2);
  520. pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
  521. sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
  522. pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
  523. sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
  524. }
  525. for (; x < xmax; x += 1) {
  526. __m256i source, source1, pix, mmk;
  527. mmk = _mm256_set1_epi32(k[x]);
  528. // (__m256i *) &imIn->image32[x + xmin][xx])
  529. source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx));
  530. source = _mm256_unpacklo_epi8(source1, _mm256_setzero_si256());
  531. pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
  532. sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
  533. pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
  534. sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
  535. source = _mm256_unpackhi_epi8(source1, _mm256_setzero_si256());
  536. pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
  537. sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
  538. pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
  539. sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
  540. }
  541. sss0 = _mm256_srai_epi32(sss0, coefs_precision);
  542. sss1 = _mm256_srai_epi32(sss1, coefs_precision);
  543. sss2 = _mm256_srai_epi32(sss2, coefs_precision);
  544. sss3 = _mm256_srai_epi32(sss3, coefs_precision);
  545. sss0 = _mm256_packs_epi32(sss0, sss1);
  546. sss2 = _mm256_packs_epi32(sss2, sss3);
  547. sss0 = _mm256_packus_epi16(sss0, sss2);
  548. _mm256_storeu_si256((__m256i*)&lineOut[xx], sss0);
  549. }
  550. for (; xx < xsize - 1; xx += 2) {
  551. __m128i sss0 = initial; // left row
  552. __m128i sss1 = initial; // right row
  553. x = 0;
  554. for (; x < xmax - 1; x += 2) {
  555. __m128i source, source1, source2;
  556. __m128i pix, mmk;
  557. // Load two coefficients at once
  558. mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
  559. // Load 2 lines
  560. // (__m128i *) &imIn->image32[x + xmin][xx])
  561. source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx));
  562. // (__m128i *) &imIn->image32[x + 1 + xmin][xx]
  563. source2 = _mm_loadl_epi64((__m128i*)(imIn + (x + 1 + xmin) * xin + xx));
  564. source = _mm_unpacklo_epi8(source1, source2);
  565. pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
  566. sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
  567. pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
  568. sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
  569. }
  570. for (; x < xmax; x += 1) {
  571. __m128i source, source1, pix, mmk;
  572. mmk = _mm_set1_epi32(k[x]);
  573. // (__m128i *) &imIn->image32[x + xmin][xx]);
  574. source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx));
  575. source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
  576. pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
  577. sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
  578. pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
  579. sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
  580. }
  581. sss0 = _mm_srai_epi32(sss0, coefs_precision);
  582. sss1 = _mm_srai_epi32(sss1, coefs_precision);
  583. sss0 = _mm_packs_epi32(sss0, sss1);
  584. sss0 = _mm_packus_epi16(sss0, sss0);
  585. _mm_storel_epi64((__m128i*)&lineOut[xx], sss0);
  586. }
  587. for (; xx < xsize; xx++) {
  588. __m128i sss = initial;
  589. x = 0;
  590. for (; x < xmax - 1; x += 2) {
  591. __m128i source, source1, source2;
  592. __m128i pix, mmk;
  593. // Load two coefficients at once
  594. mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
  595. // Load 2 lines
  596. // *(int *) &imIn->image32[x + xmin][xx]
  597. source1 = _mm_cvtsi32_si128(*(int*)(imIn + (x + xmin) * xin + xx));
  598. // *(int *) &imIn->image32[x + 1 + xmin][xx]
  599. source2 = _mm_cvtsi32_si128(*(int*)(imIn + (x + 1 + xmin) * xin + xx));
  600. source = _mm_unpacklo_epi8(source1, source2);
  601. pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
  602. sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
  603. }
  604. for (; x < xmax; x++) {
  605. // &imIn->image32[x + xmin][xx]
  606. __m128i pix = mm_cvtepu8_epi32(imIn + (x + xmin) * xin + xx);
  607. __m128i mmk = _mm_set1_epi32(k[x]);
  608. sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
  609. }
  610. sss = _mm_srai_epi32(sss, coefs_precision);
  611. sss = _mm_packs_epi32(sss, sss);
  612. lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
  613. }
  614. }
  615. } // anonymous namespace
  616. #endif // CPU_CAPABILITY_AVX2