psimd.h 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384
  1. #pragma once
  2. #ifndef PSIMD_H
  3. #define PSIMD_H
  4. #if defined(__CUDA_ARCH__)
  5. /* CUDA compiler */
  6. #define PSIMD_INTRINSIC __forceinline__ __device__
  7. #elif defined(__OPENCL_VERSION__)
  8. /* OpenCL compiler */
  9. #define PSIMD_INTRINSIC inline static
  10. #elif defined(__INTEL_COMPILER)
  11. /* Intel compiler, even on Windows */
  12. #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
  13. #elif defined(__GNUC__)
  14. /* GCC-compatible compiler (gcc/clang/icc) */
  15. #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
  16. #elif defined(_MSC_VER)
  17. /* MSVC-compatible compiler (cl/icl/clang-cl) */
  18. #define PSIMD_INTRINSIC __forceinline static
  19. #elif defined(__cplusplus)
  20. /* Generic C++ compiler */
  21. #define PSIMD_INTRINSIC inline static
  22. #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
  23. /* Generic C99 compiler */
  24. #define PSIMD_INTRINSIC inline static
  25. #else
  26. /* Generic C compiler */
  27. #define PSIMD_INTRINSIC static
  28. #endif
  29. #if defined(__GNUC__) || defined(__clang__)
  30. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  31. #include <arm_neon.h>
  32. #endif
  33. #if defined(__SSE2__)
  34. #include <emmintrin.h>
  35. #endif
  36. #if defined(__SSE3__)
  37. #include <pmmintrin.h>
  38. #endif
  39. #if defined(__SSSE3__)
  40. #include <tmmintrin.h>
  41. #endif
  42. #if defined(__SSE4_1__)
  43. #include <smmintrin.h>
  44. #endif
  45. #if defined(__SSE4_2__)
  46. #include <nmmintrin.h>
  47. #endif
  48. #if defined(__AVX__)
  49. #include <immintrin.h>
  50. #endif
  51. #elif defined(_MSC_VER)
  52. #include <intrin.h>
  53. #endif
  54. #if defined(__cplusplus)
  55. #define PSIMD_CXX_SYNTAX
  56. #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
  57. #define PSIMD_C11_SYNTAX
  58. #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
  59. #define PSIMD_C99_SYNTAX
  60. #else
  61. #define PSIMD_C89_SYNTAX
  62. #endif
  63. #if defined(__cplusplus) && (__cplusplus >= 201103L)
  64. #include <cstddef>
  65. #include <cstdint>
  66. #elif !defined(__OPENCL_VERSION__)
  67. #include <stddef.h>
  68. #include <stdint.h>
  69. #endif
  70. #if defined(__GNUC__) || defined(__clang__)
  71. #define PSIMD_HAVE_F64 0
  72. #define PSIMD_HAVE_F32 1
  73. #define PSIMD_HAVE_U8 1
  74. #define PSIMD_HAVE_S8 1
  75. #define PSIMD_HAVE_U16 1
  76. #define PSIMD_HAVE_S16 1
  77. #define PSIMD_HAVE_U32 1
  78. #define PSIMD_HAVE_S32 1
  79. #define PSIMD_HAVE_U64 0
  80. #define PSIMD_HAVE_S64 0
  81. typedef int8_t psimd_s8 __attribute__((vector_size(16), aligned(1)));
  82. typedef uint8_t psimd_u8 __attribute__((vector_size(16), aligned(1)));
  83. typedef int16_t psimd_s16 __attribute__((vector_size(16), aligned(2)));
  84. typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
  85. typedef int32_t psimd_s32 __attribute__((vector_size(16), aligned(4)));
  86. typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
  87. typedef float psimd_f32 __attribute__((vector_size(16), aligned(4)));
  88. typedef struct {
  89. psimd_s8 lo;
  90. psimd_s8 hi;
  91. } psimd_s8x2;
  92. typedef struct {
  93. psimd_u8 lo;
  94. psimd_u8 hi;
  95. } psimd_u8x2;
  96. typedef struct {
  97. psimd_s16 lo;
  98. psimd_s16 hi;
  99. } psimd_s16x2;
  100. typedef struct {
  101. psimd_u16 lo;
  102. psimd_u16 hi;
  103. } psimd_u16x2;
  104. typedef struct {
  105. psimd_s32 lo;
  106. psimd_s32 hi;
  107. } psimd_s32x2;
  108. typedef struct {
  109. psimd_u32 lo;
  110. psimd_u32 hi;
  111. } psimd_u32x2;
  112. typedef struct {
  113. psimd_f32 lo;
  114. psimd_f32 hi;
  115. } psimd_f32x2;
  116. /* Bit casts */
  117. PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
  118. return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
  119. }
  120. PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
  121. return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
  122. }
  123. PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
  124. return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
  125. }
  126. PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
  127. return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
  128. }
  129. PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
  130. return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
  131. }
  132. PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
  133. return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
  134. }
  135. /* Swap */
  136. PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
  137. const psimd_s8 new_a = *b;
  138. const psimd_s8 new_b = *a;
  139. *a = new_a;
  140. *b = new_b;
  141. }
  142. PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
  143. const psimd_u8 new_a = *b;
  144. const psimd_u8 new_b = *a;
  145. *a = new_a;
  146. *b = new_b;
  147. }
  148. PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
  149. const psimd_s16 new_a = *b;
  150. const psimd_s16 new_b = *a;
  151. *a = new_a;
  152. *b = new_b;
  153. }
  154. PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
  155. const psimd_u16 new_a = *b;
  156. const psimd_u16 new_b = *a;
  157. *a = new_a;
  158. *b = new_b;
  159. }
  160. PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
  161. const psimd_s32 new_a = *b;
  162. const psimd_s32 new_b = *a;
  163. *a = new_a;
  164. *b = new_b;
  165. }
  166. PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
  167. const psimd_u32 new_a = *b;
  168. const psimd_u32 new_b = *a;
  169. *a = new_a;
  170. *b = new_b;
  171. }
  172. PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
  173. const psimd_f32 new_a = *b;
  174. const psimd_f32 new_b = *a;
  175. *a = new_a;
  176. *b = new_b;
  177. }
  178. /* Zero-initialization */
  179. PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
  180. return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  181. }
  182. PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
  183. return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  184. }
  185. PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
  186. return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
  187. }
  188. PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
  189. return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
  190. }
  191. PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
  192. return (psimd_s32) { 0, 0, 0, 0 };
  193. }
  194. PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
  195. return (psimd_u32) { 0, 0, 0, 0 };
  196. }
  197. PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
  198. return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
  199. }
  200. /* Initialization to the same constant */
  201. PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
  202. return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
  203. }
  204. PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
  205. return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
  206. }
  207. PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
  208. return (psimd_s16) { c, c, c, c, c, c, c, c };
  209. }
  210. PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
  211. return (psimd_u16) { c, c, c, c, c, c, c, c };
  212. }
  213. PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
  214. return (psimd_s32) { c, c, c, c };
  215. }
  216. PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
  217. return (psimd_u32) { c, c, c, c };
  218. }
  219. PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
  220. return (psimd_f32) { c, c, c, c };
  221. }
  222. /* Load vector */
  223. PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
  224. return *((const psimd_s8*) address);
  225. }
  226. PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
  227. return *((const psimd_u8*) address);
  228. }
  229. PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
  230. return *((const psimd_s16*) address);
  231. }
  232. PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
  233. return *((const psimd_u16*) address);
  234. }
  235. PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
  236. return *((const psimd_s32*) address);
  237. }
  238. PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
  239. return *((const psimd_u32*) address);
  240. }
  241. PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
  242. return *((const psimd_f32*) address);
  243. }
  244. PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) {
  245. return psimd_splat_s8(*((const int8_t*) address));
  246. }
  247. PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) {
  248. return psimd_splat_u8(*((const uint8_t*) address));
  249. }
  250. PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) {
  251. return psimd_splat_s16(*((const int16_t*) address));
  252. }
  253. PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) {
  254. return psimd_splat_u16(*((const uint16_t*) address));
  255. }
  256. PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) {
  257. return psimd_splat_s32(*((const int32_t*) address));
  258. }
  259. PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) {
  260. return psimd_splat_u32(*((const uint32_t*) address));
  261. }
  262. PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) {
  263. return psimd_splat_f32(*((const float*) address));
  264. }
  265. PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) {
  266. return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 };
  267. }
  268. PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) {
  269. return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 };
  270. }
  271. PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
  272. return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
  273. }
  274. PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) {
  275. const int32_t* address_s32 = (const int32_t*) address;
  276. return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 };
  277. }
  278. PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) {
  279. const uint32_t* address_u32 = (const uint32_t*) address;
  280. return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 };
  281. }
  282. PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
  283. const float* address_f32 = (const float*) address;
  284. return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
  285. }
  286. PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) {
  287. const int32_t* address_s32 = (const int32_t*) address;
  288. return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 };
  289. }
  290. PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) {
  291. const uint32_t* address_u32 = (const uint32_t*) address;
  292. return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 };
  293. }
  294. PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
  295. const float* address_f32 = (const float*) address;
  296. return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
  297. }
  298. PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) {
  299. return psimd_load_s32(address);
  300. }
  301. PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) {
  302. return psimd_load_u32(address);
  303. }
  304. PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
  305. return psimd_load_f32(address);
  306. }
  307. PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
  308. const psimd_f32 v0x1x = psimd_load_f32(address);
  309. const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3);
  310. #if defined(__clang__)
  311. return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
  312. #else
  313. return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
  314. #endif
  315. }
  316. PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
  317. return psimd_load_f32(address);
  318. }
  319. PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
  320. const float* address_f32 = (const float*) address;
  321. return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
  322. }
  323. PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
  324. const psimd_f32 v0x1x = psimd_load_f32(address);
  325. const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2);
  326. #if defined(__clang__)
  327. return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
  328. #else
  329. return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
  330. #endif
  331. }
  332. PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
  333. return psimd_load_stride2_f32(address);
  334. }
  335. PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
  336. const float* address0_f32 = (const float*) address;
  337. const float* address1_f32 = address0_f32 + stride;
  338. const float* address2_f32 = address1_f32 + stride;
  339. const float* address3_f32 = address2_f32 + stride;
  340. return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
  341. }
  342. PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
  343. return psimd_load1_f32(address);
  344. }
  345. PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
  346. const float* address_f32 = (const float*) address;
  347. return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
  348. }
  349. PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
  350. const float* address0_f32 = (const float*) address;
  351. const float* address1_f32 = address0_f32 + stride;
  352. const float* address2_f32 = address1_f32 + stride;
  353. return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
  354. }
  355. PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
  356. return psimd_load_stride_f32(address, stride);
  357. }
  358. /* Store vector */
  359. PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
  360. *((psimd_s8*) address) = value;
  361. }
  362. PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
  363. *((psimd_u8*) address) = value;
  364. }
  365. PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
  366. *((psimd_s16*) address) = value;
  367. }
  368. PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
  369. *((psimd_u16*) address) = value;
  370. }
  371. PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
  372. *((psimd_s32*) address) = value;
  373. }
  374. PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
  375. *((psimd_u32*) address) = value;
  376. }
  377. PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
  378. *((psimd_f32*) address) = value;
  379. }
  380. PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) {
  381. *((int32_t*) address) = value[0];
  382. }
  383. PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) {
  384. *((uint32_t*) address) = value[0];
  385. }
  386. PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
  387. *((float*) address) = value[0];
  388. }
  389. PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) {
  390. int32_t* address_s32 = (int32_t*) address;
  391. address_s32[0] = value[0];
  392. address_s32[1] = value[1];
  393. }
  394. PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) {
  395. uint32_t* address_u32 = (uint32_t*) address;
  396. address_u32[0] = value[0];
  397. address_u32[1] = value[1];
  398. }
  399. PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
  400. float* address_f32 = (float*) address;
  401. address_f32[0] = value[0];
  402. address_f32[1] = value[1];
  403. }
  404. PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) {
  405. int32_t* address_s32 = (int32_t*) address;
  406. address_s32[0] = value[0];
  407. address_s32[1] = value[1];
  408. address_s32[2] = value[2];
  409. }
  410. PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) {
  411. uint32_t* address_u32 = (uint32_t*) address;
  412. address_u32[0] = value[0];
  413. address_u32[1] = value[1];
  414. address_u32[2] = value[2];
  415. }
  416. PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
  417. float* address_f32 = (float*) address;
  418. address_f32[0] = value[0];
  419. address_f32[1] = value[1];
  420. address_f32[2] = value[2];
  421. }
  422. PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) {
  423. psimd_store_s32(address, value);
  424. }
  425. PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) {
  426. psimd_store_u32(address, value);
  427. }
  428. PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
  429. psimd_store_f32(address, value);
  430. }
  431. PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
  432. float* address0_f32 = (float*) address;
  433. float* address1_f32 = address0_f32 + stride;
  434. float* address2_f32 = address1_f32 + stride;
  435. float* address3_f32 = address2_f32 + stride;
  436. *address0_f32 = value[0];
  437. *address1_f32 = value[1];
  438. *address2_f32 = value[2];
  439. *address3_f32 = value[3];
  440. }
  441. PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
  442. psimd_store1_f32(address, value);
  443. }
  444. PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
  445. float* address_f32 = (float*) address;
  446. address_f32[0] = value[0];
  447. address_f32[stride] = value[1];
  448. }
  449. PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
  450. float* address0_f32 = (float*) address;
  451. float* address1_f32 = address0_f32 + stride;
  452. float* address2_f32 = address1_f32 + stride;
  453. *address0_f32 = value[0];
  454. *address1_f32 = value[1];
  455. *address2_f32 = value[2];
  456. }
  457. /* Vector addition */
  458. PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
  459. return a + b;
  460. }
  461. PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
  462. return a + b;
  463. }
  464. PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
  465. return a + b;
  466. }
  467. PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
  468. return a + b;
  469. }
  470. PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
  471. return a + b;
  472. }
  473. PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
  474. return a + b;
  475. }
  476. PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
  477. #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
  478. return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b);
  479. #else
  480. return a + b;
  481. #endif
  482. }
  483. /* Vector subtraction */
  484. PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
  485. return a - b;
  486. }
  487. PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
  488. return a - b;
  489. }
  490. PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
  491. return a - b;
  492. }
  493. PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
  494. return a - b;
  495. }
  496. PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
  497. return a - b;
  498. }
  499. PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
  500. return a - b;
  501. }
  502. PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
  503. #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
  504. return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b);
  505. #else
  506. return a - b;
  507. #endif
  508. }
  509. /* Vector multiplication */
  510. PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
  511. return a * b;
  512. }
  513. PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
  514. return a * b;
  515. }
  516. PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
  517. return a * b;
  518. }
  519. PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
  520. return a * b;
  521. }
  522. PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
  523. return a * b;
  524. }
  525. PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
  526. return a * b;
  527. }
  528. PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
  529. #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
  530. return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b);
  531. #else
  532. return a * b;
  533. #endif
  534. }
  535. /* Quasi-Fused Multiply-Add */
  536. PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) {
  537. #if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA)
  538. return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c);
  539. #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__)
  540. return (psimd_f32) _mm_fmadd_ps((__m128) b, (__m128) c, (__m128) a);
  541. #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__)
  542. return (psimd_f32) _mm_macc_ps((__m128) b, (__m128) c, (__m128) a);
  543. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) && PSIMD_ENABLE_WASM_QFMA
  544. return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c);
  545. #else
  546. return a + b * c;
  547. #endif
  548. }
  549. PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) {
  550. return a / b;
  551. }
  552. /* Vector and */
  553. PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
  554. return (psimd_f32) (mask & (psimd_s32) v);
  555. }
  556. /* Vector and-not */
  557. PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) {
  558. return (psimd_f32) (~mask & (psimd_s32) v);
  559. }
  560. /* Vector blend */
  561. PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
  562. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  563. return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b);
  564. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  565. return (psimd_s8) __builtin_wasm_bitselect(a, b, mask);
  566. #else
  567. return (mask & a) | (~mask & b);
  568. #endif
  569. }
  570. PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) {
  571. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  572. return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b);
  573. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  574. return (psimd_u8) __builtin_wasm_bitselect(a, b, mask);
  575. #else
  576. return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b));
  577. #endif
  578. }
  579. PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
  580. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  581. return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b);
  582. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  583. return (psimd_s16) __builtin_wasm_bitselect(a, b, mask);
  584. #else
  585. return (mask & a) | (~mask & b);
  586. #endif
  587. }
  588. PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) {
  589. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  590. return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b);
  591. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  592. return (psimd_u16) __builtin_wasm_bitselect(a, b, mask);
  593. #else
  594. return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b));
  595. #endif
  596. }
  597. PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
  598. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  599. return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b);
  600. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  601. return (psimd_s32) __builtin_wasm_bitselect(a, b, mask);
  602. #else
  603. return (mask & a) | (~mask & b);
  604. #endif
  605. }
  606. PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) {
  607. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  608. return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b);
  609. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  610. return (psimd_u32) __builtin_wasm_bitselect(a, b, mask);
  611. #else
  612. return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
  613. #endif
  614. }
  615. PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
  616. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  617. return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b);
  618. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  619. return (psimd_f32) __builtin_wasm_bitselect(a, b, mask);
  620. #else
  621. return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
  622. #endif
  623. }
  624. /* Vector blend on sign */
  625. PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
  626. return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
  627. }
  628. PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
  629. return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b);
  630. }
  631. PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
  632. return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
  633. }
  634. PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
  635. return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b);
  636. }
  637. PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
  638. return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
  639. }
  640. PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
  641. return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b);
  642. }
  643. PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
  644. const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
  645. return psimd_blend_f32(mask, a, b);
  646. }
  647. /* Vector absolute value */
  648. PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
  649. const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
  650. return (psimd_f32) ((psimd_s32) v & ~mask);
  651. }
  652. /* Vector negation */
  653. PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
  654. const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
  655. return (psimd_f32) ((psimd_s32) v ^ mask);
  656. }
  657. /* Vector maximum */
  658. PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
  659. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  660. return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b);
  661. #else
  662. return psimd_blend_s8(a > b, a, b);
  663. #endif
  664. }
  665. PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
  666. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  667. return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
  668. #else
  669. return psimd_blend_u8(a > b, a, b);
  670. #endif
  671. }
  672. PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
  673. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  674. return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b);
  675. #else
  676. return psimd_blend_s16(a > b, a, b);
  677. #endif
  678. }
  679. PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
  680. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  681. return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b);
  682. #else
  683. return psimd_blend_u16(a > b, a, b);
  684. #endif
  685. }
  686. PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
  687. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  688. return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b);
  689. #else
  690. return psimd_blend_s32(a > b, a, b);
  691. #endif
  692. }
  693. PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
  694. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  695. return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b);
  696. #else
  697. return psimd_blend_u32(a > b, a, b);
  698. #endif
  699. }
  700. PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
  701. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  702. return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b);
  703. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  704. return __builtin_wasm_max_f32x4(a, b);
  705. #else
  706. return psimd_blend_f32(a > b, a, b);
  707. #endif
  708. }
  709. /* Vector minimum */
  710. PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
  711. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  712. return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b);
  713. #else
  714. return psimd_blend_s8(a < b, a, b);
  715. #endif
  716. }
  717. PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
  718. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  719. return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b);
  720. #else
  721. return psimd_blend_u8(a < b, a, b);
  722. #endif
  723. }
  724. PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
  725. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  726. return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b);
  727. #else
  728. return psimd_blend_s16(a < b, a, b);
  729. #endif
  730. }
  731. PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
  732. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  733. return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b);
  734. #else
  735. return psimd_blend_u16(a < b, a, b);
  736. #endif
  737. }
  738. PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
  739. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  740. return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b);
  741. #else
  742. return psimd_blend_s32(a < b, a, b);
  743. #endif
  744. }
  745. PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
  746. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  747. return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b);
  748. #else
  749. return psimd_blend_u32(a < b, a, b);
  750. #endif
  751. }
  752. PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
  753. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  754. return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b);
  755. #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
  756. return __builtin_wasm_min_f32x4(a, b);
  757. #else
  758. return psimd_blend_f32(a < b, a, b);
  759. #endif
  760. }
  761. PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) {
  762. #if defined(__clang__)
  763. return __builtin_convertvector(v, psimd_f32);
  764. #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
  765. return (psimd_f32) vcvtq_f32_s32((int32x4_t) v);
  766. #elif defined(__SSE2__)
  767. return (psimd_f32) _mm_cvtepi32_ps((__m128i) v);
  768. #else
  769. return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] };
  770. #endif
  771. }
  772. /* Broadcast vector element */
  773. #if defined(__clang__)
  774. PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
  775. return __builtin_shufflevector(v, v, 0, 0, 0, 0);
  776. }
  777. PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
  778. return __builtin_shufflevector(v, v, 1, 1, 1, 1);
  779. }
  780. PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
  781. return __builtin_shufflevector(v, v, 2, 2, 2, 2);
  782. }
  783. PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
  784. return __builtin_shufflevector(v, v, 3, 3, 3, 3);
  785. }
  786. #else
  787. PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
  788. return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 });
  789. }
  790. PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
  791. return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 });
  792. }
  793. PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
  794. return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 });
  795. }
  796. PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
  797. return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 });
  798. }
  799. #endif
  800. /* Reversal of vector elements */
  801. #if defined(__clang__)
  802. PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
  803. return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  804. }
  805. PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
  806. return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  807. }
  808. PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
  809. return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
  810. }
  811. PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
  812. return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
  813. }
  814. PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
  815. return __builtin_shufflevector(v, v, 3, 2, 1, 0);
  816. }
  817. PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
  818. return __builtin_shufflevector(v, v, 3, 2, 1, 0);
  819. }
  820. PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
  821. return __builtin_shufflevector(v, v, 3, 2, 1, 0);
  822. }
  823. #else
  824. PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
  825. return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
  826. }
  827. PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
  828. return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
  829. }
  830. PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
  831. return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
  832. }
  833. PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
  834. return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
  835. }
  836. PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
  837. return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
  838. }
  839. PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
  840. return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
  841. }
  842. PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
  843. return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
  844. }
  845. #endif
  846. /* Interleaving of vector elements */
  847. #if defined(__clang__)
  848. PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
  849. return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
  850. }
  851. PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
  852. return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
  853. }
  854. PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
  855. return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
  856. }
  857. PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
  858. return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
  859. }
  860. PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
  861. return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
  862. }
  863. PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
  864. return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
  865. }
  866. PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
  867. return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
  868. }
  869. PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
  870. return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
  871. }
  872. PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
  873. return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
  874. }
  875. PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
  876. return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
  877. }
  878. #else
  879. PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
  880. return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
  881. }
  882. PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
  883. return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
  884. }
  885. PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
  886. return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
  887. }
  888. PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
  889. return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
  890. }
  891. PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
  892. return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
  893. }
  894. PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
  895. return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
  896. }
  897. PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
  898. return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
  899. }
  900. PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
  901. return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
  902. }
  903. PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
  904. return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
  905. }
  906. PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
  907. return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
  908. }
  909. #endif
  910. /* Concatenation of low/high vector elements */
  911. #if defined(__clang__)
  912. PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
  913. return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
  914. }
  915. PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
  916. return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
  917. }
  918. PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
  919. return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
  920. }
  921. PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
  922. return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
  923. }
  924. PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
  925. return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
  926. }
  927. PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
  928. return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
  929. }
  930. PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
  931. return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
  932. }
  933. PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
  934. return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
  935. }
  936. PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
  937. return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
  938. }
  939. PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
  940. return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
  941. }
  942. #else
  943. PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
  944. return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
  945. }
  946. PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
  947. return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
  948. }
  949. PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
  950. return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
  951. }
  952. PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
  953. return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
  954. }
  955. PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
  956. return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
  957. }
  958. PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
  959. return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
  960. }
  961. PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
  962. return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
  963. }
  964. PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
  965. return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
  966. }
  967. PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
  968. return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
  969. }
  970. PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
  971. return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
  972. }
  973. #endif
  974. /* Concatenation of even/odd vector elements */
  975. #if defined(__clang__)
  976. PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
  977. return __builtin_shufflevector(a, b,
  978. 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
  979. }
  980. PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
  981. return __builtin_shufflevector(a, b,
  982. 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
  983. }
  984. PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
  985. return __builtin_shufflevector(a, b,
  986. 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
  987. }
  988. PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
  989. return __builtin_shufflevector(a, b,
  990. 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
  991. }
  992. PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
  993. return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
  994. }
  995. PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
  996. return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
  997. }
  998. PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
  999. return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
  1000. }
  1001. PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
  1002. return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
  1003. }
  1004. PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
  1005. return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
  1006. }
  1007. PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
  1008. return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
  1009. }
  1010. PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
  1011. return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
  1012. }
  1013. PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
  1014. return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
  1015. }
  1016. PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
  1017. return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
  1018. }
  1019. PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
  1020. return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
  1021. }
  1022. #else
  1023. PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
  1024. return __builtin_shuffle(a, b,
  1025. (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
  1026. }
  1027. PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
  1028. return __builtin_shuffle(a, b,
  1029. (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
  1030. }
  1031. PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
  1032. return __builtin_shuffle(a, b,
  1033. (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
  1034. }
  1035. PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
  1036. return __builtin_shuffle(a, b,
  1037. (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
  1038. }
  1039. PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
  1040. return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
  1041. }
  1042. PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
  1043. return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
  1044. }
  1045. PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
  1046. return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
  1047. }
  1048. PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
  1049. return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
  1050. }
  1051. PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
  1052. return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
  1053. }
  1054. PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
  1055. return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
  1056. }
  1057. PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
  1058. return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
  1059. }
  1060. PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
  1061. return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
  1062. }
  1063. PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
  1064. return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
  1065. }
  1066. PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
  1067. return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
  1068. }
  1069. #endif
  1070. /* Vector reduce */
  1071. #if defined(__clang__)
  1072. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
  1073. const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
  1074. return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
  1075. }
  1076. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
  1077. const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
  1078. return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
  1079. }
  1080. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
  1081. const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
  1082. return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
  1083. }
  1084. PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
  1085. const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
  1086. const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
  1087. return result[0];
  1088. }
  1089. PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
  1090. const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
  1091. const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
  1092. return result[0];
  1093. }
  1094. PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
  1095. const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
  1096. const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
  1097. return result[0];
  1098. }
  1099. #else
  1100. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
  1101. const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
  1102. return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
  1103. }
  1104. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
  1105. const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
  1106. return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
  1107. }
  1108. PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
  1109. const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
  1110. return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
  1111. }
  1112. PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
  1113. const psimd_f32 result = psimd_allreduce_sum_f32(v);
  1114. return result[0];
  1115. }
  1116. PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
  1117. const psimd_f32 result = psimd_allreduce_max_f32(v);
  1118. return result[0];
  1119. }
  1120. PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
  1121. const psimd_f32 result = psimd_allreduce_min_f32(v);
  1122. return result[0];
  1123. }
  1124. #endif
  1125. #endif
  1126. #endif /* PSIMD_H */